Strip HTML from Strings in Python

Strip HTML from strings in Python

I always used this function to strip HTML tags, as it requires only the Python stdlib:

For Python 3:

from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()

def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()

For Python 2:

from HTMLParser import HTMLParser
from StringIO import StringIO

class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()

def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()

Python, remove all html tags from string

You could use get_text()

for i in content:
print i.get_text()

Example below is from the docs:

>>> markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
>>> soup = BeautifulSoup(markup)
>>> soup.get_text()
u'\nI linked to example.com\n'

How do I remove HTML tags from a list of strings that contain the same HTML tags?

You can create a for-loop and call .get_text() from it:

import requests
from bs4 import BeautifulSoup

URL = "https://www.ebay.com/sch/i.html?_from=R40&_nkw=oneplus%206t&_sacat=0&rt=nc&_udlo=150&_udhi=450"
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

for price in soup.findAll("span", {"class": "s-item__price"}):
print(price.get_text(strip=True))

Prints:

$449.99
$449.99
$414.46
$399.00
$399.95
$349.99
$449.00
$585.00
...and son on.

EDIT: To print title and price, you could do for example:

for tag in soup.select('li.s-item:has(.s-item__title):has(.s-item__price)'):
print('{: <10} {}'.format(tag.select_one('.s-item__price').get_text(strip=True),
tag.select_one('.s-item__title').get_text(strip=True, separator=' ')))

Prints:

$449.99    SPONSORED OnePlus 6T 128GB 8GB RAM A6010 - Midnight Black (Unlocked) Global Version
$449.99 OnePlus 6T 128GB 8GB RAM A6010 - Midnight Black (Unlocked) Global Version
$414.46 Oneplus 6t dual sim 256gb midnight black black 6.41" unlocked ram 8gb a6010
$399.00 SPONSORED OnePlus 6T A6013, Clean ESN, Unknown Carrier, Coffee
$399.95 SPONSORED OnePlus 6T 4G LTE 6.41" 128GB ROM 8GB RAM A6013 (T-Mobile) - Mirror Black
$349.99 ONEPLUS 6T - BLACK - 128GB - (T-MOBILE) ~3841
$449.00 OnePlus 6t McLaren Edition Unlocked 256GB 10GB RAM Original Accessories Included
$434.83 OnePlus 6T 8 GB RAM 128 GB UK SIM-Free Smartphone (ML3658)
$265.74 Oneplus 6t
$241.58 New Listing OnePlus 6T 8GB 128GB UNLOCKED
$419.95 NEW IN BOX Oneplus 6T 128GB Mirror Black (T-mobile/Metro PCS/Mint) 8gb RAM
$435.99 OnePlus 6T - 128GB 6GB RAM - Mirror Black (Unlocked) Global Version

... and so on.

Python: strip html from text data

from BeautifulSoup import BeautifulSoup

def removeTags(html, *tags):
soup = BeautifulSoup(html)
for tag in tags:
for tag in soup.findAll(tag):
tag.replaceWith("")

return soup


testhtml = '''
<html>
<head>
<title>Page title</title>
</head>
<body>text here<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
</body>
</html>'''

print removeTags(testhtml, 'b', 'p')

How to remove HTML tags from python strings?

With only one line of markup, using a dedicated parser is kind of overkill. However, for larger sets of data, using a parser like BeautifulSoup is the way to go. See an example below.

from bs4 import BeautifulSoup as bsoup
import re

markup = """
<h2 class="debateHeaderProp">This house believes that society benefits when we share personal information online.</h2>
<span class="debateFormat">Oregon-Oxford, Cross Examination</span>
<div class="debateAffirmSide">On the affirmative: Foo Debate Club</div>
<div class="debateOpposeSide">On the opposition: Bar Debate Club</div>
"""
soup = bsoup(markup)

# Explicitly define the tag and class.
motion = soup.find("h2", class_="debateHeaderProp").get_text()
# Or just use the class.
d_format = soup.find(class_="debateFormat").get_text()
# And even use regex for more power.
teams = [t.get_text() for t in soup.find_all("div", class_=re.compile(r".*debate.*Side.*"))]

print "Our Debate for Today"
print "Motion:", motion
print "Format:", d_format
print teams[0]
print teams[1]

# Prints the following:
# Our Debate for Today
# Motion: This house believes that society benefits when we share personal information online.
# Format: Oregon-Oxford, Cross Examination
# On the affirmative: Foo Debate Club
# On the opposition: Bar Debate Club

One other option is using an XML parser similar to lxml.

Delete HTML Tags from string Python

Delete all tags:

import re
text = "This is the description of <img alt='' height='1' src='http://linkOfARandomImage.of/the/feed' width='1' /> the <br> text"
text = re.sub("<.*?>", "", text)
#text = "This is the description of the text"

Delete unnecessary whitespaces:

text = re.sub("\w*", " ", text)

EDIT:

text = re.sub("\w+", " ", text)

Removing html tags and entities from string in python

Try this regular expression

(\<\;).*?(\>\;)



Related Topics



Leave a reply



Submit