Strip HTML from Strings in Python

Strip HTML from strings in Python

I always used this function to strip HTML tags, as it requires only the Python stdlib:

For Python 3:

from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

For Python 2:

from HTMLParser import HTMLParser
from StringIO import StringIO

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

Python, remove all html tags from string

You could use get_text()

for i in content:
    print i.get_text()

Example below is from the docs:

>>> markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
>>> soup = BeautifulSoup(markup)
>>> soup.get_text()
u'\nI linked to example.com\n'

How do I remove HTML tags from a list of strings that contain the same HTML tags?

You can create a for-loop and call .get_text() from it:

import requests
from bs4 import BeautifulSoup

URL = "https://www.ebay.com/sch/i.html?_from=R40&_nkw=oneplus%206t&_sacat=0&rt=nc&_udlo=150&_udhi=450"
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

for price in soup.findAll("span", {"class": "s-item__price"}):
    print(price.get_text(strip=True))

Prints:

$449.99
$449.99
$414.46
$399.00
$399.95
$349.99
$449.00
$585.00
...and son on.

EDIT: To print title and price, you could do for example:

for tag in soup.select('li.s-item:has(.s-item__title):has(.s-item__price)'):
    print('{: <10} {}'.format(tag.select_one('.s-item__price').get_text(strip=True),
                              tag.select_one('.s-item__title').get_text(strip=True, separator=' ')))

Prints:

$449.99    SPONSORED OnePlus 6T 128GB 8GB RAM A6010 - Midnight Black (Unlocked) Global Version
$449.99    OnePlus 6T 128GB 8GB RAM A6010 - Midnight Black (Unlocked) Global Version
$414.46    Oneplus 6t dual sim 256gb midnight black black 6.41" unlocked ram 8gb a6010
$399.00    SPONSORED OnePlus 6T A6013, Clean ESN, Unknown Carrier, Coffee
$399.95    SPONSORED OnePlus 6T 4G LTE 6.41" 128GB ROM 8GB RAM A6013 (T-Mobile)  - Mirror Black
$349.99    ONEPLUS 6T - BLACK - 128GB - (T-MOBILE) ~3841
$449.00    OnePlus 6t McLaren Edition Unlocked 256GB 10GB RAM Original Accessories Included
$434.83    OnePlus 6T 8 GB RAM 128 GB UK SIM-Free Smartphone (ML3658)
$265.74    Oneplus 6t
$241.58    New Listing OnePlus 6T 8GB 128GB UNLOCKED
$419.95    NEW IN BOX Oneplus 6T  128GB  Mirror Black (T-mobile/Metro PCS/Mint) 8gb RAM
$435.99    OnePlus 6T - 128GB 6GB RAM - Mirror Black (Unlocked) Global Version

... and so on.

Python: strip html from text data

from BeautifulSoup import BeautifulSoup

def removeTags(html, *tags):
    soup = BeautifulSoup(html)
    for tag in tags:
        for tag in soup.findAll(tag):
            tag.replaceWith("")

    return soup


testhtml = '''
<html>
    <head>
        <title>Page title</title>
    </head>
    <body>text here<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
        <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
    </body>
</html>'''

print removeTags(testhtml, 'b', 'p')

How to remove HTML tags from python strings?

With only one line of markup, using a dedicated parser is kind of overkill. However, for larger sets of data, using a parser like BeautifulSoup is the way to go. See an example below.

from bs4 import BeautifulSoup as bsoup
import re

markup = """
<h2 class="debateHeaderProp">This house believes that society benefits when we share personal information online.</h2>
<span class="debateFormat">Oregon-Oxford, Cross Examination</span>
<div class="debateAffirmSide">On the affirmative: Foo Debate Club</div>
<div class="debateOpposeSide">On the opposition: Bar Debate Club</div>
"""
soup = bsoup(markup)

# Explicitly define the tag and class.
motion = soup.find("h2", class_="debateHeaderProp").get_text()
# Or just use the class.
d_format = soup.find(class_="debateFormat").get_text()
# And even use regex for more power.
teams = [t.get_text() for t in soup.find_all("div", class_=re.compile(r".*debate.*Side.*"))]

print "Our Debate for Today"
print "Motion:", motion
print "Format:", d_format
print teams[0]
print teams[1]

# Prints the following:
# Our Debate for Today
# Motion: This house believes that society benefits when we share personal information online.
# Format: Oregon-Oxford, Cross Examination
# On the affirmative: Foo Debate Club
# On the opposition: Bar Debate Club

One other option is using an XML parser similar to lxml.

Delete HTML Tags from string Python

Delete all tags:

import re
text = "This is the description of <img alt='' height='1' src='http://linkOfARandomImage.of/the/feed' width='1' /> the <br> text"
text = re.sub("<.*?>", "", text)
#text = "This is the description of  the  text"

Delete unnecessary whitespaces:

text = re.sub("\w*", " ", text)

EDIT:

text = re.sub("\w+", " ", text)

Removing html tags and entities from string in python

Try this regular expression

(\<\;).*?(\>\;)

Strip HTML from Strings in Python