How to Parse a Website Using Selenium and Beautifulsoup in Python

How can I parse a website using Selenium and Beautifulsoup in python?

Assuming you are on the page you want to parse, Selenium stores the source HTML in the driver's page_source attribute. You would then load the page_source into BeautifulSoup as follows:

from bs4 import BeautifulSoup

from selenium import webdriver

driver = webdriver.Firefox()

driver.get('http://news.ycombinator.com')

html = driver.page_source

soup = BeautifulSoup(html)

for tag in soup.find_all('title'):
print(tag.text)

Hacker News

Scraping a Dynamic Website using Selenium or Beautiful Soup

you can use this code to get the data you need :

import requests

url = "https://www.utsc.utoronto.ca/regoffice/timetable/view/api.php"

# for winter session
payload = "coursecode=&sessions%5B%5D=20219&instructor=&courseTitle="

headers = {
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)

login to page with Selenium works - parsing with BS4 works - but not the combination of both

EDIT: In both versions I added saving in CSV file.


If you have Selenium and requests then there are three posibility

  • use Selenium to login and to get pages.
  • use requests.Session to login and to get pages.
  • use Selenium to login, get session information from Selenium and use them in requests

Using Selenium to login and to get pages is much simpler but it works slower then requests

It needs only to use

  • browser.get(url) instead of r = session.get(post_url)
  • BeautifulSoup(browser.page_source, ...) instead of BeautifulSoup(r.text, ...)


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import csv

#--| Setup
options = Options()
#options.add_argument("--headless")
#options.add_argument("--window-size=1980,1020")
#options.add_argument('--disable-gpu')
browser = webdriver.Chrome(executable_path=r'C:\chrome\chromedriver.exe', options=options)
#browser = webdriver.Firefox()

# --- login ---

browser.get("https://login.wordpress.org/?locale=en_US")
time.sleep(2)

user_name = browser.find_element_by_css_selector('#user_login')
user_name.send_keys("my_login")
password = browser.find_element_by_css_selector('#user_pass')
password.send_keys("my_password")
#time.sleep(5)
submit = browser.find_elements_by_css_selector('#wp-submit')[0]
submit.click()

# Example send page source to BeautifulSoup or selenium for parse
soup = BeautifulSoup(browser.page_source, 'lxml')
use_bs4 = soup.find('title')
print(use_bs4.text)
#print('*' * 25)
#use_sel = browser.find_elements_by_css_selector('div > div._1vC4OE')
#print(use_sel[0].text)

# --- pages ---

data = []

url = 'https://wordpress.org/support/plugin/advanced-gutenberg/page/{}/'

for page in range(1, 3):
print('\n--- PAGE:', page, '---\n')

# read page with list of posts
browser.get(url.format(page))
soup = BeautifulSoup(browser.page_source, 'html.parser') # 'lxml'

all_uls = soup.find('li', class_="bbp-body").find_all('ul')

for number, ul in enumerate(all_uls, 1):

print('\n--- post:', number, '---\n')

a = ul.find('a')
if a:
post_url = a['href']
post_title = a.text

print('href:', post_url)
print('text:', post_title)
print('---------')

# read page with post content
browser.get(post_url)
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')

post_content = sub_soup.find(class_='bbp-topic-content').get_text(strip=True, separator='\n')
print(post_content)

# keep on list as dictionary
data.append({
'href': post_url,
'text': post_title,
'content': post_content,
})

# --- save ---

with open("wp-forum-conversations.csv", "w") as f:
writer = csv.DictWriter(f, ["text", "href", "content"])
writer.writeheader()
writer.writerows(data) # all rows at once

EDIT:

requests works much faster but it needs more work with DevTools in Firefox/Chrome to see all fields in form and what other values it sends to server. It needs also to see where it is redirect when logging is correct. BTW: don't forget to turn off JavaScript before using DevTools because requests doesn't run JavaScript and page may sends different values in form. (and it really sends different fields)

It needs full User-Agent to work correctly.

First I load login page and copy all values from <input> to send them with loginand password

After login I check if it was redirected to different page - to confirm that it was logged correctly. You can also check if page display your name.

import requests
from bs4 import BeautifulSoup
import csv

s = requests.Session()
s.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0' # it needs full user-agent
})

# --- get page with login form ---

r = s.get("https://login.wordpress.org/?locale=en_US")
soup = BeautifulSoup(r.text, 'html.parser')

# get all fields in form

payload = {}

for field in soup.find_all('input'):
name = field['name']
value = field['value']
payload[name] = value
print(name, '=', value)

# --- login ---

payload['log'] = 'my_login'
payload['pwd'] = 'my_password'

r = s.post('https://login.wordpress.org/wp-login.php', data=payload)
print('redirected to:', r.url)

# --- check if logged in ---

# check if logged in - check if redirected to different page
if r.url.startswith('https://login.wordpress.org/wp-login.php'):
print('Problem to login')
exit()

# check if logged in - check displayed name
url = 'https://wordpress.org/support/plugin/advanced-gutenberg/page/1/'
r = s.get(url)

soup = BeautifulSoup(r.text, 'html.parser')
name = soup.find('span', {'class': 'display-name'})
if not name:
print('Problem to login')
exit()
else:
print('name:', name.text)

# --- pages ---

data = []

url = 'https://wordpress.org/support/plugin/advanced-gutenberg/page/{}/'

for page in range(1, 3):
print('\n--- PAGE:', page, '---\n')

# read page with list of posts
r = s.get(url.format(page))
soup = BeautifulSoup(r.text, 'html.parser') # 'lxml'

all_uls = soup.find('li', class_="bbp-body").find_all('ul')

for number, ul in enumerate(all_uls, 1):

print('\n--- post:', number, '---\n')

a = ul.find('a')
if a:
post_url = a['href']
post_title = a.text

print('href:', post_url)
print('text:', post_title)
print('---------')

# read page with post content
r = s.get(post_url)
sub_soup = BeautifulSoup(r.text, 'html.parser')

post_content = sub_soup.find(class_='bbp-topic-content').get_text(strip=True, separator='\n')
print(post_content)

# keep on list as dictionary
data.append({
'href': post_url,
'text': post_title,
'content': post_content,
})

# --- save ---

with open("wp-forum-conversations.csv", "w") as f:
writer = csv.DictWriter(f, ["text", "href", "content"])
writer.writeheader()
writer.writerows(data) # all rows at once

How to use BeautifulSoup after using Selenium to navigate to page

For some strange reason using browser.implicitly_wait() doesn't work but time.sleep() works well. I just changed the browser.implicitly_wait(30) to time.sleep(30) and it worked perfectly.

I've used chrome driver. Don't forget to put the chrome driver in the working directory to avoid driver not found error.

import time
from bs4 import BeautifulSoup
from selenium import webdriver

url = "https://www.southwest.com/flight/"
browser = webdriver.Chrome()
browser.get(url)

departure = browser.find_element_by_id("originAirport_displayed")
destination = browser.find_element_by_id("destinationAirport_displayed")

departure.send_keys("Chicago (Midway), IL - MDW")
destination.send_keys("New Orleans, LA - MSY")

button = browser.find_element_by_id("submitButton")
button.click()

time.sleep(30)
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
print(soup.prettify())

browser.save_screenshot(browser.title + ".JPEG")

browser.close()
browser.quit()

EDIT
lxml parser is faster than html parser. In the official documentation of BeautifulSoup they recommend to use lxml parser.

web scraping using selenium and beautifulsoup

You haven't included

from selenium import webdriver 
driver = webdriver.Chrome(executable_path=r'C:\Users\HP\data\chromedriver.exe')

Try

data = soup.select('div.plp-product__name ')

Or alternatively

data = soup.find_all("div",class_="plp-product__name")

Note the correct method is find_all not findAll as it was deprecated in the bs4 library.

Navigate with Selenium and scrape with BeautifulSoup in Python

You can use only requests and BeautifulSoup to scrape, without Selenium. It will be much faster and will consume much less resources:

import json
import requests
from bs4 import BeautifulSoup

# Get 1000 results
params = {"$filter": "TemplateName eq 'Application Article'", "$orderby": "ArticleDate desc", "$top": "1000",
"$inlinecount": "allpages", }
response = requests.get("https://www.cst.com/odata/Articles", params=params).json()

# iterate 1000 results
articles = response["value"]
for article in articles:
article_json = {}
article_content = []

# title of article
article_title = article["Title"]
# article url
article_url = str(article["Url"]).split("|")[1]
print(article_title)

# request article page and parse it
article_page = requests.get(article_url).text
page = BeautifulSoup(article_page, "html.parser")

# get header
header = page.select_one("h1.head--bordered").text
article_json["Title"] = str(header).strip()
# get body content with images links and descriptions
content = page.select("section.content p, section.content img, section.content span.imageDescription, "
"section.content em")
# collect content to json format
for x in content:
if x.name == "img":
article_content.append("https://cst.com/solutions/article/" + x.attrs["src"])
else:
article_content.append(x.text)

article_json["Content"] = article_content

# write to json file
with open(f"{article_json['Title']}.json", 'w') as to_json_file:
to_json_file.write(json.dumps(article_json))

print("the end")

Beautiful Soup and Selenium cannot scrape website contents

You can simply do :

source = driver.page_source

to get the page source using selenium. And convert that source into BeautifulSoup as usual :

source = BeautifulSoup(source,"lxml")

Complete code with some improvement :

from selenium import webdriver
from datetime import datetime
import time
from bs4 import BeautifulSoup

now = datetime.today()
format_date= now.strftime("%Y/%m/%d")

driver = webdriver.<>(executable_path=r'<>')

url = "https://odb.org/" + format_date
driver.get(url)

time.sleep(10)
# To load page completely.

content=BeautifulSoup(driver.page_source,"lxml")

print(content)

# Title :
print(content.find("h1",class_="devo-title").text)

# Content :
print(content.find("article",class_="content").text)


Related Topics



Leave a reply



Submit