Web Scraping Dynamic Content with Python

Scraping data from a dynamic web table

The data is loaded dynamically from different URL. You can use this example how to load it just with requests/beautifulsoup:

import json
import requests
from bs4 import BeautifulSoup

data = {
    "sort": "Einfahrtzeit-desc",
    "page": "1",
    "pageSize": "10",
    "group": "",
    "filter": "",
    "__RequestVerificationToken": "",
    "locid": "1",
}

headers = {"X-Requested-With": "XMLHttpRequest"}

url = "https://www.laerm-monitoring.de/zug/"
api_url = "https://www.laerm-monitoring.de/zug/train_read"

with requests.Session() as s:
    soup = BeautifulSoup(s.get(url).content, "html.parser")
    data["__RequestVerificationToken"] = soup.select_one(
        '[name="__RequestVerificationToken"]'
    )["value"]
    data = s.post(api_url, data=data, headers=headers).json()

# pretty print the data
print(json.dumps(data, indent=4))

Prints:

{
    "Data": [
        {
            "id": 2536954,
            "Einfahrtzeit": "2021-04-24T20:56:26.1703+02:00",
            "Gleis": 1,
            "Richtung": "Kiel",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 7.3,
            "Zugl\u00e4nge": 181.85884,
            "Geschwindigkeit": 115.57797,
            "Maximalpegel": 88.611084,
            "Vorbeifahrtpegel": 85.421326,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536944,
            "Einfahrtzeit": "2021-04-24T20:52:25.1703+02:00",
            "Gleis": 2,
            "Richtung": "Hamburg",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 6.3,
            "Zugl\u00e4nge": 211.10226,
            "Geschwindigkeit": 152.60104,
            "Maximalpegel": 91.81743,
            "Vorbeifahrtpegel": 87.95224,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536929,
            "Einfahrtzeit": "2021-04-24T20:44:31.4703+02:00",
            "Gleis": 1,
            "Richtung": "Kiel",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 5.3,
            "Zugl\u00e4nge": 104.69964,
            "Geschwindigkeit": 110.10052,
            "Maximalpegel": 82.100815,
            "Vorbeifahrtpegel": 79.98168,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536924,
            "Einfahrtzeit": "2021-04-24T20:42:30.3703+02:00",
            "Gleis": 1,
            "Richtung": "Kiel",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 2.9,
            "Zugl\u00e4nge": 49.305683,
            "Geschwindigkeit": 125.18,
            "Maximalpegel": 98.63289,
            "Vorbeifahrtpegel": 97.25019,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536925,
            "Einfahrtzeit": "2021-04-24T20:42:20.5703+02:00",
            "Gleis": 2,
            "Richtung": "Hamburg",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 0.0,
            "Zugl\u00e4nge": 0.0,
            "Geschwindigkeit": 0.0,
            "Maximalpegel": 0.0,
            "Vorbeifahrtpegel": 0.0,
            "G\u00fcltig": "-"
        },
        {
            "id": 2536911,
            "Einfahrtzeit": "2021-04-24T20:35:19.3703+02:00",
            "Gleis": 1,
            "Richtung": "Kiel",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 4.1,
            "Zugl\u00e4nge": 103.97647,
            "Geschwindigkeit": 132.2034,
            "Maximalpegel": 87.111984,
            "Vorbeifahrtpegel": 85.6776,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536907,
            "Einfahrtzeit": "2021-04-24T20:33:31.2703+02:00",
            "Gleis": 2,
            "Richtung": "Hamburg",
            "Category": "GZ",
            "Zugkategorie": "G\u00fcterzug",
            "Vorbeifahrtdauer": 23.8,
            "Zugl\u00e4nge": 583.19586,
            "Geschwindigkeit": 95.63598,
            "Maximalpegel": 88.02967,
            "Vorbeifahrtpegel": 85.02115,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536890,
            "Einfahrtzeit": "2021-04-24T20:25:36.1703+02:00",
            "Gleis": 2,
            "Richtung": "Hamburg",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 3.5,
            "Zugl\u00e4nge": 104.63446,
            "Geschwindigkeit": 160.47487,
            "Maximalpegel": 88.60612,
            "Vorbeifahrtpegel": 86.46721,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536882,
            "Einfahrtzeit": "2021-04-24T20:22:05.8703+02:00",
            "Gleis": 2,
            "Richtung": "Hamburg",
            "Category": "GZ",
            "Zugkategorie": "G\u00fcterzug",
            "Vorbeifahrtdauer": 26.6,
            "Zugl\u00e4nge": 653.52515,
            "Geschwindigkeit": 94.59859,
            "Maximalpegel": 91.9396,
            "Vorbeifahrtpegel": 85.50632,
            "G\u00fcltig": "OK"
        },
        {
            "id": 2536869,
            "Einfahrtzeit": "2021-04-24T20:16:24.3703+02:00",
            "Gleis": 1,
            "Richtung": "Kiel",
            "Category": "PZ",
            "Zugkategorie": "Personenzug",
            "Vorbeifahrtdauer": 3.3,
            "Zugl\u00e4nge": 87.8222,
            "Geschwindigkeit": 160.01207,
            "Maximalpegel": 91.3928,
            "Vorbeifahrtpegel": 89.54336,
            "G\u00fcltig": "OK"
        }
    ],
    "Total": 8657,
    "AggregateResults": null,
    "Errors": null
}

How to scrape dynamic content with beautifulsoup?

Set headers to your request and store your information in a more structured way.

Example

import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {'User-Agent': 'Mozilla/5.0'}
URLs = ['https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2']

data = []
for url in URLs:

    results = requests.get(url,headers=headers)
    soup = BeautifulSoup(results.text, "html.parser")
    data.append({
        'name': soup.find('span', class_ = 'main-detail__name').get_text(strip=True),
        'brand': soup.find('span', class_ = 'main-detail__marque').get_text(strip=True),
        'ref':soup.find('span', class_ = 'main-detail__ref').get_text(strip=True),
        'price':soup.find('span', {'itemprop':'price'}).get('content'),
        'url':url
    })

pd.DataFrame(data)

Output

name	brand	ref	price	url
Montre The Longines Legend Diver L3.774.4.30.2	Longines	Référence : L3.774.4.30.2	2240	https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2

Web-scraping dynamic website with user input using Selenium and Python

To start with you need to send a character sequence to the Swimmer field.

To send a character sequence to the Swimmer field as the elements are within an iframe so you have to:

Induce WebDriverWait for the desired frame to be available and switch to it.
Induce WebDriverWait for the desired element to be clickable.

You can use either of the following Locator Strategies:

Using CSS_SELECTOR:

driver.get("https://www.swimming.org.nz/results.html")
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#iframe")))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[id^='x-MS_FIELD_MEMBER']"))).send_keys("Joseph Zhang")

Using XPATH:

driver.get("https://www.swimming.org.nz/results.html")
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,"//iframe[@id='iframe']")))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[starts-with(@id, 'x-MS_FIELD_MEMBER')]"))).send_keys("Joseph Zhang")

Note : You have to add the following imports :

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

Browser Snapshot:

swim

References

You can find a couple of relevant discussions in:

Switch to an iframe through Selenium and python
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element while trying to click Next button with selenium
selenium in python : NoSuchElementException: Message: no such element: Unable to locate element

Python Web Scraping Dynamic Content

Selenium is suitable for a job like this

from selenium import webdriver
from selenium.webdriver.firefox.options import Options

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get('https://kith.com/pages/search-results-page?q=nike&tab=products&sort_by=created')

item_titles = driver.find_elements_by_class_name("snize-title")

print item_titles[0].text
#NIKE WMNS SHOX TL - NOVA WHITE / TEAM ORANGE / SPRUCE AURA

Edit:

If you want to capture all item info, the div elements with snize-overhidden class will be what you want to capture. Then you may iterate through them and their sub elements

Dynamic content from table - can't scrape with Selenium

To extract the data from the Transfers table of Token Natluk Community - polygonscan webpage you need to induce WebDriverWait for the visibility_of_element_located() and using DataFrame from Pandas you can use the following Locator Strategy:

Code Block:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
s = Service('C:\\BrowserDrivers\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get("https://polygonscan.com/token/0x64a795562b02830ea4e43992e761c96d208fc58d")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button#btnCookie"))).click()
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#tokentxnsiframe")))
data = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table.table.table-md-text-normal"))).get_attribute("outerHTML")
df = pd.read_html(data)
print(df)

Console Output:

[                                             Txn Hash               Method  ...       Quantity Unnamed: 7
0   0x75411962e2e6527f5a032198816cafe4e1a475a4ebdf...  Add Liquidity ET...  ...   37929.272725        NaN
1   0x27f61026e9df4c0c14c6259f624917a12ce7f6c20eb7...  Swap Exact ETH F...  ...   50814.040553        NaN
2   0xd9ee0ed46ef8ce891e81787b25176530a30df6d2b98e...  Add Liquidity ET...  ...   55288.744543        NaN
3   0x3f3982a38ff3f5c5890eff12a9d3f7061fea88942d96...  Add Liquidity ET...  ...     978.219682        NaN
4   0x503fad1b044b98c58700d185eb8cb9c16a483fd748d7...              Unstake  ...    8884.911763        NaN
5   0x503fad1b044b98c58700d185eb8cb9c16a483fd748d7...              Unstake  ...    9026.302437        NaN
6   0xdc75ad4e37e232f8536305ef8c628fd9391c1f2c5d25...             Transfer  ...  114000.000000        NaN
7   0x218ae4183e632c47edf581705871a3f16dc32cc513ef...  Add Liquidity ET...  ...   45125.111655        NaN
8   0x9fbe017ebf37aea501050a68c8ab1d78734b576b5585...  Swap Exact ETH F...  ...    2563.443420        NaN
9   0xd30adcf551285d4b72495d55cc59ffaed82a224b138c...                Claim  ...   14923.359293        NaN
10  0x65c733e468df90eaed701bc4f1e21a4090924b1225c1...  Swap Exact ETH F...  ...   33055.752836        NaN
11  0x82c215000f9807a3a40fe3ef3e461ceac007513b49ff...  Swap Exact ETH F...  ...    6483.182959        NaN
12  0x6155da0b5b206a8ffffa300a5d75e23fa3833b9b079b...  Swap Exact ETH F...  ...   13005.174783        NaN
13  0x3435579c22e9fc42f6921229449c8cb18d133a207a66...             Transfer  ...   47500.000000        NaN
14  0x7a57be9b538e0c73df4b608a8323c2f678ba6136f9a9...  Swap Exact ETH F...  ...   19605.381370        NaN
15  0x8fe7787039c4a382f6420c78b48933dd59b0843c6ab4...             Transfer  ...  237500.000000        NaN
16  0x0e55aa0740f6c964db13efe52e1af58a35497f9a292d...  Swap Exact ETH F...  ...    6561.223602        NaN
17  0x9897d4a2f56a49a935a36183eee3dc846fc19610812c...  Swap Exact ETH F...  ...   19762.821100        NaN
18  0xf9c7d67bf679624640f20d69636f58f634bf66e7daed...  Add Liquidity ET...  ...   74224.394200        NaN
19  0x89b490947952e37e10a3619f8fbcb5a80b15f0e2f4aa...  Add Liquidity ET...  ...   14589.910231        NaN
20  0xc94e56bb3be04e610c6a89e934fb84bba58922f6641a...             Transfer  ...  142500.000000        NaN
21  0x68a5c142bbfa86b0aa4f469eb17f58e26b5251bd83e9...  Swap Exact ETH F...  ...    3307.607665        NaN
22  0x2597e521fd0a7e4edffe66007129c93d1dc22485b86a...  Swap Exact ETH F...  ...   66868.030051        NaN
23  0x14cc91039f59fd9143bc94132b9f053970947b79a16f...  Swap Exact Token...  ...   42683.069577        NaN
24  0xa5ab4179af827c6883e52cbc010509b701795a8136a0...  Swap Exact ETH F...  ...    3423.618394        NaN

[25 rows x 8 columns]]