Scraping a Dynamic Ecommerce Page with Infinite Scroll

Scraping a dynamic ecommerce page with infinite scroll

As @nrussell suggested, you can use RSelenium to programatically scroll down the page before getting the source code.

You could for example do:

library(RSelenium)
library(rvest)
#start RSelenium
checkForServer()
startServer()
remDr <- remoteDriver()
remDr$open()

#navigate to your page
remDr$navigate("http://www.linio.com.co/tecnologia/celulares-telefonia-gps/")

#scroll down 5 times, waiting for the page to load at each time
for(i in 1:5){
remDr$executeScript(paste("scroll(0,",i*10000,");"))
Sys.sleep(3)
}

#get the page html
page_source<-remDr$getPageSource()

#parse it
html(page_source[[1]]) %>% html_nodes(".product-itm-price-new") %>%
html_text()

Scraping data from a website with Infinite Scroll?

Seems to me that in order to get only september, first you want to grab only the section for september:

section = soup.find('section', {'class': 'Y2019-M9 calendar-sections'})

Then once you fetch the section for September get all the titles which are in an <a> tag like this:

for title in section.find_all('a', {'class': ' calendar-item-title subpage-trigg'}):
titles.append(title.text)

Please note that none of the previous has been tested.

UPDATE:
The problem is that everytime you want load the page, it gives you only the very first section that contains only 24 items, in order to access them you have to scroll down(infinite scroll).
If you open the browser developers tool, select Network and then XHR you will notice that everytime you scroll and load the next "page" there is a request with an url similar to this:

https://www.releases.com/calendar/nextAfter?blockIndex=139&itemIndex=23&category=Games®ionId=us

Where my guess is that blockIndex is meant for the month and itemIndex is for every page loaded, if you are looking only for the month of september blockIndex will be always 139 in that request the challenge is to get the next itemIndex for the next page so you can construct your next request.
The next itemIndex will be always the last itemIndex of the previous request.

I did make a script that does what you want using only BeautifulSoup. Use it at your own discretion, there are some constants that may be extracted dynamically, but I think this could give you a head start:

import json

import requests
from bs4 import BeautifulSoup

DATE_CODE = 'Y2019-M9'
LAST_ITEM_FIRST_PAGE = f'calendar-item col-xs-6 to-append first-item calendar-last-item {DATE_CODE}-None'
LAST_ITEM_PAGES = f'calendar-item col-xs-6 to-append calendar-last-item {DATE_CODE}-None'
INITIAL_LINK = 'https://www.releases.com/l/Games/2019/9/'
BLOCK = 139
titles = []


def get_next_page_link(div: BeautifulSoup):
index = div['item-index']
return f'https://www.releases.com/calendar/nextAfter?blockIndex={BLOCK}&itemIndex={index}&category=Games®ionId=us'


def get_content_from_requests(page_link):
headers = requests.utils.default_headers()
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
req = requests.get(page_link, headers=headers)
return BeautifulSoup(req.content, 'html.parser')


def scroll_pages(link: str):
print(link)
page = get_content_from_requests(link)
for div in page.findAll('div', {'date-code': DATE_CODE}):
item = div.find('a', {'class': 'calendar-item-title subpage-trigg'})
if item:
# print(f'TITLE: {item.getText()}')
titles.append(item.getText())
last_index_div = page.find('div', {'class': LAST_ITEM_FIRST_PAGE})
if not last_index_div:
last_index_div = page.find('div', {'class': LAST_ITEM_PAGES})
if last_index_div:
scroll_pages(get_next_page_link(last_index_div))
else:
print(f'Found: {len(titles)} Titles')
print('No more pages to scroll finishing...')


scroll_pages(INITIAL_LINK)
with open(f'titles.json', 'w') as outfile:
json.dump(titles, outfile)

if your goal is to use Selenium, I think the same principle may apply unless it has a scrolling capability as it is loading the page.
Replacing INITIAL_LINK, DATE_CODE & BLOCK accordingly, will get you other months as well.

Scraping an Infinite Scroll Page

What you have to do is:

  1. find the house button you need
  2. scroll down to this button
  3. click the button, switch tab, get the data, go back to the main tab
  4. go to step 1.

Here's the code to do this (except for extracting the data):

    from selenium import webdriver
import time
import numpy as np


url = 'https://www.quintoandar.com.br/alugar/imovel/sao-paulo-sp-brasil'
xpath_house_buttons = "//div[@class='sc-1qwl1yl-0 igVsBW']"
x_path_title = "//h1[@class='sc-1q9n36n-0 ghXeyc sc-bdVaJa hgGleC']"
x_path_address = "//p[@data-testid='listing-address-subtitle']"
num_houses = 40
houses = []


def scroll_to_house_button(driver, num_btn) -> bool:
"""
returns true if it could scroll to the house button
"""
try:
house_buttons = driver.find_elements_by_xpath(xpath_house_buttons)
driver.execute_script("arguments[0].scrollIntoView();", house_buttons[num_btn])
return True
except:
return False
def switch_to_house_tab(driver) -> bool:
"""
returns true if switching tab was successful
"""
try:
driver.switch_to.window(driver.window_handles[1])
return True
except:
return False
def switch_to_main_tab(driver) -> bool:
"""
returns true if switching tab was successful
"""
try:
driver.switch_to.window(driver.window_handles[0])
return True
except:
return False
def get_house_button_index(house_buttons, houses_scraped_text, index):
"""
returns house_button's index in house_buttons
"""
# at the beginning, the house to scrape is given by its index
if len(houses_scraped_text) < 5:
return index
# afterwards, we try to find by comparing the buttons' content
else:
for i in reversed(range(len(house_buttons))):
if (house_buttons[i].text == houses_scraped_text[-1]) and (house_buttons[i - 1].text == houses_scraped_text[-2]):
return i + 1


# Initializing the webdriver
driver = webdriver.Chrome()
driver.set_window_size(1600, 1024)
driver.get(url)


# get data
i = 0
houses_scraped_text = []
while len(houses) < num_houses:
house_buttons = driver.find_elements_by_xpath(xpath_house_buttons)

# as house_buttons never exceeds a length of 30,
# we need a smart way of getting the next one
index_btn = get_house_button_index(house_buttons, houses_scraped_text, i)
house_button = house_buttons[index_btn]
houses_scraped_text.append(house_button.text)

# can't scroll to house button => wait 1 sec
while not scroll_to_house_button(driver, num_btn=index_btn):
time.sleep(1)
print("scroll to house -- house", i + 1)

# filter houses to be scraped
if not ((not 'Sem tempo pra procurar' in house_button.text) and (not 'Ainda não encontrou seu lar' in house_button.text)):
print("house filtered -- house", i + 1, "\n")
i += 1 # you have to increment here to not loop over the same house forever
continue

# new house tab not open yet => wait 1 sec
while len(driver.window_handles) != 2: # check number of open tabs
house_button.click()
time.sleep(1)
print("new house tab opened -- house", i + 1)

# can't switch yet => wait 1 sec
while not switch_to_house_tab(driver):
time.sleep(1)
print("switched to new house tab -- house", i + 1)

##################
# LOAD DATA HERE #
##################
print("data loaded -- house", i + 1)

# close tab
driver.close()
# can't switch back to main => wait 1 sec
while not switch_to_main_tab(driver):
time.sleep(1)
print("house tab closed & switched back to main -- house", i + 1)

print(len(houses), "house scraped\n")
i += 1

Scrapy for eCommerce with Selenium Infinite Scroll, Help Returning Values

I don't see where you runs parse_product. It will not execute it automatically for you. Besides function like your parse_product with response is rather to use it in some yield Requests(supage_url, parse_product) to parse data from subpage, not from page which you get in parse. You should rather move code from parse_product into parse like this:

def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
driver.set_window_size(1920, 1080)

p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page

start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='

scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web

while p < 25:
driver.get(start_url + str(p))

#sleep(2)
i = 1

# scrolling
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
sleep(scroll_pause_time)
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if (screen_height) * i > scroll_height:
break

# after scrolling
self.html = driver.page_source
p = p + 24
resp = Selector(text=self.html)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}

But I would do other changes - you use p = p + 24 but when I check page in browser then I see I need p = p + 48 to get all product. Instead of p = p + ... I would rather use Selenium to click button > to get next page.


EDIT:

My version with other changes.

Everyone can run it without creating project.

#!/usr/bin/env python3

import scrapy
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime

class HdSpider(scrapy.Spider):

name = 'hd'

allowed_domains = ['www.homedepot.com']
start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category

def parse(self, response):

options = Options()

chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path) #, chrome_options=options)
#driver.set_window_size(1920, 1080)
print(dir(driver))
driver.maximize_window()

scroll_pause_time = 1

# loading first page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao=0'
driver.get(start_url)

screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web

#while True: # all pages
for _ in range(5): # only 5 pages

#sleep(scroll_pause_time)

# scrolling page
i = 1
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script(f"window.scrollBy(0, {screen_height});")
sleep(scroll_pause_time)

i += 1

scroll_height = driver.execute_script("return document.body.scrollHeight;")
if screen_height * i > scroll_height:
break

# after scrolling
resp = Selector(text=driver.page_source)

for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}

# click button `>` to load next page
try:
driver.find_element_by_xpath('//a[@aria-label="Next"]').click()
except:
break


# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})

c.crawl(HdSpider)
c.start()


Related Topics



Leave a reply



Submit