How to Use Multiple Requests and Pass Items in Between Them in Scrapy Python

How can i use multiple requests and pass items in between them in scrapy python

No problem. Following is correct version of your code:

def page_parser(self, response):
sites = hxs.select('//div[@class="row"]')
items = []

request = Request("http://www.example.com/lin1.cpp", callback=self.parseDescription1)
request.meta['item'] = item
yield request

request = Request("http://www.example.com/lin1.cpp", callback=self.parseDescription2, meta={'item': item})
yield request

yield Request("http://www.example.com/lin1.cpp", callback=self.parseDescription3, meta={'item': item})

def parseDescription1(self,response):
item = response.meta['item']
item['desc1'] = "test"
return item

def parseDescription2(self,response):
item = response.meta['item']
item['desc2'] = "test2"
return item

def parseDescription3(self,response):
item = response.meta['item']
item['desc3'] = "test3"
return item

Storing items when having multiple requests in scrapy

Scrapy uses sheduler to run requests so when you use Request() then it puts it in queue and it loads page later (when it has free workers, etc. See: architecture) so it doesn't run it directly and you can't get results from parsePhoto into parse_car_page

You have to send data from parse_car_page to parsePhoto:

  • in parse_car_page parse all data which are on page
  • in parse_car_page use Request(..., meta=... ) to send these data (or loader) to parsePhoto

    yield scrapy.Request(phone_url, callback=self.phone_parse, meta={'loader': loader})
  • in parsePhoto get these data

    loader = response.meta['loader']
  • in parsePhoto scrapes number and yield all data


BTW: in meta= you can use almost any key(s) - but some have special meaning: Request.meta special keys


Full working code.

You can put it one file and run python script.py without creating project. And it will save data in output.csv

import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from scrapy.shell import inspect_response
import json
import time

class OtomotoItem(scrapy.Item):
brand = scrapy.Field()
model = scrapy.Field()
year = scrapy.Field()
url = scrapy.Field()
number = scrapy.Field()
features = scrapy.Field()

def filter_out_array(x):
x = x.strip()
return None if x == '' else x

class OtomotoCarLoader(ItemLoader):
default_output_processor = TakeFirst()
features_out = MapCompose(filter_out_array)

class OtomotoSpider(scrapy.Spider):

name = 'otomoto'
start_urls = ['https://www.otomoto.pl/osobowe/']

def parse(self, response):

for car_page in response.css('.offer-title__link::attr(href)'):
yield response.follow(car_page, self.parse_car_page)

for next_page in response.css('.next.abs a::attr(href)'):
yield response.follow(next_page, self.parse)

def parse_car_page(self, response):

loader = OtomotoCarLoader(OtomotoItem(), response=response)

property_list_map = {
'Marka pojazdu': 'brand',
'Model pojazdu': 'model',
'Rok produkcji': 'year',
}

for params in response.css('.offer-params__item'):

property_name = params.css('.offer-params__label::text').extract_first().strip()

if property_name in property_list_map:
css = params.css('div::text').extract_first().strip()

if css == '':
css = params.css('a::text').extract_first().strip()

loader.add_value(property_list_map[property_name], css)

loader.add_css('features', '.offer-features__item::text')
loader.add_value('url', response.url)

number_id = self.parse_number(response)
print('number_id:', len(number_id), '|', number_id)

for id in number_id:
phone_url = "https://www.otomoto.pl/ajax/misc/contact/multi_phone/" + id + '/0/'
# use `meta=` to send data to `photo_parse`
yield scrapy.Request(phone_url, callback=self.phone_parse, meta={'loader': loader})

def parse_number(self, response):
number_id = response.xpath('//a[@data-path="multi_phone"]/@data-id').extract()
print("NUMBER [before]:", number_id)

number_id = list(set(number_id)) # you can use `set()` to get unique values
print("NUMBER [after] :", number_id)

return number_id

def phone_parse(self, response):
print("[phone_parse] response:", response)

# get data from `parse_car_page`
loader = response.meta['loader']

item = response.xpath('//p/text()').extract()
print('[phone_parse] item:', type(item), item)

json_data = json.loads(item[0])
print('[phone_parse] json:', json_data)

number = json_data["value"].replace(" ","")
print("'[phone_parse] number:", number) # THERE IT IS AS STRING

# add new data to loader
loader.add_value('number', number)

yield loader.load_item()

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(OtomotoSpider)
c.start()

Scrapy multiple requests and fill single item

Since scrapy is asynchronious you need to chain your requests manually. For transfering data between requests you can use Request's meta attribute:

def parse(self, response):
item = dict()
item['name'] = 'foobar'
yield request('http://someurl.com', self.parse2,
meta={'item': item})

def parse2(self, response):
print(response.meta['item'])
# {'name': 'foobar'}

In your case you end up with a split chain when you should have one continuous chain.

Your code should look something like this:

def parse_companies(self, response):
data = json.loads(response.body)
if not data:
return
for company in data:
item = ThalamusItem()
comp_id = company["id"]
url = self.request_details_URL + str(comp_id) + ".json"
url2 = self.request_contacts + str(comp_id)
request = Request(url, callback=self.parse_details,
meta={'url2': url2, 'item': item})
yield request

def parse_details(self, response):
item = response.meta['item']
url2 = response.meta['url2']
item['details'] = '' # add details
yield Request(url2, callback=self.parse_contacts, meta={'item': item})

def parse_contacts(self, response):
item = response.meta['item']
item['contacts'] = '' # add details
yield item

Collect items from multiple requests in an array in Scrapy

there are different approaches for this. One is chaining as you do. Problems occur is one of the requests in the middle of the chain is dropped for any reason. Your have to be really careful about that and handle all possible errors / ignored requests.

Another approach is to use a separate spider for all "grouped" requests.
You can start those spiders programmatically and pass a bucket (e.g. a dict) as spider attribute. Within your pipeline you add your items from each request to this bucket. From "outside" you listen to the spider_closed signal and get this bucket which then contains all your items.

Look here for how to start a spider programatically via a crawler runner:
https://docs.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process

pass a bucket to your spider when calling crawl() of your crawler runner
crawler_runner_object.crawl(YourSpider, bucket=dict())

and catch the sider_closed signal

from scrapy.signalmanager import dispatcher

def on_spider_closed(spider):
bucket = spider.bucket

dispatcher.connect(on_spider_closed, signal=signals.spider_closed)

this approach might seem even more complicated than chaining your requests but it actually takes a lot of complexity out of the problem as within your spider you can make your requests without taking much care about all the other requests.

Scrapy: Passing item between methods

There is an argument named meta for Request:

yield Request(url, callback=self.detail, meta={'item': item})

then in function detail, access it this way:

item = response.meta['item']

See more details here about jobs topic.

Multiple pages per item in Scrapy

If I understand you correctly, you have (at least) two different cases:

  1. The crawled page links to another page containing the data (1+ further request necessary)
  2. The crawled page contains the data (No further request necessary)

In your current code, you call yield bl.load_item() for both cases, but in the parse callback. Note that the request you yield is executed some later point in time, thus the item is incomplete and that's why you're missing the place_property key from the item for the first case.

Possible Solution

A possible solution (If I understood you correctly) Is to exploit the asynchronous behavior of Scrapy. Only minor changes to your code are involved.

For the first case, you pass the item loader to another request, which will then yield it. This is what you do in the isinstance if clause. You'll need to change the return value of the get_url_property callback to actually yield the loaded item.

For the second case, you can return the item directly,
thus simply yield the item in the else clause.

The following code contains the changes to your example.
Does this solve your problem?

def parse(self, response):

# ...

if isinstance(parse_xpath, dict): # place_property is at a URL
url = sel.xpath(parse_xpath['url_elem']).extract()
yield Request(url, callback=self.get_url_property,
meta={'loader': bl, 'parse_xpath': parse_xpath,
'place_property': place_property})
else: # parse_xpath is just an xpath; process normally
bl.add_xpath(place_property, parse_xpath)
yield bl.load_item()

def get_url_property(self, response):

loader = response.meta['loader']
# ...
loader.add_value(place_property, sel.xpath(parse_xpath['xpath'])
yield loader.load_item()

Related to that problem is the question of chaining requests, for which I have noted a similar solution.

How request multiple links at once and parse them later with scrapy?

try changing CONCURRENT_REQUESTS which is by default 16 to a higher number.

as per scrapy docs:

The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain.

Note that in some cases this results in hardware bottlenecks, so try not to increase them by a lot. I'd recommend gradually increasing this value and observing system stats (CPU/Network).

How to pass information from one method to another in scrapy

Items (items = HbsCandidatesItem()) should be created inside the for loop

for i in range(len(my_list)):
url_final = urljoin(url, my_list[i])
temp_url = response.urljoin(url_final)
items = HbsCandidatesItem()
items['Candidate Name'] = names_temp[i]
items['Image ID'] = images_temp[i]
items['Work Authorization'] = wa_temp[i]
request = scrapy.Request(temp_url, callback=self.parse_can_contents)
request.cb_kwargs['items'] = items
yield request

How to crawl multiple-level pages to one item in Scrapy?

I can answer it by myself now.

Just yield None or omit return statements in parse_C() and parse_D() will solve the problem.

Some explanation

Scrapy will not close the spider simply because one of the callbacks returns nothing, but ensure that there is no new one in the request queue as well.

So, since parse_B() will not return None or Item before it completes yielding all of its requests of Subpage C & D, the workflow won't be interrupted.



Related Topics



Leave a reply



Submit