How can i use multiple requests and pass items in between them in scrapy python
No problem. Following is correct version of your code:
def page_parser(self, response):
sites = hxs.select('//div[@class="row"]')
items = []
request = Request("http://www.example.com/lin1.cpp", callback=self.parseDescription1)
request.meta['item'] = item
yield request
request = Request("http://www.example.com/lin1.cpp", callback=self.parseDescription2, meta={'item': item})
yield request
yield Request("http://www.example.com/lin1.cpp", callback=self.parseDescription3, meta={'item': item})
def parseDescription1(self,response):
item = response.meta['item']
item['desc1'] = "test"
return item
def parseDescription2(self,response):
item = response.meta['item']
item['desc2'] = "test2"
return item
def parseDescription3(self,response):
item = response.meta['item']
item['desc3'] = "test3"
return item
Storing items when having multiple requests in scrapy
Scrapy
uses sheduler to run requests so when you use Request()
then it puts it in queue and it loads page later (when it has free workers, etc. See: architecture) so it doesn't run it directly and you can't get results from parsePhoto
into parse_car_page
You have to send data from parse_car_page
to parsePhoto
:
- in
parse_car_page
parse all data which are on page in
parse_car_page
useRequest(..., meta=... )
to send these data (orloader
) toparsePhoto
yield scrapy.Request(phone_url, callback=self.phone_parse, meta={'loader': loader})
in
parsePhoto
get these dataloader = response.meta['loader']
in
parsePhoto
scrapes number andyield
all data
BTW: in meta=
you can use almost any key(s) - but some have special meaning: Request.meta special keys
Full working code.
You can put it one file and run python script.py
without creating project. And it will save data in output.csv
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from scrapy.shell import inspect_response
import json
import time
class OtomotoItem(scrapy.Item):
brand = scrapy.Field()
model = scrapy.Field()
year = scrapy.Field()
url = scrapy.Field()
number = scrapy.Field()
features = scrapy.Field()
def filter_out_array(x):
x = x.strip()
return None if x == '' else x
class OtomotoCarLoader(ItemLoader):
default_output_processor = TakeFirst()
features_out = MapCompose(filter_out_array)
class OtomotoSpider(scrapy.Spider):
name = 'otomoto'
start_urls = ['https://www.otomoto.pl/osobowe/']
def parse(self, response):
for car_page in response.css('.offer-title__link::attr(href)'):
yield response.follow(car_page, self.parse_car_page)
for next_page in response.css('.next.abs a::attr(href)'):
yield response.follow(next_page, self.parse)
def parse_car_page(self, response):
loader = OtomotoCarLoader(OtomotoItem(), response=response)
property_list_map = {
'Marka pojazdu': 'brand',
'Model pojazdu': 'model',
'Rok produkcji': 'year',
}
for params in response.css('.offer-params__item'):
property_name = params.css('.offer-params__label::text').extract_first().strip()
if property_name in property_list_map:
css = params.css('div::text').extract_first().strip()
if css == '':
css = params.css('a::text').extract_first().strip()
loader.add_value(property_list_map[property_name], css)
loader.add_css('features', '.offer-features__item::text')
loader.add_value('url', response.url)
number_id = self.parse_number(response)
print('number_id:', len(number_id), '|', number_id)
for id in number_id:
phone_url = "https://www.otomoto.pl/ajax/misc/contact/multi_phone/" + id + '/0/'
# use `meta=` to send data to `photo_parse`
yield scrapy.Request(phone_url, callback=self.phone_parse, meta={'loader': loader})
def parse_number(self, response):
number_id = response.xpath('//a[@data-path="multi_phone"]/@data-id').extract()
print("NUMBER [before]:", number_id)
number_id = list(set(number_id)) # you can use `set()` to get unique values
print("NUMBER [after] :", number_id)
return number_id
def phone_parse(self, response):
print("[phone_parse] response:", response)
# get data from `parse_car_page`
loader = response.meta['loader']
item = response.xpath('//p/text()').extract()
print('[phone_parse] item:', type(item), item)
json_data = json.loads(item[0])
print('[phone_parse] json:', json_data)
number = json_data["value"].replace(" ","")
print("'[phone_parse] number:", number) # THERE IT IS AS STRING
# add new data to loader
loader.add_value('number', number)
yield loader.load_item()
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(OtomotoSpider)
c.start()
Scrapy multiple requests and fill single item
Since scrapy is asynchronious you need to chain your requests manually. For transfering data between requests you can use Request's meta
attribute:
def parse(self, response):
item = dict()
item['name'] = 'foobar'
yield request('http://someurl.com', self.parse2,
meta={'item': item})
def parse2(self, response):
print(response.meta['item'])
# {'name': 'foobar'}
In your case you end up with a split chain when you should have one continuous chain.
Your code should look something like this:
def parse_companies(self, response):
data = json.loads(response.body)
if not data:
return
for company in data:
item = ThalamusItem()
comp_id = company["id"]
url = self.request_details_URL + str(comp_id) + ".json"
url2 = self.request_contacts + str(comp_id)
request = Request(url, callback=self.parse_details,
meta={'url2': url2, 'item': item})
yield request
def parse_details(self, response):
item = response.meta['item']
url2 = response.meta['url2']
item['details'] = '' # add details
yield Request(url2, callback=self.parse_contacts, meta={'item': item})
def parse_contacts(self, response):
item = response.meta['item']
item['contacts'] = '' # add details
yield item
Collect items from multiple requests in an array in Scrapy
there are different approaches for this. One is chaining as you do. Problems occur is one of the requests in the middle of the chain is dropped for any reason. Your have to be really careful about that and handle all possible errors / ignored requests.
Another approach is to use a separate spider for all "grouped" requests.
You can start those spiders programmatically and pass a bucket (e.g. a dict) as spider attribute. Within your pipeline you add your items from each request to this bucket. From "outside" you listen to the spider_closed signal and get this bucket which then contains all your items.
Look here for how to start a spider programatically via a crawler runner:
https://docs.scrapy.org/en/latest/topics/practices.html#running-multiple-spiders-in-the-same-process
pass a bucket to your spider when calling crawl() of your crawler runnercrawler_runner_object.crawl(YourSpider, bucket=dict())
and catch the sider_closed signal
from scrapy.signalmanager import dispatcher
def on_spider_closed(spider):
bucket = spider.bucket
dispatcher.connect(on_spider_closed, signal=signals.spider_closed)
this approach might seem even more complicated than chaining your requests but it actually takes a lot of complexity out of the problem as within your spider you can make your requests without taking much care about all the other requests.
Scrapy: Passing item between methods
There is an argument named meta
for Request:
yield Request(url, callback=self.detail, meta={'item': item})
then in function detail
, access it this way:
item = response.meta['item']
See more details here about jobs topic.
Multiple pages per item in Scrapy
If I understand you correctly, you have (at least) two different cases:
- The crawled page links to another page containing the data (1+ further request necessary)
- The crawled page contains the data (No further request necessary)
In your current code, you call yield bl.load_item()
for both cases, but in the parse
callback. Note that the request you yield is executed some later point in time, thus the item is incomplete and that's why you're missing the place_property key from the item for the first case.
Possible Solution
A possible solution (If I understood you correctly) Is to exploit the asynchronous behavior of Scrapy. Only minor changes to your code are involved.
For the first case, you pass the item loader to another request, which will then yield it. This is what you do in the isinstance
if clause. You'll need to change the return value of the get_url_property
callback to actually yield the loaded item.
For the second case, you can return the item directly,
thus simply yield the item in the else clause.
The following code contains the changes to your example.
Does this solve your problem?
def parse(self, response):
# ...
if isinstance(parse_xpath, dict): # place_property is at a URL
url = sel.xpath(parse_xpath['url_elem']).extract()
yield Request(url, callback=self.get_url_property,
meta={'loader': bl, 'parse_xpath': parse_xpath,
'place_property': place_property})
else: # parse_xpath is just an xpath; process normally
bl.add_xpath(place_property, parse_xpath)
yield bl.load_item()
def get_url_property(self, response):
loader = response.meta['loader']
# ...
loader.add_value(place_property, sel.xpath(parse_xpath['xpath'])
yield loader.load_item()
Related to that problem is the question of chaining requests, for which I have noted a similar solution.
How request multiple links at once and parse them later with scrapy?
try changing CONCURRENT_REQUESTS which is by default 16 to a higher number.
as per scrapy docs:
The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain.
Note that in some cases this results in hardware bottlenecks, so try not to increase them by a lot. I'd recommend gradually increasing this value and observing system stats (CPU/Network).
How to pass information from one method to another in scrapy
Items (items = HbsCandidatesItem()) should be created inside the for loop
for i in range(len(my_list)):
url_final = urljoin(url, my_list[i])
temp_url = response.urljoin(url_final)
items = HbsCandidatesItem()
items['Candidate Name'] = names_temp[i]
items['Image ID'] = images_temp[i]
items['Work Authorization'] = wa_temp[i]
request = scrapy.Request(temp_url, callback=self.parse_can_contents)
request.cb_kwargs['items'] = items
yield request
How to crawl multiple-level pages to one item in Scrapy?
I can answer it by myself now.
Just yield None
or omit return statements in parse_C()
and parse_D()
will solve the problem.
Some explanation
Scrapy will not close the spider simply because one of the callbacks returns nothing, but ensure that there is no new one in the request queue as well.
So, since parse_B()
will not return None
or Item
before it completes yielding all of its requests of Subpage C & D, the workflow won't be interrupted.
Related Topics
Any Reason Not to Use '+' to Concatenate Two Strings
Error: Pg_Config Executable Not Found When Installing Psycopg2 on Alpine in Docker
How to Remove Square Bracket from Pandas Dataframe
Conditionally Format Python Pandas Cell
Is There a Python Module to Solve Linear Equations
Python - Initializing Multiple Lists/Line
Python: Nameerror: Global Name 'Foobar' Is Not Defined
Python Pyqt Signals Are Not Always Working
How to Change the Host and Port That the Flask Command Uses
What's 0Xff for in Cv2.Waitkey(1)
Python Time + Timedelta Equivalent
Reset Color Cycle in Matplotlib
Python: Use MySQLdb to Import a MySQL Table as a Dictionary
Logging, Streamhandler and Standard Streams
Python - Pysftp/Paramiko - Verify Host Key Using Its Fingerprint
How to Grab Number After Word in Python
Valueerror: Could Not Broadcast Input Array from Shape (224,224,3) into Shape (224,224)