Обработка ошибок Scrapy Spider (scrapy.core.scraper)

Обработка ошибок Scrapy Spider (scrapy.core.scraper) ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Обработка ошибок Scrapy Spider (scrapy.core.scraper)

Цитата

Сообщение Anonymous » 09 ноя 2024, 10:58

Прочитав несколько часов решений, я так и не смог найти ответ на свою проблему.
Я пытаюсь очистить веб-страницу супермаркета, думаю, ошибка в функции синтаксического анализа. Пожалуйста, если кто-то может мне помочь.
import scrapy
from bs4 import BeautifulSoup
import re
from fakesupermarketPrueba.items import fakesupermarketItem

class FakeSupermarketSpider(scrapy.Spider):
name = 'FakeSupermarketSpider'
allowed_domains = ['fakesupermarket.com.py']
start_urls = ['http://www.fakesupermarket.com.py/category/10']

all_link_categories = [
'http://www.fakesupermarket.com.py/category/1218.aspx',
'http://www.fakesupermarket.com.py/category/1231.aspx']

def __init__(self):
self.declare_xpath()

def declare_xpath(self):

self.TitleXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/h1/text()"
self.LinkXpath = "/html/body/div[1]/form/@action"
self.PriceXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/div[3]/div/div/div[5]/span/text()"
self.NumSerieXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/div[3]/div/div/div[6]/text()"
self.LinkImgXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[3]/ul/li/img/@src"
self.UnitXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/div[3]/div/div/div[8]/div[1]/div/span/text()"

def parse(self,response):
for href in all_link_categories:
url = response.urljoin(href)
yield scrapy.Request(url=url, callback=self.parse_main_item)

def parse_main_item(self,response):
item = fakesupermarketItem()

Title = response.xpath(self.TitleXpath).extract()
Title = self.cleanText(self.parseText(self.listToStr(Title)))

Link = response.xpath(self.LinkXpath).extract()
Link = self.cleanText(self.parseText(Link))

Price = response.xpath(self.PriceXpath).extract()
Price = self.cleanText(self.parseText(self.listToStr(Price)))

NumSerie = response.xpath(self.NumSerieXpath).extract()
NumSerie = self.cleanText(self.parseText(self.listToStr(NumSerie)))

LinkImg = response.xpath(self.LinkImgXpath).extract()
LinkImg = self.cleanText(self.parseText(self.listToStr(LinkImg)))

Unit = response.xpath(self.UnitXpath).extract()
Unit = self.cleanText(self.parseText(Unit))

item['Title'] = Title
item['Link'] = Link
item['Price'] = Price
item['NumSerie'] = NumSerie
item['LinkImg'] = LinkImg
item['Unit'] = Unit
return item

Моя идея состоит в том, чтобы скопировать все ссылки (их около 300), где находятся конечные продукты.
Эти ссылки находятся в массиве all_link_categories.
(Я думаю есть ошибка)
Журналы:
2021-01-12 20:14:40 [scrapy.core.engine] INFO: Spider opened
2021-01-12 20:14:40 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-01-12 20:14:40 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-01-12 20:14:40 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None)
2021-01-12 20:14:41 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None)
2021-01-12 20:14:42 [scrapy.core.scraper] ERROR: Spider error processing (referer: None)
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/scrapy/utils/defer.py", line 120, in iter_errback
yield next(it)
File "/usr/local/lib/python3.8/dist-packages/scrapy/utils/python.py", line 353, in __next__
return next(self.data)
File "/usr/local/lib/python3.8/dist-packages/scrapy/utils/python.py", line 353, in __next__
return next(self.data)
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/referer.py", line 340, in
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/root/fakesupermarketPrueba/fakesupermarketPrueba/spiders/spider.py", line 374, in parse
for href in all_link_categories:
NameError: name 'all_link_categories' is not defined
2021-01-12 20:14:42 [scrapy.core.engine] INFO: Closing spider (finished)

Подробнее здесь: https://stackoverflow.com/questions/656 ... re-scraper

1731139117

Anonymous

Прочитав несколько часов решений, я так и не смог найти ответ на свою проблему.
Я пытаюсь очистить веб-страницу супермаркета, думаю, ошибка в функции синтаксического анализа.  Пожалуйста, если кто-то может мне помочь.
import scrapy
from bs4 import BeautifulSoup
import re
from fakesupermarketPrueba.items import fakesupermarketItem

class FakeSupermarketSpider(scrapy.Spider):
name = 'FakeSupermarketSpider'
allowed_domains = ['fakesupermarket.com.py']
start_urls = ['http://www.fakesupermarket.com.py/category/10']

all_link_categories = [
'http://www.fakesupermarket.com.py/category/1218.aspx',
'http://www.fakesupermarket.com.py/category/1231.aspx']

def __init__(self):
self.declare_xpath()

def declare_xpath(self):

self.TitleXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/h1/text()"
self.LinkXpath = "/html/body/div[1]/form/@action"
self.PriceXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/div[3]/div/div/div[5]/span/text()"
self.NumSerieXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/div[3]/div/div/div[6]/text()"
self.LinkImgXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[3]/ul/li/img/@src"
self.UnitXpath = "/html/body/div[1]/form/div[5]/div[6]/div/div[2]/div[1]/div/div[1]/div[2]/div/div[3]/div/div/div[8]/div[1]/div/span/text()"

def parse(self,response):
for href in all_link_categories:
url = response.urljoin(href)
yield scrapy.Request(url=url, callback=self.parse_main_item)

def parse_main_item(self,response):
item = fakesupermarketItem()

Title = response.xpath(self.TitleXpath).extract()
Title = self.cleanText(self.parseText(self.listToStr(Title)))

Link = response.xpath(self.LinkXpath).extract()
Link = self.cleanText(self.parseText(Link))

Price = response.xpath(self.PriceXpath).extract()
Price = self.cleanText(self.parseText(self.listToStr(Price)))

NumSerie = response.xpath(self.NumSerieXpath).extract()
NumSerie = self.cleanText(self.parseText(self.listToStr(NumSerie)))

LinkImg = response.xpath(self.LinkImgXpath).extract()
LinkImg = self.cleanText(self.parseText(self.listToStr(LinkImg)))

Unit = response.xpath(self.UnitXpath).extract()
Unit = self.cleanText(self.parseText(Unit))

item['Title']           = Title
item['Link']            = Link
item['Price']           = Price
item['NumSerie']        = NumSerie
item['LinkImg']         = LinkImg
item['Unit']            = Unit
return item

Моя идея состоит в том, чтобы скопировать все ссылки (их около 300), где находятся конечные продукты.
Эти ссылки находятся в массиве all_link_categories.
(Я думаю есть ошибка)
Журналы:
2021-01-12 20:14:40 [scrapy.core.engine] INFO: Spider opened
2021-01-12 20:14:40 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-01-12 20:14:40 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-01-12 20:14:40 [scrapy.core.engine] DEBUG: Crawled (200)  (referer: None)
2021-01-12 20:14:41 [scrapy.core.engine] DEBUG: Crawled (200)  (referer: None)
2021-01-12 20:14:42 [scrapy.core.scraper] ERROR: Spider error processing   (referer: None)
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/scrapy/utils/defer.py", line 120, in iter_errback
yield next(it)
File "/usr/local/lib/python3.8/dist-packages/scrapy/utils/python.py", line 353, in __next__
return next(self.data)
File "/usr/local/lib/python3.8/dist-packages/scrapy/utils/python.py", line 353, in __next__
return next(self.data)
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/referer.py", line 340, in 
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in 
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.8/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in 
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.8/dist-packages/scrapy/core/spidermw.py", line 62, in _evaluate_iterable
for r in iterable:
File "/root/fakesupermarketPrueba/fakesupermarketPrueba/spiders/spider.py", line 374, in parse
for href in all_link_categories:
NameError: name 'all_link_categories' is not defined
2021-01-12 20:14:42 [scrapy.core.engine] INFO: Closing spider (finished)
 

Подробнее здесь: [url]https://stackoverflow.com/questions/65692305/scrapy-spider-error-processing-scrapy-core-scraper[/url]