2024-12-16 17:25:32 [ABC] ОШИБКА: Ошибка запроса: URL - [Экземпляр ошибки: обратная трассировка : : net::ERR_INVALID_ARGUMENT по URL
========================== журналы =================== =======
переход к «URL», ожидание «загрузки»
Может ли кто-нибудь помочь мне понять, почему это происходит и как это решить?
Ниже приведен пример моего кода. Мне пришлось скрыть информацию сайта из-за ее деликатного характера.
Код: Выделить всё
import scrapy
import json
import calendar
import time
import logging
import re
from scrapy_playwright.page import PageMethod
from scrapy import Request
class ABCSpider(scrapy.Spider):
name = 'ABC'
"""
custom_settings = {
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
"proxy": {
"server": "http://proxy.crawlera.com:8010",
"username": "my_key",
"password": "",
},
},
"PLAYWRIGHT_CONTEXTS": {
"default": {
"ignore_https_errors": True
}
}
}
"""
def start_requests(self):
for place in ['01']:
for zptid in ['church']:
for tt in ['city', 'countryside']:
url = f"https://...{tt}/{zptid}.aspx?Paging=12&Sort=0&Page=0&Search={place}"
yield Request(
url=url,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_coroutines": [
PageMethod("wait_for_selector", "#contentHolder_result"),
PageMethod("wait_for_load_state", "load"),
PageMethod("evaluate", "console.log('Page loaded')")
],
},
callback=self.parse_search_url,
errback=self.errback
)
def parse_search_url(self, response):
url = response.url
print('Analyzing search page:', response.url)
results_number = response.xpath('//span[@id="contentHolder_result"]/text()').get()
links = response.css('a.card-img, a.card-img-en').xpath('@href').getall()
print(f"Found {results_number} results")
for link in links:
print(link)
def errback(self, failure):
page = failure.request.meta["playwright_page"]
page.close()
# Affichage d'erreur pour débogage
self.logger.error(f"Request failed: {failure.request.url} - {failure}")
Код: Выделить всё
BOT_NAME = "off_abc"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
DOWNLOAD_HANDLERS = {
'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
}
SPIDER_MODULES = ["off_abc.spiders"]
NEWSPIDER_MODULE = "off_abc.spiders"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
#DOWNLOADER_MIDDLEWARES = {
# 'scrapy_crawlera.CrawleraMiddleware': 610,
#}
#CRAWLERA_ENABLED = True
#CRAWLERA_APIKEY = 'my_key'
#CRAWLERA_PRESERVE_DELAY: False
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'off_abc.pipelines.DuplicatesPipeline': 100
}
Подробнее здесь: https://stackoverflow.com/questions/792 ... ed-failure
Мобильная версия