Код: Выделить всё
# spider.py
def start_requests(self):
yield scrapy.Request(url=self.url, method='POST', callback=self.parse, dont_filter=True, flags=['this is the start request method'])
def parse(self, response):
logging.info('started parsing')
l = ItemLoader(FeedItem())
json_response = json.loads(response.text)
l.add_value('feed', json_response)
yield l.load_items()
yield scrapy.Request(url=self.url, method='POST', callback=self.parse, flags=['this is the parse method'])
Код: Выделить всё
# middlewares.py
def process_request(self, request, spider):
sinceId = self.cur.execute('SELECT sinceId FROM proposal').fetchone()
jobT = self.cur.execute('SELECT jobT FROM proposal').fetchone()
if not sinceId:
body = self.body.encode('utf-8')
request = request.replace(body=body)
spider.logger.info(f'{request.flags}')
spider.logger.info('Returning unchanged request')
return request
body = re.sub(r'("sinceId":")(\d+)(")', '"sinceId":' + f'"{sinceId}"', self.body) # change sinceId
body = re.sub(r'("jobTime":")(\d+)(")', '"jobTime":' + f'"{jobT}"', body) # changed jobTime
body = self.body.encode('utf-8')
spider.logger.info('Body changed')
request = request.replace(body=body)
spider.logger.info(f'{request.flags}')
spider.logger.info('Returning changed request')
return request
def spider_opened(self, spider):
self.body = '{"query":"\n query($queryParams: UserSavedSearchesParams) {\n userSavedSearches(params: $queryParams) {\n results {\n id\n uid:id\n title\n ciphertext\n description\n type\n recno\n freelancersToHire\n duration\n durationLabel\n engagement\n amount {\n amount:displayValue\n }\n createdOn:createdDateTime\n publishedOn:publishedDateTime\n renewedOn:renewedDateTime\n prefFreelancerLocation\n prefFreelancerLocationMandatory\n connectPrice\n client {\n totalHires\n totalPostedJobs\n totalSpent {\n rawValue\n currency\n displayValue\n }\n paymentVerificationStatus,\n location {\n country\n }\n totalReviews\n totalFeedback\n companyRid\n edcUserId\n lastContractRid\n companyOrgUid\n hasFinancialPrivacy\n }\n enterpriseJob\n premium\n jobTs:jobTime\n skills {\n id\n name\n prettyName\n highlighted\n }\n contractorTier\n jobStatus\n relevanceEncoded\n totalApplicants\n proposalsTier\n isLocal:local\n locations {\n city\n country\n }\n isApplied:applied\n attrs {\n id\n uid:id\n prettyName:prefLabel\n parentSkillId\n prefLabel\n highlighted\n freeText\n }\n hourlyBudget {\n type\n min\n max\n }\n clientRelation {\n companyRid\n companyName\n edcUserId\n lastContractPlatform\n lastContractRid\n lastContractTitle\n }\n totalFreelancersToHire\n contractToHire\n }\n paging {\n total\n count\n resultSetTs:resultSetTime\n }\n }\n }\n ","variables":{"queryParams":{"sinceId":"1015914410","jobTime":"1728208823055","paging":"0;20"}}}'
self.body = self.body.replace('\n','\\n')
self.con = sqlite3.connect('data.db')
self.cur = self.con.cursor()
self.cur.execute('CREATE TABLE IF NOT EXISTS proposal(title, description, _type, duration, salary, sinceID, jobT, UNIQUE(title, description, _type, duration, salary, sinceID, jobT))')
spider.logger.info("Spider opened: %s" % spider.name)
- Он отправляет первоначальный запрос.
- Получает изменение тела.
- Возвращает обратный вызов методу синтаксического анализа.
- ...
Вот обратная трассировка:
Код: Выделить всё
2024-10-08 11:12:30 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: feed)
2024-10-08 11:12:31 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.12.6 (tags/v3.12.6:a4a2d2b, Sep 6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Windows-11-10.0.22631-SP0
2024-10-08 11:12:31 [scrapy.addons] INFO: Enabled addons:
[]
2024-10-08 11:12:31 [asyncio] DEBUG: Using selector: SelectSelector
2024-10-08 11:12:31 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2024-10-08 11:12:31 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.windows_events._WindowsSelectorEventLoop
2024-10-08 11:12:31 [scrapy.extensions.telnet] INFO: Telnet Password: 443b0e3cce0e2a7f
2024-10-08 11:12:31 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2024-10-08 11:12:31 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'feed',
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 80,
'FEED_EXPORT_ENCODING': 'utf-8',
'NEWSPIDER_MODULE': 'feed.spiders',
'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
'SPIDER_MODULES': ['feed.spiders'],
'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2024-10-08 11:12:32 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.offsite.OffsiteMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'feed.middlewares.FeedDownloaderMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2024-10-08 11:12:32 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2024-10-08 11:12:32 [scrapy.middleware] INFO: Enabled item pipelines:
['feed.pipelines.FeedPipeline']
2024-10-08 11:12:32 [scrapy.core.engine] INFO: Spider opened
2024-10-08 11:12:32 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2024-10-08 11:12:32 [feed] INFO: Spider opened: feed
2024-10-08 11:12:32 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2024-10-08 11:12:32 [feed] INFO: ['this is the start request method']
2024-10-08 11:12:32 [feed] INFO: Returning unchanged request
2024-10-08 11:12:33 [feed] INFO: ['this is the start request method']
2024-10-08 11:12:33 [feed] INFO: Returning unchanged request
2024-10-08 11:12:34 [feed] INFO: ['this is the start request method']
2024-10-08 11:12:34 [feed] INFO: Returning unchanged request
2024-10-08 11:12:35 [feed] INFO: ['this is the start request method']
2024-10-08 11:12:35 [feed] INFO: Returning unchanged request
2024-10-08 11:12:36 [feed] INFO: ['this is the start request method']
2024-10-08 11:12:36 [feed] INFO: Returning unchanged request
Подробнее здесь: https://stackoverflow.com/questions/790 ... le-to-reac