Оптимизация запросов в моем коде Python с использованием асинхронности

Оптимизация запросов в моем коде Python с использованием асинхронности ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Оптимизация запросов в моем коде Python с использованием асинхронности

Цитата

Сообщение Anonymous » 02 дек 2024, 00:17

В настоящее время я пытаюсь оптимизировать написанный мной скрипт Python, который выполняет несколько HTTP-запросов с использованием библиотеки aiohttp. Я хочу воспользоваться преимуществами асинхронного программирования для одновременной обработки запросов, не блокируя выполнение остальной части программы. Я написал следующий код с целью повышения производительности за счет выполнения неблокирующих HTTP-запросов, но после запуска нового асинхронного кода я не наблюдаю каких-либо значительных улучшений производительности по сравнению с предыдущей синхронной версией скрипта.< /p>
Подозреваю, что я что-то упускаю или делаю неправильно. Я надеюсь понять, что может пойти не так с моей реализацией: проблема в том, как я структурирую асинхронный код, как я управляю параллелизмом или что-то еще, что может повлиять на производительностьimport traceback
import requests
import aiohttp
from bs4 import BeautifulSoup

# Helper function to get BeautifulSoup object
async def get_soup(session, url, headers, cookies):
async with session.get(url, headers=headers, cookies=cookies, timeout=30) as response:
if response.status == 200:
html_text = await response.text()
return BeautifulSoup(html_text, "html.parser")
else:
raise Exception(f"Failed to retrieve {url}, status code: {response.status}")

# Worker function for processing each page
async def process_page(session, site_index, start_page_num, end_page_num, new_listings):
site = sites[site_index]
cur_url = site['url']
site_headers = site['headers']
cassing = site['casings'][0]['cassingIdentity']
link_MD = site['casings'][0]['link']
sitename = site['pageTitle']
individual_Scraper_Metadata = site['individualCasings'][0]
COUNTER = site['URL_Counter']

# Prepare URL for first page
current_page_num = start_page_num
end_page_num *= COUNTER

async with aiohttp.ClientSession() as session:
try:
while True:
# Format the URL dynamically based on the page number
if cur_url.count('{}') > 1:
cur_url2 = cur_url.format(current_page_num-1, current_page_num-1)
else:
cur_url2 = cur_url.format(current_page_num)
print(f"Requesting URL: {cur_url2}")

# Retrieve and parse HTML
soup = await get_soup(session, cur_url2, site_headers, COOKIEJAR)

# Get the main listings from the page
raw_listings = soup.find_all(cassing["tag"], class_=cassing["classname"])
print(f"Found {len(raw_listings)} raw listings on page {current_page_num}")

if len(raw_listings) == 0:
print(f"No listings found at {cur_url2}")
break # Exit if no listings are found

# Extract data from each listing
for raw_listing in raw_listings:
page_listing = individualListingScraper(individual_Scraper_Metadata, raw_listing, link_MD)
page_listing['pageNo'] = current_page_num
page_listing['siteName'] = sitename
new_listings.append(page_listing)

# Terminate the loop based on the page number
if end_page_num != -1 and current_page_num >= end_page_num:
print(f"Reached last page {current_page_num}")
break
current_page_num += COUNTER

return finalReturn(sitename, new_listings)

except Exception as e:
print(f"Error processing page {current_page_num}: {str(e)}")
traceback.print_exc()

# Main function to coordinate scraping tasks
async def autoScraper(SITE_INDEX, START_PAGE_NUM=1, END_PAGE_NUM=-1):
global COOKIEJAR
new_listings = []

if SITE_INDEX >= len(sites):
return "site number not found"

site = sites[SITE_INDEX]
try:
# Set up cookies for subsequent requests
initial_response = await aiohttp.ClientSession().get("https://www.google.com/")
COOKIEJAR = initial_response.cookies

# Start scraping process
await process_page(aiohttp.ClientSession(), SITE_INDEX, START_PAGE_NUM, END_PAGE_NUM, new_listings)

return new_listings

except Exception as e:
print(f"Error in autoScraper: {str(e)}")
return new_listings

def individual_listing_scraper(individual_metadata, raw_listing_html, link_metadata, site_headers, cookie_jar, timeout_sec):
try:
# Extract the listing URL
listing_url_tag = raw_listing_html.find(link_metadata['tag'], class_=link_metadata['classname'])
if not listing_url_tag:
print("Error: Listing URL not found in the raw HTML.")
return {}

url = listing_url_tag.get(link_metadata['attrib'])
if not url:
print("Error: URL attribute not found in the tag.")
return {}

url = link_metadata['baseurlLink'] + url

# Fetch the HTML of the individual listing page
response = requests.get(url, headers=site_headers, cookies=cookie_jar, timeout=timeout_sec, verify=False)
response.raise_for_status() # Raise an error for bad HTTP responses
individual_page_html = BeautifulSoup(response.text, 'html.parser')

# Extract the listing data based on metadata
tag = individual_metadata['cassingIdentity']['tag']
classname = individual_metadata['cassingIdentity']['classname']
_id = individual_metadata['cassingIdentity']['id']
index = individual_metadata['cassingIdentity']['index']

listing_raw_html = individual_page_html.find_all(tag, class_=classname, id=_id)
if not listing_raw_html or len(listing_raw_html)

Подробнее здесь: https://stackoverflow.com/questions/792 ... chronicity

1733087860

Anonymous

В настоящее время я пытаюсь оптимизировать написанный мной скрипт Python, который выполняет несколько HTTP-запросов с использованием библиотеки aiohttp. Я хочу воспользоваться преимуществами асинхронного программирования для одновременной обработки запросов, не блокируя выполнение остальной части программы. Я написал следующий код с целью повышения производительности за счет выполнения неблокирующих HTTP-запросов, но после запуска нового асинхронного кода я не наблюдаю каких-либо значительных улучшений производительности по сравнению с предыдущей синхронной версией скрипта.< /p>
Подозреваю, что я что-то упускаю или делаю неправильно.  Я надеюсь понять, что может пойти не так с моей реализацией: проблема в том, как я структурирую асинхронный код, как я управляю параллелизмом или что-то еще, что может повлиять на производительностьimport traceback
import requests
import aiohttp
from bs4 import BeautifulSoup

# Helper function to get BeautifulSoup object
async def get_soup(session, url, headers, cookies):
async with session.get(url, headers=headers, cookies=cookies, timeout=30) as response:
if response.status == 200:
html_text = await response.text()
return BeautifulSoup(html_text, "html.parser")
else:
raise Exception(f"Failed to retrieve {url}, status code: {response.status}")

# Worker function for processing each page
async def process_page(session, site_index, start_page_num, end_page_num, new_listings):
site = sites[site_index]
cur_url = site['url']
site_headers = site['headers']
cassing = site['casings'][0]['cassingIdentity']
link_MD = site['casings'][0]['link']
sitename = site['pageTitle']
individual_Scraper_Metadata = site['individualCasings'][0]
COUNTER = site['URL_Counter']

# Prepare URL for first page
current_page_num = start_page_num
end_page_num *= COUNTER

async with aiohttp.ClientSession() as session:
try:
while True:
# Format the URL dynamically based on the page number
if cur_url.count('{}') >  1:
cur_url2 = cur_url.format(current_page_num-1, current_page_num-1)
else:
cur_url2 = cur_url.format(current_page_num)
print(f"Requesting URL: {cur_url2}")

# Retrieve and parse HTML
soup = await get_soup(session, cur_url2, site_headers, COOKIEJAR)

# Get the main listings from the page
raw_listings = soup.find_all(cassing["tag"], class_=cassing["classname"])
print(f"Found {len(raw_listings)} raw listings on page {current_page_num}")

if len(raw_listings) == 0:
print(f"No listings found at {cur_url2}")
break  # Exit if no listings are found

# Extract data from each listing
for raw_listing in raw_listings:
page_listing = individualListingScraper(individual_Scraper_Metadata, raw_listing, link_MD)
page_listing['pageNo'] = current_page_num
page_listing['siteName'] = sitename
new_listings.append(page_listing)

# Terminate the loop based on the page number
if end_page_num != -1 and current_page_num >= end_page_num:
print(f"Reached last page {current_page_num}")
break
current_page_num += COUNTER

return finalReturn(sitename, new_listings)

except Exception as e:
print(f"Error processing page {current_page_num}: {str(e)}")
traceback.print_exc()

# Main function to coordinate scraping tasks
async def autoScraper(SITE_INDEX, START_PAGE_NUM=1, END_PAGE_NUM=-1):
global COOKIEJAR
new_listings = []

if SITE_INDEX >= len(sites):
return "site number not found"

site = sites[SITE_INDEX]
try:
# Set up cookies for subsequent requests
initial_response = await aiohttp.ClientSession().get("https://www.google.com/")
COOKIEJAR = initial_response.cookies

# Start scraping process
await process_page(aiohttp.ClientSession(), SITE_INDEX, START_PAGE_NUM, END_PAGE_NUM, new_listings)

return new_listings

except Exception as e:
print(f"Error in autoScraper: {str(e)}")
return new_listings

def individual_listing_scraper(individual_metadata, raw_listing_html, link_metadata, site_headers, cookie_jar, timeout_sec):
try:
# Extract the listing URL
listing_url_tag = raw_listing_html.find(link_metadata['tag'], class_=link_metadata['classname'])
if not listing_url_tag:
print("Error: Listing URL not found in the raw HTML.")
return {}

url = listing_url_tag.get(link_metadata['attrib'])
if not url:
print("Error: URL attribute not found in the tag.")
return {}

url = link_metadata['baseurlLink'] + url

# Fetch the HTML of the individual listing page
response = requests.get(url, headers=site_headers, cookies=cookie_jar, timeout=timeout_sec, verify=False)
response.raise_for_status()  # Raise an error for bad HTTP responses
individual_page_html = BeautifulSoup(response.text, 'html.parser')

# Extract the listing data based on metadata
tag = individual_metadata['cassingIdentity']['tag']
classname = individual_metadata['cassingIdentity']['classname']
_id = individual_metadata['cassingIdentity']['id']
index = individual_metadata['cassingIdentity']['index']

listing_raw_html = individual_page_html.find_all(tag, class_=classname, id=_id)
if not listing_raw_html or len(listing_raw_html) 

Подробнее здесь: [url]https://stackoverflow.com/questions/79242476/optimising-requests-in-my-python-code-using-asynchronicity[/url]