Я ищу способы действительно ускорить этот процесс, чтобы он мог опрашивать страницы за считанные секунды. а не минуты. Я открыт для корректировок того, как используется Playwright, оптимизации логики очистки или даже изучения альтернативных библиотек или фреймворков, если они могут быстрее достичь тех же результатов.
Вот сценарий:
Код: Выделить всё
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
import pandas as pd
from io import StringIO
import random
# Apply nest_asyncio to handle nested event loops
nest_asyncio.apply()
# Updated URL with items=100
URL = "https://www.coingecko.com/en/coins/1/markets/spot?items=100"
UA = (
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/130.0.0.0 Mobile Safari/537.3"
)
OUTPUT_PATH = r"C:\Users\Gaming\Downloads\scraped_data_optimized.csv"
# Function to add random delay
async def random_delay():
await asyncio.sleep(random.uniform(0.5, 2.0))
# Function to scrape a single page
async def scrape_page(context, page_num):
# Construct the URL with the page parameter
page_url = f"{URL}&page={page_num}"
page = await context.new_page()
await page.goto(page_url, wait_until="commit")
html = await page.content()
await page.close()
try:
tables = pd.read_html(StringIO(html))
return tables if tables else None
except ValueError:
return None
# Function to scrape a single page with retries
async def scrape_page_with_retries(context, page_num, retries=3):
for attempt in range(retries):
try:
result = await scrape_page(context, page_num)
if result:
return result # Success
print(f"No tables found on page {page_num}.")
return None
except Exception as e:
print(f"Retry {attempt + 1} for page {page_num} due to error: {e}")
await random_delay()
print(f"Failed to scrape page {page_num} after {retries} attempts.")
return None
# Function to scrape multiple pages dynamically
async def scrape_all_pages(consecutive_empty_pages=3, concurrency=5):
markets = []
empty_page_count = 0
page_num = 1
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent=UA, java_script_enabled=False)
semaphore = asyncio.Semaphore(concurrency)
while empty_page_count < consecutive_empty_pages:
async def scrape_page_with_semaphore():
async with semaphore:
return await scrape_page_with_retries(context, page_num)
result = await scrape_page_with_semaphore()
if result:
markets.extend(result)
empty_page_count = 0
print(f"Page {page_num} scraped successfully.") # Only prints here
else:
empty_page_count += 1
print(f"Page {page_num} was empty.")
page_num += 1
await browser.close()
return pd.concat(markets, ignore_index=True) if markets else pd.DataFrame()
# Main function to handle scraping and saving data
async def main_async():
max_consecutive_empty_pages = 3 # Stop after 3 consecutive empty pages
concurrency = 10 # Number of concurrent tasks
df = await scrape_all_pages(consecutive_empty_pages=max_consecutive_empty_pages, concurrency=concurrency)
if not df.empty:
df = df.dropna(how="all")
df.to_csv(OUTPUT_PATH, index=False)
print(f"Data saved to '{OUTPUT_PATH}'")
else:
print("No data was scraped.")
# Entry point for the script
def main():
asyncio.run(main_async())
if __name__ == "__main__":
main()
Подробнее здесь: https://stackoverflow.com/questions/793 ... s-possible
Мобильная версия