Код: Выделить всё
#!/usr/bin/env python3
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import json
from pathlib import Path
def scrape_urlhaus():
with sync_playwright() as p:
browser = p.chromium.launch(headless=True, slow_mo=1200)
context = browser.new_context(
viewport={'width': 1366, 'height': 768},
)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
context.add_cookies(json.loads(Path("urlhaus_cookies.json").read_text()))
page = context.new_page()
page.goto("https://urlhaus.abuse.ch/")
page.screenshot(path="debug_screenshot.png")
page.wait_for_selector('a.nav-link[href="/browse/"]')
page.click('a.nav-link[href="/browse/"]')
page.wait_for_selector('table.table.table-sm.table-hover.table-bordered')
content = page.content()
Подробнее здесь: https://stackoverflow.com/questions/793 ... 405-banned