Ошибка:
Код: Выделить всё
Scraping CH1...
Error fetching URL https://www.exam-mate.com/topicalpastpapers?cat=3&sub=29&t\[0\]=54&papers\[0\]=7: ('Connection aborted.', HTTPException("Failed to execute 'send' on 'XMLHttpRequest': Failed to load 'https://www.exam-mate.com/topicalpastpapers?cat=3&sub=29&t[0]=54&papers[0]=7'."))
Scraping CH2...
Error fetching URL https://www.exam-mate.com/topicalpastpapers?cat=3&sub=29&t\[0\]=55&papers\[0\]=7: ('Connection aborted.', HTTPException("Failed to execute 'send' on 'XMLHttpRequest': Failed to load 'https://www.exam-mate.com/topicalpastpapers?cat=3&sub=29&t[0]=55&papers[0]=7'."))
Scraping completed!
Код: Выделить всё
import re
import os
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
# Headers to mimic a browser request
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
# Define your topics and URLs
topic_urls = {
"CH1": "https://www.exam-mate.com/topicalpastpapers?cat=3&sub=29&t[0]=54&papers[0]=7",
"CH2": "https://www.exam-mate.com/topicalpastpapers?cat=3&sub=29&t[0]=55&papers[0]=7",
# Add more topics as needed
}
# Base URL
BASE_URL = "https://www.exam-mate.com"
# Retry mechanism for downloading images
def download_image(image_url, save_path, retries=3):
for attempt in range(retries):
try:
response = requests.get(image_url, headers=headers, timeout=10)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {save_path}")
return
else:
print(f"Failed to download {image_url}: Status {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Error on attempt {attempt + 1} for {image_url}: {e}")
sleep(randint(1, 3)) # Random delay between retries
print(f"Failed to download after {retries} attempts: {image_url}")
# Scrape each topic
for topic, url in topic_urls.items():
print(f"Scraping {topic}...")
# Create topic-specific folders
os.makedirs(f"images/{topic}/questions", exist_ok=True)
os.makedirs(f"images/{topic}/answers", exist_ok=True)
try:
# Get the webpage
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
except requests.exceptions.RequestException as e:
print(f"Error fetching URL {url}: {e}")
continue
# Extract image links
for idx, tag in enumerate(soup.select("td:nth-of-type(1) a")):
try:
# Extract the image link
question_link = re.search(r"/questions.*\.png", tag["onclick"]).group()
full_url = BASE_URL + question_link
# Save to appropriate folder (even index: questions, odd index: answers)
if idx % 2 == 0: # Even index: questions
save_path = f"images/{topic}/questions/question_{idx // 2 + 1}.png"
else: # Odd index: answers
save_path = f"images/{topic}/answers/answer_{idx // 2 + 1}.png"
# Download the image
download_image(full_url, save_path)
except Exception as e:
print(f"Error processing tag: {tag}, {e}")
print("Scraping completed!")
Я хочу загрузить вопросы и ответы в виде изображений PNG в отдельные папки как вопросы и ответы для каждой темы.
Подробнее здесь: https://stackoverflow.com/questions/792 ... ing-python