Я получаю данные в CSV идеально нормально в каждом столбце, но для многих записей язык превращается на арабский язык или на другой язык, даже если то же самое можно сказать на английском на веб -сайте. Даже ссылка на PDF перенаправляется на PDF на разных языках, даже если есть английская версия. Filter в противном случае я потеряю свои данные.import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from http.client import RemoteDisconnected
BASE_URL = "https://iris.who.int"
PAGE_COUNT = 51 # Total pages
RPP = 100 # Results per page
# List of user-agents
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; rv:90.0) Gecko/20100101 Firefox/90.0",
]
session = requests.Session()
def get_with_retries(url, max_retries=3):
for attempt in range(max_retries):
try:
headers = {
"User-Agent": random.choice(USER_AGENTS)
}
res = session.get(url, headers=headers, timeout=20)
if res.status_code == 429:
wait = 30 + random.randint(10, 30)
print(f"
time.sleep(wait)
continue
res.raise_for_status()
return res
except RemoteDisconnected:
wait = 10 + attempt * 10
print(f"[!] RemoteDisconnected — retrying in {wait}s...")
time.sleep(wait)
except requests.exceptions.RequestException as e:
print(f"[!] Attempt {attempt + 1}/{max_retries} failed for {url}: {e}")
time.sleep(10 * (attempt + 1)) # exponential backoff
return None
def extract_details(item):
# Check if language attribute is English
lang = item.get("lang", "en")
if lang != "en":
return None # Skip non-English
title_tag = item.find("h4", class_="artifact-title")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
author_tag = item.find("span", class_="author")
if author_tag and author_tag.get("lang") and author_tag["lang"] != "en":
return None # Skip if author explicitly marked non-English
author = author_tag.get_text(strip=True) if author_tag else "N/A"
date_tag = item.find("span", class_="govdoc-date")
dates = date_tag.get_text(strip=True) if date_tag else "N/A"
href = item.find("a", href=True)
child_url = BASE_URL + href["href"] if href else "N/A"
return title, author, dates, child_url
def extract_pdf_link(child_url):
response = get_with_retries(child_url)
if not response:
return None
soup = BeautifulSoup(response.content, 'html.parser')
# Prefer links that are clearly English
pdf_links = soup.find_all("a", href=lambda x: x and ".pdf" in x)
for link in pdf_links:
href = link['href']
text = link.get_text(strip=True).lower()
if "english" in text or "en" in href:
return BASE_URL + href
# fallback if no clear English link
if pdf_links:
return BASE_URL + pdf_links[0]['href']
return None
# Main scraping loop
data = []
for page_num in range(PAGE_COUNT):
print(f"\n
url = f"https://iris.who.int/discover?filtertyp ... ={page_num}"
res = get_with_retries(url)
if not res:
print(f"[!] Skipping page {page_num + 1} due to repeated failures.")
continue
soup = BeautifulSoup(res.text, "html.parser")
items = soup.select(".ds-artifact-item")
for item in items:
result = extract_details(item)
if not result:
continue # Skip non-English
title, author, dates, child_url = result
pdf_link = extract_pdf_link(child_url) if child_url != "N/A" else "N/A"
if pdf_link is None:
continue # Skip if no valid PDF found
row = {
"Title": title,
"Author": author,
"Dates": dates,
"PDF_Link": pdf_link,
"Child_Page": child_url
}
data.append(row)
print(f"
time.sleep(random.uniform(2, 5)) # Random polite pause between items
time.sleep(random.uniform(4, 8)) # Random pause between pages
# Save to file
df = pd.DataFrame(data)
df.to_csv("who_data_english_only.csv", index=False, encoding="utf-8-sig")
print(f"\n
Подробнее здесь: https://stackoverflow.com/questions/795 ... -publicati