BeautifulSoup: невозможно извлечь основной текст статьи и текст темы, несмотря на использование правильных имен классов.

BeautifulSoup: невозможно извлечь основной текст статьи и текст темы, несмотря на использование правильных имен классов. ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

BeautifulSoup: невозможно извлечь основной текст статьи и текст темы, несмотря на использование правильных имен классов.

Цитата

Сообщение Anonymous » 04 янв 2026, 14:51

Я пишу скрипт Python для сбора новостных статей с определенного веб-сайта. Хотя мне удалось извлечь теги заголовка, автора и категории (секторы, регионы, темы), я не могу захватить текст тела статьи. Он возвращает «Н/Д», хотя я определил классы в инспекторе браузера.
Что я пробовал:
Я пытался выбрать элемент div, используя class_="paywall" и class_="styles_leadArticleText__etnRf".
Я добавил User-Agent в свои заголовки, чтобы убедиться, что меня не заблокировали.
Я использовалverify=False для обработки Проблемы с сертификатом SSL на моем локальном компьютере.
Проблема: переменнаяarticle_body постоянно возвращает None, что приводит к тому, что мой полный_текст имеет значение «Н/Д». Я подозреваю, что на веб-сайте используются динамические классы или структура, в которой я неправильно перемещаюсь.
Ссылка на веб-сайт: https://renewablesnow.com/news/india-bo ... 5-1287332/
Возникли проблемы с извлечением выделенного текста на прикрепленной фотографии.

Мой код:
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 4 12:25:27 2026

@author: hbasamh
"""

import requests
from bs4 import BeautifulSoup
import urllib3
import csv
import re

# This line hides the warning that you are connecting to an unverified site
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_renewables_article(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
# ADD verify=False HERE
response = requests.get(url, headers=headers, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')

# 1. Title
title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "N/A"

# 2 & 3. Date and Time (extracted from the meta or specific time tag)
# RenewablesNow usually stores this in a or specific meta tag
date_element = soup.find('span', class_="pe-0 pe-sm-2 border-end col-sm-6 text-right-center-sm text-14-sm")
full_date = date_element.get_text(strip=True) if date_element else "N/A"
# Example split: "Dec 29, 2025 | 5:24:20 PM" -> you can split if needed

# 4. Author
author_element = soup.find('span', class_="text-decoration-underline text-dark c-p ms-1")
author = author_element.get_text(strip=True) if author_element else "N/A"

# 5. Article Text
article_body = soup.find('div', class_="d-flex flex-column align-items-center pt-4 mt-2 paywall") or soup.find('div', class_="styles_leadArticleText__etnRf text-just")
if article_body:
paragraphs = [p.get_text(strip=True) for p in article_body.find_all('p')]
full_text = " ".join(paragraphs)
else:
full_text = "N/A"

# 6, 7, 8. Sectors, Regions, and Topics
# These are usually found in sidebar tags or footer lists

# 1. Extract SECTORS
sectors = []
sector_label = soup.find(string=re.compile("Sector", re.I))
if sector_label:
# Find the container immediately following the "Sector" label
sectors_parent = sector_label.find_next("div", class_="mt-0 mt-xl-2 d-flex flex-row flex-wrap")
if sectors_parent:
for a in sectors_parent.find_all("a"):
h6 = a.find("h6")
if h6:
sectors.append(h6.get_text(strip=True))
print(f"Sectors: {sectors}")

# 2. Extract REGIONS
regions = []
region_label = soup.find(string=re.compile("Region/Country", re.I))
if region_label:
# Find the container immediately following the "Region/Country" label
regions_parent = region_label.find_next("div", class_="mt-0 mt-xl-2 d-flex flex-row flex-wrap")
if regions_parent:
for a in regions_parent.find_all("a"):
h6 = a.find("h6")
if h6:
regions.append(h6.get_text(strip=True))
print(f"Regions: {regions}")

# 3. Extract TOPICS
topics = []
topics_label = soup.find(string=re.compile("Topics", re.I))
if topics_label:
# Find the container immediately following the "Topics" label
topics_parent = topics_label.find_next("div", class_="mt-0 mt-xl-2 d-flex flex-row flex-wrap")
if topics_parent:
for a in topics_parent.find_all("a"):
h6 = a.find("h6")
if h6:
topics.append(h6.get_text(strip=True))
print(f"Topics: {topics}")

return {
'Title': title,
'Date_Time': full_date,
'Author': author,
'Content': full_text,
'Sectors': sectors,
'Regions': regions,
'Topics': topics,
'URL': url
}

except Exception as e:
print(f"Error scraping {url}: {e}")
return None

# --- Execution ---
urls = [
"https://renewablesnow.com/news/india-bo ... 5-1287332/"
]

data_list = []
for link in urls:
result = scrape_renewables_article(link)
if result:
data_list.append(result)

# Save to CSV
keys = data_list[0].keys()
with open('renewables_data.csv', 'w', newline='', encoding='utf-8') as f:
dict_writer = csv.DictWriter(f, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data_list)

print("Data saved to renewables_data.csv")

Подробнее здесь: https://stackoverflow.com/questions/798 ... pite-using

1767527480

Anonymous

Я пишу скрипт Python для сбора новостных статей с определенного веб-сайта. Хотя мне удалось извлечь теги заголовка, автора и категории (секторы, регионы, темы), я не могу захватить текст тела статьи. Он возвращает «Н/Д», хотя я определил классы в инспекторе браузера.
Что я пробовал:
Я пытался выбрать элемент div, используя class_="paywall" и class_="styles_leadArticleText__etnRf".
Я добавил User-Agent в свои заголовки, чтобы убедиться, что меня не заблокировали.
Я использовалverify=False для обработки Проблемы с сертификатом SSL на моем локальном компьютере.
Проблема: переменнаяarticle_body постоянно возвращает None, что приводит к тому, что мой полный_текст имеет значение «Н/Д». Я подозреваю, что на веб-сайте используются динамические классы или структура, в которой я неправильно перемещаюсь.
Ссылка на веб-сайт: https://renewablesnow.com/news/india-boosts-renewables-capacity-by-record-44-5-gw-in-2025-1287332/
Возникли проблемы с извлечением выделенного текста на прикрепленной фотографии.
[img]https://i.sstatic.net/xHADnXiI.png[/img]

Мой код:
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  4 12:25:27 2026

@author: hbasamh
"""

import requests
from bs4 import BeautifulSoup
import urllib3
import csv
import re

# This line hides the warning that you are connecting to an unverified site
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_renewables_article(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
# ADD verify=False HERE
response = requests.get(url, headers=headers, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')

# 1. Title
title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "N/A"

# 2 & 3. Date and Time (extracted from the meta or specific time tag)
# RenewablesNow usually stores this in a  or specific meta tag
date_element = soup.find('span', class_="pe-0 pe-sm-2 border-end col-sm-6 text-right-center-sm text-14-sm")
full_date = date_element.get_text(strip=True) if date_element else "N/A"
# Example split: "Dec 29, 2025 | 5:24:20 PM" -> you can split if needed

# 4. Author
author_element = soup.find('span', class_="text-decoration-underline text-dark c-p ms-1")
author = author_element.get_text(strip=True) if author_element else "N/A"

# 5. Article Text
article_body = soup.find('div', class_="d-flex flex-column align-items-center pt-4 mt-2 paywall") or soup.find('div', class_="styles_leadArticleText__etnRf text-just")
if article_body:
paragraphs = [p.get_text(strip=True) for p in article_body.find_all('p')]
full_text = " ".join(paragraphs)
else:
full_text = "N/A"

# 6, 7, 8. Sectors, Regions, and Topics
# These are usually found in sidebar tags or footer lists

# 1. Extract SECTORS
sectors = []
sector_label = soup.find(string=re.compile("Sector", re.I))
if sector_label:
# Find the container immediately following the "Sector" label
sectors_parent = sector_label.find_next("div", class_="mt-0 mt-xl-2 d-flex flex-row flex-wrap")
if sectors_parent:
for a in sectors_parent.find_all("a"):
h6 = a.find("h6")
if h6:
sectors.append(h6.get_text(strip=True))
print(f"Sectors: {sectors}")

# 2.  Extract REGIONS
regions = []
region_label = soup.find(string=re.compile("Region/Country", re.I))
if region_label:
# Find the container immediately following the "Region/Country" label
regions_parent = region_label.find_next("div", class_="mt-0 mt-xl-2 d-flex flex-row flex-wrap")
if regions_parent:
for a in regions_parent.find_all("a"):
h6 = a.find("h6")
if h6:
regions.append(h6.get_text(strip=True))
print(f"Regions: {regions}")

# 3. Extract TOPICS
topics = []
topics_label = soup.find(string=re.compile("Topics", re.I))
if topics_label:
# Find the container immediately following the "Topics" label
topics_parent = topics_label.find_next("div", class_="mt-0 mt-xl-2 d-flex flex-row flex-wrap")
if topics_parent:
for a in topics_parent.find_all("a"):
h6 = a.find("h6")
if h6:
topics.append(h6.get_text(strip=True))
print(f"Topics: {topics}")

return {
'Title': title,
'Date_Time': full_date,
'Author': author,
'Content': full_text,
'Sectors': sectors,
'Regions': regions,
'Topics': topics,
'URL': url
}

except Exception as e:
print(f"Error scraping {url}: {e}")
return None

# --- Execution ---
urls = [
"https://renewablesnow.com/news/india-boosts-renewables-capacity-by-record-44-5-gw-in-2025-1287332/"
]

data_list = []
for link in urls:
result = scrape_renewables_article(link)
if result:
data_list.append(result)

# Save to CSV
keys = data_list[0].keys()
with open('renewables_data.csv', 'w', newline='', encoding='utf-8') as f:
dict_writer = csv.DictWriter(f, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data_list)

print("Data saved to renewables_data.csv")

 

Подробнее здесь: [url]https://stackoverflow.com/questions/79860129/beautifulsoup-unable-to-extract-article-body-text-and-topics-text-despite-using[/url]