код здесь
Код: Выделить всё
import os
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import datetime
# Function to create directories if needed
def create_directories():
base_path = '/home/housnimzeali' # Main directory
directories = ['scraped_data', 'logs', 'scraped_files'] # List of directories to create
# Create directories if necessary
for directory in directories:
directory_path = os.path.join(base_path, directory)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
print(f"Directory created: {directory_path}")
else:
print(f"Directory already exists: {directory_path}")
# Function for introducing a random delay
def random_sleep(min_seconds=1, max_seconds=3):
delay = random.uniform(min_seconds, max_seconds)
time.sleep(delay)
# Function to load scraped URLs
def load_scraped_urls(file_path):
try:
if pd.io.common.file_exists(file_path):
df = pd.read_excel(file_path)
return set(df['URL'].tolist()) # Returns a set of scraped URLs
else:
return set()
except Exception as e:
print(f"Error loading scraped URLs: {e}")
return set()
# Function to save scraped URLs
def save_scraped_urls(scraped_urls, file_path):
try:
df = pd.DataFrame(list(scraped_urls), columns=['URL'])
df.to_excel(file_path, index=False)
except Exception as e:
print(f"Error saving scraped URLs: {e}")
# Function to extract content, tags, and date from an article
def scrap_contents_tags_date(article_url, headers):
try:
response = requests.get(article_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
main_section = soup.select_one('main#main')
content_div = main_section.select_one('.content') if main_section else None
content = ' '.join([element.get_text(strip=True) for element in content_div.find_all(True)]) if content_div else 'No Content'
date_div = content_div.select_one('span.article-meta-date') if content_div else None
date = date_div.get_text(strip=True) if date_div else 'No Date'
tags_section = main_section.select_one('.outside-content-zone.below-content')
tags_div = tags_section.select_one('p.tags') if tags_section else None
tags = [a.get_text(strip=True) for a in tags_div.find_all('a')] if tags_div else []
return content, tags, date
except requests.exceptions.RequestException:
return 'No Content', [], 'No Date'
# Function to export articles to an Excel file
def export_to_excel(articles, filename):
try:
df = pd.DataFrame(articles)
df.to_excel(filename, index=False)
except Exception as e:
print(f"Error writing to file {filename}: {e}")
# Function to log processed files
def log_files(file_log):
today = datetime.date.today()
log_filename = f"/home/housnimzeali/logs/scraped_files_log_{today}.txt"
with open(log_filename, 'w') as log_file:
for file_name in file_log:
log_file.write(f"{file_name}\n")
# Main scraping function
def scrape_article(url, file_log, scraped_urls):
article_data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
try:
page_count = 0
while True:
random_sleep(min_seconds=2, max_seconds=5)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
container = soup.select_one('main#main')
if not container:
print(f"No content found for URL: {url}")
return article_data
article_section = container.select_one('div.posts.publishables-list-wrap.first-item-larger')
if not article_section:
print(f"No articles found in section for URL: {url}")
return article_data
for article in article_section.select('.row'):
title_elem = article.select_one('h3 a')
article_link = title_elem['href'] if title_elem else None
title = title_elem.get_text(strip=True) if title_elem else 'No Title'
summary_div = article.select_one('p.hidden-xs.item-desc')
summary = summary_div.get_text(strip=True) if summary_div else 'No Summary'
if article_link:
full_article_link = urljoin(url, article_link)
if full_article_link not in scraped_urls:
content, tags, date = scrap_contents_tags_date(full_article_link, headers)
article_data.append({
'Title': title,
'Date': date,
'Summary': summary,
'Content': content,
'Tags': ', '.join(tags) if tags else 'No Tags',
'Source': 'Medical_news',
'URL': full_article_link
})
scraped_urls.add(full_article_link)
# Save extracted articles to a file
if article_data:
file_name = f'/home/housnimzeali/scraped_data/medical_news_{page_count + 1}.xlsx'
export_to_excel(article_data, file_name)
file_log.append(file_name)
article_data.clear()
# Find the link to the next page (ignore "Previous")
next_page = soup.select_one('div.site-paging a:contains("Next")')
if next_page and 'href' in next_page.attrs:
url = urljoin(url, next_page['href'])
else:
print("No next page, scraping finished.")
break
page_count += 1
except requests.exceptions.RequestException as e:
print(f"Error during scraping: {e}")
return article_data
return article_data
# Main function
def main():
file_log = [] # List to track files
scraped_urls = load_scraped_urls('/home/housnimzeali/scraped_urls.xlsx') # Load already scraped URLs
url = "https://www.news-medical.net/medical/news" # Example URL to scrape
articles = scrape_article(url, file_log, scraped_urls)
print("Scraping complete.")
# Save log of files and scraped URLs
log_files(file_log)
save_scraped_urls(scraped_urls, '/home/housnimzeali/scraped_urls.xlsx')
# Call the main function
main()
**
Я настроил сценарий очистки, который собирает и экспортирует данные в файлы Excel.
Я хочу запланировать его для ежедневного выполнения.
Мой скрипт использует такие библиотеки, как запросы, BeautifulSoup и pandas.
Я также хочу, чтобы скрипт правильно обрабатывал повторяющиеся записи, проверяя и сохраняя очищенные URL-адреса.
**Мои вопросы:
**
Как лучше всего автоматизировать ежедневное выполнение этого сценария в среде JupyterHub и VSCode?
Должен ли я использовать такой инструмент, как cron, на сервере, где размещен JupyterHub, или существуют ли такие инструменты? конкретные решения для Jupyter/VSCode?
Какие рекомендации вы бы порекомендовали для обеспечения надежного запуска сценария по расписанию и обработки ошибок или прерываний?
Как оптимизировать управление дубликатами при последовательных очистках чтобы избежать ненужного повторения данных?
Спасибо за советы и помощь.
Подробнее здесь: https://stackoverflow.com/questions/791 ... rhub-and-v
Мобильная версия