Как автоматизировать ежедневное выполнение сценария очистки веб-страниц на JupyterHub и VSCode? - Цифровое Кемерово

Как автоматизировать ежедневное выполнение сценария очистки веб-страниц на JupyterHub и VSCode? ⇐ Python

Ответить

1 сообщение • Страница 1 из 1

Anonymous

Как автоматизировать ежедневное выполнение сценария очистки веб-страниц на JupyterHub и VSCode?

Цитата

Сообщение Anonymous » 11 ноя 2024, 06:03

Я относительно новичок в веб-скрапинге и в настоящее время работаю над сценарием Python, который запускаю в среде JupyterHub, интегрированной с VSCode. Моя цель — оптимизировать этот скрипт, чтобы он автоматически запускался каждый день для сбора данных, избегая при этом дублирования записей.
код здесь

Код: Выделить всё

import os
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import datetime

# Function to create directories if needed
def create_directories():
base_path = '/home/housnimzeali'  # Main directory
directories = ['scraped_data', 'logs', 'scraped_files']  # List of directories to create

# Create directories if necessary
for directory in directories:
directory_path = os.path.join(base_path, directory)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
print(f"Directory created: {directory_path}")
else:
print(f"Directory already exists: {directory_path}")

# Function for introducing a random delay
def random_sleep(min_seconds=1, max_seconds=3):
delay = random.uniform(min_seconds, max_seconds)
time.sleep(delay)

# Function to load scraped URLs
def load_scraped_urls(file_path):
try:
if pd.io.common.file_exists(file_path):
df = pd.read_excel(file_path)
return set(df['URL'].tolist())  # Returns a set of scraped URLs
else:
return set()
except Exception as e:
print(f"Error loading scraped URLs: {e}")
return set()

# Function to save scraped URLs
def save_scraped_urls(scraped_urls, file_path):
try:
df = pd.DataFrame(list(scraped_urls), columns=['URL'])
df.to_excel(file_path, index=False)
except Exception as e:
print(f"Error saving scraped URLs: {e}")

# Function to extract content, tags, and date from an article
def scrap_contents_tags_date(article_url, headers):
try:
response = requests.get(article_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

main_section = soup.select_one('main#main')
content_div = main_section.select_one('.content') if main_section else None
content = ' '.join([element.get_text(strip=True) for element in content_div.find_all(True)]) if content_div else 'No Content'

date_div = content_div.select_one('span.article-meta-date') if content_div else None
date = date_div.get_text(strip=True) if date_div else 'No Date'

tags_section = main_section.select_one('.outside-content-zone.below-content')
tags_div = tags_section.select_one('p.tags') if tags_section else None
tags = [a.get_text(strip=True) for a in tags_div.find_all('a')] if tags_div else []

return content, tags, date
except requests.exceptions.RequestException:
return 'No Content', [], 'No Date'

# Function to export articles to an Excel file
def export_to_excel(articles, filename):
try:
df = pd.DataFrame(articles)
df.to_excel(filename, index=False)
except Exception as e:
print(f"Error writing to file {filename}: {e}")

# Function to log processed files
def log_files(file_log):
today = datetime.date.today()
log_filename = f"/home/housnimzeali/logs/scraped_files_log_{today}.txt"
with open(log_filename, 'w') as log_file:
for file_name in file_log:
log_file.write(f"{file_name}\n")

# Main scraping function
def scrape_article(url, file_log, scraped_urls):
article_data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64;  x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
try:
page_count = 0
while True:
random_sleep(min_seconds=2, max_seconds=5)
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')
container = soup.select_one('main#main')

if not container:
print(f"No content found for URL: {url}")
return article_data

article_section = container.select_one('div.posts.publishables-list-wrap.first-item-larger')
if not article_section:
print(f"No articles found in section for URL: {url}")
return article_data

for article in article_section.select('.row'):
title_elem = article.select_one('h3 a')
article_link = title_elem['href'] if title_elem else None
title = title_elem.get_text(strip=True) if title_elem else 'No Title'
summary_div = article.select_one('p.hidden-xs.item-desc')
summary = summary_div.get_text(strip=True) if summary_div else 'No Summary'

if article_link:
full_article_link = urljoin(url, article_link)

if full_article_link not in scraped_urls:
content, tags, date = scrap_contents_tags_date(full_article_link, headers)
article_data.append({
'Title': title,
'Date': date,
'Summary': summary,
'Content': content,
'Tags': ', '.join(tags) if tags else 'No Tags',
'Source': 'Medical_news',
'URL': full_article_link
})
scraped_urls.add(full_article_link)

# Save extracted articles to a file
if article_data:
file_name = f'/home/housnimzeali/scraped_data/medical_news_{page_count + 1}.xlsx'
export_to_excel(article_data, file_name)
file_log.append(file_name)
article_data.clear()

# Find the link to the next page (ignore "Previous")
next_page = soup.select_one('div.site-paging a:contains("Next")')

if next_page and 'href' in next_page.attrs:
url = urljoin(url, next_page['href'])
else:
print("No next page, scraping finished.")
break

page_count += 1

except requests.exceptions.RequestException as e:
print(f"Error during scraping: {e}")
return article_data

return article_data

# Main function
def main():
file_log = []  # List to track files
scraped_urls = load_scraped_urls('/home/housnimzeali/scraped_urls.xlsx')  # Load already scraped URLs
url = "https://www.news-medical.net/medical/news"  # Example URL to scrape
articles = scrape_article(url, file_log, scraped_urls)
print("Scraping complete.")

# Save log of files and scraped URLs
log_files(file_log)
save_scraped_urls(scraped_urls, '/home/housnimzeali/scraped_urls.xlsx')

# Call the main function
main()

**Контекст:
**
Я настроил сценарий очистки, который собирает и экспортирует данные в файлы Excel.
Я хочу запланировать его для ежедневного выполнения.
Мой скрипт использует такие библиотеки, как запросы, BeautifulSoup и pandas.
Я также хочу, чтобы скрипт правильно обрабатывал повторяющиеся записи, проверяя и сохраняя очищенные URL-адреса.
**Мои вопросы:
**
Как лучше всего автоматизировать ежедневное выполнение этого сценария в среде JupyterHub и VSCode?
Должен ли я использовать такой инструмент, как cron, на сервере, где размещен JupyterHub, или существуют ли такие инструменты? конкретные решения для Jupyter/VSCode?
Какие рекомендации вы бы порекомендовали для обеспечения надежного запуска сценария по расписанию и обработки ошибок или прерываний?
Как оптимизировать управление дубликатами при последовательных очистках чтобы избежать ненужного повторения данных?
Спасибо за советы и помощь.

Подробнее здесь: https://stackoverflow.com/questions/791 ... rhub-and-v

1731294215

Anonymous

Я относительно новичок в веб-скрапинге и в настоящее время работаю над сценарием Python, который запускаю в среде JupyterHub, интегрированной с VSCode.  Моя цель — оптимизировать этот скрипт, чтобы он автоматически запускался каждый день для сбора данных, избегая при этом дублирования записей.
код здесь
[code]import os
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import datetime

# Function to create directories if needed
def create_directories():
base_path = '/home/housnimzeali'  # Main directory
directories = ['scraped_data', 'logs', 'scraped_files']  # List of directories to create

# Create directories if necessary
for directory in directories:
directory_path = os.path.join(base_path, directory)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
print(f"Directory created: {directory_path}")
else:
print(f"Directory already exists: {directory_path}")

# Function for introducing a random delay
def random_sleep(min_seconds=1, max_seconds=3):
delay = random.uniform(min_seconds, max_seconds)
time.sleep(delay)

# Function to load scraped URLs
def load_scraped_urls(file_path):
try:
if pd.io.common.file_exists(file_path):
df = pd.read_excel(file_path)
return set(df['URL'].tolist())  # Returns a set of scraped URLs
else:
return set()
except Exception as e:
print(f"Error loading scraped URLs: {e}")
return set()

# Function to save scraped URLs
def save_scraped_urls(scraped_urls, file_path):
try:
df = pd.DataFrame(list(scraped_urls), columns=['URL'])
df.to_excel(file_path, index=False)
except Exception as e:
print(f"Error saving scraped URLs: {e}")

# Function to extract content, tags, and date from an article
def scrap_contents_tags_date(article_url, headers):
try:
response = requests.get(article_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

main_section = soup.select_one('main#main')
content_div = main_section.select_one('.content') if main_section else None
content = ' '.join([element.get_text(strip=True) for element in content_div.find_all(True)]) if content_div else 'No Content'

date_div = content_div.select_one('span.article-meta-date') if content_div else None
date = date_div.get_text(strip=True) if date_div else 'No Date'

tags_section = main_section.select_one('.outside-content-zone.below-content')
tags_div = tags_section.select_one('p.tags') if tags_section else None
tags = [a.get_text(strip=True) for a in tags_div.find_all('a')] if tags_div else []

return content, tags, date
except requests.exceptions.RequestException:
return 'No Content', [], 'No Date'

# Function to export articles to an Excel file
def export_to_excel(articles, filename):
try:
df = pd.DataFrame(articles)
df.to_excel(filename, index=False)
except Exception as e:
print(f"Error writing to file {filename}: {e}")

# Function to log processed files
def log_files(file_log):
today = datetime.date.today()
log_filename = f"/home/housnimzeali/logs/scraped_files_log_{today}.txt"
with open(log_filename, 'w') as log_file:
for file_name in file_log:
log_file.write(f"{file_name}\n")

# Main scraping function
def scrape_article(url, file_log, scraped_urls):
article_data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64;  x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
try:
page_count = 0
while True:
random_sleep(min_seconds=2, max_seconds=5)
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')
container = soup.select_one('main#main')

if not container:
print(f"No content found for URL: {url}")
return article_data

article_section = container.select_one('div.posts.publishables-list-wrap.first-item-larger')
if not article_section:
print(f"No articles found in section for URL: {url}")
return article_data

for article in article_section.select('.row'):
title_elem = article.select_one('h3 a')
article_link = title_elem['href'] if title_elem else None
title = title_elem.get_text(strip=True) if title_elem else 'No Title'
summary_div = article.select_one('p.hidden-xs.item-desc')
summary = summary_div.get_text(strip=True) if summary_div else 'No Summary'

if article_link:
full_article_link = urljoin(url, article_link)

if full_article_link not in scraped_urls:
content, tags, date = scrap_contents_tags_date(full_article_link, headers)
article_data.append({
'Title': title,
'Date': date,
'Summary': summary,
'Content': content,
'Tags': ', '.join(tags) if tags else 'No Tags',
'Source': 'Medical_news',
'URL': full_article_link
})
scraped_urls.add(full_article_link)

# Save extracted articles to a file
if article_data:
file_name = f'/home/housnimzeali/scraped_data/medical_news_{page_count + 1}.xlsx'
export_to_excel(article_data, file_name)
file_log.append(file_name)
article_data.clear()

# Find the link to the next page (ignore "Previous")
next_page = soup.select_one('div.site-paging a:contains("Next")')

if next_page and 'href' in next_page.attrs:
url = urljoin(url, next_page['href'])
else:
print("No next page, scraping finished.")
break

page_count += 1

except requests.exceptions.RequestException as e:
print(f"Error during scraping: {e}")
return article_data

return article_data

# Main function
def main():
file_log = []  # List to track files
scraped_urls = load_scraped_urls('/home/housnimzeali/scraped_urls.xlsx')  # Load already scraped URLs
url = "https://www.news-medical.net/medical/news"  # Example URL to scrape
articles = scrape_article(url, file_log, scraped_urls)
print("Scraping complete.")

# Save log of files and scraped URLs
log_files(file_log)
save_scraped_urls(scraped_urls, '/home/housnimzeali/scraped_urls.xlsx')

# Call the main function
main()

[/code]
**Контекст:
**
Я настроил сценарий очистки, который собирает и экспортирует данные в файлы Excel.
Я хочу запланировать его для ежедневного выполнения.
Мой скрипт использует такие библиотеки, как запросы, BeautifulSoup и pandas.
Я также хочу, чтобы скрипт правильно обрабатывал повторяющиеся записи, проверяя и сохраняя очищенные URL-адреса.
**Мои вопросы:
**
Как лучше всего автоматизировать ежедневное выполнение этого сценария в среде JupyterHub и VSCode?
Должен ли я использовать такой инструмент, как cron, на сервере, где размещен JupyterHub, или существуют ли такие инструменты? конкретные решения для Jupyter/VSCode?
Какие рекомендации вы бы порекомендовали для обеспечения надежного запуска сценария по расписанию и обработки ошибок или прерываний?
Как оптимизировать управление дубликатами при последовательных очистках чтобы избежать ненужного повторения данных?
Спасибо за советы и помощь. 

Подробнее здесь: [url]https://stackoverflow.com/questions/79176273/how-to-automate-the-daily-execution-of-a-web-scraping-script-on-jupyterhub-and-v[/url]

Ответить

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Вернуться в «Python»