Сбор данных о динамической производительности веб-сайта

Сбор данных о динамической производительности веб-сайта ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Сбор данных о динамической производительности веб-сайта

Цитата

Сообщение Anonymous » 16 окт 2024, 17:38

Я хочу очистить веб-сайт газетного архива (genios.de) и столкнулся с проблемой, заключающейся в том, что оглавление версий веб-сайта отображается динамически после щелчка по нему. Откроется окно предварительного просмотра, и вы сможете получить доступ к содержанию.
Я использую Selenium и уже получаю нужные данные. Проблема, однако, в эффективности.
Это связано с тем, что как только я щелкнул одну плитку, содержимое визуализируется, а затем остается там. Это приводит к тому, что суп.find_all сначала находит именно то, что мне нужно, затем находит день 1 + день 2, затем 1+2+3 и так далее. Я написал код, который теперь проверяет определенный элемент на дату, а затем обращается к великому родителю элемента, чтобы получить только нужные данные. Похоже, это влияет на производительность, чем больше контента отображается на странице.
Есть ли простой и более эффективный способ убедиться, что «старый» контент больше не доступен как только я нажму на новую плитку?

Код: Выделить всё

import re
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

def get_tocs_one_month(url, current_date, newspaper, driver):

# Extract the year and month, add 1 to the month, and form yyyymm
year = current_date.year
month = current_date.month + 1
if month > 12:
month = 1
year += 1

date = int(f"{year}{month:02d}")

url = f"{url}?before={date}"

# Load the page
driver.get(url)

# Check for cookie consent window and decline cookies
try:
# Target the button with the class name "button ccm--decline-cookies ccm--ctrl-init"
cookie_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.ccm--decline-cookies'))
)
cookie_button.click()
print("Cookie consent declined.")
except Exception as e:
print("No cookie consent window or failed to close it:", str(e))

# Wait until the tiles with the daily issues are loaded
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.media_tile.element')))

# Get the "partialSourceListArea" section
partial_source_list_area = driver.find_element(By.CLASS_NAME, 'partialSourceListArea')

# Get all tiles with the class "media_tile element" within the "partialSourceListArea" section
media_tiles = partial_source_list_area.find_elements(By.CSS_SELECTOR, 'div.media_tile.element')

print(f"Found {len(media_tiles)} media tiles for {date-1}.")

# DataFrame to store the data
df = pd.DataFrame(columns=['newspaper', 'date', 'headline'])

# Iterate through each tile and click on it
for tile in media_tiles:
try:

# Scroll the tile into view
driver.execute_script("arguments[0].scrollIntoView(true);", tile)

# Regular expression to find date in the format dd.mm.yyyy within the tile text
date_match = re.search(r"\b\d{2}\.\d{2}\.\d{4}\b", tile.text)
if date_match:
date = date_match.group(0)
else:
date = None

# Get the current content or some unique attribute before clicking the tile
try:
toc_headline = driver.find_element(By.CLASS_NAME, 'table_of_contents__headline__documents')
content_before = toc_headline.find_element(By.TAG_NAME, 'span').get_attribute("innerHTML")
except:
content_before = ""   # No content yet if this is the first tile

# Click on the tile
tile.click()

print(f"Successfully clicked on the tile for {date}.")

if content_before:
# Wait until the content changes after clicking the tile
WebDriverWait(driver, 10).until(
EC.invisibility_of_element_located((By.XPATH, f"//*[contains(.,'{content_before}')]"))
)

# Optionally wait for the new content to appear after the change
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'table_of_contents__header__documents'))
)

# sleep(1)  # Optional sleep to give the page time to load

print(f"New content for {date} successfully loaded.")

# Get the HTML code of the daily issue
html = driver.page_source

# Create a BeautifulSoup object for the daily issue
soup = BeautifulSoup(html, 'lxml')

# Find all elements with the class "table_of_contents__headline__documents"
toc_headlines = soup.find_all('div', class_='table_of_contents__header__documents')

print(f"{len(toc_headlines)} table of contents found in html.")

# Find the toc_headline that contains a span object with the date
toc_headline_with_date = None
toc_headline_with_date = find_toc_headline_with_date(toc_headlines, date)
if not toc_headline_with_date:
print(f"No table of contents found for {date}.")
print("Waiting 2 seconds before retrying.")
time.sleep(2)
toc_headline_with_date = find_toc_headline_with_date(toc_headlines, date)
if not toc_headline_with_date:
print(f"No table of contents found for {date}. Logging the error and skipping this date.")
df = pd.concat([df, pd.DataFrame([{'newspaper': newspaper, 'date': date, 'headline': 'error'}])], ignore_index=True)
continue

# Move up two levels to find the parent container of the table of contents
toc_container = toc_headline_with_date.find_parent().find_parent()

# Check if the toc_container is actually called "table_of_contents"
if toc_container.get('class') != ['table_of_contents']:
raise ValueError(f"The container for the table of contents does not have the expected class 'table_of_contents'. Actual classes: {toc_container.get('class', [])}")

# Find all "div" elements with the class "table_of_contents__body__document__text document_title_wrapper" within the container
toc_entries = toc_container.find_all('div', class_='table_of_contents__body__document__text document_title_wrapper')

# Extract and save the text of each link in the DataFrame
for entry in toc_entries:
headline_div = entry.find('div', class_='tooltip-title')
headline = headline_div.get_text(strip=True) if headline_div else None
headline = re.sub(r'\s+', ' ', headline, flags=re.IGNORECASE) if headline else None
headline = headline.replace('"""', '"').replace('""', '"')

# Skip entries with the text "kein Titel" (case insensitive)
if headline and "kein Titel".lower() not in headline.lower():
df = pd.concat([df, pd.DataFrame([{'newspaper': newspaper, 'date': date, 'headline': headline}])], ignore_index=True)

print(f"Toc entries for the {date} successfully extracted and appended to df.")

except Exception as e:
print(f"Error: {e}")

return df

def find_toc_headline_with_date(toc_headlines, date):
for headline in toc_headlines:
span = headline.find('span')
if span and date in span.text:
return headline
return None

Еще одна проблема, с которой я столкнулся, заключается в том, что иногда не удается найти содержание текущей даты. Я хочу убедиться, что такого больше не повторится. Для этого мне нужно будет убедиться, что новый материал загружен, прежде чем извлекать информацию. Я пробую это, но почти уверен, что подход с

Код: Выделить всё

# Wait for the new content to appear after the change
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'table_of_contents__header__documents')))

не работает, поскольку такой элемент явно существует, поскольку он был загружен ранее для других плиток, на которые был нажат.
Что бы может быть лучший вариант убедиться, что загружена именно эта информация?
Чтобы использовать функцию выше, вы можете запустить этот минимальный пример:

Код: Выделить всё

import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options

# Import the function from the module
from get_tocs_one_month import get_tocs_one_month

# Set up the Selenium WebDriver for Firefox
firefox_options = Options()
firefox_options.add_argument("--headless")  # Run in headless mode

driver = webdriver.Firefox(options=firefox_options)

# Define the parameters
url = "https://genios.de/browse/Alle/Presse/Presse Deutschland/Aachener Zeitung"
date = pd.to_datetime("2014-01-01")
newspaper = "Aachener Zeitung"

# Run the function
df = get_tocs_one_month(url, date, newspaper, driver)

# Print the resulting DataFrame
print(df)

# Quit the driver
driver.quit()

Учитывая, что в настоящее время один месяц занимает около одной минуты, один год занимает 12, 10 лет 120 т.е. 2 часа. И это только одна газета. Мне нужно как минимум 30-40, что уже будет означать около 60-80ч. В идеале я хотел бы иметь 50-100 газет. Это означает, что мне придется радикально повысить эффективность, если я не хочу ждать несколько дней.
Я благодарен за любые предложения!
Изменить:
У меня есть добавлены таймеры, чтобы определить, что замедляет работу кода. В основном два аспекта:

Рендеринг другого фрагмента становится все медленнее. Примерно от 0,3/0,4 секунды для первых двух плиток до примерно 1 секунды для последних (~25 плиток).
Определение правильной части html-кода требует наибольших затрат времени. что имеет смысл, поскольку это всего лишь обходной путь проблемы, с которой я столкнулся. Оно увеличивается примерно с 0,2 секунды до почти 2 секунд.

Подробнее здесь: https://stackoverflow.com/questions/790 ... erformance

1729089537

Anonymous

Я хочу очистить веб-сайт газетного архива (genios.de) и столкнулся с проблемой, заключающейся в том, что оглавление версий веб-сайта отображается динамически после щелчка по нему. Откроется окно предварительного просмотра, и вы сможете получить доступ к содержанию.
Я использую Selenium и уже получаю нужные данные. Проблема, однако, в эффективности.
Это связано с тем, что как только я щелкнул одну плитку, содержимое визуализируется, а затем остается там. Это приводит к тому, что суп.find_all сначала находит именно то, что мне нужно, затем находит день 1 + день 2, затем 1+2+3 и так далее. Я написал код, который теперь проверяет определенный элемент на дату, а затем обращается к великому родителю элемента, чтобы получить только нужные данные.  Похоже, это влияет на производительность, чем больше контента отображается на странице.
Есть ли простой и более эффективный способ убедиться, что «старый» контент больше не доступен как только я нажму на новую плитку?
[code]import re
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

def get_tocs_one_month(url, current_date, newspaper, driver):

# Extract the year and month, add 1 to the month, and form yyyymm
year = current_date.year
month = current_date.month + 1
if month > 12:
month = 1
year += 1

date = int(f"{year}{month:02d}")

url = f"{url}?before={date}"

# Load the page
driver.get(url)

# Check for cookie consent window and decline cookies
try:
# Target the button with the class name "button ccm--decline-cookies ccm--ctrl-init"
cookie_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.ccm--decline-cookies'))
)
cookie_button.click()
print("Cookie consent declined.")
except Exception as e:
print("No cookie consent window or failed to close it:", str(e))

# Wait until the tiles with the daily issues are loaded
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.media_tile.element')))

# Get the "partialSourceListArea" section
partial_source_list_area = driver.find_element(By.CLASS_NAME, 'partialSourceListArea')

# Get all tiles with the class "media_tile element" within the "partialSourceListArea" section
media_tiles = partial_source_list_area.find_elements(By.CSS_SELECTOR, 'div.media_tile.element')

print(f"Found {len(media_tiles)} media tiles for {date-1}.")

# DataFrame to store the data
df = pd.DataFrame(columns=['newspaper', 'date', 'headline'])

# Iterate through each tile and click on it
for tile in media_tiles:
try:

# Scroll the tile into view
driver.execute_script("arguments[0].scrollIntoView(true);", tile)

# Regular expression to find date in the format dd.mm.yyyy within the tile text
date_match = re.search(r"\b\d{2}\.\d{2}\.\d{4}\b", tile.text)
if date_match:
date = date_match.group(0)
else:
date = None

# Get the current content or some unique attribute before clicking the tile
try:
toc_headline = driver.find_element(By.CLASS_NAME, 'table_of_contents__headline__documents')
content_before = toc_headline.find_element(By.TAG_NAME, 'span').get_attribute("innerHTML")
except:
content_before = ""   # No content yet if this is the first tile

# Click on the tile
tile.click()

print(f"Successfully clicked on the tile for {date}.")

if content_before:
# Wait until the content changes after clicking the tile
WebDriverWait(driver, 10).until(
EC.invisibility_of_element_located((By.XPATH, f"//*[contains(.,'{content_before}')]"))
)

# Optionally wait for the new content to appear after the change
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'table_of_contents__header__documents'))
)

# sleep(1)  # Optional sleep to give the page time to load

print(f"New content for {date} successfully loaded.")

# Get the HTML code of the daily issue
html = driver.page_source

# Create a BeautifulSoup object for the daily issue
soup = BeautifulSoup(html, 'lxml')

# Find all elements with the class "table_of_contents__headline__documents"
toc_headlines = soup.find_all('div', class_='table_of_contents__header__documents')

print(f"{len(toc_headlines)} table of contents found in html.")

# Find the toc_headline that contains a span object with the date
toc_headline_with_date = None
toc_headline_with_date = find_toc_headline_with_date(toc_headlines, date)
if not toc_headline_with_date:
print(f"No table of contents found for {date}.")
print("Waiting 2 seconds before retrying.")
time.sleep(2)
toc_headline_with_date = find_toc_headline_with_date(toc_headlines, date)
if not toc_headline_with_date:
print(f"No table of contents found for {date}. Logging the error and skipping this date.")
df = pd.concat([df, pd.DataFrame([{'newspaper': newspaper, 'date': date, 'headline': 'error'}])], ignore_index=True)
continue

# Move up two levels to find the parent container of the table of contents
toc_container = toc_headline_with_date.find_parent().find_parent()

# Check if the toc_container is actually called "table_of_contents"
if toc_container.get('class') != ['table_of_contents']:
raise ValueError(f"The container for the table of contents does not have the expected class 'table_of_contents'. Actual classes: {toc_container.get('class', [])}")

# Find all "div" elements with the class "table_of_contents__body__document__text document_title_wrapper" within the container
toc_entries = toc_container.find_all('div', class_='table_of_contents__body__document__text document_title_wrapper')

# Extract and save the text of each link in the DataFrame
for entry in toc_entries:
headline_div = entry.find('div', class_='tooltip-title')
headline = headline_div.get_text(strip=True) if headline_div else None
headline = re.sub(r'\s+', ' ', headline, flags=re.IGNORECASE) if headline else None
headline = headline.replace('"""', '"').replace('""', '"')

# Skip entries with the text "kein Titel" (case insensitive)
if headline and "kein Titel".lower() not in headline.lower():
df = pd.concat([df, pd.DataFrame([{'newspaper': newspaper, 'date': date, 'headline': headline}])], ignore_index=True)

print(f"Toc entries for the {date} successfully extracted and appended to df.")

except Exception as e:
print(f"Error: {e}")

return df

def find_toc_headline_with_date(toc_headlines, date):
for headline in toc_headlines:
span = headline.find('span')
if span and date in span.text:
return headline
return None
[/code]
Еще одна проблема, с которой я столкнулся, заключается в том, что иногда не удается найти содержание текущей даты. Я хочу убедиться, что такого больше не повторится. Для этого мне нужно будет убедиться, что новый материал загружен, прежде чем извлекать информацию.  Я пробую это, но почти уверен, что подход с 
[code]# Wait for the new content to appear after the change
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'table_of_contents__header__documents')))
[/code]
не работает, поскольку такой элемент явно существует, поскольку он был загружен ранее для других плиток, на которые был нажат.
Что бы может быть лучший вариант убедиться, что загружена именно эта информация?
Чтобы использовать функцию выше, вы можете запустить этот минимальный пример:
[code]import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options

# Import the function from the module
from get_tocs_one_month import get_tocs_one_month

# Set up the Selenium WebDriver for Firefox
firefox_options = Options()
firefox_options.add_argument("--headless")  # Run in headless mode

driver = webdriver.Firefox(options=firefox_options)

# Define the parameters
url = "https://genios.de/browse/Alle/Presse/Presse Deutschland/Aachener Zeitung"
date = pd.to_datetime("2014-01-01")
newspaper = "Aachener Zeitung"

# Run the function
df = get_tocs_one_month(url, date, newspaper, driver)

# Print the resulting DataFrame
print(df)

# Quit the driver
driver.quit()
[/code]
Учитывая, что в настоящее время один месяц занимает около одной минуты, один год занимает 12, 10 лет 120 т.е. 2 часа. И это только одна газета. Мне нужно как минимум 30-40, что уже будет означать около 60-80ч. В идеале я хотел бы иметь 50-100 газет. Это означает, что мне придется радикально повысить эффективность, если я не хочу ждать несколько дней.
Я благодарен за любые предложения!
Изменить:
У меня есть добавлены таймеры, чтобы определить, что замедляет работу кода. В основном два аспекта:
[list]
[*]Рендеринг другого фрагмента становится все медленнее. Примерно от 0,3/0,4 секунды для первых двух плиток до примерно 1 секунды для последних (~25 плиток).
[*]Определение правильной части html-кода требует наибольших затрат времени. что имеет смысл, поскольку это всего лишь обходной путь проблемы, с которой я столкнулся. Оно увеличивается примерно с 0,2 секунды до почти 2 секунд.
[/list] 

Подробнее здесь: [url]https://stackoverflow.com/questions/79094314/data-scraping-dynamic-website-performance[/url]