Извлечение имени пользователя владельца из вложенной страницы

Извлечение имени пользователя владельца из вложенной страницы ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Извлечение имени пользователя владельца из вложенной страницы

Цитата

Сообщение Anonymous » 28 дек 2024, 16:08

Я сканирую исследовательский форум HuggingFace (https://discuss.huggingface.co/c/research/7/l/latest) с помощью Selenium. Мне удалось успешно извлечь следующие атрибуты с главной страницы форума:

Дата активности
Количество просмотров
Количество ответов
Заголовок
URL

Однако я столкнулся с проблемой при попытке извлечь имя пользователя владельца из отдельных страниц тем. Имя пользователя владельца находится на вложенной странице, доступной по URL-адресу, указанному в ссылке на тему главной страницы.
Например, на главной странице у меня есть следующий фрагмент HTML для тема:

Код: Выделить всё

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

# Set up Chrome options to use headless mode (for Colab)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64;  x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Set the path to chromedriver explicitly (installed by apt)
chrome_path = "/usr/bin/chromedriver"

# Initialize the WebDriver with the updated path
driver = webdriver.Chrome(options=chrome_options)

# Open the HuggingFace page
url = "https://discuss.huggingface.co/c/research/7/l/latest"  # URL for HuggingFace Issues
driver.get(url)

# Wait for the page to load
time.sleep(6)

def scrape_huggingface_issues():
titles_and_links = []
seen_titles_and_links = set()
owner = []
replies = []
views = []
activity = []

while True:
try:
# Find all issue rows (elements in the table)
elements = driver.find_elements(By.CSS_SELECTOR, 'tr.topic-list-item')

# Extract and store the titles, links, and other data
for elem in elements:
topic_id = elem.get_attribute("data-topic-id")
if topic_id in seen_titles_and_links:
continue

seen_titles_and_links.add(topic_id)

# Extract title and link
selected_title = elem.find_element(By.CSS_SELECTOR, 'a.title.raw-link.raw-topic-link')
title = selected_title.text.strip()
relative_link = selected_title.get_attribute('href')  # Get the relative URL from the href attribute
full_link = relative_link  # Construct the absolute URL (if needed)

# Extract replies count
try:
replies_elem = elem.find_element(By.CSS_SELECTOR, 'button.btn-link.posts-map.badge-posts')
replies_count = replies_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
replies_count = "0"

# Extract views count
try:
views_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.views.topic-list-data')
views_count = views_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
views_count = "0"

# Extract activity (last activity)
try:
activity_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.topic-list-data.age.activity')
activity_text = activity_elem.get_attribute('title').strip()
except:
activity_text = "N/A"

# Use the helper function to get the owner info from the topic page
owner_text = scrape_issue_details(relative_link)

# Store the extracted data in the lists
titles_and_links.append((title, full_link, owner_text, replies_count, views_count, activity_text))
seen_titles_and_links.add((title, full_link))  # Add to the seen set to avoid duplicates

# Scroll down to load more content (if the forum uses infinite scroll)
driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
time.sleep(3)  # Adjust based on loading speed

# Check if the "Next" button is available and click it
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'a.next.page-numbers')
next_button.click()
time.sleep(3)  # Wait for the next page to load
except:
# If there's no "Next" button, exit the loop
print("No more pages to scrape.")
break

except Exception as e:
print(f"Error occurred:  {e}")
continue

return titles_and_links

def scrape_issue_details(url):
"""
Navigate to the topic page and scrape additional details like the owner's username.
"""
# Go to the topic page
driver.get(url)
time.sleep(3)  # Wait for the page to load

# Extract the owner's username
try:
owner_elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.first.username.new-user')))
owner_username_fetch = owner_elem.find_element(By.CSS_SELECTOR, 'a').text.strip()
owner_username = owner_elem.text.strip()  # Extract the username from the link
except Exception as e:
owner_username = "N/A"  # Default value if no owner found

return owner_username

# Scrape the HuggingFace issues across all pages
issues = scrape_huggingface_issues()

# Print the titles, links, and additional data (owner, replies, views, activity)
print("Scraped Titles, Links, Owner, Replies, Views, Activity:")
for i, (title, link, owner_text, replies_count, views_count, activity_text) in enumerate(issues, 1):
print(f"{i}: {title} - {link} - Owner: {owner_text} - Replies: {replies_count} - Views: {views_count} - Activity: {activity_text}")

# Close the browser
driver.quit()

Проблема:
Я не могу получить имя пользователя владельца со страницы отдельной темы. Пройдя по URL-адресу, я не могу найти и извлечь имя пользователя владельца, хотя знаю его местоположение в HTML.

Код: Выделить всё

[url=/t/model-that-can-generate-both-text-and-image-as-output/132209]Model that can generate both text and image as output[/url]

Имя пользователя владельца находится на отдельной странице темы в следующем фрагменте HTML:

Код: Выделить всё

[url=/u/InsertOPUsername]InsertOPUsername[/url]

Что я пробовал:

Я использовал driver.get(url) для перехода к отдельным страницам тем.
Я попытался найти имя пользователя с помощью WebDriverWait и правильного селектора CSS (span.first.username.new-user a).
Я успешно собираю другие данные, такие как активность, просмотры и Ответы с главной страницы, но невозможно получить имя пользователя владельца со страницы темы.

Подробнее здесь: https://stackoverflow.com/questions/793 ... ested-page

1735391281

Anonymous

Я сканирую исследовательский форум HuggingFace (https://discuss.huggingface.co/c/research/7/l/latest) с помощью Selenium. Мне удалось успешно извлечь следующие атрибуты с главной страницы форума:
[list]
[*]Дата активности
[*]Количество просмотров
[*]Количество ответов
[*]Заголовок
[*]URL
[/list]Однако я столкнулся с проблемой при попытке извлечь имя пользователя владельца из отдельных страниц тем.  Имя пользователя владельца находится на вложенной странице, доступной по URL-адресу, указанному в ссылке на тему главной страницы.
Например, на главной странице у меня есть следующий фрагмент HTML для тема:


[code]from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

# Set up Chrome options to use headless mode (for Colab)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64;  x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Set the path to chromedriver explicitly (installed by apt)
chrome_path = "/usr/bin/chromedriver"

# Initialize the WebDriver with the updated path
driver = webdriver.Chrome(options=chrome_options)

# Open the HuggingFace page
url = "https://discuss.huggingface.co/c/research/7/l/latest"  # URL for HuggingFace Issues
driver.get(url)

# Wait for the page to load
time.sleep(6)

def scrape_huggingface_issues():
titles_and_links = []
seen_titles_and_links = set()
owner = []
replies = []
views = []
activity = []

while True:
try:
# Find all issue rows (elements in the table)
elements = driver.find_elements(By.CSS_SELECTOR, 'tr.topic-list-item')

# Extract and store the titles, links, and other data
for elem in elements:
topic_id = elem.get_attribute("data-topic-id")
if topic_id in seen_titles_and_links:
continue

seen_titles_and_links.add(topic_id)

# Extract title and link
selected_title = elem.find_element(By.CSS_SELECTOR, 'a.title.raw-link.raw-topic-link')
title = selected_title.text.strip()
relative_link = selected_title.get_attribute('href')  # Get the relative URL from the href attribute
full_link = relative_link  # Construct the absolute URL (if needed)

# Extract replies count
try:
replies_elem = elem.find_element(By.CSS_SELECTOR, 'button.btn-link.posts-map.badge-posts')
replies_count = replies_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
replies_count = "0"

# Extract views count
try:
views_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.views.topic-list-data')
views_count = views_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
views_count = "0"

# Extract activity (last activity)
try:
activity_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.topic-list-data.age.activity')
activity_text = activity_elem.get_attribute('title').strip()
except:
activity_text = "N/A"

# Use the helper function to get the owner info from the topic page
owner_text = scrape_issue_details(relative_link)

# Store the extracted data in the lists
titles_and_links.append((title, full_link, owner_text, replies_count, views_count, activity_text))
seen_titles_and_links.add((title, full_link))  # Add to the seen set to avoid duplicates

# Scroll down to load more content (if the forum uses infinite scroll)
driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
time.sleep(3)  # Adjust based on loading speed

# Check if the "Next" button is available and click it
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'a.next.page-numbers')
next_button.click()
time.sleep(3)  # Wait for the next page to load
except:
# If there's no "Next" button, exit the loop
print("No more pages to scrape.")
break

except Exception as e:
print(f"Error occurred:  {e}")
continue

return titles_and_links

def scrape_issue_details(url):
"""
Navigate to the topic page and scrape additional details like the owner's username.
"""
# Go to the topic page
driver.get(url)
time.sleep(3)  # Wait for the page to load

# Extract the owner's username
try:
owner_elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.first.username.new-user')))
owner_username_fetch = owner_elem.find_element(By.CSS_SELECTOR, 'a').text.strip()
owner_username = owner_elem.text.strip()  # Extract the username from the link
except Exception as e:
owner_username = "N/A"  # Default value if no owner found

return owner_username

# Scrape the HuggingFace issues across all pages
issues = scrape_huggingface_issues()

# Print the titles, links, and additional data (owner, replies, views, activity)
print("Scraped Titles, Links, Owner, Replies, Views, Activity:")
for i, (title, link, owner_text, replies_count, views_count, activity_text) in enumerate(issues, 1):
print(f"{i}: {title} - {link} - Owner: {owner_text} - Replies: {replies_count} - Views: {views_count} - Activity: {activity_text}")

# Close the browser
driver.quit()[/code]



[b]Проблема:[/b]
Я не могу получить имя пользователя владельца со страницы отдельной темы. Пройдя по URL-адресу, я не могу найти и извлечь имя пользователя владельца, хотя знаю его местоположение в HTML.
[code][url=/t/model-that-can-generate-both-text-and-image-as-output/132209]Model that can generate both text and image as output[/url]
[/code]
Имя пользователя владельца находится на отдельной странице темы в следующем фрагменте HTML:
[code][url=/u/InsertOPUsername]InsertOPUsername[/url]
[/code]
[b]Что я пробовал:[/b]
[list]
[*]Я использовал driver.get(url) для перехода к отдельным страницам тем.
[*]Я попытался найти имя пользователя с помощью WebDriverWait и правильного селектора CSS (span.first.username.new-user a).[*]Я успешно собираю другие данные, такие как активность, просмотры и Ответы с главной страницы, но невозможно получить имя пользователя владельца со страницы темы.
[/list] 

Подробнее здесь: [url]https://stackoverflow.com/questions/79313502/extracting-owner-s-username-from-nested-page[/url]