Однако я столкнулся с проблемой при попытке извлечь имя пользователя владельца из отдельных страниц тем. Имя пользователя владельца находится на вложенной странице, доступной по URL-адресу, указанному в ссылке на тему главной страницы.
Например, на главной странице у меня есть следующий фрагмент HTML для тема:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# Set up Chrome options to use headless mode (for Colab)
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
# Set the path to chromedriver explicitly (installed by apt)
chrome_path = "/usr/bin/chromedriver"
# Initialize the WebDriver with the updated path
driver = webdriver.Chrome(options=chrome_options)
# Open the HuggingFace page
url = "https://discuss.huggingface.co/c/research/7/l/latest" # URL for HuggingFace Issues
driver.get(url)
# Wait for the page to load
time.sleep(6)
def scrape_huggingface_issues():
titles_and_links = []
seen_titles_and_links = set()
owner = []
replies = []
views = []
activity = []
while True:
try:
# Find all issue rows (elements in the table)
elements = driver.find_elements(By.CSS_SELECTOR, 'tr.topic-list-item')
# Extract and store the titles, links, and other data
for elem in elements:
topic_id = elem.get_attribute("data-topic-id")
if topic_id in seen_titles_and_links:
continue
seen_titles_and_links.add(topic_id)
# Extract title and link
selected_title = elem.find_element(By.CSS_SELECTOR, 'a.title.raw-link.raw-topic-link')
title = selected_title.text.strip()
relative_link = selected_title.get_attribute('href') # Get the relative URL from the href attribute
full_link = relative_link # Construct the absolute URL (if needed)
# Extract replies count
try:
replies_elem = elem.find_element(By.CSS_SELECTOR, 'button.btn-link.posts-map.badge-posts')
replies_count = replies_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
replies_count = "0"
# Extract views count
try:
views_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.views.topic-list-data')
views_count = views_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
views_count = "0"
# Extract activity (last activity)
try:
activity_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.topic-list-data.age.activity')
activity_text = activity_elem.get_attribute('title').strip()
except:
activity_text = "N/A"
# Use the helper function to get the owner info from the topic page
owner_text = scrape_issue_details(relative_link)
# Store the extracted data in the lists
titles_and_links.append((title, full_link, owner_text, replies_count, views_count, activity_text))
seen_titles_and_links.add((title, full_link)) # Add to the seen set to avoid duplicates
# Scroll down to load more content (if the forum uses infinite scroll)
driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
time.sleep(3) # Adjust based on loading speed
# Check if the "Next" button is available and click it
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'a.next.page-numbers')
next_button.click()
time.sleep(3) # Wait for the next page to load
except:
# If there's no "Next" button, exit the loop
print("No more pages to scrape.")
break
except Exception as e:
print(f"Error occurred: {e}")
continue
return titles_and_links
def scrape_issue_details(url):
"""
Navigate to the topic page and scrape additional details like the owner's username.
"""
# Go to the topic page
driver.get(url)
time.sleep(3) # Wait for the page to load
# Extract the owner's username
try:
owner_elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.first.username.new-user')))
owner_username_fetch = owner_elem.find_element(By.CSS_SELECTOR, 'a').text.strip()
owner_username = owner_elem.text.strip() # Extract the username from the link
except Exception as e:
owner_username = "N/A" # Default value if no owner found
return owner_username
# Scrape the HuggingFace issues across all pages
issues = scrape_huggingface_issues()
# Print the titles, links, and additional data (owner, replies, views, activity)
print("Scraped Titles, Links, Owner, Replies, Views, Activity:")
for i, (title, link, owner_text, replies_count, views_count, activity_text) in enumerate(issues, 1):
print(f"{i}: {title} - {link} - Owner: {owner_text} - Replies: {replies_count} - Views: {views_count} - Activity: {activity_text}")
# Close the browser
driver.quit()
Проблема:
Я не могу получить имя пользователя владельца со страницы отдельной темы. Пройдя по URL-адресу, я не могу найти и извлечь имя пользователя владельца, хотя знаю его местоположение в HTML.
Я использовал driver.get(url) для перехода к отдельным страницам тем.
Я попытался найти имя пользователя с помощью WebDriverWait и правильного селектора CSS (span.first.username.new-user a).
Я успешно собираю другие данные, такие как активность, просмотры и Ответы с главной страницы, но невозможно получить имя пользователя владельца со страницы темы.
Я сканирую исследовательский форум HuggingFace (https://discuss.huggingface.co/c/research/7/l/latest) с помощью Selenium. Мне удалось успешно извлечь следующие атрибуты с главной страницы форума: [list] [*]Дата активности [*]Количество просмотров [*]Количество ответов [*]Заголовок [*]URL [/list]Однако я столкнулся с проблемой при попытке извлечь имя пользователя владельца из отдельных страниц тем. Имя пользователя владельца находится на вложенной странице, доступной по URL-адресу, указанному в ссылке на тему главной страницы. Например, на главной странице у меня есть следующий фрагмент HTML для тема:
[code]from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC
import time
# Set up Chrome options to use headless mode (for Colab) chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument("--disable-infobars") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--incognito") chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option("useAutomationExtension", False)
# Set the path to chromedriver explicitly (installed by apt) chrome_path = "/usr/bin/chromedriver"
# Initialize the WebDriver with the updated path driver = webdriver.Chrome(options=chrome_options)
# Open the HuggingFace page url = "https://discuss.huggingface.co/c/research/7/l/latest" # URL for HuggingFace Issues driver.get(url)
while True: try: # Find all issue rows (elements in the table) elements = driver.find_elements(By.CSS_SELECTOR, 'tr.topic-list-item')
# Extract and store the titles, links, and other data for elem in elements: topic_id = elem.get_attribute("data-topic-id") if topic_id in seen_titles_and_links: continue
seen_titles_and_links.add(topic_id)
# Extract title and link selected_title = elem.find_element(By.CSS_SELECTOR, 'a.title.raw-link.raw-topic-link') title = selected_title.text.strip() relative_link = selected_title.get_attribute('href') # Get the relative URL from the href attribute full_link = relative_link # Construct the absolute URL (if needed)
# Use the helper function to get the owner info from the topic page owner_text = scrape_issue_details(relative_link)
# Store the extracted data in the lists titles_and_links.append((title, full_link, owner_text, replies_count, views_count, activity_text)) seen_titles_and_links.add((title, full_link)) # Add to the seen set to avoid duplicates
# Scroll down to load more content (if the forum uses infinite scroll) driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END) time.sleep(3) # Adjust based on loading speed
# Check if the "Next" button is available and click it try: next_button = driver.find_element(By.CSS_SELECTOR, 'a.next.page-numbers') next_button.click() time.sleep(3) # Wait for the next page to load except: # If there's no "Next" button, exit the loop print("No more pages to scrape.") break
except Exception as e: print(f"Error occurred: {e}") continue
return titles_and_links
def scrape_issue_details(url): """ Navigate to the topic page and scrape additional details like the owner's username. """ # Go to the topic page driver.get(url) time.sleep(3) # Wait for the page to load
# Extract the owner's username try: owner_elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.first.username.new-user'))) owner_username_fetch = owner_elem.find_element(By.CSS_SELECTOR, 'a').text.strip() owner_username = owner_elem.text.strip() # Extract the username from the link except Exception as e: owner_username = "N/A" # Default value if no owner found
return owner_username
# Scrape the HuggingFace issues across all pages issues = scrape_huggingface_issues()
# Print the titles, links, and additional data (owner, replies, views, activity) print("Scraped Titles, Links, Owner, Replies, Views, Activity:") for i, (title, link, owner_text, replies_count, views_count, activity_text) in enumerate(issues, 1): print(f"{i}: {title} - {link} - Owner: {owner_text} - Replies: {replies_count} - Views: {views_count} - Activity: {activity_text}")
# Close the browser driver.quit()[/code]
[b]Проблема:[/b] Я не могу получить имя пользователя владельца со страницы отдельной темы. Пройдя по URL-адресу, я не могу найти и извлечь имя пользователя владельца, хотя знаю его местоположение в HTML. [code][url=/t/model-that-can-generate-both-text-and-image-as-output/132209]Model that can generate both text and image as output[/url] [/code] Имя пользователя владельца находится на отдельной странице темы в следующем фрагменте HTML: [code][url=/u/InsertOPUsername]InsertOPUsername[/url] [/code] [b]Что я пробовал:[/b] [list] [*]Я использовал driver.get(url) для перехода к отдельным страницам тем. [*]Я попытался найти имя пользователя с помощью WebDriverWait и правильного селектора CSS (span.first.username.new-user a).[*]Я успешно собираю другие данные, такие как активность, просмотры и Ответы с главной страницы, но невозможно получить имя пользователя владельца со страницы темы. [/list]