Очистка/сканирование веб-сайта с несколькими вкладками с использованием Python

Очистка/сканирование веб-сайта с несколькими вкладками с использованием Python ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Очистка/сканирование веб-сайта с несколькими вкладками с использованием Python

Цитата

Сообщение Anonymous » 27 окт 2024, 11:05

Мне нужна помощь в извлечении данных с веб-сайта с несколькими вкладками и сохранении их в формате .csv с помощью Python и Selenium. Веб-сайт, о котором идет речь: https://www.amfiindia.com/research-info ... mf-scheme- Performance-details.
На сайте есть пять разных вкладок. странице, но я сосредоточен на извлечении данных из первых трех вкладок.
1-я вкладка:
[img]https: //i.sstatic.net/CbjX93Hr.png[/img]

2-я вкладка:
[img]https:/ /i.sstatic.net/zgGHFp5n.png[/img]

3-я вкладка:
[img]https:// i.sstatic.net/UxWbrQED.png[/img]

Кроме того, есть еще две вкладки: одна представляет «ВСЕ», а другая — «дату». Мне нужно получить данные для всех комбинаций первых трех вкладок, оставив выбранной вкладку «ВСЕ» и установив текущую дату.
Я пытался выполнить эту операцию с помощью Selenium , но из-за моего ограниченного опыта работы с этим инструментом мне не удалось достичь желаемого результата. Поэтому мне нужен совет, как действовать дальше.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random

def wait_for_element(driver, by, value, timeout=10):
return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))

def scrape_and_save(driver, end_type, equity_type, cap_type, all_type, filename):
# Select options from dropdowns
Select(wait_for_element(driver, By.ID, "end-type")).select_by_value(end_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "equity-type")).select_by_value(equity_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "cap-type")).select_by_value(cap_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "all-type")).select_by_value(all_type)
time.sleep(random.uniform(1, 2))

# Click "Go" button
wait_for_element(driver, By.ID, "go-button").click()

# Wait for table to load
table = wait_for_element(driver, By.ID, "fund-table", timeout=15)

# Extract table data
df = pd.read_html(table.get_attribute('outerHTML'))[0]

# Save to CSV
df.to_csv(filename, index=False)
print(f"Saved data to {filename}")

# Set up Selenium WebDriver
driver = webdriver.Chrome() # Make sure you have chromedriver installed and in PATH
driver.get("https://www.amfiindia.com/research-info ... ce-details") # Replace with actual URL

# Wait for initial page load
wait_for_element(driver, By.ID, "end-type", timeout=30)
print("Page loaded successfully")

# Define options for each dropdown
end_types = ["1", "2"] # Open-ended, Closed-end
equity_types = ["1", "2", "3", "4", "5", "6"] # Replace with actual values
cap_types = ["1", "2", "3", "4"] # Replace with actual values
all_types = ["1", "2", "3", "4"] # Replace with actual values

# Iterate through combinations
for end in end_types:
for equity in equity_types:
for cap in cap_types:
for all_type in all_types:
filename = f"fund_data_{end}_{equity}_{cap}_{all_type}.csv"
try:
scrape_and_save(driver, end, equity, cap, all_type, filename)
time.sleep(random.uniform(3, 5)) # Random wait between 3 to 5 seconds
except Exception as e:
print(f"Error scraping combination {end}_{equity}_{cap}_{all_type}: {str(e)}")

driver.quit()

Подробнее здесь: https://stackoverflow.com/questions/790 ... ing-python

1730016304

Anonymous

Мне нужна помощь в извлечении данных с веб-сайта с несколькими вкладками и сохранении их в формате .csv с помощью Python и Selenium. Веб-сайт, о котором идет речь: https://www.amfiindia.com/research-information/other-data/mf-scheme- Performance-details.
На сайте есть пять разных вкладок. странице, но я сосредоточен на извлечении данных из первых трех вкладок.
1-я вкладка:
[img]https: //i.sstatic.net/CbjX93Hr.png[/img]

2-я вкладка:
[img]https:/ /i.sstatic.net/zgGHFp5n.png[/img]

3-я вкладка:
[img]https:// i.sstatic.net/UxWbrQED.png[/img]

Кроме того, есть еще две вкладки: одна представляет «ВСЕ», а другая — «дату». Мне нужно получить данные для всех комбинаций первых трех вкладок, оставив выбранной вкладку «ВСЕ» и установив текущую дату.
Я пытался выполнить эту операцию с помощью Selenium , но из-за моего ограниченного опыта работы с этим инструментом мне не удалось достичь желаемого результата. Поэтому мне нужен совет, как действовать дальше.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random

def wait_for_element(driver, by, value, timeout=10):
return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))

def scrape_and_save(driver, end_type, equity_type, cap_type, all_type, filename):
# Select options from dropdowns
Select(wait_for_element(driver, By.ID, "end-type")).select_by_value(end_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "equity-type")).select_by_value(equity_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "cap-type")).select_by_value(cap_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "all-type")).select_by_value(all_type)
time.sleep(random.uniform(1, 2))

# Click "Go" button
wait_for_element(driver, By.ID, "go-button").click()

# Wait for table to load
table = wait_for_element(driver, By.ID, "fund-table", timeout=15)

# Extract table data
df = pd.read_html(table.get_attribute('outerHTML'))[0]

# Save to CSV
df.to_csv(filename, index=False)
print(f"Saved data to {filename}")

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Make sure you have chromedriver installed and in PATH
driver.get("https://www.amfiindia.com/research-information/other-data/mf-scheme-performance-details")  # Replace with actual URL

# Wait for initial page load
wait_for_element(driver, By.ID, "end-type", timeout=30)
print("Page loaded successfully")

# Define options for each dropdown
end_types = ["1", "2"]  # Open-ended, Closed-end
equity_types = ["1", "2", "3", "4", "5", "6"]  # Replace with actual values
cap_types = ["1", "2", "3", "4"]  # Replace with actual values
all_types = ["1", "2", "3", "4"]  # Replace with actual values

# Iterate through combinations
for end in end_types:
for equity in equity_types:
for cap in cap_types:
for all_type in all_types:
filename = f"fund_data_{end}_{equity}_{cap}_{all_type}.csv"
try:
scrape_and_save(driver, end, equity, cap, all_type, filename)
time.sleep(random.uniform(3, 5))  # Random wait between 3 to 5 seconds
except Exception as e:
print(f"Error scraping combination {end}_{equity}_{cap}_{all_type}: {str(e)}")

driver.quit()
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79089644/scraping-crawling-a-website-with-multiple-tabs-using-python[/url]