с этим кодом работает хорошо, извлекает именно то, что я вижу на сайте
Код: Выделить всё
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pickle
import os
def setup_driver():
chrome_options = Options()
#chrome_options.add_argument("--headless") # headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("enable-automation")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.maximize_window()
return driver
def load_cookies(driver, url):
# Create the "cookies" directory if it does not exist
cookies_dir = "cookies"
if not os.path.exists(cookies_dir):
os.makedirs(cookies_dir)
# Specify the full path to the cookie file
cookies_file = os.path.join(cookies_dir, "bwin.pkl")
driver.get(url)
if os.path.exists(cookies_file):
with open(cookies_file, "rb") as cookiesfile:
cookies = pickle.load(cookiesfile)
for cookie in cookies:
driver.add_cookie(cookie)
driver.refresh()
else:
close_popups(driver)
with open(cookies_file, "wb") as cookiesfile:
pickle.dump(driver.get_cookies(), cookiesfile)
def close_popups(driver):
try:
print("Attempting to close promotion popup")
popup_promotion = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#messages-with-overlay > div > vn-content-message > div > span')))
popup_promotion.click()
except Exception as e:
print(f"No promotion popup found: {e}")
try:
print("Attempting to close cookie banner")
banner_cookie = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#onetrust-accept-btn-handler')))
banner_cookie.click()
except Exception as e:
print(f"No cookie banner found: {e}")
def get_page_source(url):
driver = setup_driver()
load_cookies(driver, url)
try:
print("Waiting for ms-event-group element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-event-group')))
except Exception as e:
print(f"Loading timeout or error: {e}")
try:
print("Waiting for ms-league-header element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-league-header')))
except Exception as e:
print(f"Alternative loading timeout or error: {e}")
page_source = driver.page_source
driver.quit()
return page_source
def remove_duplicate_characters(list):
return [elem[0] if len(elem) == 2 and elem[0] == elem[1] else elem for elem in list]
def scrape_bwin():
url = "https://sports.bwin.it/it/sports/live/tennis-5"
print("Fetching page source")
page_source = get_page_source(url)
soup = BeautifulSoup(page_source, 'html.parser')
tournaments_list = []
players_list = []
odds_player1_list = []
odds_player2_list = []
sets_list = []
games_list = []
points_player1_list = []
points_player2_list = []
event_status_list = []
current_set_list = []
tournaments = soup.select('ms-event-group')
for tournament in tournaments:
tournament_name = tournament.select_one('ms-league-header .title span')
if tournament_name:
tournament_name = tournament_name.text.strip()
else:
continue
events = tournament.select('ms-event')
for event in events:
event_status_elem = event.select_one('ms-event-detail > div > ms-event-info > i.live-icon')
event_status = "Live" if event_status_elem else "Non live"
if event_status != "Live":
continue
players = [elem.text.strip() for elem in event.select('ms-event-name ms-inline-tooltip div div div div')]
odds_elements = event.select('ms-option-group:nth-child(1) ms-option')
odds = [float(elem.text.strip()) for elem in odds_elements[:2] if
elem.text.strip().replace('.', '', 1).isdigit()]
if len(odds) < 2:
continue
sets = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.sets > div')])
games = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.games.divider > div')])
points_elements = event.select('ms-set-game-scoreboard div.column.points > div:nth-child(1) > div > div')
points = [elem.text.strip() for elem in points_elements]
if len(points) == 2:
points_player1 = points[0]
points_player2 = points[1]
else:
points_player1 = "N/A"
points_player2 = "N/A"
current_set_elem = event.select_one(
'ms-event-detail > div > ms-event-info > div > ms-event-timer > ms-live-timer')
current_set = current_set_elem.text.strip() if current_set_elem else "N/A"
tournaments_list.append(tournament_name)
players_list.append(players)
odds_player1_list.append(odds[0])
odds_player2_list.append(odds[1])
sets_list.append(sets)
games_list.append(games)
points_player1_list.append(points_player1)
points_player2_list.append(points_player2)
event_status_list.append(event_status)
current_set_list.append(current_set)
df = pd.DataFrame({
'Tournaments': tournaments_list,
'Players': players_list,
'Odds Player 1': odds_player1_list,
'Odds Player 2': odds_player2_list,
'Sets': sets_list,
'Games': games_list,
'Points Player 1': points_player1_list,
'Points Player 2': points_player2_list,
'Event Status': event_status_list,
'Current Set': current_set_list
})
df.to_csv('live tennis matches.csv', index=False)
print(df)
scrape_bwin()
Код: Выделить всё
#chrome_options.add_argument("--headless") # headless mode
Код: Выделить всё
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pickle
import os
def setup_driver():
chrome_options = Options()
chrome_options.add_argument("--headless") # headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("enable-automation")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.maximize_window()
return driver
def load_cookies(driver, url):
# Create the "cookies" directory if it does not exist
cookies_dir = "cookies"
if not os.path.exists(cookies_dir):
os.makedirs(cookies_dir)
# Specify the full path to the cookie file
cookies_file = os.path.join(cookies_dir, "bwin.pkl")
driver.get(url)
if os.path.exists(cookies_file):
with open(cookies_file, "rb") as cookiesfile:
cookies = pickle.load(cookiesfile)
for cookie in cookies:
driver.add_cookie(cookie)
driver.refresh()
else:
close_popups(driver)
with open(cookies_file, "wb") as cookiesfile:
pickle.dump(driver.get_cookies(), cookiesfile)
def close_popups(driver):
try:
print("Attempting to close promotion popup")
popup_promotion = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#messages-with-overlay > div > vn-content-message > div > span')))
popup_promotion.click()
except Exception as e:
print(f"No promotion popup found: {e}")
try:
print("Attempting to close cookie banner")
banner_cookie = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#onetrust-accept-btn-handler')))
banner_cookie.click()
except Exception as e:
print(f"No cookie banner found: {e}")
def get_page_source(url):
driver = setup_driver()
load_cookies(driver, url)
try:
print("Waiting for ms-event-group element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-event-group')))
except Exception as e:
print(f"Loading timeout or error: {e}")
try:
print("Waiting for ms-league-header element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-league-header')))
except Exception as e:
print(f"Alternative loading timeout or error: {e}")
page_source = driver.page_source
driver.quit()
return page_source
def remove_duplicate_characters(list):
return [elem[0] if len(elem) == 2 and elem[0] == elem[1] else elem for elem in list]
def scrape_bwin():
url = "https://sports.bwin.it/it/sports/live/tennis-5"
print("Fetching page source")
page_source = get_page_source(url)
soup = BeautifulSoup(page_source, 'html.parser')
tournaments_list = []
players_list = []
odds_player1_list = []
odds_player2_list = []
sets_list = []
games_list = []
points_player1_list = []
points_player2_list = []
event_status_list = []
current_set_list = []
tournaments = soup.select('ms-event-group')
for tournament in tournaments:
tournament_name = tournament.select_one('ms-league-header .title span')
if tournament_name:
tournament_name = tournament_name.text.strip()
else:
continue
events = tournament.select('ms-event')
for event in events:
event_status_elem = event.select_one('ms-event-detail > div > ms-event-info > i.live-icon')
event_status = "Live" if event_status_elem else "Non live"
if event_status != "Live":
continue
players = [elem.text.strip() for elem in event.select('ms-event-name ms-inline-tooltip div div div div')]
odds_elements = event.select('ms-option-group:nth-child(1) ms-option')
odds = [float(elem.text.strip()) for elem in odds_elements[:2] if
elem.text.strip().replace('.', '', 1).isdigit()]
if len(odds) < 2:
continue
sets = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.sets > div')])
games = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.games.divider > div')])
points_elements = event.select('ms-set-game-scoreboard div.column.points > div:nth-child(1) > div > div')
points = [elem.text.strip() for elem in points_elements]
if len(points) == 2:
points_player1 = points[0]
points_player2 = points[1]
else:
points_player1 = "N/A"
points_player2 = "N/A"
current_set_elem = event.select_one(
'ms-event-detail > div > ms-event-info > div > ms-event-timer > ms-live-timer')
current_set = current_set_elem.text.strip() if current_set_elem else "N/A"
tournaments_list.append(tournament_name)
players_list.append(players)
odds_player1_list.append(odds[0])
odds_player2_list.append(odds[1])
sets_list.append(sets)
games_list.append(games)
points_player1_list.append(points_player1)
points_player2_list.append(points_player2)
event_status_list.append(event_status)
current_set_list.append(current_set)
df = pd.DataFrame({
'Tournaments': tournaments_list,
'Players': players_list,
'Odds Player 1': odds_player1_list,
'Odds Player 2': odds_player2_list,
'Sets': sets_list,
'Games': games_list,
'Points Player 1': points_player1_list,
'Points Player 2': points_player2_list,
'Event Status': event_status_list,
'Current Set': current_set_list
})
df.to_csv('live tennis matches.csv', index=False)
print(df)
scrape_bwin()
Код: Выделить всё
Fetching page source
Attempting to close promotion popup
No promotion popup found: Message:
Stacktrace:
0 chromedriver 0x00000001033ef0e8 chromedriver + 5169384
1 chromedriver 0x00000001033e6fba chromedriver + 5136314
2 chromedriver 0x0000000102f6336c chromedriver + 402284
3 chromedriver 0x0000000102fb0740 chromedriver + 718656
4 chromedriver 0x0000000102fb0a01 chromedriver + 719361
5 chromedriver 0x0000000102ff5bc4 chromedriver + 1002436
6 chromedriver 0x0000000102fd3add chromedriver + 862941
7 chromedriver 0x0000000102ff2f57 chromedriver + 991063
8 chromedriver 0x0000000102fd3853 chromedriver + 862291
9 chromedriver 0x0000000102fa35c6 chromedriver + 665030
10 chromedriver 0x0000000102fa3e4e chromedriver + 667214
11 chromedriver 0x00000001033b1d00 chromedriver + 4918528
12 chromedriver 0x00000001033b6cfd chromedriver + 4939005
13 chromedriver 0x00000001033b73d5 chromedriver + 4940757
14 chromedriver 0x0000000103392de4 chromedriver + 4791780
15 chromedriver 0x00000001033b76c9 chromedriver + 4941513
16 chromedriver 0x00000001033845b4 chromedriver + 4732340
17 chromedriver 0x00000001033d7898 chromedriver + 5073048
18 chromedriver 0x00000001033d7a57 chromedriver + 5073495
19 chromedriver 0x00000001033e6b6e chromedriver + 5135214
20 libsystem_pthread.dylib 0x00007ff81532618b _pthread_start + 99
21 libsystem_pthread.dylib 0x00007ff815321ae3 thread_start + 15
Attempting to close cookie banner
No cookie banner found: Message:
Stacktrace:
0 chromedriver 0x00000001033ef0e8 chromedriver + 5169384
1 chromedriver 0x00000001033e6fba chromedriver + 5136314
2 chromedriver 0x0000000102f6336c chromedriver + 402284
3 chromedriver 0x0000000102fb0740 chromedriver + 718656
4 chromedriver 0x0000000102fb0a01 chromedriver + 719361
5 chromedriver 0x0000000102ff5bc4 chromedriver + 1002436
6 chromedriver 0x0000000102fd3add chromedriver + 862941
7 chromedriver 0x0000000102ff2f57 chromedriver + 991063
8 chromedriver 0x0000000102fd3853 chromedriver + 862291
9 chromedriver 0x0000000102fa35c6 chromedriver + 665030
10 chromedriver 0x0000000102fa3e4e chromedriver + 667214
11 chromedriver 0x00000001033b1d00 chromedriver + 4918528
12 chromedriver 0x00000001033b6cfd chromedriver + 4939005
13 chromedriver 0x00000001033b73d5 chromedriver + 4940757
14 chromedriver 0x0000000103392de4 chromedriver + 4791780
15 chromedriver 0x00000001033b76c9 chromedriver + 4941513
16 chromedriver 0x00000001033845b4 chromedriver + 4732340
17 chromedriver 0x00000001033d7898 chromedriver + 5073048
18 chromedriver 0x00000001033d7a57 chromedriver + 5073495
19 chromedriver 0x00000001033e6b6e chromedriver + 5135214
20 libsystem_pthread.dylib 0x00007ff81532618b _pthread_start + 99
21 libsystem_pthread.dylib 0x00007ff815321ae3 thread_start + 15
Waiting for ms-event-group element
Loading timeout or error: Message:
Stacktrace:
0 chromedriver 0x00000001033ef0e8 chromedriver + 5169384
1 chromedriver 0x00000001033e6fba chromedriver + 5136314
2 chromedriver 0x0000000102f6336c chromedriver + 402284
3 chromedriver 0x0000000102fb0740 chromedriver + 718656
4 chromedriver 0x0000000102fb0a01 chromedriver + 719361
5 chromedriver 0x0000000102ff5bc4 chromedriver + 1002436
6 chromedriver 0x0000000102fd3add chromedriver + 862941
7 chromedriver 0x0000000102ff2f57 chromedriver + 991063
8 chromedriver 0x0000000102fd3853 chromedriver + 862291
9 chromedriver 0x0000000102fa35c6 chromedriver + 665030
10 chromedriver 0x0000000102fa3e4e chromedriver + 667214
11 chromedriver 0x00000001033b1d00 chromedriver + 4918528
12 chromedriver 0x00000001033b6cfd chromedriver + 4939005
13 chromedriver 0x00000001033b73d5 chromedriver + 4940757
14 chromedriver 0x0000000103392de4 chromedriver + 4791780
15 chromedriver 0x00000001033b76c9 chromedriver + 4941513
16 chromedriver 0x00000001033845b4 chromedriver + 4732340
17 chromedriver 0x00000001033d7898 chromedriver + 5073048
18 chromedriver 0x00000001033d7a57 chromedriver + 5073495
19 chromedriver 0x00000001033e6b6e chromedriver + 5135214
20 libsystem_pthread.dylib 0x00007ff81532618b _pthread_start + 99
21 libsystem_pthread.dylib 0x00007ff815321ae3 thread_start + 15
Waiting for ms-league-header element
Alternative loading timeout or error: Message:
Stacktrace:
0 chromedriver 0x00000001033ef0e8 chromedriver + 5169384
1 chromedriver 0x00000001033e6fba chromedriver + 5136314
2 chromedriver 0x0000000102f6336c chromedriver + 402284
3 chromedriver 0x0000000102fb0740 chromedriver + 718656
4 chromedriver 0x0000000102fb0a01 chromedriver + 719361
5 chromedriver 0x0000000102ff5bc4 chromedriver + 1002436
6 chromedriver 0x0000000102fd3add chromedriver + 862941
7 chromedriver 0x0000000102ff2f57 chromedriver + 991063
8 chromedriver 0x0000000102fd3853 chromedriver + 862291
9 chromedriver 0x0000000102fa35c6 chromedriver + 665030
10 chromedriver 0x0000000102fa3e4e chromedriver + 667214
11 chromedriver 0x00000001033b1d00 chromedriver + 4918528
12 chromedriver 0x00000001033b6cfd chromedriver + 4939005
13 chromedriver 0x00000001033b73d5 chromedriver + 4940757
14 chromedriver 0x0000000103392de4 chromedriver + 4791780
15 chromedriver 0x00000001033b76c9 chromedriver + 4941513
16 chromedriver 0x00000001033845b4 chromedriver + 4732340
17 chromedriver 0x00000001033d7898 chromedriver + 5073048
18 chromedriver 0x00000001033d7a57 chromedriver + 5073495
19 chromedriver 0x00000001033e6b6e chromedriver + 5135214
20 libsystem_pthread.dylib 0x00007ff81532618b _pthread_start + 99
21 libsystem_pthread.dylib 0x00007ff815321ae3 thread_start + 15
Empty DataFrame
Columns: [Tournaments, Players, Odds Player 1, Odds Player 2, Sets, Games, Points Player 1, Points Player 2, Event Status, Current Set]
Index: []
Process finished with exit code 0
Подробнее здесь: https://stackoverflow.com/questions/786 ... bsite-with