Ошибка при извлечении информации о теннисном матче в реальном времени с веб-сайта bwin.it с помощью Python Selenium и BePython

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Ошибка при извлечении информации о теннисном матче в реальном времени с веб-сайта bwin.it с помощью Python Selenium и Be

Сообщение Anonymous »

Согласно заголовку, я хотел бы извлечь информацию о теннисных матчах в прямом эфире с этой веб-страницы https://sports.bwin.it/it/sports/live/tennis-5 с помощью Python Selenium и BeautifulSoup
с этим кодом работает хорошо, извлекает именно то, что я вижу на сайте

Код: Выделить всё

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pickle
import os
def setup_driver():
chrome_options = Options()
#chrome_options.add_argument("--headless")  # headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("enable-automation")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.maximize_window()
return driver
def load_cookies(driver, url):
# Create the "cookies" directory if it does not exist
cookies_dir = "cookies"
if not os.path.exists(cookies_dir):
os.makedirs(cookies_dir)

# Specify the full path to the cookie file
cookies_file = os.path.join(cookies_dir, "bwin.pkl")

driver.get(url)
if os.path.exists(cookies_file):
with open(cookies_file, "rb") as cookiesfile:
cookies = pickle.load(cookiesfile)
for cookie in cookies:
driver.add_cookie(cookie)
driver.refresh()
else:
close_popups(driver)
with open(cookies_file, "wb") as cookiesfile:
pickle.dump(driver.get_cookies(), cookiesfile)
def close_popups(driver):
try:
print("Attempting to close promotion popup")
popup_promotion = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#messages-with-overlay > div > vn-content-message > div >  span')))
popup_promotion.click()
except Exception as e:
print(f"No promotion popup found: {e}")

try:
print("Attempting to close cookie banner")
banner_cookie = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#onetrust-accept-btn-handler')))
banner_cookie.click()
except Exception as e:
print(f"No cookie banner found: {e}")
def get_page_source(url):
driver = setup_driver()
load_cookies(driver, url)

try:
print("Waiting for ms-event-group element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-event-group')))
except Exception as e:
print(f"Loading timeout or error: {e}")

try:
print("Waiting for ms-league-header element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-league-header')))
except Exception as e:
print(f"Alternative loading timeout or error: {e}")

page_source = driver.page_source
driver.quit()
return page_source

def remove_duplicate_characters(list):
return [elem[0] if len(elem) == 2 and elem[0] == elem[1] else elem for elem in list]

def scrape_bwin():
url = "https://sports.bwin.it/it/sports/live/tennis-5"
print("Fetching page source")
page_source = get_page_source(url)
soup = BeautifulSoup(page_source, 'html.parser')

tournaments_list = []
players_list = []
odds_player1_list = []
odds_player2_list = []
sets_list = []
games_list = []
points_player1_list = []
points_player2_list = []
event_status_list = []
current_set_list = []

tournaments = soup.select('ms-event-group')

for tournament in tournaments:
tournament_name = tournament.select_one('ms-league-header .title span')
if tournament_name:
tournament_name = tournament_name.text.strip()
else:
continue

events = tournament.select('ms-event')
for event in events:
event_status_elem = event.select_one('ms-event-detail > div > ms-event-info > i.live-icon')
event_status = "Live" if event_status_elem else "Non live"

if event_status != "Live":
continue

players = [elem.text.strip() for elem in event.select('ms-event-name ms-inline-tooltip div div div div')]

odds_elements = event.select('ms-option-group:nth-child(1) ms-option')
odds = [float(elem.text.strip()) for elem in odds_elements[:2] if
elem.text.strip().replace('.', '', 1).isdigit()]

if len(odds) < 2:
continue

sets = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.sets > div')])
games = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.games.divider > div')])
points_elements = event.select('ms-set-game-scoreboard div.column.points > div:nth-child(1) > div > div')
points = [elem.text.strip() for elem in points_elements]

if len(points) == 2:
points_player1 = points[0]
points_player2 = points[1]
else:
points_player1 = "N/A"
points_player2 = "N/A"

current_set_elem = event.select_one(
'ms-event-detail > div > ms-event-info > div > ms-event-timer >  ms-live-timer')
current_set = current_set_elem.text.strip() if current_set_elem else "N/A"

tournaments_list.append(tournament_name)
players_list.append(players)
odds_player1_list.append(odds[0])
odds_player2_list.append(odds[1])
sets_list.append(sets)
games_list.append(games)
points_player1_list.append(points_player1)
points_player2_list.append(points_player2)
event_status_list.append(event_status)
current_set_list.append(current_set)

df = pd.DataFrame({
'Tournaments': tournaments_list,
'Players': players_list,
'Odds Player 1': odds_player1_list,
'Odds Player 2': odds_player2_list,
'Sets': sets_list,
'Games': games_list,
'Points Player 1': points_player1_list,
'Points Player 2': points_player2_list,
'Event Status': event_status_list,
'Current Set': current_set_list
})

df.to_csv('live tennis matches.csv', index=False)
print(df)

scrape_bwin()
но если я раскомментирую эту строку, чтобы использовать ее в безголовом режиме

Код: Выделить всё

#chrome_options.add_argument("--headless")  # headless mode
и я запускаю этот код

Код: Выделить всё

from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pickle
import os
def setup_driver():
chrome_options = Options()
chrome_options.add_argument("--headless")  # headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("enable-automation")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.maximize_window()
return driver
def load_cookies(driver, url):
# Create the "cookies" directory if it does not exist
cookies_dir = "cookies"
if not os.path.exists(cookies_dir):
os.makedirs(cookies_dir)

# Specify the full path to the cookie file
cookies_file = os.path.join(cookies_dir, "bwin.pkl")

driver.get(url)
if os.path.exists(cookies_file):
with open(cookies_file, "rb") as cookiesfile:
cookies = pickle.load(cookiesfile)
for cookie in cookies:
driver.add_cookie(cookie)
driver.refresh()
else:
close_popups(driver)
with open(cookies_file, "wb") as cookiesfile:
pickle.dump(driver.get_cookies(), cookiesfile)
def close_popups(driver):
try:
print("Attempting to close promotion popup")
popup_promotion = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#messages-with-overlay > div > vn-content-message > div >  span')))
popup_promotion.click()
except Exception as e:
print(f"No promotion popup found: {e}")

try:
print("Attempting to close cookie banner")
banner_cookie = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#onetrust-accept-btn-handler')))
banner_cookie.click()
except Exception as e:
print(f"No cookie banner found: {e}")
def get_page_source(url):
driver = setup_driver()
load_cookies(driver, url)

try:
print("Waiting for ms-event-group element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-event-group')))
except Exception as e:
print(f"Loading timeout or error: {e}")

try:
print("Waiting for ms-league-header element")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'ms-league-header')))
except Exception as e:
print(f"Alternative loading timeout or error: {e}")

page_source = driver.page_source
driver.quit()
return page_source

def remove_duplicate_characters(list):
return [elem[0] if len(elem) == 2 and elem[0] == elem[1] else elem for elem in list]

def scrape_bwin():
url = "https://sports.bwin.it/it/sports/live/tennis-5"
print("Fetching page source")
page_source = get_page_source(url)
soup = BeautifulSoup(page_source, 'html.parser')

tournaments_list = []
players_list = []
odds_player1_list = []
odds_player2_list = []
sets_list = []
games_list = []
points_player1_list = []
points_player2_list = []
event_status_list = []
current_set_list = []

tournaments = soup.select('ms-event-group')

for tournament in tournaments:
tournament_name = tournament.select_one('ms-league-header .title span')
if tournament_name:
tournament_name = tournament_name.text.strip()
else:
continue

events = tournament.select('ms-event')
for event in events:
event_status_elem = event.select_one('ms-event-detail > div > ms-event-info > i.live-icon')
event_status = "Live" if event_status_elem else "Non live"

if event_status != "Live":
continue

players = [elem.text.strip() for elem in event.select('ms-event-name ms-inline-tooltip div div div div')]

odds_elements = event.select('ms-option-group:nth-child(1) ms-option')
odds = [float(elem.text.strip()) for elem in odds_elements[:2] if
elem.text.strip().replace('.', '', 1).isdigit()]

if len(odds) < 2:
continue

sets = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.sets > div')])
games = remove_duplicate_characters(
[elem.text.strip() for elem in event.select('ms-set-game-scoreboard div.column.games.divider > div')])
points_elements = event.select('ms-set-game-scoreboard div.column.points > div:nth-child(1) > div > div')
points = [elem.text.strip() for elem in points_elements]

if len(points) == 2:
points_player1 = points[0]
points_player2 = points[1]
else:
points_player1 = "N/A"
points_player2 = "N/A"

current_set_elem = event.select_one(
'ms-event-detail > div > ms-event-info > div > ms-event-timer >  ms-live-timer')
current_set = current_set_elem.text.strip() if current_set_elem else "N/A"

tournaments_list.append(tournament_name)
players_list.append(players)
odds_player1_list.append(odds[0])
odds_player2_list.append(odds[1])
sets_list.append(sets)
games_list.append(games)
points_player1_list.append(points_player1)
points_player2_list.append(points_player2)
event_status_list.append(event_status)
current_set_list.append(current_set)

df = pd.DataFrame({
'Tournaments': tournaments_list,
'Players': players_list,
'Odds Player 1': odds_player1_list,
'Odds Player 2': odds_player2_list,
'Sets': sets_list,
'Games': games_list,
'Points Player 1': points_player1_list,
'Points Player 2': points_player2_list,
'Event Status': event_status_list,
'Current Set': current_set_list
})

df.to_csv('live tennis matches.csv', index=False)
print(df)

scrape_bwin()
возвращает этот ввод

Код: Выделить всё

Fetching page source
Attempting to close promotion popup
No promotion popup found: Message:
Stacktrace:
0   chromedriver                        0x00000001033ef0e8 chromedriver + 5169384
1   chromedriver                        0x00000001033e6fba chromedriver + 5136314
2   chromedriver                        0x0000000102f6336c chromedriver + 402284
3   chromedriver                        0x0000000102fb0740 chromedriver + 718656
4   chromedriver                        0x0000000102fb0a01 chromedriver + 719361
5   chromedriver                        0x0000000102ff5bc4 chromedriver + 1002436
6   chromedriver                        0x0000000102fd3add chromedriver + 862941
7   chromedriver                        0x0000000102ff2f57 chromedriver + 991063
8   chromedriver                        0x0000000102fd3853 chromedriver + 862291
9   chromedriver                        0x0000000102fa35c6 chromedriver + 665030
10  chromedriver                        0x0000000102fa3e4e chromedriver + 667214
11  chromedriver                        0x00000001033b1d00 chromedriver + 4918528
12  chromedriver                        0x00000001033b6cfd chromedriver + 4939005
13  chromedriver                        0x00000001033b73d5 chromedriver + 4940757
14  chromedriver                        0x0000000103392de4 chromedriver + 4791780
15  chromedriver                        0x00000001033b76c9 chromedriver + 4941513
16  chromedriver                        0x00000001033845b4 chromedriver + 4732340
17  chromedriver                        0x00000001033d7898 chromedriver + 5073048
18  chromedriver                        0x00000001033d7a57 chromedriver + 5073495
19  chromedriver                        0x00000001033e6b6e chromedriver + 5135214
20  libsystem_pthread.dylib             0x00007ff81532618b _pthread_start + 99
21  libsystem_pthread.dylib             0x00007ff815321ae3 thread_start + 15

Attempting to close cookie banner
No cookie banner found: Message:
Stacktrace:
0   chromedriver                        0x00000001033ef0e8 chromedriver + 5169384
1   chromedriver                        0x00000001033e6fba chromedriver + 5136314
2   chromedriver                        0x0000000102f6336c chromedriver + 402284
3   chromedriver                        0x0000000102fb0740 chromedriver + 718656
4   chromedriver                        0x0000000102fb0a01 chromedriver + 719361
5   chromedriver                        0x0000000102ff5bc4 chromedriver + 1002436
6   chromedriver                        0x0000000102fd3add chromedriver + 862941
7   chromedriver                        0x0000000102ff2f57 chromedriver + 991063
8   chromedriver                        0x0000000102fd3853 chromedriver + 862291
9   chromedriver                        0x0000000102fa35c6 chromedriver + 665030
10  chromedriver                        0x0000000102fa3e4e chromedriver + 667214
11  chromedriver                        0x00000001033b1d00 chromedriver + 4918528
12  chromedriver                        0x00000001033b6cfd chromedriver + 4939005
13  chromedriver                        0x00000001033b73d5 chromedriver + 4940757
14  chromedriver                        0x0000000103392de4 chromedriver + 4791780
15  chromedriver                        0x00000001033b76c9 chromedriver + 4941513
16  chromedriver                        0x00000001033845b4 chromedriver + 4732340
17  chromedriver                        0x00000001033d7898 chromedriver + 5073048
18  chromedriver                        0x00000001033d7a57 chromedriver + 5073495
19  chromedriver                        0x00000001033e6b6e chromedriver + 5135214
20  libsystem_pthread.dylib             0x00007ff81532618b _pthread_start + 99
21  libsystem_pthread.dylib             0x00007ff815321ae3 thread_start + 15

Waiting for ms-event-group element
Loading timeout or error: Message:
Stacktrace:
0   chromedriver                        0x00000001033ef0e8 chromedriver + 5169384
1   chromedriver                        0x00000001033e6fba chromedriver + 5136314
2   chromedriver                        0x0000000102f6336c chromedriver + 402284
3   chromedriver                        0x0000000102fb0740 chromedriver + 718656
4   chromedriver                        0x0000000102fb0a01 chromedriver + 719361
5   chromedriver                        0x0000000102ff5bc4 chromedriver + 1002436
6   chromedriver                        0x0000000102fd3add chromedriver + 862941
7   chromedriver                        0x0000000102ff2f57 chromedriver + 991063
8   chromedriver                        0x0000000102fd3853 chromedriver + 862291
9   chromedriver                        0x0000000102fa35c6 chromedriver + 665030
10  chromedriver                        0x0000000102fa3e4e chromedriver + 667214
11  chromedriver                        0x00000001033b1d00 chromedriver + 4918528
12  chromedriver                        0x00000001033b6cfd chromedriver + 4939005
13  chromedriver                        0x00000001033b73d5 chromedriver + 4940757
14  chromedriver                        0x0000000103392de4 chromedriver + 4791780
15  chromedriver                        0x00000001033b76c9 chromedriver + 4941513
16  chromedriver                        0x00000001033845b4 chromedriver + 4732340
17  chromedriver                        0x00000001033d7898 chromedriver + 5073048
18  chromedriver                        0x00000001033d7a57 chromedriver + 5073495
19  chromedriver                        0x00000001033e6b6e chromedriver + 5135214
20  libsystem_pthread.dylib             0x00007ff81532618b _pthread_start + 99
21  libsystem_pthread.dylib             0x00007ff815321ae3 thread_start + 15

Waiting for ms-league-header element
Alternative loading timeout or error: Message:
Stacktrace:
0   chromedriver                        0x00000001033ef0e8 chromedriver + 5169384
1   chromedriver                        0x00000001033e6fba chromedriver + 5136314
2   chromedriver                        0x0000000102f6336c chromedriver + 402284
3   chromedriver                        0x0000000102fb0740 chromedriver + 718656
4   chromedriver                        0x0000000102fb0a01 chromedriver + 719361
5   chromedriver                        0x0000000102ff5bc4 chromedriver + 1002436
6   chromedriver                        0x0000000102fd3add chromedriver + 862941
7   chromedriver                        0x0000000102ff2f57 chromedriver + 991063
8   chromedriver                        0x0000000102fd3853 chromedriver + 862291
9   chromedriver                        0x0000000102fa35c6 chromedriver + 665030
10  chromedriver                        0x0000000102fa3e4e chromedriver + 667214
11  chromedriver                        0x00000001033b1d00 chromedriver + 4918528
12  chromedriver                        0x00000001033b6cfd chromedriver + 4939005
13  chromedriver                        0x00000001033b73d5 chromedriver + 4940757
14  chromedriver                        0x0000000103392de4 chromedriver + 4791780
15  chromedriver                        0x00000001033b76c9 chromedriver + 4941513
16  chromedriver                        0x00000001033845b4 chromedriver + 4732340
17  chromedriver                        0x00000001033d7898 chromedriver + 5073048
18  chromedriver                        0x00000001033d7a57 chromedriver + 5073495
19  chromedriver                        0x00000001033e6b6e chromedriver + 5135214
20  libsystem_pthread.dylib             0x00007ff81532618b _pthread_start + 99
21  libsystem_pthread.dylib             0x00007ff815321ae3 thread_start + 15

Empty DataFrame
Columns: [Tournaments, Players, Odds Player 1, Odds Player 2, Sets, Games, Points Player 1, Points Player 2, Event Status, Current Set]
Index: []

Process finished with exit code 0

Почему я не могу получить тот же результат в безголовом режиме? как мне это решить?

Подробнее здесь: https://stackoverflow.com/questions/786 ... bsite-with
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение
  • Ключ подписи кода Xamarin IOS не найден в ключевой матче
    Anonymous » » в форуме IOS
    0 Ответы
    11 Просмотры
    Последнее сообщение Anonymous
  • Как проверить контент на сертификат. P12 без установки на ключевой матче или доступа к Apple Portal
    Anonymous » » в форуме IOS
    0 Ответы
    12 Просмотры
    Последнее сообщение Anonymous
  • Используя str_replace, чтобы он действовал только на первом матче?
    Anonymous » » в форуме Php
    0 Ответы
    2 Просмотры
    Последнее сообщение Anonymous
  • Веб-API для извлечения информации с веб-сайта [закрыто]
    Anonymous » » в форуме Python
    0 Ответы
    16 Просмотры
    Последнее сообщение Anonymous
  • Веб-API для извлечения информации с веб-сайта [закрыто]
    Anonymous » » в форуме Jquery
    0 Ответы
    19 Просмотры
    Последнее сообщение Anonymous

Вернуться в «Python»