Dockerized скрипт Selenium зависает в месте расположения элемента

Dockerized скрипт Selenium зависает в месте расположения элемента ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Dockerized скрипт Selenium зависает в месте расположения элемента

Цитата

Сообщение Anonymous » 06 ноя 2024, 12:40

Я пытаюсь разместить свой парсер в облаке, используя файл докеров, так как не могу установить среду рабочего стола на свой vps. Я не могу найти примеров для seleniumbase, и для работы обхода капчи Cloudflare он должен работать в режиме headful.
У меня все работает, но скрипт случайно зависает, даже при выключении обработчик сигнала не работает, когда я нажимаю Ctrl+C, и я заметил, что он зависает на всем, что имеет какое-то отношение к элементу, расположенному на странице.
Мой файл dockerfile:
# Use the official Python image
FROM python:3.10-slim

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV DISPLAY=:99

# Install system dependencies for Chrome and Xvfb
RUN apt-get update && apt-get install -y \
wget \
gnupg2 \
curl \
unzip \
libxi6 \
libgconf-2-4 \
libnss3 \
libxss1 \
fonts-liberation \
libappindicator3-1 \
x11-utils \
xvfb \
python3-tk \
python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Install Google Chrome Stable and fonts
RUN apt-get update && apt-get install curl gnupg -y \
&& curl --location --silent https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
&& apt-get update \
&& apt-get install google-chrome-stable -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*

# Set the working directory
WORKDIR /app
RUN mkdir /app/misc

# Copy the requirements file and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of your application code
COPY . .

# Start Xvfb and then run the script
CMD ["sh", "-c", "Xvfb :99 -screen 0 1920x1080x24 & python private/main.py"]

Функция парсинга (URL-адрес я удалил из соображений конфиденциальности):
async def fetch_token_hrefs():
"""
A generator to fetch and yield the first three tokens from front page.
This function will loop indefinitely to allow refreshing the page for new tokens,
and restart the driver every 10 minutes.
"""
driver = None
last_restart_time = time.time()
try:
while True: # Loop to refresh the page for new tokens
# Restart the driver every 10 minutes
current_time = time.time()
if driver is None or (current_time - last_restart_time) > 300:
if driver:
driver.quit()
driver = Driver(
binary_location="/usr/bin/google-chrome",
undetected=True, browser='chrome', no_sandbox=True,
agent=random.choice(user_agents),
do_not_track=True, headless=False,
#disable_gpu=True
)
driver.set_window_size(600, 600)
last_restart_time = current_time

start_time = time.time()
try:
# Refresh the page and click the captcha
driver.uc_open_with_reconnect()
# Check for Cloudflare captcha and click it if detected
if "Cloudflare" in driver.page_source:
logging.info("Cloudflare captcha detected. Attempting to click the captcha.")
driver.uc_gui_click_captcha()
logging.info("Passed the cloudflare captcha")
# Fetch the first three tokens from the page
tokens = []
WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.custom-p8lifi'))
)
for i in range(1, 4):
try:
token_link = driver.find_element(By.CSS_SELECTOR, f'a.custom-p8lifi:nth-child({i})')
if token_link is None:
raise ValueError(f"Token link for token {i} is None")

token_name, amount = parse_token_info(token_link.text)
if token_name is None or amount is None:
raise ValueError(f"Token name or amount for token {i} is None")

href = token_link.get_attribute('href')
if href is None:
raise ValueError(f"Href for token {i} is None")

tokens.append({
'name': sanitize_token_name(token_name),
'amount': amount,
'url': href,
})
logging.info("Fetched token %s: %s", i, token_name)

except Exception as e:
logging.warning("Could not fetch token %s: %s", i, str(e))
#await bot.send_message(CHANNEL_ID, f"

Token Fetching Error at {current_time}: \n{str(e)}")

end_time = time.time()
elapsed_time = end_time - start_time
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logging.info("Elapsed time: %s seconds", elapsed_time)

# Save the elapsed time to a file for later visualization
with open("token_fetch_times.txt", "a", encoding="utf-8") as f:
f.write(f"{timestamp},{elapsed_time}\n")

yield tokens, elapsed_time

# Refresh the page instead of quitting the driver
driver.refresh()

# Wait a bit before the next fetch to control the frequency of fetching
await asyncio.sleep(random.randint(1, 2))

except Exception as e:
logging.error("Error fetching tokens: %s", e)
driver.refresh()
#await bot.send_message(CHANNEL_ID, f"

Token Fetching Error at {current_time}: \n{str(e)}")

except Exception as e:
logging.error("Failed to initialize driver: %s", str(e))
await bot.send_message(CHANNEL_ID, f"

Driver Initialization Error: {str(e)}")
if driver:
driver.quit()

Подробнее здесь: https://stackoverflow.com/questions/791 ... t-location

1730886002

Anonymous

Я пытаюсь разместить свой парсер в облаке, используя файл докеров, так как не могу установить среду рабочего стола на свой vps. Я не могу найти примеров для seleniumbase, и для работы обхода капчи Cloudflare он должен работать в режиме headful.
У меня все работает, но скрипт случайно зависает, даже при выключении обработчик сигнала не работает, когда я нажимаю Ctrl+C, и я заметил, что он зависает на всем, что имеет какое-то отношение к элементу, расположенному на странице.
Мой файл dockerfile:
# Use the official Python image
FROM python:3.10-slim

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV DISPLAY=:99

# Install system dependencies for Chrome and Xvfb
RUN apt-get update && apt-get install -y \
wget \
gnupg2 \
curl \
unzip \
libxi6 \
libgconf-2-4 \
libnss3 \
libxss1 \
fonts-liberation \
libappindicator3-1 \
x11-utils \
xvfb \
python3-tk \
python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Install Google Chrome Stable and fonts
RUN apt-get update && apt-get install curl gnupg -y \
&& curl --location --silent https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
&& apt-get update \
&& apt-get install google-chrome-stable -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*

# Set the working directory
WORKDIR /app
RUN mkdir /app/misc

# Copy the requirements file and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of your application code
COPY . .

# Start Xvfb and then run the script
CMD ["sh", "-c", "Xvfb :99 -screen 0 1920x1080x24 & python private/main.py"]

Функция парсинга (URL-адрес я удалил из соображений конфиденциальности):
async def fetch_token_hrefs():
"""
A generator to fetch and yield the first three tokens from front page.
This function will loop indefinitely to allow refreshing the page for new tokens,
and restart the driver every 10 minutes.
"""
driver = None
last_restart_time = time.time()
try:
while True:  # Loop to refresh the page for new tokens
# Restart the driver every 10 minutes
current_time = time.time()
if driver is None or (current_time - last_restart_time) > 300:
if driver:
driver.quit()
driver = Driver(
binary_location="/usr/bin/google-chrome",
undetected=True, browser='chrome', no_sandbox=True,
agent=random.choice(user_agents),
do_not_track=True, headless=False,
#disable_gpu=True
)
driver.set_window_size(600, 600)
last_restart_time = current_time

start_time = time.time()
try:
# Refresh the page and click the captcha
driver.uc_open_with_reconnect()
# Check for Cloudflare captcha and click it if detected
if "Cloudflare" in driver.page_source:
logging.info("Cloudflare captcha detected.  Attempting to click the captcha.")
driver.uc_gui_click_captcha()
logging.info("Passed the cloudflare captcha")
# Fetch the first three tokens from the page
tokens = []
WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.custom-p8lifi'))
)
for i in range(1, 4):
try:
token_link = driver.find_element(By.CSS_SELECTOR, f'a.custom-p8lifi:nth-child({i})')
if token_link is None:
raise ValueError(f"Token link for token {i} is None")

token_name, amount = parse_token_info(token_link.text)
if token_name is None or amount is None:
raise ValueError(f"Token name or amount for token {i} is None")

href = token_link.get_attribute('href')
if href is None:
raise ValueError(f"Href for token {i} is None")

tokens.append({
'name': sanitize_token_name(token_name),
'amount': amount,
'url': href,
})
logging.info("Fetched token %s: %s", i, token_name)

except Exception as e:
logging.warning("Could not fetch token %s: %s", i, str(e))
#await bot.send_message(CHANNEL_ID, f"🚨Token Fetching Error at {current_time}: \n{str(e)}")

end_time = time.time()
elapsed_time = end_time - start_time
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logging.info("Elapsed time: %s seconds", elapsed_time)

# Save the elapsed time to a file for later visualization
with open("token_fetch_times.txt", "a", encoding="utf-8") as f:
f.write(f"{timestamp},{elapsed_time}\n")

yield tokens, elapsed_time

# Refresh the page instead of quitting the driver
driver.refresh()

# Wait a bit before the next fetch to control the frequency of fetching
await asyncio.sleep(random.randint(1, 2))

except Exception as e:
logging.error("Error fetching tokens: %s", e)
driver.refresh()
#await bot.send_message(CHANNEL_ID, f"🚨Token Fetching Error at {current_time}: \n{str(e)}")

except Exception as e:
logging.error("Failed to initialize driver: %s", str(e))
await bot.send_message(CHANNEL_ID, f"🚨 Driver Initialization Error: {str(e)}")
if driver:
driver.quit()
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79161156/dockerized-selenium-script-freezes-on-element-location[/url]