Перейдите на следующую страницу, очистите данные и вернитесь на первую страницу.Python

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Перейдите на следующую страницу, очистите данные и вернитесь на первую страницу.

Сообщение Anonymous »

При очистке данных с первой страницы я перехожу на вторую страницу и нажимаю на первую ссылку. После возврата парсер возвращается к первой странице вместо того, чтобы оставаться на второй странице. Почему не сохраняется текущая страница. Как нам остаться на текущей странице?
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from twocaptcha import TwoCaptcha
from pymongo import MongoClient
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# MongoDB Connection
client = MongoClient('localhost', 27017)
db = client['report_db']
collection = db['report']

# Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")

# Initialize the WebDriver using ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Open target website
driver.get('https://www.abogacia.es/servicios-aboga ... -letrados/')
print('Website opened')

wait = WebDriverWait(driver, 10)
# Accept cookies if the button appears
try:
cookies_button = wait.until(EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")))
cookies_button.click()
except Exception as e:
print("Cookies button not found or already accepted.")

# Switch to iframe for interaction
iframe_element = wait.until(EC.presence_of_element_located((By.XPATH, '//iframe[contains(@src, "censo.abogacia.es")]')))
driver.switch_to.frame(iframe_element)

# Enter search query and submit
input_value = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@id="j_id23:j_id33"]')))
input_value.send_keys('a')

time.sleep(10)

# Handle ElementClickInterceptedException
submit_element = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "Buscar")]')))
try:
submit_element.click()
except Exception as e:
print(f"Initial click failed: {e}")
# Scroll into view and click using JavaScript as a workaround
driver.execute_script("arguments[0].scrollIntoView(true);", submit_element)
driver.execute_script("arguments[0].click();", submit_element)

print('Website form submitted within iframe')

def get_existing_names():
"""Retrieve all first names from the database and store them in a set for fast lookup."""
return {entry["first_name"] for entry in collection.find({}, {"first_name": 1, "_id": 0})}

existing_names = get_existing_names()

def insert_data_mongo(first_name,nombre, colegio, alta_colegiacion, n_colegiado, ejerciente, residente, direccion_profesional, telefono,fax):
try:
data = {
"first_name":first_name,
"Nombre": nombre,
"Colegio": colegio,
"Alta_Colegiacion": alta_colegiacion,
"N_Colegiado": n_colegiado,
"Ejerciente": ejerciente,
"Residente": residente,
"Direccion_Profesional": direccion_profesional,
"Telefono": telefono,
"Fax:":fax
}
# Insert the data into MongoDB
collection.insert_one(data)
print("Inserted data into MongoDB successfully")

except Exception as e:
print(f"Error inserting data: {e}")

def extract_information(driver):
"""Extract information from the webpage based on given labels."""
labels = {

"first_name":"first_name",
"Nombre:": "Nombre",
"Colegio:": "Colegio",
"Alta Colegiación:": "Alta Colegiación",
"N. Colegiado:": "N. Colegiado",
"Ejerciente": "Ejerciente",
"Residente": "Residente",
"Dirección Profesional:": "Dirección Profesional",
"Teléfono:": "Teléfono",
"Fax:": "Fax"
}

result = {}
for label, key in labels.items():
try:
label_element = driver.find_element(By.XPATH, f"//label[text()='{label}']")
sibling_span = label_element.find_element(By.XPATH, "following-sibling::span")
result[key] = sibling_span.text.strip()
except Exception:
pass # Skip missing elements gracefully
return result

# Main extraction loop with retry mechanism for handling stale element exceptions
i=0
while True:
# Handling table details
detail_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//table[@class='iceDatTbl tablaElementos']//tbody"))
)

rows = detail_element.find_elements(By.XPATH, ".//td[contains(@style, 'width:140px')]")

for page_number in range(len(rows)):
time.sleep(5)

try:
detail_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//table[@class='iceDatTbl tablaElementos']//tbody"))
)
rows = detail_element.find_elements(By.XPATH, ".//td[contains(@style, 'width:140px')]")

if page_number < len(rows):
current_row = rows[page_number]
name_element = current_row.find_element(By.XPATH, ".//span")
name_text = name_element.text.strip()
print(name_text)

if name_text in existing_names:
print(f"{name_text} already exists in the database, skipping...")
continue

# Click the name link to go to the detail page
name_element.click()
time.sleep(5)

# Extract information on the detail page
info = extract_information(driver)
first_name = name_text
nombre = info.get("Nombre", "")
colegio = info.get("Colegio", "")
alta_colegiacion = info.get("Alta Colegiación", "")
n_colegiado = info.get("N. Colegiado", "")
ejerciente = info.get("Ejerciente", "")
residente = info.get("Residente", "")
direccion_profesional = info.get("Dirección Profesional", "")
telefono = info.get("Teléfono", None)
if telefono == "":
telefono = None
fax = info.get("Fax", None)
if fax == "":
fax = None

# Insert the data into MongoDB
insert_data_mongo(first_name, nombre, colegio, alta_colegiacion, n_colegiado, ejerciente, residente, direccion_profesional, telefono, fax)

WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//a[@id='j_id23:j_id50']"))
).click()

time.sleep(5)

# Ensure you're still on the same page and handle the CAPTCHA

except Exception as e:
print(f"Error processing row {page_number}: {e}")
continue

# Pagination to the next page
next_page = driver.find_element(By.XPATH, "//img[@id='j_id23:j_id79']")
next_page.click()
i += 1
print(f"page_number-{i}")
time.sleep(5)


Подробнее здесь: https://stackoverflow.com/questions/790 ... first-page
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «Python»