Перейдите на следующую страницу, очистите данные и вернитесь на первую страницу.

Перейдите на следующую страницу, очистите данные и вернитесь на первую страницу. ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Перейдите на следующую страницу, очистите данные и вернитесь на первую страницу.

Цитата

Сообщение Anonymous » 01 окт 2024, 17:58

При очистке данных с первой страницы я перехожу на вторую страницу и нажимаю на первую ссылку. После возврата парсер возвращается к первой странице вместо того, чтобы оставаться на второй странице. Почему не сохраняется текущая страница. Как нам остаться на текущей странице?
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from twocaptcha import TwoCaptcha
from pymongo import MongoClient
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# MongoDB Connection
client = MongoClient('localhost', 27017)
db = client['report_db']
collection = db['report']

# Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")

# Initialize the WebDriver using ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Open target website
driver.get('https://www.abogacia.es/servicios-aboga ... -letrados/')
print('Website opened')

wait = WebDriverWait(driver, 10)
# Accept cookies if the button appears
try:
cookies_button = wait.until(EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")))
cookies_button.click()
except Exception as e:
print("Cookies button not found or already accepted.")

# Switch to iframe for interaction
iframe_element = wait.until(EC.presence_of_element_located((By.XPATH, '//iframe[contains(@src, "censo.abogacia.es")]')))
driver.switch_to.frame(iframe_element)

# Enter search query and submit
input_value = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@id="j_id23:j_id33"]')))
input_value.send_keys('a')

time.sleep(10)

# Handle ElementClickInterceptedException
submit_element = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "Buscar")]')))
try:
submit_element.click()
except Exception as e:
print(f"Initial click failed: {e}")
# Scroll into view and click using JavaScript as a workaround
driver.execute_script("arguments[0].scrollIntoView(true);", submit_element)
driver.execute_script("arguments[0].click();", submit_element)

print('Website form submitted within iframe')

def get_existing_names():
"""Retrieve all first names from the database and store them in a set for fast lookup."""
return {entry["first_name"] for entry in collection.find({}, {"first_name": 1, "_id": 0})}

existing_names = get_existing_names()

def insert_data_mongo(first_name,nombre, colegio, alta_colegiacion, n_colegiado, ejerciente, residente, direccion_profesional, telefono,fax):
try:
data = {
"first_name":first_name,
"Nombre": nombre,
"Colegio": colegio,
"Alta_Colegiacion": alta_colegiacion,
"N_Colegiado": n_colegiado,
"Ejerciente": ejerciente,
"Residente": residente,
"Direccion_Profesional": direccion_profesional,
"Telefono": telefono,
"Fax:":fax
}
# Insert the data into MongoDB
collection.insert_one(data)
print("Inserted data into MongoDB successfully")

except Exception as e:
print(f"Error inserting data: {e}")

def extract_information(driver):
"""Extract information from the webpage based on given labels."""
labels = {

"first_name":"first_name",
"Nombre:": "Nombre",
"Colegio:": "Colegio",
"Alta Colegiación:": "Alta Colegiación",
"N. Colegiado:": "N. Colegiado",
"Ejerciente": "Ejerciente",
"Residente": "Residente",
"Dirección Profesional:": "Dirección Profesional",
"Teléfono:": "Teléfono",
"Fax:": "Fax"
}

result = {}
for label, key in labels.items():
try:
label_element = driver.find_element(By.XPATH, f"//label[text()='{label}']")
sibling_span = label_element.find_element(By.XPATH, "following-sibling::span")
result[key] = sibling_span.text.strip()
except Exception:
pass # Skip missing elements gracefully
return result

# Main extraction loop with retry mechanism for handling stale element exceptions
i=0
while True:
# Handling table details
detail_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//table[@class='iceDatTbl tablaElementos']//tbody"))
)

rows = detail_element.find_elements(By.XPATH, ".//td[contains(@style, 'width:140px')]")

for page_number in range(len(rows)):
time.sleep(5)

try:
detail_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//table[@class='iceDatTbl tablaElementos']//tbody"))
)
rows = detail_element.find_elements(By.XPATH, ".//td[contains(@style, 'width:140px')]")

if page_number < len(rows):
current_row = rows[page_number]
name_element = current_row.find_element(By.XPATH, ".//span")
name_text = name_element.text.strip()
print(name_text)

if name_text in existing_names:
print(f"{name_text} already exists in the database, skipping...")
continue

# Click the name link to go to the detail page
name_element.click()
time.sleep(5)

# Extract information on the detail page
info = extract_information(driver)
first_name = name_text
nombre = info.get("Nombre", "")
colegio = info.get("Colegio", "")
alta_colegiacion = info.get("Alta Colegiación", "")
n_colegiado = info.get("N. Colegiado", "")
ejerciente = info.get("Ejerciente", "")
residente = info.get("Residente", "")
direccion_profesional = info.get("Dirección Profesional", "")
telefono = info.get("Teléfono", None)
if telefono == "":
telefono = None
fax = info.get("Fax", None)
if fax == "":
fax = None

# Insert the data into MongoDB
insert_data_mongo(first_name, nombre, colegio, alta_colegiacion, n_colegiado, ejerciente, residente, direccion_profesional, telefono, fax)

WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//a[@id='j_id23:j_id50']"))
).click()

time.sleep(5)

# Ensure you're still on the same page and handle the CAPTCHA

except Exception as e:
print(f"Error processing row {page_number}: {e}")
continue

# Pagination to the next page
next_page = driver.find_element(By.XPATH, "//img[@id='j_id23:j_id79']")
next_page.click()
i += 1
print(f"page_number-{i}")
time.sleep(5)

Подробнее здесь: https://stackoverflow.com/questions/790 ... first-page

1727794719

Anonymous

При очистке данных с первой страницы я перехожу на вторую страницу и нажимаю на первую ссылку. После возврата парсер возвращается к первой странице вместо того, чтобы оставаться на второй странице. Почему не сохраняется текущая страница. Как нам остаться на текущей странице?
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from twocaptcha import TwoCaptcha
from pymongo import MongoClient
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# MongoDB Connection
client = MongoClient('localhost', 27017)
db = client['report_db']
collection = db['report']

# Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")

# Initialize the WebDriver using ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Open target website
driver.get('https://www.abogacia.es/servicios-abogacia/censo-de-letrados/')
print('Website opened')

wait = WebDriverWait(driver, 10)
# Accept cookies if the button appears
try:
cookies_button = wait.until(EC.element_to_be_clickable((By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll")))
cookies_button.click()
except Exception as e:
print("Cookies button not found or already accepted.")

# Switch to iframe for interaction
iframe_element = wait.until(EC.presence_of_element_located((By.XPATH, '//iframe[contains(@src, "censo.abogacia.es")]')))
driver.switch_to.frame(iframe_element)

# Enter search query and submit
input_value = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@id="j_id23:j_id33"]')))
input_value.send_keys('a')

time.sleep(10)

# Handle ElementClickInterceptedException
submit_element = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "Buscar")]')))
try:
submit_element.click()
except Exception as e:
print(f"Initial click failed: {e}")
# Scroll into view and click using JavaScript as a workaround
driver.execute_script("arguments[0].scrollIntoView(true);", submit_element)
driver.execute_script("arguments[0].click();", submit_element)

print('Website form submitted within iframe')

def get_existing_names():
"""Retrieve all first names from the database and store them in a set for fast lookup."""
return {entry["first_name"] for entry in collection.find({}, {"first_name": 1, "_id": 0})}

existing_names = get_existing_names()

def insert_data_mongo(first_name,nombre, colegio, alta_colegiacion, n_colegiado, ejerciente, residente, direccion_profesional, telefono,fax):
try:
data = {
"first_name":first_name,
"Nombre": nombre,
"Colegio": colegio,
"Alta_Colegiacion": alta_colegiacion,
"N_Colegiado": n_colegiado,
"Ejerciente": ejerciente,
"Residente": residente,
"Direccion_Profesional": direccion_profesional,
"Telefono": telefono,
"Fax:":fax
}
# Insert the data into MongoDB
collection.insert_one(data)
print("Inserted data into MongoDB successfully")

except Exception as e:
print(f"Error inserting data: {e}")

def extract_information(driver):
"""Extract information from the webpage based on given labels."""
labels = {

"first_name":"first_name",
"Nombre:": "Nombre",
"Colegio:": "Colegio",
"Alta Colegiación:": "Alta Colegiación",
"N. Colegiado:": "N.  Colegiado",
"Ejerciente": "Ejerciente",
"Residente": "Residente",
"Dirección Profesional:": "Dirección Profesional",
"Teléfono:": "Teléfono",
"Fax:": "Fax"
}

result = {}
for label, key in labels.items():
try:
label_element = driver.find_element(By.XPATH, f"//label[text()='{label}']")
sibling_span = label_element.find_element(By.XPATH, "following-sibling::span")
result[key] = sibling_span.text.strip()
except Exception:
pass  # Skip missing elements gracefully
return result

# Main extraction loop with retry mechanism for handling stale element exceptions
i=0
while True:
# Handling table details
detail_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//table[@class='iceDatTbl tablaElementos']//tbody"))
)

rows = detail_element.find_elements(By.XPATH, ".//td[contains(@style, 'width:140px')]")

for page_number in range(len(rows)):
time.sleep(5)

try:
detail_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, ".//table[@class='iceDatTbl tablaElementos']//tbody"))
)
rows = detail_element.find_elements(By.XPATH, ".//td[contains(@style, 'width:140px')]")

if page_number < len(rows):
current_row = rows[page_number]
name_element = current_row.find_element(By.XPATH, ".//span")
name_text = name_element.text.strip()
print(name_text)

if name_text in existing_names:
print(f"{name_text} already exists in the database, skipping...")
continue

# Click the name link to go to the detail page
name_element.click()
time.sleep(5)

# Extract information on the detail page
info = extract_information(driver)
first_name = name_text
nombre = info.get("Nombre", "")
colegio = info.get("Colegio", "")
alta_colegiacion = info.get("Alta Colegiación", "")
n_colegiado = info.get("N. Colegiado", "")
ejerciente = info.get("Ejerciente", "")
residente = info.get("Residente", "")
direccion_profesional = info.get("Dirección Profesional", "")
telefono = info.get("Teléfono", None)
if telefono == "":
telefono = None
fax = info.get("Fax", None)
if fax == "":
fax = None

# Insert the data into MongoDB
insert_data_mongo(first_name, nombre, colegio, alta_colegiacion, n_colegiado, ejerciente, residente, direccion_profesional, telefono, fax)

WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//a[@id='j_id23:j_id50']"))
).click()

time.sleep(5)

# Ensure you're still on the same page and handle the CAPTCHA

except Exception as e:
print(f"Error processing row {page_number}: {e}")
continue

# Pagination to the next page
next_page = driver.find_element(By.XPATH, "//img[@id='j_id23:j_id79']")
next_page.click()
i += 1
print(f"page_number-{i}")
time.sleep(5)
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79042388/navigate-to-the-next-page-scrape-data-and-return-to-the-first-page[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

Перейдите на следующую страницу, очистите данные и вернитесь на первую страницу.

Последнее сообщение Anonymous « 01 окт 2024, 12:00
Добавлено в форуме Python

Anonymous » 01 окт 2024, 12:00 » в форуме Python

При очистке данных с первой страницы я перехожу на вторую страницу и нажимаю на первую ссылку. После возврата парсер возвращается к первой странице вместо того, чтобы оставаться на второй странице. Почему не сохраняется текущая страница. Как нам...

0 Ответы

25 Просмотры

Последнее сообщение Anonymous
01 окт 2024, 12:00
Приложение UWP. Перейдите на главную страницу и очистите рамки.

Последнее сообщение Anonymous « 26 янв 2025, 14:08
Добавлено в форуме C#

Anonymous » 26 янв 2025, 14:08 » в форуме C#

У меня есть киоск-приложение UWP, из-за которого происходит утечка памяти. Когда я просматриваю его работающим в диспетчере задач, я вижу, что потребление памяти увеличивается по мере навигации по страницам приложения. Эта навигация осуществляется в...

0 Ответы

22 Просмотры

Последнее сообщение Anonymous
26 янв 2025, 14:08
Очистите число, удалив первую цифру, а затем все последовательные нули.

Последнее сообщение Anonymous « 19 янв 2025, 13:53
Добавлено в форуме Php

Anonymous » 19 янв 2025, 13:53 » в форуме Php

У меня есть массив чисел, например:
10001234

10002345
Теперь у меня есть число , который должен сопоставляться со всеми числами внутри массива. Число может быть либо 10001234 (что будет легко сопоставить), но также может быть, например, 100001234...

0 Ответы

11 Просмотры

Последнее сообщение Anonymous
19 янв 2025, 13:53
Вернитесь на предыдущую страницу, когда пользователь нажимает кнопку

Последнее сообщение Anonymous « 13 мар 2025, 12:47
Добавлено в форуме Android

Anonymous » 13 мар 2025, 12:47 » в форуме Android

Я использую пакет inappwebView ( в моем приложении Flutter. />
... где он показывает метод _webviewcontroller.goback ()

Подробнее здесь:

0 Ответы

6 Просмотры

Последнее сообщение Anonymous
13 мар 2025, 12:47
При загрузке страницы перейдите на конкретную страницу, к которой принадлежит элемент listview, с помощью listview и dat

Последнее сообщение Anonymous « 18 сен 2024, 11:15
Добавлено в форуме C#

Anonymous » 18 сен 2024, 11:15 » в форуме C#

У меня есть выбранный DataKey в сеансе из ListView.

Я могу вернуть выбор, когда вернусь на эту страницу aspx, содержащую представление списка.

Но когда выбранный элемент в списке принадлежит какой-либо другой странице (не первой странице списка),...

0 Ответы

28 Просмотры

Последнее сообщение Anonymous
18 сен 2024, 11:15

Вернуться в «Python»