Anonymous
Бесконечный цикл на нумерации страниц с селеном
Сообщение
Anonymous » 08 ноя 2024, 03:29
У меня есть простой парсинг. Он работает нормально и все такое, но когда доходит до разбиения на страницы, когда он переходит к последней разбивке на страницы, он переходит в бесконечный цикл. На последней странице происходит бесконечный цикл:
Код: Выделить всё
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 4 21:22:10 2024
@author: user
"""
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from selenium import webdriver
from urllib.parse import urlparse
import undetected_chromedriver as uc
from selenium import webdriver
quantidade_2024 = 0
quantidade_2023 = 0
quantidade_2022 = 0
quantidade_2021 = 0
precos = []
tamanhos = []
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)
#options.add_argument(f"--proxy-server={proxy}")
#options.add_argument("start-maximized")
#options.add_argument("disable-infobars")options.add_argument("--disable-extensions")
actions = ActionChains(driver)
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--verbose")
#options.add_argument("--remote-debugging-port=9222")
options.add_argument('--headless')
options.add_argument("--disable-gpu")
data_planilha = []
precos_m2 = []
url_base = "https://www.vivareal.com.br/venda/ceara/caucaia/bairros/caucaia/#onde=,Cear%C3%A1,Caucaia,Bairros,Caucaia,,,,BR%3ECeara%3ENULL%3ECaucaia%3EBarrios%3ECaucaia,,,/"
driver.get(url_base)
#driver.find_element(By.XPATH, "//ul[@class = 'header__nav-links']/li[2]/a").click()
#driver.implicitly_wait(30)
resultados = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//section[@class = 'results__main']" )))
quantidade_imoveis = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//strong[@class = 'results-summary__count js-total-records']"))).text
quantidade_imoveis = int(quantidade_imoveis)
print(quantidade_imoveis)
while True:
time.sleep(1)
imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div")))
print("entrou")
for index, value in enumerate(imoveis):
driver.implicitly_wait(20)
imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div")))
driver.execute_script("arguments[0].scrollIntoView(true);", imoveis[index])
time.sleep(3)
tamanho = imoveis[index].find_element(By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']").text
#driver.execute_script("arguments[0].scrollIntoView(true);", tamanho)
# tamanho = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']"))).text
print(tamanho)
driver.implicitly_wait(20)
preco = imoveis[index].find_element(By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p").text
preco_imovel = "".join(re.findall('\d+', preco))
price = int(preco_imovel)
# driver.execute_script("arguments[0].scrollIntoView(true);", preco)
#preco = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p"))).text
size = int(tamanho)
print(preco_imovel)
precos.append(int(price))
tamanhos.append(size)
preco_m2 = price/size
print(preco_m2)
precos_m2.append(preco_m2)
driver.save_screenshot("VIVAREAL.png")
try:
botao_proximo = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, "//button[@title = 'Próxima página']")))
if botao_proximo.is_enabled():
driver.execute_script('arguments[0].click()', botao_proximo)
else:
break
except NoSuchElementException:
print("No more pages available")
break
tamanho_medio = sum(tamanhos)/quantidade_imoveis
preco_medio_total = sum(precos)/quantidade_imoveis
precos_m2_total = sum(precos_m2)/quantidade_imoveis
data_planilha.append({
'URL': url_base,
'Número de anúncios': quantidade_imoveis,
'Preço Médio': preco_medio_total,
'Tamanho Médio': tamanho_medio,
'Preço Médio por m2': precos_m2_total,
})
df_planilha = pd.DataFrame(data_planilha)
df_planilha.to_excel("catalogo_vivareal.xlsx")
driver.quit()
на этом веб-сайте кнопка последней страницы выглядит следующим образом:
страница данных — это пустая строка, и есть атрибут отключенных данных, который не отображается на других страницах, кроме последней страницы.
Подробнее здесь:
https://stackoverflow.com/questions/791 ... h-selenium
1731025753
Anonymous
У меня есть простой парсинг. Он работает нормально и все такое, но когда доходит до разбиения на страницы, когда он переходит к последней разбивке на страницы, он переходит в бесконечный цикл. На последней странице происходит бесконечный цикл: [code]#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Nov 4 21:22:10 2024 @author: user """ from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By import time from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import json import requests import pandas as pd from bs4 import BeautifulSoup from urllib.parse import urlparse import re from selenium import webdriver from urllib.parse import urlparse import undetected_chromedriver as uc from selenium import webdriver quantidade_2024 = 0 quantidade_2023 = 0 quantidade_2022 = 0 quantidade_2021 = 0 precos = [] tamanhos = [] options = webdriver.ChromeOptions() options.add_argument("start-maximized") driver = uc.Chrome(options=options) #options.add_argument(f"--proxy-server={proxy}") #options.add_argument("start-maximized") #options.add_argument("disable-infobars")options.add_argument("--disable-extensions") actions = ActionChains(driver) options.add_argument("--disable-dev-shm-usage") options.add_argument("--no-sandbox") options.add_argument("--verbose") #options.add_argument("--remote-debugging-port=9222") options.add_argument('--headless') options.add_argument("--disable-gpu") data_planilha = [] precos_m2 = [] url_base = "https://www.vivareal.com.br/venda/ceara/caucaia/bairros/caucaia/#onde=,Cear%C3%A1,Caucaia,Bairros,Caucaia,,,,BR%3ECeara%3ENULL%3ECaucaia%3EBarrios%3ECaucaia,,,/" driver.get(url_base) #driver.find_element(By.XPATH, "//ul[@class = 'header__nav-links']/li[2]/a").click() #driver.implicitly_wait(30) resultados = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//section[@class = 'results__main']" ))) quantidade_imoveis = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//strong[@class = 'results-summary__count js-total-records']"))).text quantidade_imoveis = int(quantidade_imoveis) print(quantidade_imoveis) while True: time.sleep(1) imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div"))) print("entrou") for index, value in enumerate(imoveis): driver.implicitly_wait(20) imoveis = WebDriverWait(driver, 200).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'results-list js-results-list']/div"))) driver.execute_script("arguments[0].scrollIntoView(true);", imoveis[index]) time.sleep(3) tamanho = imoveis[index].find_element(By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']").text #driver.execute_script("arguments[0].scrollIntoView(true);", tamanho) # tamanho = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//span[@class = 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area']"))).text print(tamanho) driver.implicitly_wait(20) preco = imoveis[index].find_element(By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p").text preco_imovel = "".join(re.findall('\d+', preco)) price = int(preco_imovel) # driver.execute_script("arguments[0].scrollIntoView(true);", preco) #preco = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, ".//div[@class = 'property-card__price js-property-card-prices js-property-card__price-small']/p"))).text size = int(tamanho) print(preco_imovel) precos.append(int(price)) tamanhos.append(size) preco_m2 = price/size print(preco_m2) precos_m2.append(preco_m2) driver.save_screenshot("VIVAREAL.png") try: botao_proximo = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, "//button[@title = 'Próxima página']"))) if botao_proximo.is_enabled(): driver.execute_script('arguments[0].click()', botao_proximo) else: break except NoSuchElementException: print("No more pages available") break tamanho_medio = sum(tamanhos)/quantidade_imoveis preco_medio_total = sum(precos)/quantidade_imoveis precos_m2_total = sum(precos_m2)/quantidade_imoveis data_planilha.append({ 'URL': url_base, 'Número de anúncios': quantidade_imoveis, 'Preço Médio': preco_medio_total, 'Tamanho Médio': tamanho_medio, 'Preço Médio por m2': precos_m2_total, }) df_planilha = pd.DataFrame(data_planilha) df_planilha.to_excel("catalogo_vivareal.xlsx") driver.quit() [/code] на этом веб-сайте кнопка последней страницы выглядит следующим образом: [code] Próxima página > [/code] страница данных — это пустая строка, и есть атрибут отключенных данных, который не отображается на других страницах, кроме последней страницы. Подробнее здесь: [url]https://stackoverflow.com/questions/79168467/infinite-loop-on-pagination-with-selenium[/url]