Я пытаюсь извлечь рейтинг EPC из каждой списки. Вы можете получить рейтинг EPC только при нажатии на список. Каждый раз, когда я запускаю свой сценарий, это
это не так, в чем может быть проблема? Хотя я попытался увеличить время ожидания основного контента, но я все еще сталкиваюсь с той же проблемой. Может ли это быть безголовым браузером, который не мог загрузить? < /p>
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from typing import Iterator
import pandas as pd
# Constants
URL = "https://www.zoopla.co.uk/house-prices/e ... =list&pn=1"
TIMEOUT = 5
# Helper function to extract text from a WebElement
def etext(e: WebElement) -> str:
if e:
if t := e.text.strip():
return t
if (p := e.get_property("textContent")) and isinstance(p, str):
return p.strip()
return ""
# Click a WebElement
def click(driver: WebDriver, e: WebElement) -> None:
ActionChains(driver).click(e).perform()
# Get all WebElements that match the given CSS selector
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
wait = WebDriverWait(driver, TIMEOUT)
sel = (By.CSS_SELECTOR, css)
try:
yield from wait.until(EC.presence_of_all_elements_located(sel))
except TimeoutException:
pass # Return empty if elements are not found
# Click the "Next" button for pagination
def click_next(driver: WebDriver) -> None:
for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
if etext(a) == "Next":
click(driver, a)
break
# Handle cookie consent popup
def click_through(driver: WebDriver) -> None:
try:
wait = WebDriverWait(driver, TIMEOUT)
shadow_root = driver.find_element(By.ID, "usercentrics-root").shadow_root
button = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]")
))
click(driver, button)
except Exception:
pass # Ignore if cookie popup isn't present
# Scrape EPC Rating from individual listing
def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
driver.get(listing_url) # Open property details page
try:
epc_element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.main-content .z3kgis3 ._1vhryas0 ._8lgu4x1 div:nth-child(3) div'))
)
return etext(epc_element) # Extract EPC rating text
except TimeoutException:
return "N/A" # Return "N/A" if EPC Rating is missing
# Scrape data from the search results page
def scrape_page(driver: WebDriver) -> list[dict]:
result = []
for house in get_all(driver, "div[data-testid=result-item]"):
try:
listing_url = house.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
address = etext(house.find_element(By.CSS_SELECTOR, "h2"))
date_sold = etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
house_type = etext(house.find_element(By.CSS_SELECTOR, "div._1pbf8i52 p"))
num_rooms = etext(house.find_element(By.CSS_SELECTOR, "._1pbf8i51 div:nth-child(2) p"))
tenure = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(1) div"))
square_foot = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(2) div"))
# Get EPC Rating from listing page
epc_rating = get_epc_rating(driver, listing_url)
result.append({
"Address": address,
"Date Last Sold": date_sold,
"Property Type": house_type,
"Number of Rooms": num_rooms,
"Tenure": tenure,
"Square Foot": square_foot,
"EPC Rating": epc_rating,
"Listing URL": listing_url
})
except NoSuchElementException:
continue # Skip missing elements
return result
# Main script execution
if __name__ == "__main__":
with Chrome() as driver:
driver.get(URL)
click_through(driver) # Handle cookies
all_results = []
prev_url = ""
npages = 0
while prev_url != driver.current_url: # Stop if pagination stops working (e.g., Cloudflare blocks)
prev_url = driver.current_url
all_results.extend(scrape_page(driver))
click_next(driver)
npages += 1
# Convert results to DataFrame
df = pd.DataFrame(all_results)
# Display results
print(df)
print(f"Processed {npages} pages")
# Save to CSV
df.to_csv("zoopla_data.csv", index=False)
Подробнее здесь: https://stackoverflow.com/questions/794 ... nformation
Почему я не могу извлечь информацию ⇐ Html
Программисты Html
1739012443
Anonymous
Я пытаюсь извлечь рейтинг EPC из каждой списки. Вы можете получить рейтинг EPC только при нажатии на список. Каждый раз, когда я запускаю свой сценарий, это
это не так, в чем может быть проблема? Хотя я попытался увеличить время ожидания основного контента, но я все еще сталкиваюсь с той же проблемой. Может ли это быть безголовым браузером, который не мог загрузить? < /p>
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from typing import Iterator
import pandas as pd
# Constants
URL = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
TIMEOUT = 5
# Helper function to extract text from a WebElement
def etext(e: WebElement) -> str:
if e:
if t := e.text.strip():
return t
if (p := e.get_property("textContent")) and isinstance(p, str):
return p.strip()
return ""
# Click a WebElement
def click(driver: WebDriver, e: WebElement) -> None:
ActionChains(driver).click(e).perform()
# Get all WebElements that match the given CSS selector
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
wait = WebDriverWait(driver, TIMEOUT)
sel = (By.CSS_SELECTOR, css)
try:
yield from wait.until(EC.presence_of_all_elements_located(sel))
except TimeoutException:
pass # Return empty if elements are not found
# Click the "Next" button for pagination
def click_next(driver: WebDriver) -> None:
for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
if etext(a) == "Next":
click(driver, a)
break
# Handle cookie consent popup
def click_through(driver: WebDriver) -> None:
try:
wait = WebDriverWait(driver, TIMEOUT)
shadow_root = driver.find_element(By.ID, "usercentrics-root").shadow_root
button = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]")
))
click(driver, button)
except Exception:
pass # Ignore if cookie popup isn't present
# Scrape EPC Rating from individual listing
def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
driver.get(listing_url) # Open property details page
try:
epc_element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.main-content .z3kgis3 ._1vhryas0 ._8lgu4x1 div:nth-child(3) div'))
)
return etext(epc_element) # Extract EPC rating text
except TimeoutException:
return "N/A" # Return "N/A" if EPC Rating is missing
# Scrape data from the search results page
def scrape_page(driver: WebDriver) -> list[dict]:
result = []
for house in get_all(driver, "div[data-testid=result-item]"):
try:
listing_url = house.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
address = etext(house.find_element(By.CSS_SELECTOR, "h2"))
date_sold = etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
house_type = etext(house.find_element(By.CSS_SELECTOR, "div._1pbf8i52 p"))
num_rooms = etext(house.find_element(By.CSS_SELECTOR, "._1pbf8i51 div:nth-child(2) p"))
tenure = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(1) div"))
square_foot = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(2) div"))
# Get EPC Rating from listing page
epc_rating = get_epc_rating(driver, listing_url)
result.append({
"Address": address,
"Date Last Sold": date_sold,
"Property Type": house_type,
"Number of Rooms": num_rooms,
"Tenure": tenure,
"Square Foot": square_foot,
"EPC Rating": epc_rating,
"Listing URL": listing_url
})
except NoSuchElementException:
continue # Skip missing elements
return result
# Main script execution
if __name__ == "__main__":
with Chrome() as driver:
driver.get(URL)
click_through(driver) # Handle cookies
all_results = []
prev_url = ""
npages = 0
while prev_url != driver.current_url: # Stop if pagination stops working (e.g., Cloudflare blocks)
prev_url = driver.current_url
all_results.extend(scrape_page(driver))
click_next(driver)
npages += 1
# Convert results to DataFrame
df = pd.DataFrame(all_results)
# Display results
print(df)
print(f"Processed {npages} pages")
# Save to CSV
df.to_csv("zoopla_data.csv", index=False)
Подробнее здесь: [url]https://stackoverflow.com/questions/79417659/why-cant-i-extract-listings-information[/url]
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия