Проблема с нумерацией страниц в моем скрипте PyCharm

Проблема с нумерацией страниц в моем скрипте PyCharm ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Проблема с нумерацией страниц в моем скрипте PyCharm

Цитата

Сообщение Anonymous » 01 дек 2024, 12:19

Я пытался выполнить очистку этого веб-сайта (https://www.datacenters.com/locations/u ... s/virginia), но нумерация страниц не работает. Я перепробовал все что угодно, но не смог заставить это работать. он очистит только первую страницу. Кто-нибудь может помочь? Я использую pycharm.
Конечно, вот точный код, который вы предоставили:

Код: Выделить всё

import requests
from bs4 import BeautifulSoup
import csv

# Base URL of the website
base_url = 'https://www.datacenters.com'
# URL of the main webpage to scrape
main_url = f'{base_url}/locations/united-states/virginia'

def get_data_from_page(url, writer):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
location_tiles = soup.find_all('div', class_='LocationTile__details__sXkB0')

if not location_tiles:
print(f"No location tiles found on {url}")

for tile in location_tiles:
name = tile.find('div', class_='LocationTile__name__NrDKr').text
address = tile.find('div', class_='LocationTile__address__Utj30').text

parent_anchor = tile.find_parent('a', href=True)
if parent_anchor:
relative_link = parent_anchor['href']
link = f'{base_url}{relative_link}'

detail_response = requests.get(link)
if detail_response.status_code == 200:
detail_soup = BeautifulSoup(detail_response.content, 'html.parser')

power_div = detail_soup.find('div', id='power')
power = power_div.find('strong').text.strip() if power_div else 'N/A'

sqf_div = detail_soup.find('div', id='statInfo')
sqf = sqf_div.find('strong').text.strip() if sqf_div else 'N/A'

# Write the data to the CSV file
writer.writerow([name, address, power, sqf, link])
print(f"Scraped data for {name}")
else:
print(f"Failed to retrieve details from {link}. Status code: {detail_response.status_code}")
else:
print(f"No link found for {name}")
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

# Open a CSV file to write the data
with open('datacenters.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header row
writer.writerow(['Name', 'Address', 'Power', 'SQF', 'Link'])

# Start with the first page
page_url = main_url
while True:
print(f"Scraping page: {page_url}")
get_data_from_page(page_url, writer)

# Check if there is a next page link
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')
next_page_button = soup.find('button', class_='Control__control__ijHLR Pagination__pageItem__NsQSw Pagination__symbol__KHv6r')
if next_page_button and not 'Pagination__disabled__FbUC6' in next_page_button['class']:
next_page_button.click()
next_page_link_tag = soup.find('a', href=True)
if next_page_link_tag:
next_page_link = next_page_link_tag['href']
page_url = f"{base_url}{next_page_link}"
print(f"Next page link found: {page_url}")
else:
print("Next page link not found.")
break
else:
print("No more pages.")
break

Не стесняйтесь спрашивать, нужна ли вам дополнительная помощь или изменения!
Я пробовал несколько конфигураций, но постоянно получаю ошибки, указывающие на отсутствие кнопка «Далее».

Подробнее здесь: https://stackoverflow.com/questions/792 ... arm-script

1733044755

Anonymous

Я пытался выполнить очистку этого веб-сайта (https://www.datacenters.com/locations/united-states/virginia), но нумерация страниц не работает. Я перепробовал все что угодно, но не смог заставить это работать. он очистит только первую страницу. Кто-нибудь может помочь? Я использую pycharm.
Конечно, вот точный код, который вы предоставили:
[code]import requests
from bs4 import BeautifulSoup
import csv

# Base URL of the website
base_url = 'https://www.datacenters.com'
# URL of the main webpage to scrape
main_url = f'{base_url}/locations/united-states/virginia'

def get_data_from_page(url, writer):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
location_tiles = soup.find_all('div', class_='LocationTile__details__sXkB0')

if not location_tiles:
print(f"No location tiles found on {url}")

for tile in location_tiles:
name = tile.find('div', class_='LocationTile__name__NrDKr').text
address = tile.find('div', class_='LocationTile__address__Utj30').text

parent_anchor = tile.find_parent('a', href=True)
if parent_anchor:
relative_link = parent_anchor['href']
link = f'{base_url}{relative_link}'

detail_response = requests.get(link)
if detail_response.status_code == 200:
detail_soup = BeautifulSoup(detail_response.content, 'html.parser')

power_div = detail_soup.find('div', id='power')
power = power_div.find('strong').text.strip() if power_div else 'N/A'

sqf_div = detail_soup.find('div', id='statInfo')
sqf = sqf_div.find('strong').text.strip() if sqf_div else 'N/A'

# Write the data to the CSV file
writer.writerow([name, address, power, sqf, link])
print(f"Scraped data for {name}")
else:
print(f"Failed to retrieve details from {link}. Status code: {detail_response.status_code}")
else:
print(f"No link found for {name}")
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

# Open a CSV file to write the data
with open('datacenters.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header row
writer.writerow(['Name', 'Address', 'Power', 'SQF', 'Link'])

# Start with the first page
page_url = main_url
while True:
print(f"Scraping page: {page_url}")
get_data_from_page(page_url, writer)

# Check if there is a next page link
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')
next_page_button = soup.find('button', class_='Control__control__ijHLR Pagination__pageItem__NsQSw Pagination__symbol__KHv6r')
if next_page_button and not 'Pagination__disabled__FbUC6' in next_page_button['class']:
next_page_button.click()
next_page_link_tag = soup.find('a', href=True)
if next_page_link_tag:
next_page_link = next_page_link_tag['href']
page_url = f"{base_url}{next_page_link}"
print(f"Next page link found: {page_url}")
else:
print("Next page link not found.")
break
else:
print("No more pages.")
break
[/code]
Не стесняйтесь спрашивать, нужна ли вам дополнительная помощь или изменения!
Я пробовал несколько конфигураций, но постоянно получаю ошибки, указывающие на отсутствие кнопка «Далее». 

Подробнее здесь: [url]https://stackoverflow.com/questions/79227787/issue-with-pagination-on-my-pycharm-script[/url]