Но я не могу найти тег при запуске кода. Я распечатаю родительский тег, и я вижу тег H2 , который я хочу, а также когда я ввожу отладку, я могу получить то, что хочу.
Код: Выделить всё
import time
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
def playwright_get_soup(url, selector_to_wait_for=None, wait_after_page_load=None):
with sync_playwright() as this_playwright:
browser = this_playwright.chromium.launch()
page = browser.new_page()
page.goto(url)
try:
page.wait_for_load_state("load")
if wait_after_page_load:
time.sleep(wait_after_page_load)
except:
pass
if selector_to_wait_for:
page.wait_for_selector(selector_to_wait_for, timeout=15000)
soup = BeautifulSoup(page.content(), "html.parser")
browser.close()
send_readable_formatted_html_dump(soup,
return soup
def parse_product_detail_page(soup):
parent_block = soup.find("div", class_="primary_block")
name_and_id_box = parent_block.find("div", class_="item-box")
print(name_and_id_box) # the h2 tag is visible here
name_and_id_header = name_and_id_box.find("h2", class_="col-xs-6 ")
# import ipdb; ipdb.set_trace() # the h2 tag is also visible here
id_and_raw_name = name_and_id_header.split("#", maxsplit=1) # this is where the program errors out
def scrape_product_detail_page(product_detail_url):
try:
soup = playwright_url_to_soup(product_detail_url, selector_to_wait_for=".item-box")
except:
return None
parsed_data = parse_product_detail_page(soup)
return parsed_data
result = scrape_product_detail_page("https://www.innovation-line.com/four-color-photoimage-products/ventoux-210d-polyester-drawstring-cinch-pack-backpack-907.html")
Подробнее здесь: https://stackoverflow.com/questions/794 ... in-a-print