Как проанализировать страницу в Python, если вход перенаправляет [закрыто]Python

Программы на Python
Anonymous
 Как проанализировать страницу в Python, если вход перенаправляет [закрыто]

Сообщение Anonymous »

Я пытаюсь проанализировать страницу, которая защищена входом в систему и паролем. Когда я захожу на страницу example.com/login/ после заполнения поля, я автоматически перенаправляется на страницу example.com/listing/. В то же время я могу анализировать пример.import requests
from bs4 import BeautifulSoup

#This URL will be the URL that your login form points to with the "action" tag.
POST_LOGIN_URL = 'https://example.com/login/'

#This URL is the page you actually want to pull down with requests.
REQUEST_URL = 'https://example/client/?id=R_111

payload = {
'username': 'username',
'pass': 'pass'
}

_headers = {
"user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
"_ga":"GA1.1.2090868915.1717658789",
"_ym_uid":"1717658790316065116",
"_ym_d":"1717658790",
"_ga_1DG8M1CXDE":"GS1.1.1717658789.1.0.1717658795.0.0.0"
}

with requests.Session() as session:
post = session.post(POST_LOGIN_URL, data=payload, headers=_headers)
r = session.get(REQUEST_URL)
print(r.text) #or whatever else you want to do with the request data!

Я получаю содержимое страницы example.com/login/ в ответ:













Email address


Password



Keep Me Logged In

Log In  ›







< /code>
По какой-то причине с использованием результатов GET запроса при повторной авторизации. < /p>
My working solution:
%%time

login_url = 'https://example.com/listing/' # URL for authorization and client listing page
clients_page_url = 'https://example.com/listing/' # Page where clients are located

# Access credentials
payload = {
'email': creds.email,
'password': creds.password
}

# Create a session
with requests.Session() as s:
# Headers to simulate a browser
_headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}

# Perform POST request for authorization
r = s.post(login_url, data=payload, headers=_headers)

if r.ok:
print("Authorization successful!")

# Get content from the clients page
clients_response = s.get(clients_page_url, headers=_headers)

if clients_response.ok:
# Parse the HTML code of the clients page
soup = BeautifulSoup(clients_response.content, 'html.parser')

# Find the table on the page
table = soup.find('table')
if table:
# Convert the table to a DataFrame
result_raw = pd.read_html(str(table), displayed_only=False)[0]

# Convert all column names to lowercase
result_raw.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)

# Rename columns
result_raw.columns = result_raw.columns.to_series().replace(
{
'id': 'client_id',
'date': 'dt',
)

# Convert string '20250123Jan, 23 12:11' to date format
result_raw['dt'] = result_raw['dt'].apply(lambda date_str: datetime.strptime(f"{date_str[:8]} {date_str[-6:]}", '%Y%m%d %H:%M'))

# Change data type for comments to str
result_raw['comment'] = result_raw['comment'].astype(str)

# Extract client IDs from the "ID" column
client_ids = result_raw['client_id'].tolist() # It is assumed that the column is named 'ID'

# List to store results
results = []

for index, row in result_raw.iterrows():
client_id = row['client_id']
client_url = f'https://example.com/client/?id={client_id}'

# Get content from the client page
client_response = s.get(client_url, headers=_headers)

if client_response.ok:
# Parse the HTML code of the client page
client_soup = BeautifulSoup(client_response.content, 'html.parser')

# Find element with href attribute that contains 'tel:'
phone_link = client_soup.find('a', href=lambda x: x and x.startswith('tel:'))

if phone_link:
phone_number = phone_link['href'].replace('tel:+', '') # Remove 'tel:+' from the link
row['phone'] = phone_number # Add phone number to the current row of DataFrame
else:
row['phone'] = "Phone number not found" # If phone number is not found

# Extract name from element with class 'h3 d-inline fw-bold'
name_element = client_soup.find('div', class_='h3 d-inline fw-bold')
if name_element:
row['nom'] = name_element.get_text(strip=True) # Save name in the column 'Nom'
else:
row['nom'] = "Name not found" # If name is not found

else:
row['phone'] = "Error retrieving page" # If client page is unavailable

results.append(row) # Add row to results

# Create final DataFrame from results
webcrm_df = pd.DataFrame(results)
print("\nFinal table:")
print(webcrm_df)
else:
print("Table not found on the page.")
else:
print(f"Failed to retrieve clients page: {clients_response.status_code}")
else:
print(f"Authorization error: {r.status_code}")


Подробнее здесь: https://stackoverflow.com/questions/793 ... -redirects

Вернуться в «Python»