Код: Выделить всё
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
def fetch_scholar_links_from_url(url: str) -> pd.DataFrame:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
r = s.post(url, data={'json': '1'})
soup = bs(r.json()['B'], 'html.parser')
works = [
('https://scholar.google.com' + x.get('href'))
for x in soup.select('a')
if 'javascript:void(0)' not in x.get('href') and len(x.get_text()) > 7
]
df = pd.DataFrame(works, columns=['Link'])
return df
url = 'https://scholar.google.ca/citations?user=iYN86KEAAAAJ&hl=en'
df_links = fetch_scholar_links_from_url(url)
print(df_links)
Однако, если я выполню ту же функцию для извлечения ссылок на статьи (это пример с первый ссылку в извлеченном списке ссылок),
Код: Выделить всё
url2 = df_links.iloc[0]['Link']
df_links_2 = fetch_scholar_links_from_url(url2)
print(df_links_2)
Код: Выделить всё
traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/requests/models.py", line 974, in json
return complexjson.loads(self.text, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/json/__init__.py", line 346, in loads
return _default_decoder.decode(s)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/my_username/my_folder/collect_urls_papers_per_author.py", line 73, in
df_links_2 = fetch_scholar_links_from_url(url2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/my_username/my_folder/collect_urls_papers_per_author.py", line 55, in fetch_scholar_links_from_url
soup = bs(r.json()['B'], 'html.parser')
^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/requests/models.py", line 978, in json
raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)
requests.exceptions.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Просто для ясности, в моем примере:
Код: Выделить всё
url2 = df_links.iloc[0]['Link']
Код: Выделить всё
https://scholar.google.com/citations?view_op=view_citation&hl=en&user=iYN86KEAAAAJ&citation_for_view=iYN86KEAAAAJ:kNdYIx-mwKoC
Код: Выделить всё
df_links_2 = fetch_scholar_links_from_url(url2)
print(df_links_2)
Код: Выделить всё
https://proceedings.neurips.cc/paper_files/paper/2014/hash/5ca3e9b122f61f8f06494c97b1afccf3-Abstract.html
Код: Выделить всё
https://scholar.google.com/citations?view_op=view_citation&hl=en&user=iYN86KEAAAAJ&citation_for_view=iYN86KEAAAAJ:kNdYIx-mwKoC
Подробнее здесь: https://stackoverflow.com/questions/792 ... r-profiles
Мобильная версия