Скрипт останавливается через некоторое время (WebScraping)Python

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Скрипт останавливается через некоторое время (WebScraping)

Сообщение Anonymous »

Я пытаюсь парсить сайт https://jamabandi.nic.in/land%20records/NakalRecord

Хотя для некоторых районов это работает нормально.

Но для этого он останавливается

Введите значение округа как 21
И значение деревни 1
Tehsil=8
Деревня номер 1

Может кто-нибудь взглянуть на нее

Error :
Traceback (most recent call last):
File "/Users/jatin/opt/anaconda3/lib/python3.9/site-packages/urllib3/connection.py", line 174, in _new_conn
conn = connection.create_connection(
File "/Users/jatin/opt/anaconda3/lib/python3.9/site-packages/urllib3/util/connection.py", line 72, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "/Users/jatin/opt/anaconda3/lib/python3.9/socket.py", line 954, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known

During handling of the above exception, another exception occurred:

#!/usr/bin/env python
# coding: utf-8

# In[1]:

import pandas as pd
import time

# In[2]:

import requests as re
import os
from io import StringIO
import requests
from bs4 import BeautifulSoup as BS

# In[3]:
def clean_links(link):
return link.split(",")[1].split(')')[0].replace("'","")
def links_all_func(soup):

table = soup.find_all('table')

# StringIO(str(table))
# print(table)
df_start=pd.read_html(StringIO(str(table)))[1]
div_col_lg_12 = soup.find('div', class_='col-lg-12')
# print(soup)
# print(div_col_lg_12)
df_start=pd.read_html(str(table))[1]
# print(df_start)
# Find links within the selected div
links_within_div = div_col_lg_12.find_all('td')

all_links=[]
# # Perform actions on the links or retrieve their attributes
for link in links_within_div:
k=link.find_all('a')
if len(k)>0:
new_link=(k[0]['href'])
all_links.append(new_link)

return all_links,df_start

def clean_func(khatoni):
khatoni = [str(element) for element in khatoni]
cleanedkhatoni = [str(value) for value in khatoni if str(value) != 'null' and not (isinstance(value, float) and math.isnan(value))]
if len(cleanedkhatoni)==0:
return "NA"
return cleanedkhatoni[0]
def data_avail(df_owner):
new_row = {}
for col in df_owner.columns:
unique_values = {val for val in df_owner[col] if pd.notna(val)}
new_row[col] = ', '.join(str(val) for val in unique_values)

new_df = pd.DataFrame([new_row])
return new_df

def change_village(district_id,tehsil_id):
print("Village_Changed")
r = requests.get(URL)
soup = BS(r.content, "html.parser")

form = get_aspnet_form(soup)
districts, district_event_target = get_options(soup, "district")

print("Scraping from district %s:" % district_name)
form["__EVENTTARGET"] = district_event_target
form[district_event_target] = district_id
soup = BS(requests.post(URL, data=form).content, "html.parser")

form = get_aspnet_form(soup)
tehsils, tehsil_event_target = get_options(soup, "tehsil")
print("Scraping from tehsil %s:" % tehsil_name)
form["__EVENTTARGET"] = tehsil_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
soup = BS(requests.post(URL, data=form).content, "html.parser")

form = get_aspnet_form(soup)
return soup,form

# In[4]:

def change_tehsil(district_id):
print("Tehsil Changed")
r = requests.get(URL)
soup = BS(r.content, "html.parser")

form = get_aspnet_form(soup)
districts, district_event_target = get_options(soup, "district")

print("Scraping from district %s:" % district_name)
form["__EVENTTARGET"] = district_event_target
form[district_event_target] = district_id
soup = BS(requests.post(URL, data=form).content, "html.parser")

form = get_aspnet_form(soup)
return soup,form

# In[5]:

def change_district():
print("District Changed")
r = requests.get(URL)
soup = BS(r.content, "html.parser")

form = get_aspnet_form(soup)
return soup,form

# In[6]:

def name_func(soup,district_id,village_id,tehsil_id,period_id,name_of_owner):

len_new=0
links_all,df_start=links_all_func(soup)
new_df=pd.DataFrame()
dfs = []
j=0
for i in links_all:
df_owner=0
time.sleep(1)
arg=clean_links(i)
form = get_aspnet_form_new(soup,arg)
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id
form[village_event_target] = village_id
form[owner_event_target] = "1"
search_new='ctl00$ContentPlaceHolder1$txtsearch:'
form[search_new]=" "
placeholder_new='ctl00$ContentPlaceHolder1$b'
form[placeholder_new]='RdbtnOwner'
form[name_event_target]=name
k=(requests.post(URL, data=form))
cookie=k.cookies.get('jamabandiID')
# print(cookie)
# print(form)
page_source=get_new_request(cookie)
# print(page_source)
df_new1=df_start.iloc[j:j+1]
# print(df_new1)
df = pd.read_html(page_source, attrs={'id': 'GridView1'})[0]
df2 = pd.read_html(page_source)
# print(df)
new_df=func_dict(df2)
# print(new_df)
# print(df_start)
result_df = pd.concat([df_new1,new_df,df], axis=1)
df_owner=result_df
df_owner['district']=district_name
df_owner['tehsil']=tehsil_name
df_owner['village_name']=village_name
df_owner['year']=period_id
kahtoni=clean_func(list(df_owner['Khatoni'].unique()))
khewat=clean_func(list(df_owner['Khewat'].unique()))
hissa=clean_func(list(df_owner['Hissa'].unique()))
villagenew=clean_func(list(df_owner['village_name'].unique()))
villagenew=villagenew.strip()
hissa_new = hissa.replace("/", "-")
# print(hissa_new)
khatoni_new=kahtoni.replace
name_new=name.replace(" ","_")
name1=name_new.split("_")[0]
key=str(name_of_owner)+"-"+str(hissa)
df_owner['key']=key
df_owner=data_avail(df_owner)
# df_owner.to_csv('df_owner_'+str(j)+'.csv')
df_owner['new_col']=j
# print(df_owner)
print("Hi")
len_new+=1
dfs.append(df_owner)
# print(df_owner.columns)
# new_df=new_df.append(df_owner)
# new_df=pd.concat([new_df,df_owner],axis=0)
print("Length of new_df is"+str(len(new_df)))
# df_owne
j+=1
new_df = pd.concat(dfs, axis=0, ignore_index=True)
return new_df

# In[7]:

def get_aspnet_form_new(soup,argument):
form = {
"__EVENTTARGET": "ctl00$ContentPlaceHolder1$GridView1",
"__EVENTARGUMENT": argument,
"__LASTFOCUS": "",
"ctl00$ContentPlaceHolder1$a": "RdobtnOwner",
}

forms = soup.find("form", attrs={"id": "aspnetForm"})
for i in forms.find_all("input", recursive=False):
form.update({i.attrs["name"]: i.attrs["value"]})
return form

def get_new_request(cookie):
url = "https://jamabandi.nic.in/land%20records/Nakal_khewat"

payload = {}
files={}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'jamabandiID='+str(cookie),

'referer': 'https://jamabandi.nic.in/land%20records/NakalRecord',
'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

response = requests.request("GET", url, headers=headers, data=payload, files=files)
return response.text
# df = pd.read_html(response.text, attrs={'id': 'GridView1'})[0]
# return df

def func_dict(df2):
dict1={}
row=df2[0].iloc[0]
i=0
while(i list:
types = {
"district": "ddldname",
"tehsil": "ddltname",
"village": "ddlvname",
"period": "ddlPeriod",
"owner": "ddlOwner",
"record": "ListBox1",
"tablenew": "GridView1",
}
ID = "ctl00_ContentPlaceHolder1_%s" % types[type_]
select = soup.find("select", attrs={"id": ID})

result = []
for option in select.find_all("option"):
value = option.attrs["value"]
text = option.text
if value != "-1":
result.append((value, text))

return [result, select.attrs["name"]]

def get_records(soup: BS):
ID = "ctl00_ContentPlaceHolder1_ListBox1"
records = soup.find("select", attrs={"id": ID})

result = []
for record in records.find_all("option"):
name = record.attrs["value"]
if "?" not in name:
result.append(name)

return result

if __name__ == "__main__":
x=int(input("Please enter district value"))
village_check=int(input("Please enter village check value 1 or 0"))
tehsil_check=int(input("Please enter tehsil value"))
print(x)
r = requests.get(URL)
soup = BS(r.content, "html.parser")
# os.mkdir("Haryana_Gurugram")
try:
os.chdir("Haryana_Check_v1")
except:
os.mkdir("Haryana_Check_v1")
form = get_aspnet_form(soup)
districts, district_event_target = get_options(soup, "district")
for district_id, district_name in districts[x:]:
print("Scraping from district %s:" % district_name)
form["__EVENTTARGET"] = district_event_target
form[district_event_target] = district_id

soup = BS(requests.post(URL, data=form).content, "html.parser")

form = get_aspnet_form(soup)
tehsils, tehsil_event_target = get_options(soup, "tehsil")
for tehsil_id, tehsil_name in tehsils[tehsil_check:]:
print("Scraping from tehsil %s:" % tehsil_name)
form["__EVENTTARGET"] = tehsil_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id

soup = BS(requests.post(URL, data=form).content, "html.parser")
if village_check==1:
i=int(input("Please enter village_value"))
i=i*5
village_check=0
elif village_check==0:
i=0

form = get_aspnet_form(soup)

villages, village_event_target = get_options(soup, "village")
for village_id, village_name in villages[i:]:
if i%5==0:
print("Scraping from village %s:" % village_name)
form["__EVENTTARGET"] = village_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[village_event_target] = village_id

soup = BS(requests.post(URL, data=form).content, "html.parser")

form = get_aspnet_form(soup)
periods, period_event_target = get_options(soup, "period")
for period_id, period_name in periods:
total_df=pd.DataFrame()
print("DF IS CREATED")
print("Scraping from period %s:" % period_name)
form["__EVENTTARGET"] = period_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id

soup = BS(requests.post(URL, data=form).content, "html.parser")

form = get_aspnet_form(soup)
owners, owner_event_target = get_options(soup, "owner")
form["__EVENTTARGET"] = owner_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id
form[owner_event_target] = "1"
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
names, name_event_target = get_options(soup, "record")
form["__EVENTTARGET"] = name_event_target
namecount=0
name_not_done=[]
for name_id,name in names:
try:

print(namecount)
if '?' not in name:

print("Scraping name "+str(name))
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id
form[village_event_target] = village_id
form[owner_event_target] = "1"
search_new='ctl00$ContentPlaceHolder1$txtsearch:'
form[search_new]=" "
placeholder_new='ctl00$ContentPlaceHolder1$b'
form[placeholder_new]='RdbtnOwner'
form[name_event_target]=name
soup = BS(requests.post(URL, data=form).content, "html.parser")

df=name_func(soup,district_id,village_id,tehsil_id,period_id,name)
total_df=pd.concat([total_df,df])
total_df.to_csv('total_df'+'_'+str(village_name)+"_"+str(period_name)+".csv")
print("TOtal_df_Lengthis"+len(total_df))

else:
pass
namecount+=1
except:
pass
namecount+=1
name_not_done.append(namecount)

df_not_name=pd.DataFrame(columns=['nameids'])
df_not_name['nameids']=name_not_done
df_not_name[district_event_target] = district_id
df_not_name[tehsil_event_target] = tehsil_id
df_not_name[period_event_target] = period_id
df_not_name[village_event_target] = village_id
df_not_name.to_csv('name_not'+str(district_id)+str(tehsil_id)+str(period_id)+str(village_id)+'.csv')
else:
pass
i+=1

form=0
soup=0
soup,form=change_village(district_id,tehsil_id)

form=0
soup=0
soup,form=change_tehsil(district_id)

form=0
soup=0
soup,form=change_district()

# In[ ]:

village_id

# In[ ]:

villages

# In[ ]:

periods[2:]

# In[ ]:



Подробнее здесь: https://stackoverflow.com/questions/791 ... ebscraping
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение
  • Webscraping - запрос с динамически генерируемым параметром
    Anonymous » » в форуме Python
    0 Ответы
    16 Просмотры
    Последнее сообщение Anonymous
  • WebScraping JWT Python
    Anonymous » » в форуме Python
    0 Ответы
    18 Просмотры
    Последнее сообщение Anonymous
  • AWS Instance EC2 – Ubuntu через некоторое время останавливается
    Anonymous » » в форуме Python
    0 Ответы
    9 Просмотры
    Последнее сообщение Anonymous
  • Фоновое выполнение Фоны останавливается через некоторое время (таймер и мониторинг местоположения не работают)
    Anonymous » » в форуме Android
    0 Ответы
    3 Просмотры
    Последнее сообщение Anonymous
  • Фоновое выполнение Фоны останавливается через некоторое время (таймер и мониторинг местоположения не работают)
    Anonymous » » в форуме Android
    0 Ответы
    3 Просмотры
    Последнее сообщение Anonymous

Вернуться в «Python»