代码学习
这是学信网高校信息爬取的代码(python),写的不好,请大家多多包涵并指教,最近比较忙,注释和代码的优化以后有时间再搞。
import requests #爬虫库
from bs4 import BeautifulSoup #html文本解析库
import bs4
def fileclear(file):
file.seek(0)
file.truncate()
def gethtml(url):
html=requests.get(url,timeout=30)
page=html.text
results=BeautifulSoup(page,"html.parser")#文本解析
return results
def get_trs(table):
if isinstance(table,bs4.element.Tag):
trs=table('tr')
else:
trs=[]
return trs
def get_tds(table):
if isinstance(table,bs4.element.Tag):
tds=table('td')
else:
tds=[]
return tds
def gettime(name):
html=requests.get("https://baike.baidu.com/item/"+name,timeout=30,headers={'user-agent':'Mozilla/5.0'})
html.encoding='utf-8'
demo=html.text
results=BeautifulSoup(demo,"html.parser")#文本解析
context=(results.find('div',"basic-info cmn-clearfix").text.replace('\n',''))
#print(context)
start=context.find("创办时间")
time=(context[start+4:start+8])
return time
def nametolocation1(name,city):
try:
html=requests.get("https://restapi.amap.com/v3/place/text?keywords="+name+"&types=高等院校&city="+city+"&offset=1&page=1&extensions=base&output=XML&key=自己的key",timeout=30)
demo=html.text
#print(demo)
results=BeautifulSoup(demo,"html.parser")#文本解析
location=results.find('location').text
except:
location="0,0"
return location
def nametolocation2(name,city):
try:
html=requests.get("https://restapi.amap.com/v3/geocode/geo?address="+name+"&city="+city+"&output=XML&key=自己的key",timeout=30)
demo=html.text
#print(demo)
results=BeautifulSoup(demo,"html.parser")#文本解析
location=results.find('location').text
except:
location="0,0"
return location
textfile1=open('F:/毕业设计/数据处理/数据/学信网高校数据.csv',"a", encoding="utf-8")
fileclear(textfile1)
textfile1.write("编号,名称,城市,主管单位,院校类型,层次,一流大学,一流学科,研究生院,满意度,网址,电话,地址,dl经度,dl纬度,poi经度,poi纬度\n")
num=1
for i in range(64):#64
urlnum=i*20
results=(gethtml('https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start='+str(urlnum)))
table=results.find('table')
for tritem in get_trs(table):
trtext=str(num)
if tritem.td is None:
continue
else:
schurl=(tritem.td.a['href'])
try:
schoolinfo=gethtml('https://gaokao.chsi.com.cn'+schurl)
schoolinfolist=schoolinfo.find('div',"mid")('div')
web_telelist=(schoolinfolist[0])('span')
weburl=(web_telelist[0].text.replace(' ','').replace('\n','').replace(',',','))
telenum=(web_telelist[1].text.replace(' ','').replace('\n','').replace(',',','))
place=schoolinfolist[1].text.replace(' ','').replace('\n','').replace(',',',')
except:
weburl="无"
telenum="无"
place="无"
num=num+1
for tditem in get_tds(tritem):
trtext=trtext+','+tditem.text.replace(' ','').replace('\n','').replace('','yes')
trtext=trtext+','+weburl+','+telenum+','+place
schname=trtext.split(',')[1]
city=trtext.split(',')[2]
print(city)
trtext=trtext+','+nametolocation2(place,city)+','+nametolocation1(schname,city)
print(trtext)
textfile1.write(trtext.replace('\r','')+"\n")
textfile1.close()