学信网高校信息爬取

代码学习

这是学信网高校信息爬取的代码(python),写的不好,请大家多多包涵并指教,最近比较忙,注释和代码的优化以后有时间再搞。

import requests #爬虫库
from bs4 import BeautifulSoup #html文本解析库
import bs4

def fileclear(file):
    file.seek(0)
    file.truncate()

def gethtml(url):
    html=requests.get(url,timeout=30)
    page=html.text
    results=BeautifulSoup(page,"html.parser")#文本解析
    return results

def get_trs(table):
    if isinstance(table,bs4.element.Tag):
        trs=table('tr')
    else:
        trs=[]
    return trs

def get_tds(table):
    if isinstance(table,bs4.element.Tag):
        tds=table('td')
    else:
        tds=[]
    return tds

def gettime(name):
    html=requests.get("https://baike.baidu.com/item/"+name,timeout=30,headers={'user-agent':'Mozilla/5.0'})
    html.encoding='utf-8'
    demo=html.text
    results=BeautifulSoup(demo,"html.parser")#文本解析
    context=(results.find('div',"basic-info cmn-clearfix").text.replace('\n',''))
    #print(context)
    start=context.find("创办时间")
    time=(context[start+4:start+8])
    return time

def nametolocation1(name,city):
    try:
        html=requests.get("https://restapi.amap.com/v3/place/text?keywords="+name+"&types=高等院校&city="+city+"&offset=1&page=1&extensions=base&output=XML&key=自己的key",timeout=30)
        demo=html.text
        #print(demo)
        results=BeautifulSoup(demo,"html.parser")#文本解析
        location=results.find('location').text
    except:
        location="0,0"
    return location

def nametolocation2(name,city):
    try:
        html=requests.get("https://restapi.amap.com/v3/geocode/geo?address="+name+"&city="+city+"&output=XML&key=自己的key",timeout=30)
        demo=html.text
        #print(demo)
        results=BeautifulSoup(demo,"html.parser")#文本解析
        location=results.find('location').text
    except:
        location="0,0"
    return location

textfile1=open('F:/毕业设计/数据处理/数据/学信网高校数据.csv',"a", encoding="utf-8")
fileclear(textfile1)

textfile1.write("编号,名称,城市,主管单位,院校类型,层次,一流大学,一流学科,研究生院,满意度,网址,电话,地址,dl经度,dl纬度,poi经度,poi纬度\n")

num=1
for i in range(64):#64
    urlnum=i*20
    results=(gethtml('https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start='+str(urlnum)))
    table=results.find('table')
    for tritem in get_trs(table):
        trtext=str(num)
        if tritem.td is None:
            continue
        else:
            schurl=(tritem.td.a['href'])
            try:
                schoolinfo=gethtml('https://gaokao.chsi.com.cn'+schurl)
                schoolinfolist=schoolinfo.find('div',"mid")('div')
                web_telelist=(schoolinfolist[0])('span')
                weburl=(web_telelist[0].text.replace(' ','').replace('\n','').replace(',',','))
                telenum=(web_telelist[1].text.replace(' ','').replace('\n','').replace(',',','))
                place=schoolinfolist[1].text.replace(' ','').replace('\n','').replace(',',',')
            except:
                weburl="无"
                telenum="无"
                place="无"
            num=num+1
        for tditem in get_tds(tritem):
            trtext=trtext+','+tditem.text.replace(' ','').replace('\n','').replace('','yes')
        trtext=trtext+','+weburl+','+telenum+','+place
        schname=trtext.split(',')[1]
        city=trtext.split(',')[2]
        print(city)
        trtext=trtext+','+nametolocation2(place,city)+','+nametolocation1(schname,city)
        print(trtext)
        textfile1.write(trtext.replace('\r','')+"\n")

textfile1.close()
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值