学信网高校信息爬取

最新推荐文章于 2024-02-23 11:56:33 发布

天元GIS

最新推荐文章于 2024-02-23 11:56:33 发布

阅读量1.8k

点赞数

分类专栏：爬虫文章标签： python

本文链接：https://blog.csdn.net/qq_41527968/article/details/106052096

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

代码学习

这是学信网高校信息爬取的代码（python）,写的不好，请大家多多包涵并指教，最近比较忙，注释和代码的优化以后有时间再搞。

import requests #爬虫库
from bs4 import BeautifulSoup #html文本解析库
import bs4

def fileclear(file):
    file.seek(0)
    file.truncate()

def gethtml(url):
    html=requests.get(url,timeout=30)
    page=html.text
    results=BeautifulSoup(page,"html.parser")#文本解析
    return results

def get_trs(table):
    if isinstance(table,bs4.element.Tag):
        trs=table('tr')
    else:
        trs=[]
    return trs

def get_tds(table):
    if isinstance(table,bs4.element.Tag):
        tds=table('td')
    else:
        tds=[]
    return tds

def gettime(name):
    html=requests.get("https://baike.baidu.com/item/"+name,timeout=30,headers={'user-agent':'Mozilla/5.0'})
    html.encoding='utf-8'
    demo=html.text
    results=BeautifulSoup(demo,"html.parser")#文本解析
    context=(results.find('div',"basic-info cmn-clearfix").text.replace('\n',''))
    #print(context)
    start=context.find("创办时间")
    time=(context[start+4:start+8])
    return time

def nametolocation1(name,city):
    try:
        html=requests.get("https://restapi.amap.com/v3/place/text?keywords="+name+"&types=高等院校&city="+city+"&offset=1&page=1&extensions=base&output=XML&key=自己的key",timeout=30)
        demo=html.text
        #print(demo)
        results=BeautifulSoup(demo,"html.parser")#文本解析
        location=results.find('location').text
    except:
        location="0,0"
    return location

def nametolocation2(name,city):
    try:
        html=requests.get("https://restapi.amap.com/v3/geocode/geo?address="+name+"&city="+city+"&output=XML&key=自己的key",timeout=30)
        demo=html.text
        #print(demo)
        results=BeautifulSoup(demo,"html.parser")#文本解析
        location=results.find('location').text
    except:
        location="0,0"
    return location

textfile1=open('F:/毕业设计/数据处理/数据/学信网高校数据.csv',"a", encoding="utf-8")
fileclear(textfile1)

textfile1.write("编号,名称,城市,主管单位,院校类型,层次,一流大学,一流学科,研究生院,满意度,网址,电话,地址,dl经度,dl纬度,poi经度,poi纬度\n")

num=1
for i in range(64):#64
    urlnum=i*20
    results=(gethtml('https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start='+str(urlnum)))
    table=results.find('table')
    for tritem in get_trs(table):
        trtext=str(num)
        if tritem.td is None:
            continue
        else:
            schurl=(tritem.td.a['href'])
            try:
                schoolinfo=gethtml('https://gaokao.chsi.com.cn'+schurl)
                schoolinfolist=schoolinfo.find('div',"mid")('div')
                web_telelist=(schoolinfolist[0])('span')
                weburl=(web_telelist[0].text.replace(' ','').replace('\n','').replace(',','，'))
                telenum=(web_telelist[1].text.replace(' ','').replace('\n','').replace(',','，'))
                place=schoolinfolist[1].text.replace(' ','').replace('\n','').replace(',','，')
            except:
                weburl="无"
                telenum="无"
                place="无"
            num=num+1
        for tditem in get_tds(tritem):
            trtext=trtext+','+tditem.text.replace(' ','').replace('\n','').replace('','yes')
        trtext=trtext+','+weburl+','+telenum+','+place
        schname=trtext.split(',')[1]
        city=trtext.split(',')[2]
        print(city)
        trtext=trtext+','+nametolocation2(place,city)+','+nametolocation1(schname,city)
        print(trtext)
        textfile1.write(trtext.replace('\r','')+"\n")

textfile1.close()

天元GIS

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
学信网高校信息爬取

代码学习这是学信网高校信息爬取的代码（python）,写的不好，请大家多多包涵并指教，最近比较忙，注释和代码的优化以后有时间再搞。import requests #爬虫库from bs4 import BeautifulSoup #html文本解析库import bs4def fileclear(file): file.seek(0) file.truncate()def gethtml(url): html=requests.get(url,timeout=30)
复制链接

扫一扫