利用requests库爬取大学导师信息网站并写入excel文件
1.requests库获取导师网站的页面html信息
利用导师的url编码的规律,通过requests库的session会话成功获取每一个页面的html。
from pyquery import PyQuery as pq
import requests
import re
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def getPage(userId,headers):
url_tutor_specific= 'https://daoshi.eol.cn/tutor/' + str(userId)
time.sleep(0.2)
session = requests.Session()
retry = Retry(connect = 5, backoff_factor = 1)
adapter = HTTPAdapter(max_retries = retry)
session.mount('https://', adapter)
session.keep_alive = False
info = session.get(url_tutor_specific, headers = headers)
doc =pq(info.text)
if doc=='':
return
2.pyquery库解析html并提取所需导师的关键信息
通过网站的开发者模式F12,选中每一块所需的导师信息,通过css选择器定位到每一个信息,并将结果进行简单处理传出。
name=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div:nth-child(1) > div:nth-child(1)')
nameOfTutor=name.text()[5:]
if nameOfTutor=='':
nameOfTutor='Null'
sex=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div:nth-child(1) > div:nth-child(2) > span')
sexOfTutor=sex.text()
if sexOfTutor=='':
sexOfTutor='Null'
university=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div:nth-child(2) > div > a')
universityOfTutor=university.text()
if universityOfTutor=='':
universityOfTutor='Null'
school=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div:nth-child(2)')
if school=='':
schoolOfTutor='Null'
else:
schoolOfTutor=school.text().split('\n')[1][5:]
print(nameOfTutor,sexOfTutor,universityOfTutor,schoolOfTutor)
level=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div:nth-child(3) > div')
levelOfTutor=level.text()[3:]
if levelOfTutor=='':
levelOfTutor='Null'
kind=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div:nth-child(3)')
if kind=='':
kindOfTutor='Null'
else:
kindOfTutor=kind.text().split('\n')[1][5:]
field=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div:nth-child(4)')
fieldOfTutor=field.text()[5:]
if fieldOfTutor=='':
fieldOfTutor='Null'
researchArea=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-header.clearfix > div.teacher-jieshao > div.teacher-td.xinxi-less > p')
researchAreaOfTutor=researchArea.text()
if researchAreaOfTutor=='':
researchAreaOfTutor='Null'
phone=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-con > div:nth-child(1) > div.lf-item-con > div:nth-child(1)')
phoneOfTutor=phone.text()[5:]
if phoneOfTutor=='':
phoneOfTutor='Null'
email=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-con > div:nth-child(1) > div.lf-item-con > div:nth-child(2)')
emailOfTutor=email.text()[5:]
if emailOfTutor=='':
emailOfTutor='Null'
address=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-con > div:nth-child(1) > div.lf-item-con > div:nth-child(3)')
addressOfTutor=address.text()[5:]
if addressOfTutor=='':
addressOfTutor='Null'
statement=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-con > div:nth-child(2) > div.lf-item-con > p:nth-child(1)')
statementOfTutor=statement.text()
if statementOfTutor=='':
statementOfTutor='Null'
achievement=doc('body > div.container.clearfix > div.main-lf.clearfix > div.main-lf-con > div:nth-child(3) > div.lf-item-con')
achievementOfTutor=achievement.text()
if achievementOfTutor=='':
achievementOfTutor='Null'
print(levelOfTutor,kindOfTutor,fieldOfTutor,researchAreaOfTutor,phoneOfTutor,emailOfTutor,addressOfTutor)
print(statementOfTutor)
print(achievementOfTutor)
return [nameOfTutor,sexOfTutor,universityOfTutor,schoolOfTutor,levelOfTutor,kindOfTutor,fieldOfTutor,researchAreaOfTutor,phoneOfTutor,emailOfTutor,addressOfTutor,statementOfTutor,achievementOfTutor]
3.利用pandas库存储全部导师信息到excel表格
通过for循环和try,except的方式分批将每个网页提取的导师信息存入DataFrame中,并写入csv文件,最后将csv文件转化成xlsx的格式。
def toExcel(headers):
index=0
df_info=pd.DataFrame(columns=['nameOfTutor','sexOfTutor','universityOfTutor','schoolOfTutor','levelOfTutor','kindOfTutor','fieldOfTutor','researchAreaOfTutor','phoneOfTutor','emailOfTutor','addressOfTutor','statementOfTutor','achievementOfTutor'])
for each in range(1,50000): #调整爬取人数的数量
try:
info=getPage(str(each),headers)
except IndexError:
continue
if info=='':
continue
df_info.loc[index]=info
if each%200==0 and each!=200:
df_info.to_csv('./TutorInfo.csv',mode='a',header=False,index=False)
df_info.drop(df_info.index, inplace=True)
print(each)
elif each==200:
df_info.to_csv('./TutorInfo.csv', mode='a', header=True, index=False)
df_info.drop(df_info.index, inplace=True)
print(each)
if each%800==0:
df_read = pd.read_csv('./TutorInfo.csv', header=0)
df_read.to_excel('./TutorInfo_'+str(each)+'.xlsx', header=True, index=False)
index += 1
df_read=pd.read_csv('./TutorInfo.csv', header=0)
df_read.to_excel('./TutorInfo_total.xlsx',header=True,index=False)
if __name__ == '__main__':
headers={'Host': 'daoshi.eol.cn','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
toExcel(headers)
进一步了解详情请联系modas_lee@foxmail.com