建议使用单进程爬虫,因为爬取研招网信息是数据处理花费的时间多,数据量大,而不是爬取数据花费的画,所以如果多进程,那么不同进程必须将数据放置在不同的文件中,之后将数据在汇总也要花费大量的时间,经过实际测试,多进程需要5·7天完成,单进程需要1·2天完成。
代码可以直接运行,需要一个包含学科代码的文件txt文件 ,该文件在网页https://yz.chsi.com.cn/zsml/pages/getZy.jsp
上,可以直接把他放在一个txt文件中(命名为subject.txt),放在项目文件夹内即可
程序会创建E:/yanzhaowang2.xlsx文件,存储运行结果,项目运行后得到的是研招网上所有的数据信息。
其结果示例如下
然后可以利用excel文件的筛选功能进行选择
# coding:utf-8
import requests
from bs4 import BeautifulSoup
from lxml import etree
import xlsxwriter
from multiprocessing import Pool
from requests.exceptions import ConnectionError
import multiprocessing
# 不用数据库的情况下
# 多进程中,不能同时存一个excel文件的不同的sheet,因为一个excel文件只能有一个进程处理,所以采用多进程获取数据,将获取的数据存入到一个list中,再将不同list的数据存到excel文件中
num = 1 # 当前插入的行数
def getschool (ssdm,yjxkdm):
# 由省市代码和一级学科代码确定的网页
soup = getpost(ssdm, '', yjxkdm, '')
listschool = []
# 先获取第一页的内容
for i in soup.find_all("td"):
text = i.find('a')
# 如果text不为空,输出
if text:
listschool.append(text.get_text()[7:])
# print(text.get_text()[7:])
# 如果有第二页,则循环从第二页开始获取
if len(soup.find_all(href='#')) > 1:
# print(soup.find_all(href='#')[-2].get_text())
pagenum = int(soup.find_all(href='#')[-2].get_text())
#得到页码数量
for pagei in range(1, pagenum):
soup = getpost(ssdm, '', yjxkdm, str(pagei+1))
for i in soup.find_all("td"):
text = i.find('a')
# 如果text不为空,输出
if text:
listschool.append(text.get_text()[7:])
# print(text.get_text()[7:])
# print(listschool)
return listschool
def getmajor (ssdm,dwmc,yjxkdm):
# r = requests.post(url)
# soup = BeautifulSoup(r.content, "lxml")
soup = getpost(ssdm, dwmc, yjxkdm, '')
# print(ssdm+dwmc+yjxkdm)
listmajor = []
for i in soup.find_all("td", class_="ch-table-center"):
text = i.find('a')
# 如果text不为空,输出
if text:
if text.get_text() == '查看':
listmajor.append('https://yz.chsi.com.cn/' + text.get('href'))
# print(text.get('href'))
if len(soup.find_all(href='####')) > 1:
# print(soup.find_all(href='#')[-2].get_text())
pagenum = int(soup.find_all(href='####')[-2].get_text()) #得到页码数量
for pagei in range(1, pagenum):
soup = getpost(ssdm, dwmc, yjxkdm, str(pagei+1))
for i in soup.find_all("td", class_="ch-table-center"):
text = i.find('a')
# 如果text不为空,输出
if text:
if text.get_text() == '查看':
listmajor.append('https://yz.chsi.com.cn/' + text.get('href'))
# print(text.get('href'))
return listmajor
def getsubject (url, province):
r = requests.post(url)
soup = BeautifulSoup(r.content, "lxml")
# 获取招生条件
condition = soup.find_all(class_='zsml-summary')
# 招生单位
# print(condition[0].get_text())
# worksheet.write()
# # 考试方式
# print(condition[1].get_text())
# # 院系所
# print(condition[2].get_text())
# # 跨专业
# print(condition[3].get_text())
# # 专业
# print(condition[4].get_text())
# # 学习方式
# print(condition[5].get_text())
# # 研究方向
# print(condition[6].get_text())
# # 指导老师
# print(condition[7].get_text())
# # 拟招人数
# print(condition[8].get_text())
# 获取考试范围
results = soup.find_all(class_='zsml-res-items')
global num
for nunmre in range(len(results)):
for numco in range(len(condition)):
# 插入该专业的招生条件信息
worksheet.write(num, numco+1, condition[numco].get_text())
# 插入该专业的考试范围
re = results[nunmre].find_all("td")
for numsuj in range(len(re)):
worksheet.write(num, numsuj+10, re[numsuj].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
print(condition[2].get_text()+condition[4].get_text()+condition[6].get_text())
worksheet.write(num, 0, province)
num = num + 1
# for i in range(len(results)):
#
# re = results[i].find_all("td")
# # 政治
# print(re[0].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
# # 外语
# print(re[1].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
# # 业务科一
# print(re[2].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
# # 业务科二
# print(re[3].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
def getpost (ssdm,dwmc,yjxkdm,pageno):
# 参数说明 ssdm 省市代码,dwmc 单位名称,yjxkdm 一级学科代码,pageno 页码
# 针对第二页的,无法直接获取链接,需要post请求pageno
url1 = 'https://yz.chsi.com.cn/zsml/queryAction.do' #在查询院校中,用此链接请求getschool
url2 = 'https://yz.chsi.com.cn/zsml/querySchAction.do' #在查询专业中,用此链接请求getmajor
if dwmc == '':
url = url1
else:
url = url2
header = {}
header['Accept'] = '*/*'
header['Accept-Encoding'] = 'gzip, deflate, br'
header['Accept-Language'] = 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
header['Connection'] = 'keep-alive'
header['Content-Length'] = '58'
header['Content-Type'] = 'application/x-www-form-urlencoded'
header['Host'] = 'yz.chsi.com.cn'
header['Origin'] = 'https://yz.chsi.com.cn'
header['Referer'] = 'https://yz.chsi.com.cn/zsml/queryAction.do?ssdm=11&dwmc=&mldm=&mlmc=&yjxkdm=0812&zymc=&xxfs='
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
header['X-Requested'] = 'XMLHttpRequest'
data = {'ssdm': ssdm, 'dwmc': dwmc, 'yjxkdm': yjxkdm, 'pageno': pageno}
# r = requests.post(url, data=data, headers=header)
# htmlcontent = r.content.decode('utf-8')
# soup = BeautifulSoup(htmlcontent, 'lxml')
#设置超时重发机制
try:
r = requests.post(url, headers=header, data=data)
if r.status_code == 200:
htmlcontent = r.content.decode('utf-8')
soup = BeautifulSoup(htmlcontent, 'lxml')
return soup
except requests.exceptions.Timeout:
global NETWORK_STATUS
NETWORK_STATUS = False # 请求超时改变状态
if NETWORK_STATUS == False:
'''请求超时'''
for i in range(1, 10):
print('请求超时,第%s次重复请求' % i)
r = requests.post(url, headers=header, data=data, timeout=5)
if r.status_code == 200:
htmlcontent = r.content.decode('utf-8')
soup = BeautifulSoup(htmlcontent, 'lxml')
return soup
return -1 # 当所有请求都失败,返回 -1 ,此时有极大的可能是网络问题或IP被封。
NETWORK_STATUS = True # 判断状态变量
province = [{"mc":"北京市","dm":"11"},{"mc":"天津市","dm":"12"},{"mc":"河北省","dm":"13"},{"mc":"山西省","dm":"14"},{"mc":"内蒙古自治区","dm":"15"},{"mc":"辽宁省","dm":"21"},{"mc":"吉林省","dm":"22"},{"mc":"黑龙江省","dm":"23"},{"mc":"上海市","dm":"31"},{"mc":"江苏省","dm":"32"},{"mc":"浙江省","dm":"33"},{"mc":"安徽省","dm":"34"},{"mc":"福建省","dm":"35"},{"mc":"江西省","dm":"36"},{"mc":"山东省","dm":"37"},{"mc":"河南省","dm":"41"},{"mc":"湖北省","dm":"42"},{"mc":"湖南省","dm":"43"},{"mc":"广东省","dm":"44"},{"mc":"广西壮族自治区","dm":"45"},{"mc":"海南省","dm":"46"},{"mc":"重庆市","dm":"50"},{"mc":"四川省","dm":"51"},{"mc":"贵州省","dm":"52"},{"mc":"云南省","dm":"53"},{"mc":"西藏自治区","dm":"54"},{"mc":"陕西省","dm":"61"},{"mc":"甘肃省","dm":"62"},{"mc":"青海省","dm":"63"},{"mc":"宁夏回族自治区","dm":"64"},{"mc":"新疆维吾尔自治区","dm":"65"}]
# province = [{"mc":"北京市","dm":"11"},{"mc":"天津市","dm":"12"}] #第一次
# province = [{"mc":"河北省","dm":"13"},{"mc":"山西省","dm":"14"}] #第二个
# province = [{"mc":"内蒙古自治区","dm":"15"},{"mc":"辽宁省","dm":"21"},{"mc":"吉林省","dm":"22"},{"mc":"黑龙江省","dm":"23"}] #第三个
# province = [{"mc":"上海市","dm":"31"},{"mc":"江苏省","dm":"32"},{"mc":"浙江省","dm":"33"},{"mc":"安徽省","dm":"34"},{"mc":"福建省","dm":"35"},{"mc":"江西省","dm":"36"},{"mc":"山东省","dm":"37"},{"mc":"河南省","dm":"41"}]
# province = [{"mc":"湖北省","dm":"42"},{"mc":"湖南省","dm":"43"},{"mc":"广东省","dm":"44"},{"mc":"广西壮族自治区","dm":"45"},{"mc":"海南省","dm":"46"},{"mc":"重庆市","dm":"50"},{"mc":"四川省","dm":"51"},{"mc":"贵州省","dm":"52"},{"mc":"云南省","dm":"53"},{"mc":"西藏自治区","dm":"54"},{"mc":"陕西省","dm":"61"},{"mc":"甘肃省","dm":"62"},{"mc":"青海省","dm":"63"},{"mc":"宁夏回族自治区","dm":"64"},{"mc":"新疆维吾尔自治区","dm":"65"}]
# province = [{"mc":"天津市","dm":"12"},{"mc":"河北省","dm":"13"},{"mc":"山西省","dm":"14"},{"mc":"内蒙古自治区","dm":"15"},{"mc":"辽宁省","dm":"21"},{"mc":"吉林省","dm":"22"},{"mc":"黑龙江省","dm":"23"},{"mc":"上海市","dm":"31"},{"mc":"江苏省","dm":"32"},{"mc":"浙江省","dm":"33"},{"mc":"安徽省","dm":"34"},{"mc":"福建省","dm":"35"},{"mc":"江西省","dm":"36"},{"mc":"山东省","dm":"37"},{"mc":"河南省","dm":"41"},{"mc":"湖北省","dm":"42"},{"mc":"湖南省","dm":"43"},{"mc":"广东省","dm":"44"},{"mc":"广西壮族自治区","dm":"45"},{"mc":"海南省","dm":"46"},{"mc":"重庆市","dm":"50"},{"mc":"四川省","dm":"51"},{"mc":"贵州省","dm":"52"},{"mc":"云南省","dm":"53"},{"mc":"西藏自治区","dm":"54"},{"mc":"陕西省","dm":"61"},{"mc":"甘肃省","dm":"62"},{"mc":"青海省","dm":"63"},{"mc":"宁夏回族自治区","dm":"64"},{"mc":"新疆维吾尔自治区","dm":"65"}]
# province = [{"mc":"北京市","dm":"11"}]
# print(province[0]['dm'])
with open("subject.txt", "r", encoding="utf-8") as f:
content = f.read()
subject = eval(content) #获取科目信息
# print(subject[0]['dm'])
baseurl = "https://yz.chsi.com.cn/zsml/queryAction.do?"
baseurl1 = "https://yz.chsi.com.cn/zsml/querySchAction.do?"
# 构造查询地址,由省市+一级学科
# url = baseurl + 'ssdm='+str(province[0]['dm']) +'&dwmc='+'&mldm=&mlmc=&yjxkdm='+str(subject[0]['dm'])+'&zymc=&xxfs='
# print(url)
# print(range(len(subject)))
# 初始化Excel表格
workbook = xlsxwriter.Workbook('E:/yanzhaowang2.xlsx')
worksheet = workbook.add_worksheet('yanzhao')
worksheet.write(0, 0, '所在省市')
worksheet.write(0, 1, '招生单位')
worksheet.write(0, 2, '考试方式')
worksheet.write(0, 3, '院系所')
worksheet.write(0, 4, '跨专业')
worksheet.write(0, 5, '专业')
worksheet.write(0, 6, '学习方式')
worksheet.write(0, 7, '研究方向')
worksheet.write(0, 8, '指导老师')
worksheet.write(0, 9, '拟招人数')
worksheet.write(0, 10, '政治')
worksheet.write(0, 11, '外语')
worksheet.write(0, 12, '业务课一')
worksheet.write(0, 13, '业务课二')
# #测试用例
# province = [{"mc": "北京市", "dm": "11"}]
# subject = [{"mc": "哲学", "dm": "0101"}]
for i in range(len(province)):
# num = 1 # 当前插入的行数
# worksheet = workbook.add_worksheet(str(province[i]['mc']))
# worksheet = workbook.add_worksheet('yanzhao')
# worksheet.cell(1, 1, '所在省市')
# worksheet.cell(1, 2, '招生单位')
# worksheet.cell(1, 3, '考试方式')
# worksheet.cell(1, 4, '院系所')
# worksheet.cell(1, 5, '跨专业')
# worksheet.cell(1, 6, '专业')
# worksheet.cell(1, 7, '学习方式')
# worksheet.cell(1, 8, '研究方向')
# worksheet.cell(1, 9, '指导老师')
# worksheet.cell(1, 10, '拟招人数')
# worksheet.cell(1, 11, '政治')
# worksheet.cell(1, 12, '外语')
# worksheet.cell(1, 13, '业务课一')
# worksheet.cell(1, 14, '业务课二')
for j in range(len(subject)):
print(str(len(province))+':'+str(i)+str(province[i]['mc'])+str(len(subject))+':'+str(j))
# searchurl = baseurl + 'ssdm='+str(province[i]['dm']) +'&dwmc='+'&mldm=&mlmc=&yjxkdm='+str(subject[j]['dm'])+'&zymc=&xxfs='
# print(searchurl)
listschool = getschool(str(province[i]['dm']), str(subject[j]['dm'])) #得到在这个城市开设这个专业的学校列表
for z in range(len(listschool)):
# 合成带学校学科的链接,可直接进入学校专业界面
# schoolurl = baseurl1 + 'ssdm='+str(province[i]['dm']) +'&dwmc='+str(listschool[z]) +'&mldm=&mlmc=&yjxkdm='+str(subject[j]['dm'])+'&zymc=&xxfs='
# 获取专业范围地址集合
listmajor = getmajor(str(province[i]['dm']), str(listschool[z]), str(subject[j]['dm']))
for k in range(len(listmajor)):
# 进入专业范围内地址
# print(listmajor[k])
getsubject(listmajor[k], province[i]['mc'])
# print(num)
workbook.close()