爬取考研信息
前言
随着每年考研人数的剧增,手上不握点考研学校的信息怎么行!!!
一、我对爬虫的理解
-
总结
-
所见即所得
尽量不要爬需要的登入的网站,麻烦
多个页面爬取加个进度条吧!
反爬手段越来越高端,爬取信息的同时注意信息安全
二、我喜欢的爬虫工具库说明
1.requests
用于向网站发出请求并获取网页的html代码
相关代码如下:1.引入库2.说明要爬取的链接3.声明头head4.发出requests请求
import requests
getschool_url = 'https://yz.chsi.com.cn/sch/search.do?ssdm=&yxls='
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
r = requests.get(getschool_url,headers = head)
2.lxml
用于将requests获取到的html代码转换成树的结构便于后续结点的获取
from lxml import etree
html = etree.HTML(r.content.decode())
#获取il class=yxk-fliter-list标签下的li
all_li = html.xpath('//ul[@class="yxk-fliter-list"]/li')
3.BeautifulSoup
完全兼容lxml,是一个比lxml更好的工具,也是用于获取某一标签的。
from bs4 import BeautifulSoup
soup=BeautifulSoup(r.content,'lxml')
#找到一个标签为table class为ch-table的结点
table = soup.find('table',class_="ch-table")
二、写爬虫程序的步骤
1.前期工作(这一定是爬虫的重重之重)
对已有的网页进行分析,查找
我的准备工作,首先对我要爬取的网站进行分析
已高考网为例:因为我要获取对应的学校是否是985或者211
当我搜索完学校看到链接地址后,就知道这是个easygame了
通小小的一行代码就能根据我的学校获取到查询学校的链接
url = 'http://college.gaokao.com/schlist/n{}/'.format(name)
然后打开控制台(F12)找到你需要的信息的标签
如此一来:所见即所得
分析完了就可以开始写代码了
爬虫代码写起来是很简单的,根据标签得到相应的文本信息
import requests
from lxml import etree
import time
from bs4 import BeautifulSoup
import numpy as np
from urllib import parse
import urllib.request
from tqdm import tqdm
import json
getschool_url = 'https://yz.chsi.com.cn/sch/search.do?ssdm=&yxls='
#爬取院校地址目录
#ssdm对应
#['11','12','13','14','15','21','22','23','31','32','33','34','35','36','37','41','42','43','44','45','46','50','51','52','53','54','61','62','63','64','65','81','91','71','11','']
def get_ALL_Position():
getschool_url = 'https://yz.chsi.com.cn/sch/search.do?ssdm=&yxls='
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
ssdm = ['11','12','13','14','15','21','22','23','31','32','33','34','35','36','37','41','42','43','44','45','46','50','51','52','53','54','61','62','63','64','65','81','91','71','']
r = requests.get(getschool_url,headers = head)
html = etree.HTML(r.content.decode())
#获取il class=yxk-fliter-list标签下的li
all_li = html.xpath('//ul[@class="yxk-fliter-list"]/li')
li = all_li[0]
option = li.xpath('//select[@class="ch-hide"]/option')
num = 0
school_position = []
for position in option[1:-3]:
num = num+1
school_position.append(position.text.strip())
return school_position,ssdm
#https://yz.chsi.com.cn/sch/search.do?ssdm=11&start=0
#最终爬取网站,每页20个
#ch-page clearfix
#爬取页码
def get_position_page(position_value):
position_value = str(position_value)
url = 'https://yz.chsi.com.cn/sch/search.do?ssdm={}&yxls='.format(position_value)
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
r= requests.get(url,headers = head)
html = etree.HTML(r.content.decode())
a = html.xpath('//ul[@class="ch-page clearfix"]/li/a')
num = int(a[-2].text)
pagelist = np.array(range(1,num+1))
return pagelist
#爬取地址下对应的所有学校的所有信息
#table ch-table
def get_school(position_value):
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
pages = get_position_page(position_value)
school = []
for index,page in tqdm(enumerate(pages)):
index = index*20
index = str(index)
url = 'https://yz.chsi.com.cn/sch/search.do?ssdm={}&start={}'.format(position_value,index)
if position_value == '':
url = 'https://yz.chsi.com.cn/sch/search.do?start={}'.format(index)
r = requests.get(url,headers = head)
soup=BeautifulSoup(r.content,'lxml')
table = soup.find('table',class_="ch-table")
tbody = table.find('tbody')
trs = tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
for index,td in enumerate(tds):
if index==0:
school.append(td.find('a').text.strip())
return school
#获取院校是否是211 985 双一流 本科
#http://college.gaokao.com/schlist/
#/n中国人民大学/
def get_school_information(name_list):
nature = []
no_find = 0
finded = 0
for name in tqdm(name_list):
url = 'http://college.gaokao.com/schlist/n{}/'.format(name)
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.5211 SLBChan/25"}
r = requests.get(url,headers=head)
soup=BeautifulSoup(r.content,'lxml')
div = soup.find('div',class_="scores_List")
try:
lis = div.find('dl').find('dd').find('ul').find_all('li')
school_position = " "
School_characteristics = " "
school_Types =" "
school_Nature = " "
school_network = " "
school_information = {}
school_information["school_name"] = name
for index,li in enumerate(lis):
if index==0:
school_information["school_position"] = li.text[6:].strip()
if index==1:
school_information["school_characteristics"] = li.text[5:].strip()
if index==2:
school_information["school_types"] = li.text[5:].strip()
if index==4:
school_information["school_nature"] = li.text[5:].strip()
if index==5:
school_information["school_network"] = li.text[5:].strip()
finded = finded+1
nature.append(school_information)
except Exception as e:
no_find = no_find+1
return nature,no_find,finded
if __name__ == '__main__':
#获取全部学校信息
schoo_list = get_school('')
#根据学校爬取学校的种类
nature,no_find,finded = get_school_information(schoo_list)
#print(nature,no_find,finded)
#保持为json格式
json_data=json.loads(str(nature).replace("\'","\""))
print("正在转换为json格式")
with open('./schoolinformation'+'.json','w',encoding='UTF-8') as f:
json.dump(json_data,f,ensure_ascii=False)
print("转换完成")
以上就是我的最终代码,爬虫的分析高于代码的书写,最后的数据存储使用json,我本来是准备不用json存储直接使用到服务器端,但是我高估了爬取的速度,因为爬取的速度实在是不可观,以致于我不得不加了tqdm模块用于显示爬取的进度,不然一直看不到还以为内核又挂了呢!
tqdm:一个可以直接作用与数组列表用于可视化的进度条模块
直接两个括号套在for 循环中的循环对象上就行了
from tqdm import tqdm
for name in tqdm(name_list):
最后效果
我前端页面写的不好看,所有渲染之后也不是特别好看!
代码文件以上传到github
https://github.com/loveorheat/WEBenglishsentence