前言
发现一个记录有关2015~2019年各高校历届分数线的网站,先来做爬虫练习
网站链接
分析网站
利用浏览器分析网站
- 分析学校页面
- 得到获取各学校的url
https://api.eol.cn/gkcx/api/?page=1&request_type=1&school_type=6000&size=20&sort=view_total&uri=apigkcx/api/school/hotlists
- 学校json数据
- 分析分数页面
- 得到获取各学校录取分数的url
# %s填入学校Id
https://static-data.eol.cn/www/2.0/schoolprovinceindex/detial/%s/32/1/1.json
- 学校录取分数json数据
爬虫编写
获取学校
my_url = 'https://api.eol.cn/gkcx/api/?page=%s&request_type=1&school_type=6000&size=20&sort=view_total&uri=apigkcx/api/school/hotlists'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
import json
import requests
import os
list = []
for i in range(1,58):
print(my_url%(i))
response = requests.get(my_url%(i),headers=headers)
print(response.status_code)
json_data = json.loads(response.text)
my_json= json_data["data"]["item"]
for my in my_json:
li = {"school_id":my["school_id"],
"belong":my["belong"],
"city_name":my["city_name"],
"name":my["name"],
"nature_name":my["nature_name"],
"dual_class_name":my["dual_class_name"]}
list.append(li)
print(li)
json_data = json.dumps(list, ensure_ascii=False)
# 将得到的数据存入文件school.json中
with open('./school.json', 'w', encoding='UTF-8') as f:
# print(json_data)
json.dump(json_data, f, ensure_ascii=False)
获取分数
score_url = 'https://static-data.eol.cn/www/2.0/schoolprovinceindex/detial/%s/32/1/1.json'
scores = []
for my in list:
# print(score_url%(my["school_id"]))
# print(my)
response = requests.get(score_url%(my["school_id"]),headers=headers)
print(response.status_code)
json_data = json.loads(response.text)
my_json= json_data["data"]["item"]
print(my_json)
存入mysql数据库
建立数据库
-- school表
DROP TABLE IF EXISTS `school`;
CREATE TABLE `school` (
`school_id` int(11) NULL DEFAULT NULL,
`belong` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`city_name` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`nature_name` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`dual_class_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
-- score表
DROP TABLE IF EXISTS `score`;
CREATE TABLE `score` (
`school_id` int(11) NULL DEFAULT NULL,
`max` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`min` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`min_section` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`local_batch_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`year` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`zslx_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`average` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`xclevel_name` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
import pymysql.cursors
# 连接数据库
conn = pymysql.connect(host='192.168.153.131',
user='root',
password='123456',
db='score',
charset='utf8')
# 创建一个游标
cursor = conn.cursor()
# 插入数据
# 数据直接写在sql后面
sql = "insert into score_test(school_id,belong,city_name,name,nature_name,dual_class_name) values(%s, %s, %s, %s,%s, %s)"
i = 0
for my in list:
cursor.execute(sql, [my["school_id"],my["belong"], my["city_name"], my["name"], my["nature_name"], my["dual_class_name"]])
i+= 1
print(i)
print('------------------------------------------')
conn.commit() # 提交,不然无法保存插入或者修改的数据(这个一定不要忘记加上)
print('???????????????????????????????????????????????????')
cursor.close() # 关闭游标
print('++++++++++++++++++++++++++++++++++++++++++++')
conn.close() # 关闭连接
import pymysql.cursors
# 连接数据库
conn = pymysql.connect(host='192.168.153.131',
user='root',
password='123456',
db='score',
charset='utf8')
# 创建一个游标
cursor = conn.cursor()
# 插入数据
# 数据直接写在sql后面
sql = "INSERT INTO `score`.`score`(`school_id`, `max`, `min`, `min_section`, `local_batch_name`, `year`, `zslx_name`, `average`, `xclevel_name`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
i = 0
score_url = 'https://static-data.eol.cn/www/2.0/schoolprovinceindex/detial/%s/32/1/1.json'
# response = requests.get(score_url%(286),headers=headers)
# print(response.status_code)
# json_data = json.loads(response.text)
# print(json_data=="")
# my_1= json_data["data"]
# print(my_1)
# my_2 = my_1["item"]
# print(my_2)
for my in list:
response = requests.get(score_url%(my["school_id"]),headers=headers)
print(response.status_code)
json_data = json.loads(response.text)
try:
my_1= json_data["data"]
# print(my_1)
my_2 = my_1["item"]
# print(my_2)
for my_score in my_2:
cursor.execute(sql, [my_score["school_id"],my_score["max"], my_score["min"],
my_score["min_section"], my_score["local_batch_name"], my_score["year"],
my_score["zslx_name"], my_score["average"], my_score["xclevel_name"]])
i+= 1
print(i)
conn.commit() # 提交,不然无法保存插入或者修改的数据(这个一定不要忘记加上)
except Exception :
print(my)
print('------------------------------------------')
print('???????????????????????????????????????????????????')
cursor.close() # 关闭游标
print('++++++++++++++++++++++++++++++++++++++++++++')
conn.close() # 关闭连接
简单分析
查询2015~2019年分数录取线包含有370~390之间的包含有等级B的学校以及各年录取线
select
d.*,
(select min from score where local_batch_name like '本科一批' and year like '2019' and school_id = d.school_id and zslx_name = '普通类') '2019',
(select min from score where local_batch_name like '本科一批' and year like '2018' and school_id = d.school_id and zslx_name = '普通类') '2018',
(select min from score where local_batch_name like '本科一批' and year like '2017' and school_id = d.school_id and zslx_name = '普通类') '2017',
(select min from score where local_batch_name like '本科一批' and year like '2016' and school_id = d.school_id and zslx_name = '普通类') '2016',
(select min from score where local_batch_name like '本科一批' and year like '2015' and school_id = d.school_id and zslx_name = '普通类') '2015',
(select xclevel_name from score where local_batch_name like '本科一批' and year like '2019' and school_id = d.school_id and zslx_name = '普通类') '等级'
from (
select DISTINCT t.school_id, t.`name`,t.city_name from (
SELECT sc.school_id,sc.`name`,sc.city_name,sc.dual_class_name,so.min,so.local_batch_name FROM `score` so
left join school sc on sc.school_id = so.school_id
where so.local_batch_name like '本科一批' and so.xclevel_name like '%B%'
and sc.dual_class_name = '双一流'
and so.min > '370'
and so.min < '390'
) t
) d order by d.city_name;