python爬虫练习(2015-2019年各高校历届分数线爬取)

前言

发现一个记录有关2015~2019年各高校历届分数线的网站,先来做爬虫练习
网站链接

分析网站

利用浏览器分析网站

  • 分析学校页面学校界面
  • 得到获取各学校的url
https://api.eol.cn/gkcx/api/?page=1&request_type=1&school_type=6000&size=20&sort=view_total&uri=apigkcx/api/school/hotlists
  • 学校json数据
    学校数据
  • 分析分数页面分数页面
  • 得到获取各学校录取分数的url
# %s填入学校Id
https://static-data.eol.cn/www/2.0/schoolprovinceindex/detial/%s/32/1/1.json
  • 学校录取分数json数据
    在这里插入图片描述

爬虫编写

获取学校

my_url = 'https://api.eol.cn/gkcx/api/?page=%s&request_type=1&school_type=6000&size=20&sort=view_total&uri=apigkcx/api/school/hotlists'
headers = { 
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
import json
import requests
import os
list = []
for i in range(1,58):  
    print(my_url%(i))
    response = requests.get(my_url%(i),headers=headers)
    print(response.status_code)
    json_data = json.loads(response.text)  
    my_json= json_data["data"]["item"]
    for my in my_json:
        li = {"school_id":my["school_id"],
              "belong":my["belong"],
              "city_name":my["city_name"],
              "name":my["name"],
              "nature_name":my["nature_name"],
              "dual_class_name":my["dual_class_name"]}
        list.append(li)
        print(li)
json_data = json.dumps(list, ensure_ascii=False)
# 将得到的数据存入文件school.json中
with open('./school.json', 'w', encoding='UTF-8') as f:
    # print(json_data)
    json.dump(json_data, f, ensure_ascii=False)

获取分数

score_url = 'https://static-data.eol.cn/www/2.0/schoolprovinceindex/detial/%s/32/1/1.json'
scores = []
for my in list:
#     print(score_url%(my["school_id"]))
#     print(my)
    response = requests.get(score_url%(my["school_id"]),headers=headers)
    print(response.status_code)
    json_data = json.loads(response.text)
    my_json= json_data["data"]["item"]
    print(my_json)

存入mysql数据库

建立数据库
-- school表
DROP TABLE IF EXISTS `school`;
CREATE TABLE `school`  (
  `school_id` int(11) NULL DEFAULT NULL,
  `belong` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `city_name` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `nature_name` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `dual_class_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;

-- score表
DROP TABLE IF EXISTS `score`;
CREATE TABLE `score`  (
  `school_id` int(11) NULL DEFAULT NULL,
  `max` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `min` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `min_section` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `local_batch_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `year` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `zslx_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `average` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `xclevel_name` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
import pymysql.cursors

# 连接数据库
conn = pymysql.connect(host='192.168.153.131',
                       user='root',
                       password='123456',
                       db='score',
                       charset='utf8')
# 创建一个游标
cursor = conn.cursor()
# 插入数据
# 数据直接写在sql后面
sql = "insert into score_test(school_id,belong,city_name,name,nature_name,dual_class_name) values(%s, %s, %s, %s,%s, %s)" 
i = 0
for my in list:
    cursor.execute(sql, [my["school_id"],my["belong"], my["city_name"], my["name"], my["nature_name"], my["dual_class_name"]])
    i+= 1
    print(i)
print('------------------------------------------')
conn.commit()   # 提交,不然无法保存插入或者修改的数据(这个一定不要忘记加上)
print('???????????????????????????????????????????????????')
cursor.close()  # 关闭游标
print('++++++++++++++++++++++++++++++++++++++++++++')
conn.close()  # 关闭连接
import pymysql.cursors

# 连接数据库
conn = pymysql.connect(host='192.168.153.131',
                       user='root',
                       password='123456',
                       db='score',
                       charset='utf8')
# 创建一个游标
cursor = conn.cursor()
# 插入数据
# 数据直接写在sql后面
sql = "INSERT INTO `score`.`score`(`school_id`, `max`, `min`, `min_section`, `local_batch_name`, `year`, `zslx_name`, `average`, `xclevel_name`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)" 
i = 0
score_url = 'https://static-data.eol.cn/www/2.0/schoolprovinceindex/detial/%s/32/1/1.json'
# response = requests.get(score_url%(286),headers=headers)
# print(response.status_code)
# json_data = json.loads(response.text)
# print(json_data=="")
# my_1= json_data["data"]
# print(my_1)
# my_2 = my_1["item"]
# print(my_2)

for my in list:
    response = requests.get(score_url%(my["school_id"]),headers=headers)
    print(response.status_code)
    json_data = json.loads(response.text)
    try:
        my_1= json_data["data"]
#         print(my_1)
        my_2 = my_1["item"]
#         print(my_2)

        for my_score in my_2:
            cursor.execute(sql, [my_score["school_id"],my_score["max"], my_score["min"],
                                 my_score["min_section"], my_score["local_batch_name"], my_score["year"],
                                 my_score["zslx_name"], my_score["average"], my_score["xclevel_name"]])
            i+= 1
            print(i)
        conn.commit()   # 提交,不然无法保存插入或者修改的数据(这个一定不要忘记加上)
    except Exception :
        print(my)
        
print('------------------------------------------')
print('???????????????????????????????????????????????????')
cursor.close()  # 关闭游标
print('++++++++++++++++++++++++++++++++++++++++++++')
conn.close()  # 关闭连接

简单分析

查询2015~2019年分数录取线包含有370~390之间的包含有等级B的学校以及各年录取线

select 
d.*,
(select min from score where local_batch_name like '本科一批' and year like '2019' and school_id = d.school_id and zslx_name = '普通类') '2019',
(select min from score where local_batch_name like '本科一批' and year like '2018' and school_id = d.school_id and zslx_name = '普通类') '2018',
(select min from score where local_batch_name like '本科一批' and year like '2017' and school_id = d.school_id and zslx_name = '普通类') '2017',
(select min from score where local_batch_name like '本科一批' and year like '2016' and school_id = d.school_id and zslx_name = '普通类') '2016',
(select min from score where local_batch_name like '本科一批' and year like '2015' and school_id = d.school_id and zslx_name = '普通类') '2015',
(select xclevel_name from score where local_batch_name like '本科一批' and year like '2019' and school_id = d.school_id and zslx_name = '普通类') '等级'
from (
select DISTINCT t.school_id, t.`name`,t.city_name from (
SELECT  sc.school_id,sc.`name`,sc.city_name,sc.dual_class_name,so.min,so.local_batch_name FROM `score` so
left join school sc on sc.school_id = so.school_id

where so.local_batch_name like '本科一批'  and so.xclevel_name like '%B%'
and sc.dual_class_name = '双一流' 
and so.min > '370'
and so.min < '390'
) t
) d order by d.city_name;

查询结果

评论 11
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值