爬取网页url:https://gkcx.eol.cn/school/search?schoolflag=&fromcoop=bdkp&is_recruitment=1
1.分析网页:
2.获得数据:
import requests
import sqlite3
import json
url = 'https://api.eol.cn/gkcx/api/'
provinces = []
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
def get_data(page):
data = {
'access_token': "",
'admissions': "",
'central': "",
'department': "",
'dual_class': "",
'f211': "",
'f985': "",
'is_doublehigh': "",
'is_dual_class': "",
'is_recruitment': "1",
'keyword': "",
'nature': "",
'page': page,
'province_id': "",
'ranktype': "",
'request_type': 1,
'school_type': "",
'size': 20,
'sort': "view_total",
'type': "",
'uri': "apidata/api/gk/school/lists"
}
return data
for page in range(1,2):
data = get_data(page=page)
response = resquest.post(url=url, headers=head, data=data)
3.存入数据库,这里用sqlit3:
import requests
import sqlite3
import json
url = 'https://api.eol.cn/gkcx/api/'
provinces = []
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
def get_data(page):
data = {
'access_token': "",
'admissions': "",
'central': "",
'department': "",
'dual_class': "",
'f211': "",
'f985': "",
'is_doublehigh': "",
'is_dual_class': "",
'is_recruitment': "1",
'keyword': "",
'nature': "",
'page': page,
'province_id': "",
'ranktype': "",
'request_type': 1,
'school_type': "",
'size': 20,
'sort': "view_total",
'type': "",
'uri': "apidata/api/gk/school/lists"
}
return data
conn = sqlite3.connect('university.db')
c = conn.cursor()
def create_table(province): # 创建表的函数
sql = f'create table {province} (address TEXT,city_name TEXT, county_name TEXT, dual_class_name TEXT, school_name TEXT, nature_name TEXT, level_name TEXT, type_name TEXT)'
c.execute(sql)
conn.commit()
def insert_into(province, address, city_name, county_name, dual_class_name, school_name, nature_name, level_name, type_name): # 插入数据的函数
sql = f'insert into {province} values("{address}","{city_name}","{county_name}","{dual_class_name}","{school_name}","{nature_name}","{level_name}","{type_name}")'
c.execute(sql) # 这里必须加上引号,不然会报错。区别就是: 福建 和 “福建” 的区别。
conn.commit()
for page in range(1,101):
data = get_data(page=page)
response = requests.post(url=url, headers=head, data=data)
for i in range(19): # 获取所需字段数据
province = response.json()['data']['item'][i]['province_name']
address = response.json()['data']['item'][i]['address']
city_name = response.json()['data']['item'][i]['city_name']
county_name = response.json()['data']['item'][i]['county_name']
dual_class_name = response.json()['data']['item'][i]['county_name']
school_name = response.json()['data']['item'][i]['name']
nature_name = response.json()['data']['item'][i]['nature_name']
level_name = response.json()['data']['item'][i]['level_name']
type_name = response.json()['data']['item'][i]['type_name']
if province not in provinces: # 判断该省的表是已创建
provinces.append(province) # 没创建则创建该省的表
create_table(province)
insert_into(province, address, city_name, county_name, dual_class_name, school_name, nature_name, level_name, type_name)
else: # 已创建了则直接插入数据
insert_into(province, address, city_name, county_name, dual_class_name, school_name, nature_name, level_name, type_name)
conn.close()
得到的数据:
问题:
1,用一个空列表来判断该省的表是否已创建,这样无法进行第二次爬取。因为第二次启动列表是空的,会去创建表,但原本的表已经在了,这时会报错。可以把列表改成一个文本文件,存储在本地。
2, 爬取速度慢,可以改成多线程。
3, 无法实现增量爬取。可以在存储数据时多加一个字段,用来判断该学校是否已经在表中。