爬取网站学校各种信息和简介
import json
import requests
import random
import time
import pymysql
#创建连接MYSQL的类
class TestMysql:
#初始化变量
def __init__(self,username,host,passwd,database):
self.username = username
self.host = host
self.passwd = passwd
self.database = database
#创建数据库连接
def conn_mysql(self):
conn = pymysql.connect(user=self.username,host=self.host,password=self.passwd,db=self.database,cursorclass=pymysql.cursors.DictCursor)
return conn
#关闭数据库的提示信息
def close_mysql(self):
print("MySQL is Closed")
# 查询数据
def get_data(self):
self.conn = self.conn_mysql()
cur = self.conn.cursor()
sql = "select school_id from school"
cur.execute(sql)
results = cur.fetchall()
return results
def add_data(self,data):
self.conn = self.conn_mysql()
cur = self.conn.cursor()
clicks = data['clicks']
is_211 = data['is_211']
is_985 = data['is_985']
is_ads = data['is_ads']
is_zihuaxian = data['is_zihuaxian']
province_area = data['province_area']
province_name = data['province_name']
rk_rank = data['rk_rank']
school_id = data['school_id']
school_name = data['school_name']
syl = data['syl']
type_name = data['type_name']
type_school_name = data['type_school_name']
sql = """INSERT INTO SCHOOL(school_name,school_id,clicks,is_211,is_985,is_ads,is_zihuaxian,province_area,province_name,rk_rank,syl,type_name,type_school_name)
VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"""%(school_name,school_id,clicks,is_211,is_985,is_ads,is_zihuaxian,province_area,province_name,rk_rank,syl,type_name,type_school_name)
cur.execute(sql)
# 提交到数据库执行
self.conn.commit()
def add_data2(self,data):
self.conn = self.conn_mysql()
cur = self.conn.cursor()
code = data['code']
degree_name = data['degree_name']
degree_type = data['degree_type']
level1 = data['level1']
level1_name = data['level1_name']
level2 = data['level2']
level2_name = data['level2_name']
spe_id = data['spe_id']
special_name = data['special_name']
sql = """INSERT INTO SPECIALLIST(special_name,code,degree_name,degree_type,level1,level1_name,level2,level2_name,spe_id)
VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')"""%(special_name,code,degree_name,degree_type,level1,level1_name,level2,level2_name,spe_id)
cur.execute(sql)
# 提交到数据库执行
self.conn.commit()
def update_schoolData(self,data,id):
self.conn = self.conn_mysql()
cur = self.conn.cursor()
school_phone = data['phone']['school_phone']
print(school_phone)
zhaoban_phone = data['phone']['zhaoban_phone']
print(zhaoban_phone)
school_site = data['site']['school_site']
print(school_site)
zhaoban_site = data['site']['zhaoban_site']
print(zhaoban_site)
if len(data['school_email']) > 0:
school_email = data['school_email'][0]
else:
school_email = "暂无"
print(school_email)
intro = data['intro']
print(intro)
num_master = data['num_master']
content_id = data['content_id']
print(num_master)
num_doctor = data['num_doctor']
print(num_doctor)
num_subject = data['num_subject']
print(num_subject)
num_lab = data['num_lab']
print(num_lab)
create_date = data['create_date']
print(create_date)
school_address = data['school_address']
print(school_address)
school_id = data['school_id']
# 获取简介信息,简介信息很长放在数据库里需要用text存储,故直接创建txt文件写入,数据库中存入文件名
url = f'https://static.kaoyan.cn/json/article/{content_id}.json'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Connection': 'close'}
proxy = {'http': '33.33.33.10:8118'}
requests.adapters.DEFAULT_RETRIES = 5
response = requests.get(url, headers=headers, proxies=proxy).json()
# 生成文件存储
f = open(str(school_id)+'intro.txt','w',encoding='utf-8')
f.write(response['data']['content'])
f.close()
sql = """UPDATE SCHOOL SET intro='%sintro.txt',num_doctor='%s',num_lab='%s',num_master='%s',num_subject='%s',school_phone='%s',zhaoban_phone='%s',school_address='%s',school_email='%s',school_site='%s',zhaoban_site='%s',create_date='%s'
WHERE school_id = %s"""%(school_id,num_doctor,num_lab,num_master,num_subject,school_phone,zhaoban_phone,school_address,school_email,school_site,zhaoban_site,create_date,school_id)
cur.execute(sql)
# 提交到数据库执行
self.conn.commit()
if __name__ == "__main__":
#数据库
# 定义变量
username = 'root'
host = 'localhost'
passwd = '123456'
database = 'kaoyan'
mysql = TestMysql(username, host, passwd, database)
mysql.conn_mysql()
results = mysql.get_data()
i = 934
while i <= 1000:
school_id = str(results[i]['school_id'])
print(school_id)
#请求网址
url = f'https://static.kaoyan.cn/json/school/{school_id}/info.json'
# 构建headers请求头
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36','Connection': 'close'}
# 获取网页的json数据
proxy = {'http': '33.33.33.10:8118'}
requests.adapters.DEFAULT_RETRIES = 5
response = requests.get(url, headers=headers,proxies=proxy).json()
# 用res变量接收返回的数据
res = response['data']
print(res)
mysql.update_schoolData(res,school_id)
i += 1
time.sleep(random.uniform(0.5, 5.5))
print("爬取完成")