爬虫练习2

爬取网站学校各种信息和简介

import json
import requests
import random
import time
import pymysql

#创建连接MYSQL的类
class TestMysql:

    #初始化变量
    def __init__(self,username,host,passwd,database):
        self.username = username
        self.host = host
        self.passwd = passwd
        self.database = database
    #创建数据库连接
    def conn_mysql(self):
        conn = pymysql.connect(user=self.username,host=self.host,password=self.passwd,db=self.database,cursorclass=pymysql.cursors.DictCursor)
        return conn
    #关闭数据库的提示信息
    def close_mysql(self):
        print("MySQL is Closed")

     # 查询数据
    def get_data(self):
        self.conn = self.conn_mysql()
        cur = self.conn.cursor()
        sql = "select school_id from school"
        cur.execute(sql)
        results = cur.fetchall()
        return results

    def add_data(self,data):
        self.conn = self.conn_mysql()
        cur = self.conn.cursor()

        clicks = data['clicks']
        is_211 = data['is_211']
        is_985 = data['is_985']
        is_ads = data['is_ads']
        is_zihuaxian = data['is_zihuaxian']
        province_area = data['province_area']
        province_name = data['province_name']
        rk_rank = data['rk_rank']
        school_id = data['school_id']
        school_name = data['school_name']
        syl = data['syl']
        type_name = data['type_name']
        type_school_name = data['type_school_name']

        sql = """INSERT INTO SCHOOL(school_name,school_id,clicks,is_211,is_985,is_ads,is_zihuaxian,province_area,province_name,rk_rank,syl,type_name,type_school_name)
                VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"""%(school_name,school_id,clicks,is_211,is_985,is_ads,is_zihuaxian,province_area,province_name,rk_rank,syl,type_name,type_school_name)
        cur.execute(sql)
        # 提交到数据库执行
        self.conn.commit()

    def add_data2(self,data):
        self.conn = self.conn_mysql()
        cur = self.conn.cursor()

        code = data['code']
        degree_name = data['degree_name']
        degree_type = data['degree_type']
        level1 = data['level1']
        level1_name = data['level1_name']
        level2 = data['level2']
        level2_name = data['level2_name']
        spe_id = data['spe_id']
        special_name = data['special_name']

        sql = """INSERT INTO SPECIALLIST(special_name,code,degree_name,degree_type,level1,level1_name,level2,level2_name,spe_id)
                VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')"""%(special_name,code,degree_name,degree_type,level1,level1_name,level2,level2_name,spe_id)
        cur.execute(sql)
        # 提交到数据库执行
        self.conn.commit()

    def update_schoolData(self,data,id):
        self.conn = self.conn_mysql()
        cur = self.conn.cursor()

        school_phone = data['phone']['school_phone']
        print(school_phone)
        zhaoban_phone = data['phone']['zhaoban_phone']
        print(zhaoban_phone)
        school_site = data['site']['school_site']
        print(school_site)
        zhaoban_site = data['site']['zhaoban_site']
        print(zhaoban_site)
        if len(data['school_email']) > 0:
            school_email = data['school_email'][0]
        else:
            school_email = "暂无"
        print(school_email)
        intro = data['intro']
        print(intro)
        num_master = data['num_master']
        content_id = data['content_id']
        print(num_master)
        num_doctor = data['num_doctor']
        print(num_doctor)
        num_subject = data['num_subject']
        print(num_subject)
        num_lab = data['num_lab']
        print(num_lab)
        create_date = data['create_date']
        print(create_date)
        school_address = data['school_address']
        print(school_address)
        school_id = data['school_id']
        # 获取简介信息,简介信息很长放在数据库里需要用text存储,故直接创建txt文件写入,数据库中存入文件名
        url = f'https://static.kaoyan.cn/json/article/{content_id}.json'
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
            'Connection': 'close'}
        proxy = {'http': '33.33.33.10:8118'}
        requests.adapters.DEFAULT_RETRIES = 5
        response = requests.get(url, headers=headers, proxies=proxy).json()
        # 生成文件存储
        f = open(str(school_id)+'intro.txt','w',encoding='utf-8')
        f.write(response['data']['content'])
        f.close()

        sql = """UPDATE SCHOOL SET intro='%sintro.txt',num_doctor='%s',num_lab='%s',num_master='%s',num_subject='%s',school_phone='%s',zhaoban_phone='%s',school_address='%s',school_email='%s',school_site='%s',zhaoban_site='%s',create_date='%s'
                WHERE school_id = %s"""%(school_id,num_doctor,num_lab,num_master,num_subject,school_phone,zhaoban_phone,school_address,school_email,school_site,zhaoban_site,create_date,school_id)
        cur.execute(sql)
        # 提交到数据库执行
        self.conn.commit()


if __name__ == "__main__":
    #数据库
    # 定义变量
    username = 'root'
    host = 'localhost'
    passwd = '123456'
    database = 'kaoyan'
    mysql = TestMysql(username, host, passwd, database)
    mysql.conn_mysql()
    results = mysql.get_data()
    i = 934
    while i <= 1000:
        school_id = str(results[i]['school_id'])
        print(school_id)
        #请求网址
        url = f'https://static.kaoyan.cn/json/school/{school_id}/info.json'
        # 构建headers请求头
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36','Connection': 'close'}
        # 获取网页的json数据
        proxy = {'http': '33.33.33.10:8118'}
        requests.adapters.DEFAULT_RETRIES = 5
        response = requests.get(url, headers=headers,proxies=proxy).json()
        #  用res变量接收返回的数据
        res = response['data']
        print(res)
        mysql.update_schoolData(res,school_id)
        i += 1
        time.sleep(random.uniform(0.5, 5.5))
    print("爬取完成")




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值