Pyhon项目开发之爬取百度热搜榜总结

一:需要的模块

import re import time import requests import csv import json import pymysql.cursors import stylecloud import matplotlib.pyplot as plt
二:数据爬取
使用requests库爬取网站源代码

def spider(type):
    url = "https://top.baidu.com/board?tab={}".format(type)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    response = requests.get(url, headers=headers)  # 使用requests库进行爬取
    text = response.text
    write(text, type)  # 进入保存数据方法

三:保存数据
保存数据到txt json csv中

def write(text, type):
    # 正则匹配需要的内容
    obj = re.compile(r'<div class="c-single-text-ellipsis">(?P<names>.*?)</div>.*?'
                     r'<div class="trend_2RttY">.*?<div class="hot-index_1Bl1a">(?P<ranks>.*?)</div>.*?<div class="text_1lUwZ">',
                     re.S)
    res = obj.finditer(text)
    # 循坏打印出来数据,并且存进数组
    for i in res:
        title = i.group("names")
        rank = i.group("ranks")
        data = {}
        data['热搜名字'] = title
        data['热搜指数'] = rank
        data_list.append(data)
        # 存为json
        with open('data_json.json', 'a', encoding='utf-8') as f:
            f.write(json.dumps(data, indent=4, ensure_ascii=False))
            # json.dump(data, f, ensure_ascii=False, indent=4)
        # 存为txt
        with open('data.txt', 'a', encoding='utf-8') as f:
            f.write(title + '\r')
        # 保存为csv
        with open('data_csv.csv', 'a', encoding='utf-8', newline='') as f:
            csvwriter = csv.writer(f)
            # for it in ress:
            dic = i.groupdict()
            csvwriter.writerow(dic.values())
        get_con(title, rank, type)  # 进入数据库方法
    print("-------数据写入完成-------")
    print("-------数据保存成功-------")

四:包爬取的数据保存到数据库中

def get_con(title, rank, type):
    print("开始写入数据库")
    with open('zhanghao.txt') as f:  # 打开txt读取账号密码
        line1 = f.readline()
        line2 = f.readline()
        name = line1.rstrip()
        pwd = line2.rstrip()
    connect = pymysql.connect(host="localhost", port=3306, user="{}".format(name), passwd="{}".format(pwd),
                              charset='utf8')
    cursor = connect.cursor()
    try:
        cursor.execute('create database hotsearch')  # 执行创建数据库账号密码
    except Exception as e:
        pass
    time.sleep(3)
    connect.select_db('hotsearch')

    # 设置游标
    # 创建表这里缺陷是语句不够智能
    SqlLog1 = """
    CREATE TABLE `car` (`name` varchar(255) NOT NULL,`time` datetime NOT NULL,`rank` int(255) NOT NULL) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
    SqlLog2 = """
    CREATE TABLE `documentary` (`name` varchar(255) NOT NULL,`time` datetime NOT NULL,`rank` int(255) NOT NULL) ENGINE=MyISAM DEFAULT CHARSET=utf8;
    """
    SqlLog3 = """
        CREATE TABLE `game` (
  `name` varchar(255) NOT NULL,
  `time` datetime NOT NULL,
  `rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;"""
    SqlLog4 = """
        CREATE TABLE `movie` (
  `name` varchar(255) NOT NULL,
  `time` datetime NOT NULL,
  `rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
        """
    SqlLog5 = """
        CREATE TABLE `novel` (
  `name` varchar(255) NOT NULL,
  `time` datetime NOT NULL,
  `rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
        """
    SqlLog6 = """
        CREATE TABLE `teleplay` (
  `name` varchar(255) NOT NULL,
  `time` datetime NOT NULL,
  `rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
        """
    SqlLog7 = """
        CREATE TABLE `variety` (
  `name` varchar(255) NOT NULL,
  `time` datetime NOT NULL,
  `rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
        """
    SqlLog8 = """
            CREATE TABLE `cartoon` (
      `name` varchar(255) NOT NULL,
      `time` datetime NOT NULL,
      `rank` int(255) NOT NULL
    ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
            """
    tables = [SqlLog1,
              SqlLog2,
              SqlLog3,
              SqlLog4,
              SqlLog5,
              SqlLog6,
              SqlLog7,
              SqlLog8]
    for i in tables:
        try:
            cursor.execute(i)
        except Exception as e:
            pass

    # 写入数据库
    try:
        if 'novel' in type:
            sql = "INSERT INTO novel(name,rank,time) VALUES (%s,%s,now())"
        elif 'realtime' in type:
            sql = "INSERT INTO realtime(name,rank,time) VALUES (%s,%s,now())"
        elif 'movie' in type:
            sql = "INSERT INTO movie(name,rank,time) VALUES (%s,%s,now())"
        elif 'teleplay' in type:
            sql = "INSERT INTO teleplay(name,rank,time) VALUES (%s,%s,now())"
        elif 'variety' in type:
            sql = "INSERT INTO variety(name,rank,time) VALUES (%s,%s,now())"
        elif 'documentary' in type:
            sql = "INSERT INTO documentary(name,rank,time) VALUES (%s,%s,now())"
        elif 'car' in type:
            sql = "INSERT INTO car(name,rank,time) VALUES (%s,%s,now())"
        elif 'game' in type:
            sql = "INSERT INTO game(name,rank,time) VALUES (%s,%s,now())"
        data = (title, rank)
        cursor.execute(sql, data)
        # 调用执行
        connect.commit()
    except Exception as e:
        print("插入失败")
        # 回滚
        connect.rollback()
    print('成功插入', cursor.rowcount, '条数据')
    # 查询数据放入数组

五:将数据从数据库中查询出来进行分析

select = "select name,rank from {}".format(type)
    cursor.execute(select)
    select_res = cursor.fetchall()  # 结果为元组
    nm = []
    rk = []
    for x in select_res:
        nm.append(x[0])
        rk.append(x[1])
    # 开始分析,设置图表的尺寸样式
    plt.rcParams['figure.figsize'] = (19.0, 12.0)  # 设置尺寸
    plt.rcParams['image.interpolation'] = 'nearest'  # 设置style
    plt.rcParams['image.cmap'] = 'gray'  # 设置 颜色 style
    plt.rcParams['font.family'] = ['sans-serif']  # 设置字体样式
    plt.rcParams['font.sans-serif'] = ['SimHei']
    # X,Y轴的数据
    plt.bar(nm, rk)
    plt.xticks(rotation=-18)  # 标签旋转18度
    plt.xlabel("百度热点", fontsize="5")  # 设置字体大小
    plt.ylabel("热度指数", fontsize="5")
    # plt.xticks(fontsize=20,)#设置刻度大小
    # plt.yticks(fontsize=20)
    plt.show()
    print("分析完成")

六:生成一个词云

def ciyun():
    stopwords = open('data.txt', encoding='utf-8').read().split('\n')
    stylecloud.gen_stylecloud(file_path='data.txt',
                              font_path='C:\Windows\Fonts\msyhbd.ttc',
                              output_name='CIYUN.png',
                              icon_name='fas fa-question-circle',
                              size=500,
                              custom_stopwords=stopwords)
    print("-------词云制作完成-------")

七:运行结果
在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值