Python爬取热门微博,并存储到MySQL中

目标网站:m.weibo.cn
url的获取可以从浏览器的F12中的network的XHR中找到。

weibo_demo.py:

import requests
import json
from w3lib.html import remove_tags
from mysqlhelper import MySQLHelper
import time

helper = MySQLHelper()
max_page = 50

#设置header
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
}

def get_one_page_info(url):
    #需要:text,comments_count,attitudes_count,reposts_count,created_at ,source,存储到mysql中
    response = requests.get(url=url , headers=headers)

    #json.loads()将json的字符串转化为dict
    res_dict = json.loads(response.text)
    cards_list = res_dict['data']['cards']

    #具体的获取数据
    for card in cards_list:
        if 'mblog' in card:
            text = remove_tags(card['mblog']['text'])
            comments_count = card['mblog']['comments_count']
            attitudes_count = card['mblog']['attitudes_count']
            reposts_count = card['mblog']['reposts_count']
            created_at = card['mblog']['created_at']
            source_a = card['mblog']['source']

            # print(text,comments_count,attitudes_count,reposts_count,created_at,source_a)
            insert_sql = 'INSERT INTO weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)VALUES (%s, %s, %s, %s, %s, %s)'
            data = (source_a, created_at, text, comments_count, attitudes_count, reposts_count)
            helper.execute_insert_sql(insert_sql, data)

            # time.sleep(1)
# CREATE TABLE wb_test(id int primary key auto_increment,source_a varchar(50),created_at varchar(40),`text` text,comments_count int,attitudes_count int,reposts_count int) default charset=utf8;
#truncate table 表名 制空表

if __name__ =='__main__':

    for i in range(max_page):
        print('page ' + str(i + 1) + ' has done!')

        url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D60%26q%3D%E6%96%B0%E5%86%A0%E7%96%AB%E6%83%85%26t%3D0&page_type=searchall%page={}'.format(i + 1)
        get_one_page_info(url)

mysqlhelper.py:

import pymysql

class MySQLHelper(object):
    def __init__(self):
        self.conn = pymysql.connect(host='localhost', port=3306, db='wb', user='root', passwd='123456', charset='utf8')
        #cursor游标,类似与yeild生成器
        self.cursor = self.conn.cursor()

    def execute_insert_sql(self, sql, data):
        self.cursor.execute(sql, data)
        self.conn.commit()

    def __del__(self):
        self.cursor.close()
        self.conn.close()

if __name__ =='__main__':
    #实例化
    helper = MySQLHelper()
    insert_sql = 'INSERT INTO weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)VALUES (%s, %s, %s, %s, %s, %s)'
    data = ('mi', '2020-4-22', '今天天气好', 2, 3, 5)
    helper.execute_insert_sql(insert_sql, data)

运行结果:
在这里插入图片描述

2020.4.30报错:
后来在运行中发现会报pymysql.err.InternalError: (1366, "Incorrect string value: '\\xF0\\x9F\\x98\\xB7 ' for column 'text' at row 1")错误,这个错误发生在当需要将特殊字符写入MySQL时。此时需要在mysqlhelper.py中加上self.cursor.execute("SET NAMES utf8mb4;")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值