微博爬取长津湖博文及评论

最新推荐文章于 2024-04-01 16:13:04 发布

呵呵样

最新推荐文章于 2024-04-01 16:13:04 发布

阅读量1k

点赞数 4

分类专栏：笔记文章标签：爬虫 python

本文链接：https://blog.csdn.net/heheyangxyy/article/details/120628824

版权

笔记专栏收录该内容

8 篇文章 0 订阅

订阅专栏

微博爬取长津湖博文及评论

# Changjin Lake film data from Weibo
# @Time: 20211006
# @Author: heheyang

import requests
import json
import re
import pprint
import pandas as pd

def comments_singlePage_crawl(url,headers,comments_info, id):
    """
    评论单页爬取
    :param url:
    :param headers:
    :return:
    """
    # 获取html码
    html = requests.get(url, headers).text
    # json解析html
    html_dict = json.loads(html)
    comments_data = html_dict["data"]["data"]
    for comment in comments_data:
        comments_info["id"].append(id)
        comments_info["date"].append(comment["created_at"])
        # 筛选出text中的文本信息
        text = re.sub("<span(.*?)</span>", "", comment["text"])
        text = re.sub("<a(.*?)</a>", "", text)
        comments_info["text"].append(text)

def weibo_bowen_singelPage_crawl(url,headers,mblog_info,comments_info):
    """
    单页爬取函数
    :param url: 待爬取url
    :param headers: 请求头
    :param mblog_info: mblog信息存储字典
    """
    # 获取html码
    html = requests.get(url,headers).text
    # json解析html
    html_dict = json.loads(html)
    users = html_dict["data"]["cards"]
    # 博文存储
    for user in users:
        mblog = user["mblog"]
        mblog_info["id"].append(mblog["id"])
        mblog_info["date"].append(mblog["created_at"])
        # 筛选出text中的文本信息
        text = re.sub("<span(.*?)</span>","",mblog["text"])
        text = re.sub("<a(.*?)</a>","",text)
        mblog_info["text"].append(text)
        # 构造评论的url
        comments_url = "https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id_type=" % (mblog["id"], mblog["id"])
        # 保存评论
        i = 0
        while True:
            try:
                comments_url_ = comments_url + str(i)
                comments_singlePage_crawl(comments_url_, headers, comments_info, mblog["id"])
                i += 1
            except:
                break
        pprint.pprint(comments_info)

def weibo_bowen_data_crawl(url,headers):
    """
    博文信息爬取函数
    :param url: 待爬取网站url
    :param headers: 请求头
    :return: 博文信息存储字典mblog_info
    """
    # 博文信息存储字典
    mblog_info = {
        "id": [],
        "date": [],
        "text": []
    }
    # 评论信息保存字典
    comments_info = {
        "id":[],
        "date":[],
        "text":[],
    }
    # 爬取10页博文
    for i in range(1,10):
        url_ = url + str(i)
        # 添加博文信息
        weibo_bowen_singelPage_crawl(url_, headers, mblog_info,comments_info)
    return mblog_info,comments_info

def bowen_data_store(mblog_info,comments_info):
    """
    数据处理并保存到excel中
    :param mblog_info: 博文信息
    :return: 保存到excel
    """
    # 保存表1
    data = pd.DataFrame(mblog_info)
    data["num"] = data.index + 1
    data["keyword"] = ["Film Changjin Lake"]*len(data["num"])
    df = data.loc[:,["num","keyword","id","date","text"]]
    df.to_excel("bowen_data.xlsx",sheet_name="Sheet1")
    #保存表2
    comments_data = pd.DataFrame(comments_info)
    comments_data["num"] = comments_data.index + 1
    df_c = comments_data.loc[:,["num","id","date","text"]]
    df_c.to_excel("bowen_comments_data.xlsx",sheet_name="Sheet1")


if __name__ == '__main__':
    # 微博url
    url =  "https://m.weibo.cn/api/container/getIndex?uid=7377392724&luicode=10000011&lfid=100103type%3D1%26q%3D%E9%95%BF%E6%B4%A5%E6%B9%96&type=uid&value=7377392724&containerid=1076037377392724&page=" # 长津湖微博
    # 请求头
    headers = {
        "cookie":"自行添加",
        "user-agent":"自行添加"
    }
    mblog_info,comments_info = weibo_bowen_data_crawl(url,headers)
    bowen_data_store(mblog_info,comments_info)

注意修改请求头信息，结果会存储两个excel文件，一个博文存储文件，一个评论存储文件。
在这里插入图片描述

在这里插入图片描述
写了一下午，欢迎交流，需要数据文件的可以私聊。

呵呵样

关注

4
点赞
踩
8

收藏

觉得还不错? 一键收藏
5
评论
微博爬取长津湖博文及评论

微博爬取长津湖博文及评论# Changjin Lake film data from Weibo# @Time: 20211006# @Author: heheyangimport requestsimport jsonimport reimport pprintimport pandas as pddef comments_singlePage_crawl(url,headers,comments_info, id): """ 评论单页爬取 :param ur
复制链接

扫一扫