爬取某个微博博主的所有文章_扒微博上的内容-CSDN博客

本文链接：https://blog.csdn.net/m0_59139871/article/details/140621563

1、爬取某个微博博主的所有文章

import datetime
import json
import os
import re
import sys
import traceback
import random
from time import sleep
import get_ck
import requests
from lxml import etree


class Weibo():

    def __init__(self, user_id, filter=0):
        self.user_id = user_id  # 用户id，即需要我们输入的数字，如昵称为“Dear-迪丽热巴”的id为1669879400
        self.filter = filter  # 取值范围为0、1，程序默认值为0，代表要爬取用户的全部微博，1 代表只爬取用户的原创微博
        self.username = ''  # 用户名，如“Dear-迪丽热巴”
        self.weibo_num = 0  # 用户全部微博数
        self.weibo_num2 = 0  # 爬取到的微博数
        self.following = 0  # 用户关注数
        self.followers = 0  # 用户粉丝数
        self.weibo_content = []  # 微博内容
        self.publish_time = []  # 微博发布时间
        self.publish_place = []  # 微博发布位置
        self.up_num = []  # 微博对应的点赞数
        self.retweet_num = []  # 微博对应的转发数
        self.comment_num = []  # 微博对应的评论数

    # 读取cookie文件并解析cookie值
    def get_cookie(self):
        cookies = ''   # 初始化cookie值
        with open('cookies.txt', 'r') as f:
            cookies_list = json.load(f)
        for cookie in cookies_list:
            cookies += cookie['name'] + "=" + cookie['value'] + ";"
        self.cookie = {
            "Cookie": cookies
        }
        print(self.cookie)


    # 获取博主用户名
    def get_username(self):
        try:
            url = "https://weibo.cn/%d/info" % self.user_id  # 获取用户基本资料信息比如：https://weibo.cn/3084826290/info（杨昊的基本信息）
            html = requests.get(url, cookies=self.cookie).content  # 获取用户网页基本信息HTML
            selector = etree.HTML(html)  # 类似于正则表达式的解析

            username = selector.xpath("//title/text()")[0]
            # username = selector.xpath("//title/text()")[0]  # 以某种规则去切取某部分内容:切取HTML的title文本内容 username=XX的微博
            # xpath的作用就是两个字“定位”，运用各种方法进行快速准确的定位
            self.username = username
            print(u"用户名: " + self.username)

        except Exception as e:
            print("Error: ", e)
            traceback.print_exc()  # 打印出错误的内容

    # 获取博主微博数、关注数、粉丝数
    def get_user_info(self):
        try:
            url = "https://weibo.cn/u/%d?filter=%d&page=1" % (self.user_id, self.filter)
            html = requests.get(url, cookies=self.cookie).content  # 获取用户网页基本信息HTML
            selector = etree.HTML(html)  # 类似于正则表达式的解析
            pattern = r"\d+\.?\d*"  # \d+表示匹配一个或多个数字字符，比如：’1‘、’34‘、’9999‘  详情：查看正则表达式

            # 微博数(某用户总的微博数）
            str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]
            # 以某种规则去切取某部分内容,匹配<div class="tip2"><span class="tc">微博[15]</span>
            # print(str_wb)#微博[15]
            guid = re.findall(pattern, str_wb, re.S | re.M)
            for value in guid:
                num_wb = int(value)
                break
            self.weibo_num = num_wb
            print(u"微博数: " + str(self.weibo_num))

            # 关注数
            str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
            guid = re.findall(pattern, str_gz, re.M)
            self.following = int(guid[0])
            print(u"关注数: " + str(self.following))

            # 粉丝数
            str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
            guid = re.findall(pattern, str_fs, re.M)
            self.followers = float(guid[0])
            print(u"粉丝数: " + str(self.followers))

        except Exception as e:
            print("Error: ", e)
            traceback.print_exc()


    # 获取用户微博内容及对应的发布时间、点赞数、转发数、评论数
    def get_weibo_info(self):
        try:
            url = "https://weibo.cn/u/%d?filter=%d&page=1" % (self.user_id, self.filter)
            html = requests.get(url, cookies=self.cookie).content
            selector = etree.HTML(html)
            if selector.xpath("//input[@name='mp']") == []:
                # 如果没有name='mp' 说明就一页 没有下一页
                page_num = 1
            else:
                page_num = (int)(selector.xpath("//input[@name='mp']")[0].attrib["value"])
            # 以上代码为了得出微博有多少页
            pattern = r"\d+\.?\d*"

            for page in range(1, page_num + 1):
                print(u"正在爬取第" + str(page) + "页，请耐心等待..........")

                if page % 10 == 0:   # 爬取10页睡眠5秒
                    t = random.randint(6,12)
                    sleep(t)
                    print("睡眠" + str(t) + "秒，请稍等.....")

                url2 = "https://weibo.cn/u/%d?filter=%d&page=%d" % (self.user_id, self.filter, page)
                html2 = requests.get(url2, cookies=self.cookie).content
                selector2 = etree.HTML(html2)
                info = selector2.xpath("//div[@class='c']")

                if len(info) > 3:  # 其实每页的微博数量为9个 但是len(info)=12 所以要去掉三个
                    for i in range(0, len(info) - 2):  # 所以要去掉三个
                        # 微博内容
                        str_t = info[i].xpath("div/span[@class='ctt']")
                        weibo_content = str_t[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)  # 解码编码
                        # // *[@id="app"]/div[1]/div[1]/div[8]/div/div/div/article/div[2]/div[1]/a
                        self.weibo_content.append(weibo_content)
                        # print(u"微博内容：" + weibo_content)


                        str_time = info[i].xpath("div/span[@class='ct']")
                        str_time = str_time[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(
                            sys.stdout.encoding)

                        # 微博发布工具
                        if len(str_time.split(u'来自')) > 1:
                            publish_place = str_time.split(u'来自')[1]
                        else:
                            publish_place = u'无'
                        self.publish_place.append(publish_place)
                        # print(u"微博发布工具:" + publish_place)

                        # 微博发布时间
                        publish_time = str_time.split(u'来自')[0]
                        if u"刚刚" in publish_time:
                            publish_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
                        elif u"分钟" in publish_time:
                            minute = publish_time[:publish_time.find(u"分钟")]
                            minute = datetime.timedelta(minutes=int(minute))
                            publish_time = (datetime.datetime.now() - minute).strftime("%Y-%m-%d %H:%M")
                        elif u"今天" in publish_time:
                            today = datetime.datetime.now().strftime("%Y-%m-%d")
                            time = publish_time[3:]
                            publish_time = today + " " + time
                        elif u"月" in publish_time:
                            year = datetime.datetime.now().strftime("%Y")
                            month = publish_time[0:2]
                            day = publish_time[3:5]
                            time = publish_time[7:12]
                            publish_time = year + "-" + month + "-" + day + " " + time
                        else:
                            publish_time = publish_time[:16]
                        self.publish_time.append(publish_time)
                        # print(u"微博发布时间：" + publish_time)

                        # 点赞数
                        str_zan = info[i].xpath("div/a/text()")[-4]
                        guid = re.findall(pattern, str_zan, re.M)
                        up_num = int(guid[0])
                        self.up_num.append(up_num)
                        # print(u"点赞数: " + str(up_num))

                        # 转发数
                        retweet = info[i].xpath("div/a/text()")[-3]
                        guid = re.findall(pattern, retweet, re.M)
                        retweet_num = int(guid[0])
                        self.retweet_num.append(retweet_num)
                        # print(u"转发数: " + str(retweet_num))

                        # 评论数
                        comment = info[i].xpath("div/a/text()")[-2]
                        guid = re.findall(pattern, comment, re.M)
                        comment_num = int(guid[0])
                        self.comment_num.append(comment_num)
                        # print(u"评论数: " + str(comment_num))

                        self.weibo_num2 += 1  # 原创微博

                if not self.filter:  # 如果是默认的filter
                    print(u"共" + str(self.weibo_num2) + u"条微博")
                else:
                    print(u"共" + str(self.weibo_num) + u"条微博，其中" + str(self.weibo_num2) + u"条为原创微博")
        except Exception as e:
            print("Error: ", e)
            traceback.print_exc()


    # 将爬取的信息写入文件
    def write_txt(self):
        try:
            if self.filter:  # 如果filter=1，表示为原创
                result_header = u"\n\n原创微博内容：\n"
            else:
                result_header = u"\n\n微博内容：\n"
            result = (u"用户信息\n用户昵称：" + self.username +
                      u"\n用户id:" + str(self.user_id) +
                      u"\n微博数:" + str(self.weibo_num) +
                      u"\n关注数:" + str(self.following) +
                      u"\n粉丝数:" + str(self.followers) +
                      result_header)
            for i in range(1, self.weibo_num2 + 1):
                text = (str(i) + "\n" +
                        u"微博内容:" + self.weibo_content[i - 1] + "\n" +
                        u"发布时间：" + str(self.publish_time[i - 1]) + "\n" +
                        u"来自：" + str(self.publish_place[i - 1]) + "\n" +
                        u"点赞数：" + str(self.up_num[i - 1]) + "\n" +
                        u"转发数：" + str(self.retweet_num[i - 1]) + "\n" +
                        u"评论数：" + str(self.comment_num[i - 1]) + "\n\n"
                        )
                result = result + text
            file_dir = os.path.split(os.path.realpath(__file__))[0] + os.sep + "weibo"  # os.path.split()：将文件名和路径分割开。
            if not os.path.isdir(file_dir):
                os.mkdir(file_dir)
            file_path = file_dir + os.sep + "北京生态环境1.txt"
            f = open(file_path, "wb")  # wb" 以二进制写方式打开，只能写文件， 如果文件不存在，创建该文件  如果文件已存在，先清空，再打开文件
            f.write(result.encode(sys.stdout.encoding))
            f.close()
            print(u"微博写入文件完毕，保存路径:" + file_path)
        except Exception as e:
            print("Error: ", e)
            traceback.print_exc()



    # 运行爬虫
    def start(self):
        try:
            self.get_cookie()
            self.get_username()
            self.get_user_info()
            print("正在爬取数据请耐心等待............")
            self.get_weibo_info()
            self.write_txt()
            print(u"信息抓取完毕")
        except Exception as e:
            print("Error: ", e)


def main():
    try:
        user_id = 你要爬取博主的id  # 可以改成任意合法的用户id（爬虫的微博id除外）
        filter = 0  # 值为0表示爬取全部微博（原创微博+转发微博），值为1表示只爬取原创微博
        wb = Weibo(user_id, filter)  # 调用Weibo类，创建微博实例wb
        wb.start()  # 爬取微博信息
        print(u"用户名：" + wb.username)
        print(u"全部微博数：" + str(wb.weibo_num))
        print(u"关注数：" + str(wb.following))
        print(u"粉丝数：" + str(wb.followers))
    except Exception as e:
        print("Error: ", e)
        traceback.print_exc()


if __name__ == "__main__":
    get_ck.get_cookies()
    sleep(10)
    main()

2、扫码登录获取cookie值（get_ck.python）

import json
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

browser_options = Options()
browser = webdriver.Chrome(options=browser_options)
headers = {'user-agent': 'your user agent'}
print("浏览器已成功创建。")


def get_cookies(url='https://passport.weibo.com/sso/signin?entry=wapsso&source=wapsso&url=https%3A%2F%2Fm.weibo.cn%2F'):
    url = url
    browser.get(url)
    print('请在25秒内，使用微博APP扫码登录你的账号...')
    time.sleep(25)
    # 获取登录后的Cookies
    with open('cookies.txt', 'w') as f:
        f.write(json.dumps(browser.get_cookies()))
        f.close()
    print('已成功保存cookie信息。')

3、注意：关于数据一次爬取不全的问题，是因为微博反爬机制导致的。我们可以多次爬取，只需要记住上次爬取结束的页数，下次从上次结束的位置继续爬取即可解决该问题。在代码中修改page和保存文件的文件名即可。