爬取微博文章内容

项目场景:

爬取微博文章内容:
导入一些模块:
import time
import requests
import csv
import os
from datetime import datetime


问题描述

爬取14个数据:

id
likeNum
commentsLen
reports_count
region
content
contentLen
created_at
type
detailUrl
authorAvatar
authorName
authorDetail
isVip

保存到csv文件:

def init():
    if not os.path.exists('./articleData.csv'):
        with open('./articleData.csv', 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'id',
                'likeNum',
                'commentsLen',
                'reports_count',
                'region',
                'content',
                'contentLen',
                'created_at',
                'type',
                'detailUrl',
                'authorAvatar',
                'authorName',
                'authorDetail',
                'isVip'
            ])


def writerRow(row):
    with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)


完整代码:

import time
import requests
import csv
import os
from datetime import datetime


def init():
    if not os.path.exists('./articleData.csv'):
        with open('./articleData.csv', 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'id',
                'likeNum',
                'commentsLen',
                'reports_count',
                'region',
                'content',
                'contentLen',
                'created_at',
                'type',
                'detailUrl',
                'authorAvatar',
                'authorName',
                'authorDetail',
                'isVip'
            ])


def writerRow(row):
    with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)


def get_data(url, params):
    headers = {
        'Cookie': 'XSRF-TOKEN=zASpYIx0oUosfBlB0MsTSRdi; SSOLoginState=1704083302; SUB=_2A25Ilk82DeThGeBI71US9yzKzzuIHXVr6s7-rDV8PUJbkNB-LWXlkW1NRpId-Znw75c-wagHUOjJucjoob6tHv3U; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZM5jTZLaMAANadOdO6n405NHD95QcSoBNe0MESoBNWs4DqcjPi--Xi-i2iK.4i--NiK.XiKLsS0e4eo-t; WBPSESS=Ii9Wh36g6mj5Z4ggI26vDWjCIui3_Ugbw4SWQGD-3thTaFTWO4WfBvG6bThO4kGKymgzVpGAtZV7ECafvFIdUVzuArqnCejbOvzVVpt49LX2IF7cmIN2gYRZz9Z8CMGcwbkBpKHIXseyKeK-4ee9gw==',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()['statuses']
    else:
        return None


def getAllTypeList():
    typeList = []
    with open('./navData.csv', 'r', encoding='utf-8') as reader:
        readerCsv = csv.reader(reader)
        next(reader)
        for nav in readerCsv:
            typeList.append(nav)
    return typeList


def parse_json(response, type):
    for article in response:
        id = article['id']
        likeNum = article['attitudes_count']
        commentsLen = article['comments_count']
        reports_count = article['reposts_count']
        try:
            region = article['region_name'].replace('发布于', '')
        except:
            region = '无'
        content = article['text_raw']
        contentLen = article['textLength']
        created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
        type = type
        try:
            detailUrl = 'https://www.weibo.com/' + str(article['id']) + '/' + str(article['mblogid'])
        except:
            detailUrl = '无'
        authorAvatar = article['user']['avatar_large']
        authorName = article['user']['screen_name']
        authorDetail = 'https://www.weibo.com/u/' + str(article['user']['id'])
        isVip = article['user']['v_plus']
        writerRow([
            id,
            likeNum,
            commentsLen,
            reports_count,
            region,
            content,
            contentLen,
            created_at,
            type,
            detailUrl,
            authorAvatar,
            authorName,
            authorDetail,
            isVip
        ])


def start(typeNum=3, pageNum=2):
    articleUrl = 'https://weibo.com/ajax/feed/hottimeline'
    init()
    typeList = getAllTypeList()
    typeNumCount = 0
    for type in typeList:
        if typeNumCount > typeNum:
            return
        time.sleep(1)
        for page in range(0, pageNum):
            print('正在爬取的类型:%s中的第%s页的文章数据' % (type[0], page + 1))
            time.sleep(1)
            params = {
                'group_id': type[1],
                'containerid': type[2],
                'max_id': page,
                'count': 10,
                'extparam': 'discover|new_feed'
            }
            response = get_data(articleUrl, params)
            parse_json(response, type[0])
        typeNumCount += 1


if __name__ == '__main__':
    start()



结果:

在这里插入图片描述

  • 15
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
微博内容,可以使用 Python 虫库中的 requests 和 BeautifulSoup 库,以及模拟用户登录的库。 下面是一个基本的微博内容的示例代码(假设要的是微博用户“Python”发布的微博): ```python import requests from bs4 import BeautifulSoup # 模拟用户登录,获 cookies login_url = 'https://passport.weibo.cn/signin/login' data = { 'username': 'your_username', 'password': 'your_password', 'savestate': '1', 'entry': 'mweibo', 'mainpageflag': '1' } session = requests.Session() session.post(login_url, data=data) cookies = session.cookies.get_dict() # 微博内容 url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=2145291155' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } response = requests.get(url, headers=headers, cookies=cookies) json_data = response.json() cards = json_data['data']['cards'] for card in cards: mblog = card.get('mblog') if mblog: text = mblog.get('text') if text: soup = BeautifulSoup(text, 'html.parser') print(soup.get_text()) ``` 在代码中,首先模拟用户登录,获 cookies,然后使用 requests 库发起 GET 请求,获微博内容的 JSON 数据。最后使用 BeautifulSoup 库解析 HTML 代码,获微博正文内容。 需要注意的是,微博对于虫比较敏感,如果频繁可能会被封禁账号或者 IP,因此需要注意频率。此外,还需要遵守虫的相关法律法规,不得进行非法行为。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

强哥哥1222

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值