scrapy框架爬取微博之spider文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy.settings import default_settings
import json
from ..items import WeiboItem
import re
from w3lib.html import remove_tags

class WeiboSpider(scrapy.Spider):
    name = 'weibo'
    allowed_domains = ['weibo.cn']
    start_urls = ['https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_2027356850']

    def parse(self, response):
        # print(response)
        user_str = json.loads(response.text)
        # print(user_str)
        # 每个用户的信息
        user_list = user_str['data']['cards']
        for i in user_list:
            # print(i['card_group'])
            name_str = i['card_group']
            for j in name_str:
                if 'user' in j:
                    user_id = j['user']['id']
                    # print(user_id)
                    user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=230413%d_-_WEIBO_SECOND_PROFILE_WEIBO'
                    url = user_url % user_id
                    # print(url)
                    item = WeiboItem()
                    yield scrapy.Request(url, meta={'item': item}, callback=self.parse_list, dont_filter= False)


    def parse_list(self, response):
        user_dict = json.loads(response.text)
        for i in user_dict['data']['cards']:
            # print(i)
            if 'mblog' in i:
                # 名字
                name = i['mblog']['user']['screen_name']
                # 内容
                info = i['mblog']['text']
                # 去除标签
                info =  remove_tags(info)
                # 时间
                time = i['mblog']['created_at']
                # print(time)
                if '前' in time:

                    time = '8-30'
                if '昨天' in time:
                    time = '8-29'
                print(time)
                # 转发
                zhuanfa = i['mblog']['reposts_count']
                # 评论
                pinglun = i['mblog']['comments_count']
                # 点赞
                zan = i['mblog']['attitudes_count']
                item = WeiboItem()
                item['name'] = name
                item['info'] = info
                item['time'] = time
                item['zhuanfa'] = str(zhuanfa)
                item['pinglun'] = str(pinglun)
                item['zan'] = str(zan)
                user_id = i['mblog']['user']['id']
                user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d'
                url = user_url % user_id
                yield item

                for i in user_dict['data']['cards']:
                    # print(i)
                    if 'mblog' in i:
                        user_id = i['mblog']['user']['id']
                        user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d'
                        url = user_url % user_id
                        yield scrapy.Request(url, callback=self.parse, dont_filter=False)




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值