# -*- coding: utf-8 -*-
import scrapy
from scrapy.settings import default_settings
import json
from ..items import WeiboItem
import re
from w3lib.html import remove_tags
class WeiboSpider(scrapy.Spider):
name = 'weibo'
allowed_domains = ['weibo.cn']
start_urls = ['https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_2027356850']
def parse(self, response):
# print(response)
user_str = json.loads(response.text)
# print(user_str)
# 每个用户的信息
user_list = user_str['data']['cards']
for i in user_list:
# print(i['card_group'])
name_str = i['card_group']
for j in name_str:
if 'user' in j:
user_id = j['user']['id']
# print(user_id)
user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=230413%d_-_WEIBO_SECOND_PROFILE_WEIBO'
url = user_url % user_id
# print(url)
item = WeiboItem()
yield scrapy.Request(url, meta={'item': item}, callback=self.parse_list, dont_filter= False)
def parse_list(self, response):
user_dict = json.loads(response.text)
for i in user_dict['data']['cards']:
# print(i)
if 'mblog' in i:
# 名字
name = i['mblog']['user']['screen_name']
# 内容
info = i['mblog']['text']
# 去除标签
info = remove_tags(info)
# 时间
time = i['mblog']['created_at']
# print(time)
if '前' in time:
time = '8-30'
if '昨天' in time:
time = '8-29'
print(time)
# 转发
zhuanfa = i['mblog']['reposts_count']
# 评论
pinglun = i['mblog']['comments_count']
# 点赞
zan = i['mblog']['attitudes_count']
item = WeiboItem()
item['name'] = name
item['info'] = info
item['time'] = time
item['zhuanfa'] = str(zhuanfa)
item['pinglun'] = str(pinglun)
item['zan'] = str(zan)
user_id = i['mblog']['user']['id']
user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d'
url = user_url % user_id
yield item
for i in user_dict['data']['cards']:
# print(i)
if 'mblog' in i:
user_id = i['mblog']['user']['id']
user_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%d'
url = user_url % user_id
yield scrapy.Request(url, callback=self.parse, dont_filter=False)
scrapy框架爬取微博之spider文件
最新推荐文章于 2024-04-24 09:57:07 发布