import scrapy
import json
import re
import datetime
import time
from w3lib.html import remove_tags
import math
from my_project.items import WeiboItem
class WeiboSpider(scrapy.Spider):
name = ‘weibo’
allowed_domains = [‘weibo.cn’]
start_urls = [‘https://m.weibo.cn/api/container/getIndex?type=uid&value=1793285524&containerid=1076031793285524‘]
def parse(self, response):
res_dict = json.loads(response.text)
# print(res_dict)
#循环所有微博内容并获取
for data in res_dict['data']['cards']:
try:
name = data['mblog']['user']['screen_name']
print(name)
time = data['mblog']['created_at']
#判断时间使其输出格式统一
if '小时' in time:
print(time)
res_time = re.match('\d+', time)
ISOTIMEFORMAT = '%Y-%m-%d %X'
time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
res_hour = re.search(r'\s(\d+):', time_now)
res = int(res_time[0]) - int(res_hour[1])
if res > 0:
now = datetime.datetime.now()
time = (now + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
else:
time = datetime.datetime.now().strftime("%Y-%m-%d")
print(time)
content = data['mblog']['text']
# content = remove_tags(content)
print(content)
passnum = data['mblog']['reposts_count']
print(passnum)
comments_count = data['mblog']['comments_count']
print(comments_count)
attitudes_count = data['mblog']['attitudes_count']
print(attitudes_count)
uid = data['mblog']['user']['id']
#print(uid)
item = WeiboItem()
item['name'] = name
item['time'] = time
item['content'] = content
item['passnum'] = passnum
item['comments_count'] = comments_count
item['attitudes_count'] = attitudes_count
yield item
except Exception as e:
print(e)
try:
#获取其微博的总数
count_num = res_dict['data']['cardlistInfo']['total']
# print(count_num)
#判断分了多少页
res_num = math.ceil(count_num/10)
except Exception as e:
print(e)
# print(res_num)
#获取其全部微博
for i in range(1,res_num+1):
page_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+str(uid)+'&containerid=107603'+str(uid)+'&page='+str(i)+''
yield scrapy.Request(url=page_url,callback=self.parse)
#找到他的关注的人的ajax请求,并携带自身id传给下个函数
uid_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_'+str(uid)+'_-_1042015:tagCategory_050&luicode=10000011&lfid=107603'+str(uid)+'&type=uid&value='+str(uid)+''
#print(uid)
yield scrapy.Request(url=uid_url,callback=self.parse_info,meta={'gid':uid})
# 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_1878546883_-_1042015%253AtagCategory_012&luicode=10000011&lfid=1076031878546883&type=uid&value=1878546883&page=2'
def parse_info(self,response):
res_dict = json.loads(response.text)
#获取他的所有关注的人
try:
res_num = res_dict['data']['cardlistInfo']['total']
res_num = math.ceil(res_num/20)
except:
#若果找不到这个值,就付给一个固定页数,通过一个人找其他的人,所以不需要很多的人
res_num = 5
#这个是由上个函数携带多来的
uid = response.meta['gid']
print(uid)
#获取他关注的人的每页的内容
for i in range(2,res_num+1):
bs_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_'+str(uid)+'_-_1042015:tagCategory_050&luicode=10000011&lfid=107603'+str(uid)+'&type=uid&value='+str(uid)+'&page='+str(i)+''
#print('我来到了关猪的所有人的地方')
yield scrapy.Request(url=bs_url,callback=self.parse_info,meta={'gid':uid})
# 获取其他人的id
try:
res_mmp = res_dict['data']['cards'][3]['card_group']
except:
res_mmp = res_dict['data']['cards'][0]['card_group']
for data in res_mmp:
#print('我来到了其他人的主页')
#print(data)
try:
uid = data['buttons'][0]['actionlog']['oid']
except Exception as e:
print(e)
#print(str(uid)+'------------')
#获取到其他人的id并返回给上个函数
uid_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+str(uid)+'&containerid=107603'+str(uid)+''
yield scrapy.Request(url=uid_url,callback=self.parse)