分析页面
通过对微博页面json的查看,发现:
个人页面request url为 https://m.weibo.cn/profile/info?uid={uid}
关注页面request url为 https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}
粉丝页面request url为 https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-{}&since_id={id}
创建scrapy项目
使用cmd 进入项目路径,然后输入scrapy startproject weibo
ITEM.py
这里创建两个类,一个是个人信息类item,另一个是关注与粉丝列表item类。
import scrapy
class userItem(scrapy.Item):
id=scrapy.Field()
avatar_hd=scrapy.Field()
description=scrapy.Field()
follow_count=scrapy.Field()
followers_count=scrapy.Field()
gender=scrapy.Field()
profile_image_url=scrapy.Field()
profile_url=scrapy.Field()
name=scrapy.Field()
statuses_count=scrapy.Field()
verified=scrapy.Field()
verified_reason=scrapy.Field()
class userRelationItem(scrapy.Item):
id=scrapy.Field()
follows=scrapy.Field()
fans=scrapy.Field()
class weiboItem(scrapy.Item):
Spider.py
因为初始页面是用户详情页,为了方便,这里重写了start_request方法,用来提交初始url到engine。
from scrapy import Request,Spider
from ..items import userItem,userRelationItem
class weiboSpider(Spider):
name='weibospider'
allowed_domains=['m.weibo.cn']
user_url='https://m.weibo.cn/profile/info?uid={uid}'
follow_url='https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}'
fans_url='https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-{}&since_id={id}'
weibo_url=''
start_users=['1655890975','5832321505','1645677662']
def start_requests(self):
for uid in self.start_users:
yield Request(self.user_url.format(uid=uid),callback=self.parse_user)
def parse_user(self,response):
result=response.json()
useritem=userItem()
if result['data']['user']:
user_info=result['data']['user']
#这里建立一个字典,将item的key与json数据进行映射
user_map={
'id':'id',
'avatar_hd':'avatar_hd',
'description':'description',
'follow_count':'follow_count',
'followers_count':'followers_count',
'gender':'gender',
'profile_image_url':'profile_image_url',
'profile_url':'profile_url',
'name':'screen_name',
'statuses_count':'statuses_count',
'verified':'verified',
'verified_reason':'verified_reason'
}
for i,w in user_map.items():
useritem[i]=user_info[w]
yield useritem
uid=result['data']['user']['id']
yield Request(self.follow_url.format(uid=uid,page=1),callback=self.parse_follows,meta={'page':1,'uid':uid})
yield Request(self.fans_url.format(uid=uid),callback=self.parse_fans,meta={'id':1,'uid':uid})
def parse_follows(self,response):
result=response.json()
if result['ok']and result['data']['cards'][0]['card_group']:
follows=result['data']['cards'][0]['card_group']
for follow in follows:
uid=follow['user']['id']
yield Request(self.user_url.format(uid=uid),callback=parse_user)
uid=response.meta['uid']
relationitem=userRelationItem()
follows=[{'id':follow['user']['id'],'name':follow['user']['screen_name']} for follow in follows]
relationitem['id']=uid
relationitem['follows']=follows
yield relationitem
#下一页关注
page=response.meta['page']+1
yield Request(self.follow_url.format(uid=uid,page=page),callback=self.parse_follows,meta={'page':page,'uid':uid})
def parse_fans(self,response):
result=response.json()
if result['ok']and result['data']['cards'][0]['card_group']:
fans=result['data']['cards'][0]['card_group']
for fan in fans:
uid=fan['user']['id']
yield Request(self.user_url.format(uid=uid),callback=parse_user)
uid=response.meta['uid']
relationitem=userRelationItem()
fans=[{'id':fan['user']['id'],'name':fan['user']['screen_name']} for fan in fans]
relationitem['id']=uid
relationitem['fans']=fans
yield relationitem
#下一页粉丝
id=response.meta['id']+1
yield Request(self.fan_url.format(uid=uid,id=id),callback=self.parse_fans,meta={'id':id,'uid':uid})
Piplines.py
为了把获取到的数据存入mysql,我们建立了WeiboPiplines类:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from .items import userItem,userRelationItem
import json
class WeiboPipeline(object):
def __init__(self):
self.mysqlcli = pymysql.connect(host='192.168.37.128', user='root', passwd='ww901020', port=3306, db='sina')
def process_item(self, item, spider):
cur = self.mysqlcli.cursor()
ctable='create table if not exists weibo(\
_id varchar(200) primary key ,_name varchar(200) ,\
avatar_hd VARCHAR(200) ,description varchar(200) ,\
follow_count VARCHAR(200) ,followers_count varchar(200) ,\
gender VARCHAR(200) ,profile_image_url varchar(200) ,\
profile_url VARCHAR(200) ,statuses_count varchar(200) ,\
verified VARCHAR(200) ,verified_reason varchar(200) )'
ctable2='create table if not exists liebiao(_id varchar(200) ,_follows varchar(5000) default "",fans varchar(5000) default "")'
cur.execute(ctable)
cur.execute(ctable2)
self.mysqlcli.commit()
keys=','.join(item.keys())
vals=','.join(['%s']*len(item))
vvv=[item[x] for x in item]
vv=[]
for x in vvv:
vv.append(str(x))
vvs=tuple(vv)
if isinstance(item,userItem): #判断item是否为关注/粉丝列表item
sql = 'INSERT INTO weibo({}) VALUES({})'.format(keys,vals)
cur.execute(sql,vvs)
self.mysqlcli.commit()
elif isinstance(item,userRelationItem):
a=[x for x in item]
b=item.values()
c=[i for i in b]
c1=str(c[1])
sql = 'INSERT INTO liebiao(_id) VALUES(%s)'
cur.execute(sql,c[0])
if a[1]=='_follows':
sql2='UPDATE liebiao set _follows = CONCAT(_follows,%s) WHERE _id=%s'
else:
sql2='UPDATE liebiao set fans = CONCAT(fans,%s) WHERE _id=%s'
cur.execute(sql2,(c1,c[0]))
self.mysqlcli.commit()
return item