由上图可见:微博已经视粉丝分布为商业机密,故爬取难度越来越大。无论web上爬,还是手机上爬,均受限。
两种方式:手动爬+微博API爬。
本文展示手动爬,以李易峰的粉丝分布为基础,只能爬取20页,源码如下
# encoding=utf-8
import random
import json
import base64
import requests
from lxml import etree
import re
import pymongo
"""
输入你的微博账号和密码,可去淘宝买,一元七个。
建议买几十个,微博限制的严,太频繁了会出现302转移。
或者你也可以把时间间隔调大点。
"""
myWeiBo = [
# {'no': '314061410@qq.com', 'psw': '123456789'},
{'no': '835163102@qq.com', 'psw': '987654321'},
#{'no': 'shudieful3618@163.com', 'psw': 'a123456'},
]
host="http://weibo.cn"
scrawl_ID=set()
cookies = []
def getCookies(weibo):
""" 获取Cookies """
loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
for elem in weibo:
account = elem['no']
password = elem['psw']
username = base64.b64encode(account.encode('utf-8')).decode('utf-8')
postData = {
"entry": "sso",
"gateway": "1",
"from": "null",
"savestate": "30",
"useticket": "0",
"pagerefer": "",
"vsnf": "1",
"su": username,
"service": "sso",
"sp": password,
"sr": "1440*900",
"encoding": "UTF-8",
"cdult": "3",
"domain": "sina.com.cn",
"prelt": "0",
"returntype": "TEXT",
}
session = requests.Session()
r = session.post(loginURL, data=postData)
jsonStr = r.content.decode('gbk')
#print 'jsonStr=',jsonStr
info = json.loads(jsonStr)
#print 'info=',info
if info["retcode"] == "0":
print "Get Cookie Success!( Account:%s )" % account
cookie = session.cookies.get_dict()
cookies.append(cookie)
else:
print "Failed!( Reason:%s )" % info['reason']
return cookies
def weiboLogin(ID):
cookies = getCookies(myWeiBo)
print "Get Cookies Finish!( Num:%d)" % len(cookies)
cookie=random.choice(cookies)
rich_url="http://weibo.cn/%s/fans" % ID
r=requests.post(rich_url,cookies=cookie)
return r.content
def url_to_page(url):
cookie=random.choice(cookies)
r=requests.post(url,cookies=cookie)
if(r.status_code==requests.codes.ok):
return r.content
else:
return r.status_code
def MongoInit():
clinet = pymongo.MongoClient("localhost", 27017)
db = clinet["Sina_by_fw"]
Fans_db = db["Fans"]
print"MongoDBPipleline_init() finish****"
return Fans_db
fans_cities=[]
page_count=0
fans_count=0
Fans_db=MongoInit()
def parse_for_fans(page,IDhost):
global Fans_db,fans_cities
global page_count
global fans_count
page_count+=1
print"page_count=",page_count
IDs=set(re.findall('uid=(\d+)', page))
for ID in IDs:
if ID!=str(IDhost):# one is str ,the other is int
fans_count+=1
info_page=url_to_page("http://weibo.cn/%s/info" % ID)
expression_nick=u'\u6635\u79f0[:|\uff1a](.*?)<'
nick=re.findall(expression_nick.encode('UTF-8'),info_page)[0]
expression_city=u'\u5730\u533a[:|\uff1a](.*?)<'
city=re.findall(expression_city.encode('UTF-8'),info_page)[0]#this must be encode('UTF-8')!!!!!
print nick,city,fans_count
fans_cities.append(city)
'''
if len(fans_cities)==50:
fans_cities_dict=dict()
for i in range(len(fans_cities)):
fans_cities_dict[str(i+1)]=fans_cities[i]
Fans_db.insert(fans_cities_dict)
del fans_cities[:]
'''
e_page=etree.HTML(page)
url_next=e_page.xpath(u'//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href')
if url_next:
next_page=url_to_page(host+url_next[0])
parse_for_fans(next_page,IDhost)
else:
fans_cities_dict=dict()
for i in range(len(fans_cities)):
fans_cities_dict[str(i+1)]=fans_cities[i]
Fans_db.insert(fans_cities_dict)
del fans_cities[:]
#系统提示:为了避免骚扰,微博智能反垃圾系统已过滤掉部分广告用户。
page=weiboLogin(ID=1291477752)
parse_for_fans(page,IDhost=1291477752)
2.mongodb的数据必须要为python字典。