新浪微博粉丝爬虫-wap站只能爬取20页-

由上图可见:微博已经视粉丝分布为商业机密,故爬取难度越来越大。无论web上爬,还是手机上爬,均受限。

两种方式:手动爬+微博API爬。

本文展示手动爬,以李易峰的粉丝分布为基础,只能爬取20页,源码如下

# encoding=utf-8
import random
import json
import base64
import requests
from lxml import etree
import re
import pymongo
"""
输入你的微博账号和密码,可去淘宝买,一元七个。
建议买几十个,微博限制的严,太频繁了会出现302转移。
或者你也可以把时间间隔调大点。
"""
myWeiBo = [
    # {'no': '314061410@qq.com', 'psw': '123456789'},
     {'no': '835163102@qq.com', 'psw': '987654321'},
    #{'no': 'shudieful3618@163.com', 'psw': 'a123456'},
 
]





host="http://weibo.cn"
scrawl_ID=set()

cookies = []
def getCookies(weibo):
    """ 获取Cookies """
    
    loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
    for elem in weibo:
        account = elem['no']
        password = elem['psw']
        username = base64.b64encode(account.encode('utf-8')).decode('utf-8')
        postData = {
            "entry": "sso",
            "gateway": "1",
            "from": "null",
            "savestate": "30",
            "useticket": "0",
            "pagerefer": "",
            "vsnf": "1",
            "su": username,
            "service": "sso",
            "sp": password,
            "sr": "1440*900",
            "encoding": "UTF-8",
            "cdult": "3",
            "domain": "sina.com.cn",
            "prelt": "0",
            "returntype": "TEXT",
        }
        session = requests.Session()
        r = session.post(loginURL, data=postData)
        jsonStr = r.content.decode('gbk')
            #print 'jsonStr=',jsonStr
        info = json.loads(jsonStr)
            #print 'info=',info
        if info["retcode"] == "0":
            print "Get Cookie Success!( Account:%s )" % account
            cookie = session.cookies.get_dict()
            cookies.append(cookie)
        else:
            print "Failed!( Reason:%s )" % info['reason']
    return cookies


def weiboLogin(ID):
    cookies = getCookies(myWeiBo)
    print "Get Cookies Finish!( Num:%d)" % len(cookies)
    cookie=random.choice(cookies)
    rich_url="http://weibo.cn/%s/fans" % ID
    r=requests.post(rich_url,cookies=cookie)
    return r.content

def url_to_page(url):
    cookie=random.choice(cookies)
    r=requests.post(url,cookies=cookie)
    if(r.status_code==requests.codes.ok):
        return r.content
    else:
        return r.status_code

def MongoInit():
    clinet = pymongo.MongoClient("localhost", 27017)
    db = clinet["Sina_by_fw"]
    Fans_db = db["Fans"]
    print"MongoDBPipleline_init() finish****"
    return Fans_db

fans_cities=[]
page_count=0
fans_count=0
Fans_db=MongoInit()

def parse_for_fans(page,IDhost):
    global Fans_db,fans_cities
    global page_count
    global fans_count
    page_count+=1
    print"page_count=",page_count
    IDs=set(re.findall('uid=(\d+)', page))
    for ID in IDs:
        if ID!=str(IDhost):# one is str ,the other is int
            fans_count+=1
            info_page=url_to_page("http://weibo.cn/%s/info" % ID)
            expression_nick=u'\u6635\u79f0[:|\uff1a](.*?)<'
            nick=re.findall(expression_nick.encode('UTF-8'),info_page)[0]
            expression_city=u'\u5730\u533a[:|\uff1a](.*?)<'
            city=re.findall(expression_city.encode('UTF-8'),info_page)[0]#this must be encode('UTF-8')!!!!!
            print nick,city,fans_count
            fans_cities.append(city)
            '''
            if len(fans_cities)==50:
                fans_cities_dict=dict()
                for i in range(len(fans_cities)):
                    fans_cities_dict[str(i+1)]=fans_cities[i]
                Fans_db.insert(fans_cities_dict)
                del fans_cities[:]
            '''
            
    e_page=etree.HTML(page)
    url_next=e_page.xpath(u'//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href')
    if url_next:
        next_page=url_to_page(host+url_next[0])
        parse_for_fans(next_page,IDhost)
    else:
        fans_cities_dict=dict()
        for i in range(len(fans_cities)):
            fans_cities_dict[str(i+1)]=fans_cities[i]
        Fans_db.insert(fans_cities_dict)
        del fans_cities[:]

#系统提示:为了避免骚扰,微博智能反垃圾系统已过滤掉部分广告用户。
page=weiboLogin(ID=1291477752)
parse_for_fans(page,IDhost=1291477752)



        
        


注意:1.网站的数据要经过utf-8解码

2.mongodb的数据必须要为python字典。

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值