一
一直苦恼于新浪微博API开放的接口限制太多。只能用爬虫获取数据,然后我花了很长时间去找模拟登陆代码,根本没有一个能成功的。于是我就参考了谋篇文章,对代码做了些小修改,终于可以用了。
相关参考可以看http://blog.csdn.net/ta790799213/article/details/44205351
二:在模拟登陆的时候出现了个retcode=4049,解决办法:在http://login.sina.com.cn/?r=%2Fmember%2Fsecurity%2Fprotect.php登陆进去后,设置部分区域登陆不用验证码。如果不行,登陆微博,在登陆保护页同样设置一下区域登陆不用验证码。
三:代码如下:
# -*- coding: utf-8 -*-
import requests
import base64
import re
import urllib
import urllib2
import rsa
import json
import binascii
import string
from weibo import Client
import random
import time
import logging, logging.handlers
code = "5f9f84b2aa3198032416963c84c2d182"
app_key = "1110261163"
app_secret = "0de95a319a66c755c008b6332d7dd063"
redirect_uri = "https://api.weibo.com/oauth2/default.html"
class SinaCrawler:
def __init__(self, max_page):
self.session = None
self.MAX_PAGE = max_page
token = {u'access_token': u'2.00pE39sBn1UT7E61e7174d95TdYVED', u'remind_in': u'157679999', u'uid': u'1720813027', u'expires_at': 1575304674}
self.client = Client(app_key, app_secret, redirect_uri, token)
self.f = open("data", "w")
def __del__(self):
self.f.close()
def userlogin(self,username,password):
session = requests.Session()
url_prelogin = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.18)&_=1430736851146'
url_login = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.8)'
#get servertime,nonce, pubkey,rsakv
resp = session.get(url_prelogin)
print resp.content
p = re.compile('{.*}')
json_data = re.search(p, resp.content).group()
print json_data
data = eval(json_data)
servertime = data['servertime']
print 'servertime:',servertime
nonce = data['nonce']
pubkey = data['pubkey']
rsakv = data['rsakv']
# calculate su
su = base64.b64encode(urllib.quote(username))
#calculate sp
rsaPublickey= int(pubkey,16)
key = rsa.PublicKey(rsaPublickey,65537)
message = str(servertime) +'\t' + str(nonce) + '\n' + str(password)
sp = binascii.b2a_hex(rsa.encrypt(message,key))
postdata = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'userticket': '1',
'ssosimplelogin': '1',
'vsnf': '1',
'vsnval': '',
'su': su,
'service': 'miniblog',
'servertime': servertime,
'nonce': nonce,
'pwencode': 'rsa2',
'sp': sp,
'encoding': 'UTF-8',
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META',
'rsakv' : rsakv,
}
resp = session.post(url_login,data = postdata)
# print resp.headers
print resp.content
login_url = re.findall('{location\.replace\(\'(.+?)\'\);}',resp.content)
print 'login_url',login_url,type(login_url)
respo = session.get(login_url[0])
print respo.content
self.session = session
def do_search(self, query):
""" do search
Args:
query : str indicating the query
Return:
None
"""
self.f.write('screen_name\tgender\trelated_msg\tregister_time\tlocation\n')
for page in range(1, self.MAX_PAGE + 1):
time.sleep(random.random())
self.do_search_page(page, query)
def do_search_page(self, page, query):
""" get search result of the page in the search html page
Args:
page : int indicating the number of the page
Return:
None
"""
search_url = "http://s.weibo.com/wb/%s&page=%d" % (query, page)
html_page = self.session.get(search_url)
print html_page.content
# print all_results
# res_cnt = 1
# for res in all_results:
# print 'page %d result %d done' % (page, res_cnt)
# res_cnt += 1
# information = self.get_person_info(res)
# def get_search_result(self, html_content):
# """ get search result from the html content
#
# Args:
# html_content: str for storing html content of the search page
#
# Return:
# None
# """
# #content = re.findall(r"\"pid\":\"pl_user_feedList\"(?P<tips>[\w\W]*?)", html_content)
# html_content = html_content.strip()
# content = re.findall(r"\"pid\":\"pl_wb_feedlist\"(?P<tips>[\w\W]*?)</script>", html_content)[0]
# clean_content = string.replace(content, "\\\\", "\\")
# search_result = re.findall(r"<div class=\\\"WB_cardwrap S_bg2 clearfix\\\" >(?P<tips>[\w\W]*?)<\\/div>\\n<\\/div>", clean_content)
# return search_result
if __name__ == '__main__':
sina_crawler = SinaCrawler(2)
sina_crawler.userlogin('18650306405', 'lkz881199')
query = 'iphone'
#print type(query)
q = string.replace(str(urllib.quote(query)), "%", "%25")
print q
sina_crawler.do_search(q)
四:结果如图:
模拟登陆成功后,在每个页面上都会有你的个人信息入微博昵称:这个程序员不太冷2,还有uid。。