因为前几天写了个Python登录教务网的爬虫,这几天又突然想到用Python登录淘宝试试,于是二话不说,代码走起。本来以为很简单,但一写我就傻眼了,登淘宝的涉及到很多东西,验证码,加密算法,用httpfox查看post数据又很多看不懂的,所以百度大神们的类似代码,经过几天痛苦的尝试,终于搞定了,登淘宝最重要的几点就是:一,验证码,我是用正则提取验证码地址,然后用webbrowser把验证码直接在浏览器中显示,让用户输入;二,获取token,当你处理好了验证码,登录了login_url,你就自然会发现token,然后重定向到另一地址;三,登录用token_url,获取st,再利用st重定向到淘宝用户的主页地址,当你实现了这些就成功了!总的来说就像奔跑吧,兄弟的游戏那种,每获得一个值,登录,就会获得下个值的线索,依次下去,就会成功。这是我的一孔之见,如果我说错了什么,请大神指点!欢迎一起讨论!
参考地址:
http://my.oschina.net/u/811744/blog/191165
http://www.sufeinet.com/thread-4585-1-1.html
代码如下:
</pre><pre name="code" class="python">#-*-coding:gbk-*-
import urllib
import urllib2
import cookielib
import re
import webbrowser
#登录地址
login_url = "https://login.taobao.com/member/login.jhtml"
proxy_url = 'http://120.193.146.95:843'
#post请求头部
headers = {
'Host':'login.taobao.com',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0',
'Referer' : 'https://login.taobao.com/member/login.jhtml',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection' : 'Keep-Alive'
}
#用户名,密码
username = raw_input("请输入账号: ") #此处不需要密码,因为经过淘宝加密算法后,在下面的ua中已经包含了账号和密码,所以很安全
#所以每个用户的ua都是不一样的
#请求数据包,这三个都是动态生成的,但是不用担心,只要选取其中一组就可以,只要你的用户名和密码不变就可以
ua = '074UW5Tcy...uUm07'#因为ua,gr,password2都太长了所以省略了很多
gr = '687474703A.....343433'
password2 = '100eba2....e674077e'
post = { 'ua':ua,
'TPL_checkcode':'',
'CtrlVersion': '1,0,0,7',
'TPL_password':'',
'TPL_redirect_url':'http://i.taobao.com/my_taobao.htm?nekot=udm8087E1424147022443',
'TPL_username':username,
'loginsite':'0',
'newlogin':'0',
'from':'tb',
'fc':'default',
'style':'default',
'css_style':'',
'tid':'XOR_1_000000000000000000000000000000_625C4720470A0A050976770A',
'support':'000001',
'CtrlVersion':'1,0,0,7',
'loginType':'4',
'minititle':'',
'minipara':'',
'umto':'NaN',
'pstrong':'3',
'llnick':'',
'sign':'',
'need_sign':'',
'isIgnore':'',
'full_redirect':'',
'popid':'',
'callback':'',
'guf':'',
'not_duplite_str':'',
'need_user_id':'',
'poy':'',
'gvfdcname':'10',
'gvfdcre':gr,
'from_encoding ':'',
'sub':'',
'TPL_password_2':password2,
'loginASR':'1',
'loginASRSuc':'1',
'allp':'',
'oslanguage':'zh-CN',
'sr':'1366*768',
'osVer':'windows|6.1',
'naviVer':'firefox|35'
}
postData = urllib.urlencode(post)
proxy = urllib2.ProxyHandler({'http':proxy_url}) #设置代理,防止自己的IP被封
cookieJar = cookielib.LWPCookieJar()
cookie= urllib2.HTTPCookieProcessor(cookieJar) #智能处理cookie
opener = urllib2.build_opener(cookie,proxy,urllib2.HTTPHandler)
urllib2.install_opener(opener)
req = urllib2.Request(login_url,postData,headers)
taobao = urllib2.urlopen(req)
read = taobao.read()
staus = taobao.getcode()
if staus == 200:
print '获取服务器请求成功!'
#处理验证码,获取token
pattern = re.compile(r'(?<=<img id="J_StandardCode_m" src="https://s.tbcdn.cn/apps/login/static/img/blank.gif" data-src=").[^<]*?(?=")')
checkCodeUrlList = re.findall(pattern, read)
#print checkCodeUrlList[0]
webbrowser.open_new_tab(checkCodeUrlList[0]) #这里和下面的[0],只是为了提取列表中的值
print '到浏览器看验证码图片'
checkcode = raw_input('请输入验证码:')
post['TPL_checkcode'] = checkcode
postData = urllib.urlencode(post)
req = urllib2.Request(login_url,postData,headers)
taobao = urllib2.urlopen(req)
read_token = taobao.read()
#处理token,获得st
pattern_token = re.compile(r'(?<=<input type="hidden" id="J_HToken" value=").[^<]*?(?=")')
token = re.findall(pattern_token,read_token)
token_url = 'https://passport.alipay.com/mini_apply_st.js?site=0&token=%s&callback=stCallback6' % token[0]
req_token = urllib2.Request(token_url)
response_token = urllib2.urlopen(req_token).read()
#处理st,获得用户淘宝主页的登录地址
pattern_st = re.compile(r'(?<="st":").[^<]*?(?=")')
st = re.findall(pattern_st,response_token)
st_url = 'https://login.taobao.com/member/vst.htm?st=%s&TPL_username=%s' % (st[0],username)
req_st = urllib2.Request(st_url)
response_st = urllib2.urlopen(req_st).read()
print response_st
pattern_end = re.compile(r'(?<=top.location = ").[^<]*?(?=";)')
end = re.findall(pattern_end,response_st)
end_url = '%s' % end[0]
req_end = urllib2.Request(end_url)
response_end = urllib2.urlopen(req_end).read()
print response_end