这次按照的是 www.yiibai.com/python(小写)/python3-webbug-series4.html 的教程来写的,在原有代码的基础上稍微做了改动,体验了一把装逼的感觉,因为我测试用的知乎帐号是手机号注册的,所以代码上会有些出入
#encoding utf-8
__author__ = 'qiao'
import gzip
import re
import http.cookiejar
import urllib.request
import urllib.parse
def get_opener(head):
cj = http.cookiejar.CookieJar()
pro = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(pro)
header = []
for key, value in head.items():
elem = (key,value)
header.append(elem)
opener.addheaders = header
return opener
def ungzip(data):
try:
print('Decompressing')
data = gzip.decompress(data)
print('Data decompression finished')
except:
print('It doesn\'t need decompress')
return data
def get_xsrf(data):
cer = re.compile('name=\"_xsrf\" value=\"(.*)\"',flags = 0)
strlist = cer.findall(data)
return strlist[0]
# test code
header = {
'Connection': 'Keep-Alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Host': 'www.zhihu.com',
'DNT':'1'
}
url = 'http://www.zhihu.com/'
opener = get_opener(header)
op = opener.open(url)
data = op.read()
data = ungzip(data)
_xsrf = get_xsrf(data.decode())
url += '/login/phone_num'
id = '你的帐号'
password = '你的密码'
post_dict = {
'_xsrf':_xsrf,
'phone_num':id,
'password': password,
'rememberme':'true',
'captcha':''
}
post_data = urllib.parse.urlencode(post_dict).encode()
op = opener.open(url,post_data)
data = op.read()
data = ungzip(data)
print(data.decode())
print('Login Success')
程序运行后的输出是:
Decompressing
It doesn't need decompress
Decompressing
It doesn't need decompress
{"r":0,
"msg": "\u767b\u5f55\u6210\u529f"
}
Login Success
其中\u767b\u5f55\u6210\u529f 是unicode编码,转换成中文就是“登录成功”
感觉还不错,接下来要在原理的层面上再深入理解爬虫,装高级一点的逼