代码在python3环境下测试通过:
from bs4 import BeautifulSoup
import requests
url = 'http://www.zhihu.com'
login_url = url+'/login/email'
captcha_url = 'http://www.zhihu.com/captcha.gif'
headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://www.zhihu.com/',
'Content-Length': '154',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, sdch',
'Host':' www.zhihu.com',
'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection':' keep-alive'
}
login_data={'email':'xxxx',#替换为账号
'password':'xxxx',#替换为密码
'remember_me':'true',
'Referer': 'http://www.zhihu.com/'
}
def add_xsrf():
'''向login_data里面添加_xsrf值,首先获取未登录状态的响应报文,
利用soup解析出_xsrf值'''
soup=BeautifulSoup(requests.get(url).text)
xsrf=soup.find('input',attrs={'name':'_xsrf'})['value']
login_data['_xsrf'] = xsrf.encode('utf-8')
def add_captcha():
captcha =session.get(captcha_url,stream=True)
with open('captcha.gif','wb') as f:
for line in captcha.iter_content(10):
f.write(line)
captcha_str = input('请输入验证码:')
login_data['captcha'] = captcha_str
if __name__=='__main__':
session = requests.session()
add_xsrf()
add_captcha()
responds=session.post(login_url, headers=headers, data=login_data)
with open('zhihu.txt','wt',encoding="utf8",errors='ignore')as f:
print(session.get(url).text,file=f)
说明:
1.用到两个第三方库:用requests代替urllib,用BeautifulSoup代替re。下载方式:命令行键入 pip install requests、pip install BeautifulSoup。
2.验证码暂时无法做到自动识别,需要手动填写。
3.对于两种登录方式,url分别为'http://www.zhihu.com/login/email'、'http://www.zhihu.com/login/phone_num' 。推荐使用邮箱登录,手机登陆由于知乎对密码加密会出现
密码报错的现象(可以抓包获取加密后密码)。
4.打开文件时一定注明:encoding="utf8",errors='ignore',否则会出现UnicodeEncodeError。
如果你想从二进制模式的文件中读取或写入文本数据,必须确保要进行解码和编码操作。比如:
with open('somefile.bin', 'rb') as f:
data = f.read(16)
text = data.decode('utf-8')
with open('somefile.bin', 'wb') as f:
text = 'Hello World'
f.write(text.encode('utf-8'))
5.两个库的官方文档:http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
http://cn.python-requests.org/zh_CN/latest/