Python模拟登陆知乎,获取收藏夹内容

最近在研究模拟登陆,以模拟知乎登陆为例,参考了知乎大神xchaoinfo的代码。

PS.知乎可以通过手机号和邮箱号两种方式登陆,方法相同,这里模拟的是通过手机号的登陆。

主要用到了requests库等,用requests处理cookies更方便一点。requests.Session能够跨请求地保持cookies,同一个session实例发送的请求都保持同一个cookies,requests模块会自动处理

PC端的验证码是选择倒立的汉字,处理起来比较麻烦,这里模拟了手机端登陆的过程,输入验证码即可。



import re
import requests
from bs4 import BeautifulSoup
from PIL import Image
import http.cookiejar
import time
import json

session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookies')
try:
    session.cookies.load(ignore_discard=True)
except:
    print("Cookie 未能加载")


home_url = 'https://www.zhihu.com/'
headers = {
			'User-Agent':'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
			'Referer':'https://www.zhihu.com/',
			'Host':'www.zhihu.com'
}

def islogin():
	url = 'https://www.zhihu.com/settings/profile'
	status = session.get(url,headers = headers, allow_redirects=False ).status_code  #注意禁止重定向
	if status == 200:
		return True
	else:
		return False

def get_xsrf(url = 'https://www.zhihu.com/explore'):
	html = session.get(url,headers = headers)
	_xsrf = re.search('_xsrf" value="(.*?)"/>',html.text,re.S).group(1)
	return _xsrf

def get_captcha():
	t = str(int(time.time() * 1000))
	captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
	r = session.get(captcha_url, headers=headers)
	with open('captcha.jpg', 'wb') as f:	#保存验证码到本地
		f.write(r.content)
		f.close()
	im = Image.open('captcha.jpg')
	im.show()
	im.close()
	captcha = input('请输入验证码:')
	return captcha

def login(user_name,password):
	_xsrf = get_xsrf()
	post_url = 'https://www.zhihu.com/login/phone_num'
	post_data = {
					'_xsrf' : _xsrf,
					'password' : password,
					'phone_num' : user_name,
					}
	result = session.post(post_url,data = post_data,headers = headers)	#尝试不输入验证码登陆
	if(json.loads(result.text))['r'] == 1 :		#登陆失败,需要输入验证码
		post_data['captcha'] = get_captcha()
		login_page = session.post(post_url,data = post_data,headers = headers)
		print((json.loads(login_page.text))['msg'])	#登陆提示信息
	session.cookies.save()	#保存cookies,方便下次登陆

def show_mine():
	mine_url = 'https://www.zhihu.com/collections/mine'
	html = session.get(mine_url,headers = headers)
	href = BeautifulSoup(html.text,'html5lib').find_all('h2',class_ = 'zm-item-title')[1].find('a')['href']
	url = 'https://www.zhihu.com' + str(href)
	show_collection(url)

def down_answer(url):
	answer_html = session.get(url,headers = headers)
	title = BeautifulSoup(answer_html.text,'html5lib').find('title').get_text()
	print(title)

def get_collection(url):
	print(url)
	content = session.get(url,headers = headers)
	all_answer = BeautifulSoup(content.text,'html5lib').find_all('a',class_="toggle-expand")
	for answer in all_answer:
		answer_href = answer['href']
		if(re.match('^https://zhuanlan',answer_href) == None):
			answer_url = 'https://www.zhihu.com' + answer_href
		else:
			answer_url = answer_href
		down_answer(answer_url)

def show_collection(url):
	coll_html = session.get(url,headers = headers)
	page_num = BeautifulSoup(coll_html.text,'html5lib').find('div' ,class_ = 'zm-invite-pager').find_all('span')[-2].get_text()
	print(page_num)
	for page in range(1,int(page_num)):
		page_url = url + '?page=' + str(page)
		get_collection(page_url)


if __name__ == '__main__':
	if islogin():
		print('已登录')
	else:
		user_name = input('请输入用户名:')
		password = input('请输入密码:')
		login(user_name,password)
	show_mine()


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值