#一、通过requests模拟知乎登录 # -*- coding: utf-8 -*- # author : seven # time : 2017/7/21 import requests import re from bs4 import BeautifulSoup try: import http.cookiejar as cookielib # python3 except: import cookielib # python2 session = requests.session() session.cookies = cookielib.LWPCookieJar(filename='cookies.txt') try: session.cookies.load(ignore_discard=True) except: print('cookie 未能加载') userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' header = { 'HOST': 'www.zhihu.com', 'Referer': 'https://www.zhihu.com', 'User-Agent': userAgent } def is_login(): '''通过访问需要登录的页面,判断返回的状态码来判断是否登录,200登录来,302为登录''' need_login_visit_url = 'https://www.zhihu.com/settings/profile' response = session.get(need_login_visit_url, headers=header, allow_redirects=False) # allow_redirects如果服务器返回302是否需要访问重定向后的页面,如果不设置为False,获取重定向后的页面之后就返回200了,就永远认为是登录了 if response.status_code != 200: return False else: return True def get_xsrf(): '''请求时候,要设置userAgent,有的网站就会验证此值,防止爬虫''' response = session.get('https://www.zhihu.com/#signin', headers=header) soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8') xsrfNodes = soup.select('input[name="_xsrf"]') xsrfInput = xsrfNodes[0] xsrf = xsrfInput.get('value', '') return xsrf # 不知道为何用下面正则匹配返回的内容匹配不出来,临时先用BeautifulSoup代替 # match_obj = re.match('.*name="_xsrf" value="(.*?)"',response.text) # if match_obj: # return match_obj.group(1) # else: # return '' def get_index(): '''获取首页''' response = session.get('https://www.zhihu.com', headers=header) with open('index_page.html', 'wb') as f: f.write(response.text.encode('utf-8')) # 向内存中写内容要用utf-8 print('ok') def zhihu_login(account, password): # 知乎登录 if re.match(r'1\d{10}', account): '''phone login''' post_url = 'http://www.zhihu.com/login/phone_num' post_data = { '_xsrf': get_xsrf(), 'phone_num': account, 'password': password } else: '''email login''' post_url = 'http://www.zhihu.com/login/email' post_data = { '_xsrf': get_xsrf(), 'phone_num': account, 'password': password } session.post(post_url, data=post_data, headers=header) session.cookies.save() # 保存cookie if __name__ == '__main__': if not is_login(): zhihu_login('xxxxx','xxxxx.') else: # 2、带着cookie去访问其他页面就ok了 get_index() #二、scrapy模拟登录 #1、创建工程:scrapy genspider zhihu www.zhibu.com #2、spider文件夹下代码: # -*- coding: utf-8 -*- import scrapy import re import json from PIL import Image class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' headers = { 'HOST': 'www.zhihu.com', 'Referer': 'https://www.zhihu.com', 'User-Agent': userAgent } def parse(self, response): '''登录成功后访问页面''' print(response.body) pass def parse_detail(self, response): '''每个访问页面的详细数据''' pass def start_requests(self): '''重写start_requests,入口方法''' return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] # 所有scrapy的Request都要有callback,因为它的请求全是异步的 def login(self, response): # 正则表达式默认之匹配第一行\r\n之后的不匹配,re.DOTALL:匹配所有 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL) if match_obj: xsrf = match_obj.group(1) import time t = str(int(time.time() * 1000)) captcha_url = "http://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) yield scrapy.Request(captcha_url,headers=self.headers,meta={'xsrf':xsrf},callback=self.login_after_captcha,dont_filter=True) def login_after_captcha(self,reponse): '''请求验证码之后登录''' captcha_name = 'captcha.jpg' with open(captcha_name,'wb') as f: f.write(reponse.body) try: im = Image.open(captcha_name) im.show() im.close() except Exception as e: print('打开图片出错') captcha = input("请输入验证码:") xsrf = reponse.meta.get('xsrf','') return [scrapy.FormRequest( # 完成表单提交 # 为了简单这里只写phone login url='http://www.zhihu.com/login/phone_num', formdata={ '_xsrf': xsrf, 'phone_num': 'sdsd', 'password': 'asdasads.', 'captcha': captcha }, headers=self.headers, callback=self.check_login, # 所有scrapy的Request都要有callback,因为它的请求全是异步的 dont_filter=True # 默认scrapy是过滤掉除了allowed_domains中掉url,如果dont_filter=True ,则不会过滤 )] def check_login(self, response): '''登录后回调,在去访问其他url,scrapy会自动带cookie信息,不需要自己处理''' reponse_text = response.text; text_json = json.loads(reponse_text) # 服务器返回的是json字符串,解析出来 if 'msg' in text_json and text_json['msg'] == '登录成功': for url in self.start_urls: yield scrapy.Request(url, dont_filter=True, headers=self.headers) # 不写回调函数,默认走parse else: print('error %s' % reponse_text)
scrapy的basic模板模拟登录、requests模拟登录
最新推荐文章于 2023-07-23 22:25:04 发布