该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
headers = {
'Connection': 'keep-alive',
'Host': 'http://www.zhihu.com',
'Referer': 'https://www.zhihu.com/signup?next=%2F',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
grant_type = 'password'
client_id = "c3cef7c66a1843f8b3a9e6a1e3160e20"
x_UDID = 'ANCuT28Zjg2PTn2VG48gf99U - sbL76I8EN4 ='
source = 'com.zhihu.web'
timestamp = str(int(time.time() * 1000))
timestamp2 = str(time.time() * 1000)
captcha_url = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
def start_requests(self):
return [scrapy.Request(self.captcha_url, headers=self.headers,
callback=self.login)]
def login(self, response):
need_cap = json.loads(response.body)['show_captcha']
if need_cap:
print("需要验证码")
yield scrapy.Request(url=self.captcha_url, headers=self.headers,
callback=self.captcha, method='PUT')
else:
print("不需要验证码")
post_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'
post_data = {
"client_id": self.client_id,
"username": "******",
"password": "******",
"source": self.source,
"timestamp": self.timestamp,
"signature": self.get_signature(self.grant_type, self.client_id, self.source, self.timestamp),
"lang": "en",
"ref_source": "homepage",
"captcha": '',
}
yield scrapy.FormRequest(url=post_url, formdata=post_data, headers=self.headers, callback=self.check_login)
def get_signature(self, grant_type, client_id, source, timestamp):
"""处理签名"""
hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4', None, sha1)
hm.update(str.encode(grant_type))
hm.update(str.encode(client_id))
hm.update(str.encode(source))
hm.update(str.encode(timestamp))
return str(hm.hexdigest())
def captcha(self, response):
try:
img = json.loads(response.text)['img_base64']
except ValueError:
print('获取img_base64失败')
else:
img = img.encode('utf8')
img_data = base64.b64decode(img)
with open('captcha.jpg', 'wb') as f:
f.write(img_data)
f.close()
captcha = input('请输入验证码:')
post_data = {'input_text': captcha}
yield scrapy.FormRequest(url=self.captcha_url, formdata=post_data, callback=self.captcha_login,
headers=self.headers)
def captcha_login(self, response):
try:
cap_result = json.loads(response.body)['success']
except ValueError:
print('关于验证码的POST请求响应失败!')
else:
if cap_result:
print('验证成功')
post_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'
post_data = {
"client_id": self.client_id,
"username": "******",
"password": "******",
"grant_type": self.grant_type,
"source": self.source,
"timestamp": self.timestamp,
"signature": self.get_signature(self.grant_type, self.client_id, self.source, self.timestamp),
"lang": "en",
"ref_source": "homepage",
"captcha": '',
}
headers = self.headers
headers.update({
'Origin': 'https://www.zhihu.com',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'x-xsrftoken': 'UiFIIz9fMjuytEYZ7VViRIBKZugpWsEK',
'X-Zse-83': '3_1.1',
'x-requested-with': 'fetch',
})
yield scrapy.FormRequest(url=post_url, formdata=post_data, headers=headers, callback=self.check_login)
def check_login(self, response):
# 验证服务器的返回数据判断是否登录成功
text_json = json.loads(response.text)
if "uid" in text_json:
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, headers=self.headers)