之前一直没搞明白,但是崔大神的书值得一看,看了就懂
谷歌浏览器F12,随便输入账户、密码登录https://github.com/login,找到POST请求的那个请求,在Headers信息中获取登录链接,查看post_data,cookie可以用session解决,其他数据固定,只差一个数据authenticity_token
这个数据在请求登录页面时得到,查看登录页面源码,ctrl+F查找authenticity_token,找到//input[@name="authenticity_token"]/@value
思路session请求登录页面,维持cookie,页面源码XPATH查找得到authenticity_token的值——带上这个值构建登录的post_data,post请求登录链接https://github.com/session,登录进去——请求获取数据链接https://github.com/settings/profile,在页面查找需要数据
import requests
from lxml import etree
class Login():
def __init__(self):
self.headers = {
'Rerfer':'http://github.com',
'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64',
'Host':'github.com'
}
self.login_url = 'https://github.com/login'
self.post_url ='https://github.com/session'
self.logined_url = 'https://github.com/settings/profile'
self.session = requests.session()
#预登陆,得到数据
def prelog(self):
res = self.session.get(self.login_url, headers=self.headers)
html = etree.HTML(res.text)
prelog = html.xpath('//input[@name="authenticity_token"]/@value')[0]
return prelog
#带上数据,登陆成功
def realog(self,email,password):
post_data = {
'commit':'Sign in',
'utf8':'✓',
'authenticity_token':self.prelog(),
'login':email,
'password':password,
'webauthn-support':'supported'
}
res = self.session.post(self.post_url,data=post_data,headers=self.headers)
if res.status_code == 200:
print(res.text)
#请求需要爬取数据的页面,XPATH查找数据
res = self.session.get(self.logined_url,headers=self.headers)
if res.status_code ==200:
html = etree.HTML(res.text)
#输出为用户名
name = html.xpath('//img[@class="avatar"]/@alt')[0]
print(name)
if __name__ == '__main__':
login = Login()
login.realog(email='自己填',password='自己填')