如果想要从github上爬取一些东西并且不受限制,首先要做的就是能够模拟登录到github上。虽然github提供了强大的api,但是这个api需要付费,而且价格很高。
模拟登录的话无非模拟在浏览器上登录的过程,最主要的是获取token,cookie。
#1.获取登录页面
def get_github_html(url):
response = requests.get(url)
first_cookie = response.cookies.get_dict()
return response.text,first_cookie
#2.获取token
def get_token(html):
soup = BeautifulSoup(html,'xml')
res = soup.find("input",attrs={"name":"authenticity_token"})
token = res['value']
return token
#3.登录
def login_github(url,token,cookie):
data = {
"commit":"Sign in",
"utf8":"√",
"authenticity_token":token,
"login":"****@****.com" # github username
"password":"**********" # github password
}
response = requests.post(url,data,cookies=cookie)
print response.status_code # if the status_code == 200, then we succeed.
cookie = response.cookies.get_dict()
return cookie
# 获取我们想要的网页
def get_html(url_want):
base_url = 'https://gtihub.com/login'
login_url = 'https://github.com/session'
html, cookie = get_github_html(base_url)
token = get_token(html)
cookie = login_github(login_url,token,cookie)
response = requests.get(url_want,cookies=cookie)
# response is what we want
# next is to parse the response