以下内容主要实现微博登录的过程
创作时间:2020/11/13
亲测有效
1、实现登录的心路历程
自从2020年9月份起,微博登录不仅需要输出账号,密码,验证码同时还增加了扫码的方式才能登录。在经历过多次挫败后,终于无意在B站找到了希望之光,另辟蹊径,实现免扫码就能进行登录的过程。
实现方式:因为微博是新浪的子产品,而登录新浪网不需要进行扫码登录,因此只要将登录微博的url地址换成登录新浪的url地址,就可以拿到登录所必须的cookie,再将获取到的cookie写入header中,可以进行数据的爬取。
在进行数据抓包的时候,建议打开浏览器无痕窗口,输入新浪网址:http://my.sina.com.cn,打开控制台进行登录
2、实现登录的代码过程
注:爬虫登录的接口需要进行抓包,chrome提供了自动抓包工具,F12或者Ctrl+shift+i进行控制台——>点击netWork——>勾选Preserve log
2.1、模块的导入
import requests
import urllib
import base64
import time
import re
import json
import rsa
import binascii
from bs4 import BeautifulSoup
from requests.packages.urllib3.connectionpool import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
2.2、获取账号
微博的账号就是su字段对应的值,进行了一系列的加密过程。user_name表示用户输入的账号,实现代码如下:
def get_username(self):
return base64.b64encode(urllib.parse.quote(self.user_name).encode("utf-8")).decode("utf-8")
2.3、欲登录实现
该接口在输入账号后会出现,目的是输出在输入密码后,对密码进行加密的一些字段如:pubkey,servertime等,实现代码如下:
def get_pre_login(self):
header = {
"Referer": "http://my.sina.com.cn/profile/unlogin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}
params = {
"entry": "account",
"callback": "pluginSSOController.preloginCallBack",
"su": self.get_username(),
"rsakt": "mod",
"checkpin": "1",
"client": "ssologin.js(v1.4.19)",
"_": int(time.time() * 1000)
}
try:
response = self.session.post("https://login.sina.com.cn/sso/prelogin.php", params=params, headers=header,verify=False) # 因为方法定义在class类中,self.session相当于requests
return json.loads(re.search(r"\((?P<data>.*)\)", response.text).group("data"))
except:
print("获取公钥失败")
return 0
2.4、获取密码
登录所需要的sp字段的值就是加密后形成的密码,实现过程如下:
def get_password(self):
public_key = rsa.PublicKey(int(self.get_pre_login()["pubkey"], 16), int("10001", 16))
password_string = str(self.get_pre_login()["servertime"]) + '\t' + str(
self.get_pre_login()["nonce"]) + '\n' + self.pass_word
password = binascii.b2a_hex(rsa.encrypt(password_string.encode("utf-8"), public_key)).decode('utf-8')
return password
2.5、登录返回cookie信息
输入账号和密码后,可能需要进行验证码的输入,
如果需要,则获取登录接口所需要的door参数字段后才能成功的获取cookie
不需要,则直接会进行登录操作,获取到cookie
def login(self):
header = {
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "http://my.sina.com.cn",
"Referer": "http://my.sina.com.cn/profile/unlogin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}
post_data = {
"entry": "account",
"gateway": "1",
"from": "",
"savestate": "30",
"qrcode_flag": "true",
"useticket": "0",
"pagerefer": "",
"vsnf": "1",
"su": self.get_username(),
"service": "sso",
"servertime": self.get_pre_login()["servertime"],
"nonce": self.get_pre_login()["nonce"],
"pwencode": "rsa2",
"rsakv": self.get_pre_login()["rsakv"],
"sp": self.get_password(),
"sr": "1366*768",
"encoding": "UTF-8",
"cdult":"3",
"domain":"sina.com.cn",
# "prelt": "34",
"returntype": "TEXT"
}
pre_login_data = self.get_pre_login()
if pre_login_data['showpin'] == 1:
url = "https://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int(time.time()), pre_login_data["pcid"])
with open("captcha.jpeg", "wb") as file_out:
file_out.write(self.session.get(url).content)
code = input("请输入验证码:")
post_data["door"] = code
login_data = self.session.post("https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)&_=%d"%int(time.time() * 1000), data=post_data, headers=header, verify=False)
ticket_url = json.loads(login_data.content)['crossDomainUrlList'][0]
t_header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}
resp = requests.get(ticket_url, headers=t_header)
sub = resp.cookies['SUB']
print(sub)
return sub
3、验证用户是否成功登录
这里的url我选择的是疫情话题,我发现用户登录后,会有 高级搜索文字,而未登录则没有,判断代码如下:
def check_cookie(self):
url = "https://s.weibo.com/weibo/%25E7%2596%25AB%25E6%2583%2585?topnav=1&wvr=6&b=1"
headers = {
"Origin": "https://s.weibo.com/weibo?q=%E7%96%AB%E6%83%85&xsort=hot&suball=1×cope=custom:2020-11-09-0:2020-11-09-23&Refer=g",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
"cookie": 'SUB={}'.format(self.login()),
# "cookie": 'SUB=你的cookie'
}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
return soup.find_all('div', class_='action')
当你获取到cookie之后,因为cookie设置的时长有12小时左右,所以可以直接将获取的cookie值直接赋值到headers的cookie字段中,也就是注释cookie的那个地方。
若输出的内容仍然有高级搜索字样,则证明登录成功
4、源代码实现
注:user_name以及pass_word 需要修改为你的账号密码
import requests
import urllib
import base64
import time
import re
import json
import rsa
import binascii
from bs4 import BeautifulSoup
from requests.packages.urllib3.connectionpool import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class Login(object):
session = requests.session()
user_name = "你的账号"
pass_word = "你的密码"
def get_username(self):
return base64.b64encode(urllib.parse.quote(self.user_name).encode("utf-8")).decode("utf-8") # 返回用户名 su字段的值
def get_pre_login(self):
header = {
"Referer": "http://my.sina.com.cn/profile/unlogin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}
params = {
"entry": "account",
"callback": "pluginSSOController.preloginCallBack",
"su": self.get_username(),
"rsakt": "mod",
"checkpin": "1",
"client": "ssologin.js(v1.4.19)",
"_": int(time.time() * 1000)
}
try:
response = self.session.post("https://login.sina.com.cn/sso/prelogin.php", params=params, headers=header,verify=False)
return json.loads(re.search(r"\((?P<data>.*)\)", response.text).group("data"))
except:
print("获取公钥失败")
return 0
def get_password(self):
public_key = rsa.PublicKey(int(self.get_pre_login()["pubkey"], 16), int("10001", 16))
password_string = str(self.get_pre_login()["servertime"]) + '\t' + str(
self.get_pre_login()["nonce"]) + '\n' + self.pass_word
password = binascii.b2a_hex(rsa.encrypt(password_string.encode("utf-8"), public_key)).decode('utf-8')
return password
def login(self):
header = {
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "http://my.sina.com.cn",
"Referer": "http://my.sina.com.cn/profile/unlogin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}
post_data = {
"entry": "account",
"gateway": "1",
"from": "",
"savestate": "30",
"qrcode_flag": "true",
"useticket": "0",
"pagerefer": "",
"vsnf": "1",
"su": self.get_username(),
"service": "sso",
"servertime": self.get_pre_login()["servertime"],
"nonce": self.get_pre_login()["nonce"],
"pwencode": "rsa2",
"rsakv": self.get_pre_login()["rsakv"],
"sp": self.get_password(),
"sr": "1366*768",
"encoding": "UTF-8",
"cdult":"3",
"domain":"sina.com.cn",
# "prelt": "34", #这个字段获取有点麻烦,后来发现注释掉也不影响
"returntype": "TEXT"
}
pre_login_data = self.get_pre_login()
if pre_login_data['showpin'] == 1:
url = "https://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int(time.time()), pre_login_data["pcid"])
with open("captcha.jpeg", "wb") as file_out:
file_out.write(self.session.get(url).content)
code = input("请输入验证码:")
post_data["door"] = code
login_data = self.session.post("https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)&_=%d"%int(time.time() * 1000), data=post_data, headers=header, verify=False)
ticket_url = json.loads(login_data.content)['crossDomainUrlList'][0]
t_header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}
resp = requests.get(ticket_url, headers=t_header)
sub = resp.cookies['SUB']
print(sub)
return sub
def check_cookie(self):
url = "https://s.weibo.com/weibo/%25E7%2596%25AB%25E6%2583%2585?topnav=1&wvr=6&b=1"
headers = {
"Origin": "https://s.weibo.com/weibo?q=%E7%96%AB%E6%83%85&xsort=hot&suball=1×cope=custom:2020-11-09-0:2020-11-09-23&Refer=g",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
"cookie": 'SUB={}'.format(self.login()),
# "cookie": 'SUB=_2A25yrVGiDeRhGeFP7lMQ9S_PyTuIHXVR28RqrDV8PUNbmtANLUH3kW9NQQnDlzrJI52d0j7UXYHMfHE0nWOF0i8l'
}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
return soup.find_all('div', class_='action')
login = Login()
print(login.check_cookie())