获取cookie和bkn参数
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
import pyautogui
import redis
url='https://qun.qq.com/member.html#gid=453987149'
# 需要在这里边获取Cookie,bkn,和所有群号码,把所有群号码存起来,然后在爬虫里边一个一个取群号码,Cookie和bkn的值就直接传
# headers={
# "cookie": "SINAGLOBAL=6855701965715.122.1577692539947; __gads=ID=c5153c4711c3fa8d:T=1577692757:S=ALNI_MZn5p9pdEiyOo6N9NhaO3oLE1lKsg; _ga=GA1.2.789113268.1577692750; login_sid_t=62e1525776c0bc911756365be2f04f7c; cross_origin_proto=SSL; _s_tentry=cn.bing.com; Apache=8344750195808.959.1603933930414; ULV=1603933930438:89:2:2:8344750195808.959.1603933930414:1603763999621; wb_view_log=1920*10801; wb_view_log_3607491825=1920*10801; UOR=,,login.sina.com.cn; WBtopGlobal_register_version=2020102910; ALF=1635473639; SSOLoginState=1603937640; SCF=ApfIXjexvlOALW_AwBZGr5lNOOESACnJqp_QJtptKZ35jgGHHFSN2QNy3PWTydCN6YAT53oDzyHeUOE95ONOmSI.; SUB=_2A25ynlU4DeRhGeNN6lQU8ivFyjqIHXVR6sHwrDV8PUNbmtANLVr1kW9NScY2ECGW9SVd_9-19MZvuNdisoBSgzC4; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhXgPg0.VYZ6lqP_5Dg2LWr5JpX5KzhUgL.Fo-0eKqfeo-4eKq2dJLoIEBLxK-L12BL1heLxK-LB-qLB.zLxKBLBonL12-LxK-LBKnL1-et; SUHB=05i629ijI_wSsQ; wvr=6; wb_view_log_5316525916=1920*10801; webim_unReadCount=%7B%22time%22%3A1603958917009%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D"
# }
# browser.add_cookie(headers)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
# url=""
# 设置中文
option.add_argument('lang=zh_CN.UTF-8')
# 更换头部
option.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"')
browser = webdriver.Chrome(options=option)
browser.get(url)#打开浏览器预设网址
browser.maximize_window()
time.sleep(5)
# 模拟点击登陆
# browser.switch_to.frame('ptlogin_iframe')#frame切换,进入frame内部才可以进行点击
pyautogui.click(1016, 595)
# time.sleep(5)
# browser.find_element_by_xpath('//*[@id="bottom_qlogin"]').click()#使用xpath找到头像按钮元素
# print(browser.page_source)
# print(browser.execute_script('return event = event||window.event;x=event.clientX;y=event.clientY'))
# browser.find_element_by_id('img_out_3526748995').click()
time.sleep(2)
# print(browser.page_source)#打印网页源代码
# 获取cookies
# 'Cookie': 'RK=3hYoIM4OUR; ptcz=22062646123ccd493ed12173c7b29ec0dbe8425bb3c65e1fd6f12f2d1709caea;
# pgv_pvid=4613506960; _qpsvr_localtk=0.6521903418751458; uin=o1784329053; skey=@UIIkmNdEJ; p_uin=o1784329053;
# pt4_token=DLWBd9LdPUwpXTsYe-tg7eAM1dVCX9-HKjjvo6YdlHI_; p_skey=kd1ZrcOjDHPyY-9g6fSvQ8Mui8gnJdGt9J9PZIpkeLA_;
# traceid=1e9a5faf2a'
# print(browser.get_cookies())
cookie = browser.get_cookies()
for lq in cookie:
# print(lq)
if lq["name"] == 'RK':
# print(lq["value"])
RK = lq["value"]
elif lq["name"] == 'ptcz':
# print(lq["value"])
ptcz = lq["value"]
elif lq["name"] == 'pgv_pvid':
# print(lq["value"])
pgv_pvid = lq["value"]
elif lq["name"] == '_qpsvr_localtk':
# print(lq["value"])
_qpsvr_localtk = lq["value"]
elif lq["name"] == 'uin':
# print(lq["value"])
uin = lq["value"]
elif lq["name"] == 'skey':
# print(lq["value"])
skey = lq["value"]
elif lq["name"] == 'p_uin':
# print(lq["value"])
p_uin = lq["value"]
elif lq["name"] == 'pt4_token':
# print(lq["value"])
pt4_token = lq["value"]
elif lq["name"] == 'p_skey':
# print(lq["value"])
p_skey = lq["value"]
elif lq["name"] == 'traceid':
# print(lq["value"])
traceid = lq["value"]
# 拼接Cookies
cookies = 'RK='+RK+'; '+'ptcz='+ptcz+'; '+'_qpsvr_localtk='+_qpsvr_localtk+'; '+'uin='+uin+'; '+'skey='+skey+'; '+'p_uin='+p_uin+'; '+'pt4_token='+pt4_token+'; '+'p_skey='+p_skey+'; '+'traceid='+traceid+'; '
print(cookies)
# RK=31gEwhzu1Q;ptcz=b2456e2b484e6ec14abbfda2e37ed7dcdcd49215e0df8ef0ac601d2c8c589f3f;_qpsvr_localtk=0.5913548757973062;uin=o3526748995;skey=@15ZJAbNkW;p_uin=o3526748995;pt4_token=K1YlECESkc03EeCGCBxhkpIFOYtZPWNCNFHeT4D*mrU_;p_skey=YkbDrjjcbxSMPo8pdKPDDfR*kpJ8ZINkwcnt9*ALBM4_;traceid=f00fb9b92c;
# _qpsvr_localtk=0.5913548757973062; uin=o3526748995; skey=@15ZJAbNkW; RK=31gEwhzu1Q; ptcz=b2456e2b484e6ec14abbfda2e37ed7dcdcd49215e0df8ef0ac601d2c8c589f3f; p_uin=o3526748995; pt4_token=K1YlECESkc03EeCGCBxhkpIFOYtZPWNCNFHeT4D*mrU_; p_skey=YkbDrjjcbxSMPo8pdKPDDfR*kpJ8ZINkwcnt9*ALBM4_; traceid=f00fb9b92c
# RK=31gEwhzu1Q; ptcz=75cea5abd61decb2916239bf7951fe4e43076a6a34a57b3fed25ec40ce416a61; _qpsvr_localtk=0.1803814196177329; uin=o3526748995; skey=@15ZJAbNkW; p_uin=o3526748995; pt4_token=TRmwjYWuulBbk-5rTyHf7zmYLxEmCsYGoSt4AKWOnPU_; p_skey=SBnVUwqCr2yaVQkdgLb*4Lk28g0ZJgBnsKIUoS0QlX0_; traceid=23a67f1f50;
# 执行js语句,然后获取bkn的返回值
# 获取bkn参数
bkn = browser.execute_script('return $.getCSRFToken()')
print(browser.execute_script('return $.getCSRFToken()'))
写提取群号和爬虫逻辑
import requests
import json
import time
from qq.getBkn import bkn,cookies
import xlwt
import random
def getqun():
global user_agent
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"]
headers = {
'User-Agent':user_agent[random.randint(0, len(user_agent)-1)],
'Cookie': str(cookies)
}
global qunid
qunid = []
global qunname
qunname = []
url = 'https://qun.qq.com/cgi-bin/qun_mgr/get_group_list'
json_data = {
'bkn': str(bkn)
}
res = requests.post(url, headers=headers, data=json_data)
html_str = res.text
print(html_str)
a = json.loads(html_str)
print(a)
for quid in a["join"]:
# print(quid)
qunid.append(quid['gc'])
# print(quid['gc'])
qunname.append(quid['gn'])
# print(quid['gn'])
print(qunid)
print(qunname)
if __name__ == '__main__':
getqun()
# print(qunid)
# print(qunname)
# print(len(qunid))
# print(len(qunname))
# print(qunid[0])
# print(qunname[2])
length = len(qunid)
for q in range(0,length):
time.sleep(5)
print(qunname[q])
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('群成员信息', cell_overwrite_ok=True)
col = ('账号', '加入时间', '最后发言时间', '群昵称', 'qq昵称','q龄')
for inser in range(0, 6):
sheet.write(0, inser, col[inser])
# flag =1
i=0
j=20
headers = {
'User-Agent':user_agent[random.randint(0, len(user_agent)-1)],
'Cookie':str(cookies)
}
url = 'https://qun.qq.com/cgi-bin/qun_mgr/search_group_members'
# 先请求一次,获取并计算相关参数,获取第一页的数据
json_data = {
'gc':str(qunid[q]),
'st':str(i),
'end':str(j),
'sort':'0',
'bkn':str(bkn)
}
i = i+21
j= j+21
res = requests.post(url, headers=headers,data = json_data)
html_str = res.text
# print(html_str)
a = json.loads(html_str)#json转为字典
# 提取群里面的总人数
person = a["count"]
# person = person+11
# 计算出页码和最后一页的个数
c=int(person/21) #除最后一页之外的页数
if person>21 and person<42:
c=int(person/21)+1
# print(c)
d=person%21 #最后一页的人数
print(c,d)
down = 1
for lala in a["mems"]:
joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
# sheet.write(lala['uin'],joinTime,last_speak_time,lala['card'],lala['nick'])
data = [lala['uin'],joinTime,last_speak_time,lala['card'],lala['nick'],lala['qage']]
for inser in range(0,6):
sheet.write(down, inser, data[inser])
down += 1
print("账号:",str(lala['uin']),"加入时间:",joinTime,"最后发言时间:",last_speak_time,"群昵称",lala['card'],"qq昵称:",lala['nick'],lala['qage'])
# 得到相关参数后加入循环,进行循环请求,获取中间页码数据
if c>=2:
code = 1
for flag in range(1,c):
# code = 1
code+=1
json_data = {
'gc':str(qunid[q]),
'st': str(i),
'end': str(j),
'sort': '0',
'bkn': str(bkn)
}
i = i + 21
j = j + 21
# if code==c-1:
# i = i + 21
# j = j + d
res = requests.post(url, headers=headers, data=json_data)
html_str = res.text
# print(html_str)
a = json.loads(html_str) # json转为字典
# print(a)
# 提取群里面的总人数
# person = a["count"]
# # 计算出页码和最后一页的个数
# c = int(person / 21) # 除最后一页之外的页数
# d = person % 21 # 最后一页的人数
for lala in a["mems"]:
joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
# sheet.write(lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'])
data = [lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'], lala['qage']]
for inser in range(0, 6):
sheet.write(down, inser, data[inser])
down += 1
print("账号:", str(lala['uin']), "加入时间:", joinTime, "最后发言时间:", last_speak_time, "群昵称", lala['card'],
"qq昵称:", lala['nick'], lala['qage'])
# 获取最后一页的数据
if c !=2:
if d!=0:#保证这个最后一页是存在的
j=j-20+d
# print(i,j)
json_data = {
'gc':str(qunid[q]),
'st': str(i),
'end': str(j),
'sort': '0',
'bkn': str(bkn)
}
res = requests.post(url, headers=headers, data=json_data)
html_str = res.text
a = json.loads(html_str) # json转为字典
for lala in a["mems"]:
joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
# sheet.write(lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'])
data = [lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'], lala['qage']]
for inser in range(0, 6):
sheet.write(down, inser, data[inser])
down += 1
print("账号:", str(lala['uin']), "加入时间:", joinTime, "最后发言时间:", last_speak_time, "群昵称", lala['card'],
"qq昵称:", lala['nick'], lala['qage'])
# 保存excel文件,以群名称保存
savepath = 'C:/Users/Administrator/Desktop/shuju/'+str(qunname[q])+'.xls'
book.save(savepath)
print(a["mems"])
print(html_str)
print(c,d)
# ss=12
# c=ss/2
# print(c)
# print(type(person))
# print(html_str)
最后参考下面的博客把py文件合并成exe
pyinstaller打包py文件压缩成exe
最后要记得将浏览器驱动放在exe所在的目录下