爬取qq群信息

获取cookie和bkn参数

from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
import pyautogui
import redis

url='https://qun.qq.com/member.html#gid=453987149'

# 需要在这里边获取Cookie,bkn,和所有群号码,把所有群号码存起来,然后在爬虫里边一个一个取群号码,Cookie和bkn的值就直接传
# headers={
# "cookie": "SINAGLOBAL=6855701965715.122.1577692539947; __gads=ID=c5153c4711c3fa8d:T=1577692757:S=ALNI_MZn5p9pdEiyOo6N9NhaO3oLE1lKsg; _ga=GA1.2.789113268.1577692750; login_sid_t=62e1525776c0bc911756365be2f04f7c; cross_origin_proto=SSL; _s_tentry=cn.bing.com; Apache=8344750195808.959.1603933930414; ULV=1603933930438:89:2:2:8344750195808.959.1603933930414:1603763999621; wb_view_log=1920*10801; wb_view_log_3607491825=1920*10801; UOR=,,login.sina.com.cn; WBtopGlobal_register_version=2020102910; ALF=1635473639; SSOLoginState=1603937640; SCF=ApfIXjexvlOALW_AwBZGr5lNOOESACnJqp_QJtptKZ35jgGHHFSN2QNy3PWTydCN6YAT53oDzyHeUOE95ONOmSI.; SUB=_2A25ynlU4DeRhGeNN6lQU8ivFyjqIHXVR6sHwrDV8PUNbmtANLVr1kW9NScY2ECGW9SVd_9-19MZvuNdisoBSgzC4; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhXgPg0.VYZ6lqP_5Dg2LWr5JpX5KzhUgL.Fo-0eKqfeo-4eKq2dJLoIEBLxK-L12BL1heLxK-LB-qLB.zLxKBLBonL12-LxK-LBKnL1-et; SUHB=05i629ijI_wSsQ; wvr=6; wb_view_log_5316525916=1920*10801; webim_unReadCount=%7B%22time%22%3A1603958917009%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D"
# }
# browser.add_cookie(headers)

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
# url=""
# 设置中文
option.add_argument('lang=zh_CN.UTF-8')
# 更换头部
option.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"')
browser = webdriver.Chrome(options=option)
browser.get(url)#打开浏览器预设网址
browser.maximize_window()
time.sleep(5)
# 模拟点击登陆
# browser.switch_to.frame('ptlogin_iframe')#frame切换,进入frame内部才可以进行点击
pyautogui.click(1016, 595)
# time.sleep(5)
# browser.find_element_by_xpath('//*[@id="bottom_qlogin"]').click()#使用xpath找到头像按钮元素
# print(browser.page_source)
# print(browser.execute_script('return event = event||window.event;x=event.clientX;y=event.clientY'))
# browser.find_element_by_id('img_out_3526748995').click()
time.sleep(2)

# print(browser.page_source)#打印网页源代码
# 获取cookies
# 'Cookie': 'RK=3hYoIM4OUR; ptcz=22062646123ccd493ed12173c7b29ec0dbe8425bb3c65e1fd6f12f2d1709caea;
# pgv_pvid=4613506960; _qpsvr_localtk=0.6521903418751458; uin=o1784329053; skey=@UIIkmNdEJ; p_uin=o1784329053;
# pt4_token=DLWBd9LdPUwpXTsYe-tg7eAM1dVCX9-HKjjvo6YdlHI_; p_skey=kd1ZrcOjDHPyY-9g6fSvQ8Mui8gnJdGt9J9PZIpkeLA_;
# traceid=1e9a5faf2a'

# print(browser.get_cookies())
cookie = browser.get_cookies()
for lq in cookie:
  # print(lq)
  if lq["name"] == 'RK':
    # print(lq["value"])
    RK = lq["value"]
  elif lq["name"] == 'ptcz':
    # print(lq["value"])
    ptcz = lq["value"]
  elif lq["name"] == 'pgv_pvid':
    # print(lq["value"])
    pgv_pvid = lq["value"]
  elif lq["name"] == '_qpsvr_localtk':
    # print(lq["value"])
    _qpsvr_localtk = lq["value"]
  elif lq["name"] == 'uin':
    # print(lq["value"])
    uin = lq["value"]
  elif lq["name"] == 'skey':
    # print(lq["value"])
    skey = lq["value"]
  elif lq["name"] == 'p_uin':
    # print(lq["value"])
    p_uin = lq["value"]
  elif lq["name"] == 'pt4_token':
    # print(lq["value"])
    pt4_token = lq["value"]
  elif lq["name"] == 'p_skey':
    # print(lq["value"])
    p_skey = lq["value"]
  elif lq["name"] == 'traceid':
    # print(lq["value"])
    traceid = lq["value"]
# 拼接Cookies
cookies = 'RK='+RK+'; '+'ptcz='+ptcz+'; '+'_qpsvr_localtk='+_qpsvr_localtk+'; '+'uin='+uin+'; '+'skey='+skey+'; '+'p_uin='+p_uin+'; '+'pt4_token='+pt4_token+'; '+'p_skey='+p_skey+'; '+'traceid='+traceid+'; '
print(cookies)
# RK=31gEwhzu1Q;ptcz=b2456e2b484e6ec14abbfda2e37ed7dcdcd49215e0df8ef0ac601d2c8c589f3f;_qpsvr_localtk=0.5913548757973062;uin=o3526748995;skey=@15ZJAbNkW;p_uin=o3526748995;pt4_token=K1YlECESkc03EeCGCBxhkpIFOYtZPWNCNFHeT4D*mrU_;p_skey=YkbDrjjcbxSMPo8pdKPDDfR*kpJ8ZINkwcnt9*ALBM4_;traceid=f00fb9b92c;
# _qpsvr_localtk=0.5913548757973062; uin=o3526748995; skey=@15ZJAbNkW; RK=31gEwhzu1Q; ptcz=b2456e2b484e6ec14abbfda2e37ed7dcdcd49215e0df8ef0ac601d2c8c589f3f; p_uin=o3526748995; pt4_token=K1YlECESkc03EeCGCBxhkpIFOYtZPWNCNFHeT4D*mrU_; p_skey=YkbDrjjcbxSMPo8pdKPDDfR*kpJ8ZINkwcnt9*ALBM4_; traceid=f00fb9b92c
# RK=31gEwhzu1Q; ptcz=75cea5abd61decb2916239bf7951fe4e43076a6a34a57b3fed25ec40ce416a61; _qpsvr_localtk=0.1803814196177329; uin=o3526748995; skey=@15ZJAbNkW; p_uin=o3526748995; pt4_token=TRmwjYWuulBbk-5rTyHf7zmYLxEmCsYGoSt4AKWOnPU_; p_skey=SBnVUwqCr2yaVQkdgLb*4Lk28g0ZJgBnsKIUoS0QlX0_; traceid=23a67f1f50;


# 执行js语句,然后获取bkn的返回值
# 获取bkn参数
bkn = browser.execute_script('return $.getCSRFToken()')
print(browser.execute_script('return $.getCSRFToken()'))

写提取群号和爬虫逻辑

import requests
import json
import time
from qq.getBkn import bkn,cookies
import xlwt
import random

def getqun():
    global user_agent
    user_agent = [
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"]

    headers = {
        'User-Agent':user_agent[random.randint(0, len(user_agent)-1)],
        'Cookie': str(cookies)

    }

    global  qunid
    qunid = []
    global qunname
    qunname = []
    url = 'https://qun.qq.com/cgi-bin/qun_mgr/get_group_list'
    json_data = {
        'bkn': str(bkn)
    }

    res = requests.post(url, headers=headers, data=json_data)
    html_str = res.text
    print(html_str)
    a = json.loads(html_str)
    print(a)
    for quid in a["join"]:
        # print(quid)
        qunid.append(quid['gc'])
        # print(quid['gc'])
        qunname.append(quid['gn'])
        # print(quid['gn'])
    print(qunid)
    print(qunname)


if __name__ == '__main__':

    getqun()
    # print(qunid)
    # print(qunname)
    # print(len(qunid))
    # print(len(qunname))
    # print(qunid[0])
    # print(qunname[2])
    length = len(qunid)
    for q in range(0,length):
        time.sleep(5)
        print(qunname[q])
        book = xlwt.Workbook(encoding='utf-8', style_compression=0)
        sheet = book.add_sheet('群成员信息', cell_overwrite_ok=True)
        col = ('账号', '加入时间', '最后发言时间', '群昵称', 'qq昵称','q龄')
        for inser in range(0, 6):
            sheet.write(0, inser, col[inser])
        # flag =1
        i=0
        j=20
        headers = {
            'User-Agent':user_agent[random.randint(0, len(user_agent)-1)],
            'Cookie':str(cookies)
        }

        url = 'https://qun.qq.com/cgi-bin/qun_mgr/search_group_members'
        # 先请求一次,获取并计算相关参数,获取第一页的数据
        json_data = {
            'gc':str(qunid[q]),
            'st':str(i),
            'end':str(j),
            'sort':'0',
            'bkn':str(bkn)
        }
        i = i+21
        j= j+21
        res = requests.post(url, headers=headers,data = json_data)
        html_str = res.text
        # print(html_str)
        a = json.loads(html_str)#json转为字典
        # 提取群里面的总人数
        person = a["count"]
        # person = person+11
        # 计算出页码和最后一页的个数
        c=int(person/21) #除最后一页之外的页数
        if person>21 and person<42:
            c=int(person/21)+1
        # print(c)
        d=person%21      #最后一页的人数
        print(c,d)
        down = 1
        for lala in a["mems"]:

            joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
            last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
            # sheet.write(lala['uin'],joinTime,last_speak_time,lala['card'],lala['nick'])
            data = [lala['uin'],joinTime,last_speak_time,lala['card'],lala['nick'],lala['qage']]
            for inser in range(0,6):
                sheet.write(down, inser, data[inser])
            down += 1
            print("账号:",str(lala['uin']),"加入时间:",joinTime,"最后发言时间:",last_speak_time,"群昵称",lala['card'],"qq昵称:",lala['nick'],lala['qage'])

        # 得到相关参数后加入循环,进行循环请求,获取中间页码数据
        if c>=2:
            code = 1
            for flag in range(1,c):
                # code = 1
                code+=1
                json_data = {
                    'gc':str(qunid[q]),
                    'st': str(i),
                    'end': str(j),
                    'sort': '0',
                    'bkn': str(bkn)
                }
                i = i + 21
                j = j + 21
                # if code==c-1:
                #     i = i + 21
                #     j = j + d
                res = requests.post(url, headers=headers, data=json_data)
                html_str = res.text
                # print(html_str)
                a = json.loads(html_str)  # json转为字典
                # print(a)
                # 提取群里面的总人数
                # person = a["count"]
                # # 计算出页码和最后一页的个数
                # c = int(person / 21)  # 除最后一页之外的页数
                # d = person % 21  # 最后一页的人数
                for lala in a["mems"]:
                    joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
                    last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
                    # sheet.write(lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'])
                    data = [lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'], lala['qage']]
                    for inser in range(0, 6):
                        sheet.write(down, inser, data[inser])
                    down += 1
                    print("账号:", str(lala['uin']), "加入时间:", joinTime, "最后发言时间:", last_speak_time, "群昵称", lala['card'],
                          "qq昵称:", lala['nick'], lala['qage'])



            # 获取最后一页的数据
            if c !=2:
                if d!=0:#保证这个最后一页是存在的
                    j=j-20+d
                    # print(i,j)
                    json_data = {
                        'gc':str(qunid[q]),
                        'st': str(i),
                        'end': str(j),
                        'sort': '0',
                        'bkn': str(bkn)
                    }
                    res = requests.post(url, headers=headers, data=json_data)
                    html_str = res.text
                    a = json.loads(html_str)  # json转为字典
                    for lala in a["mems"]:
                        joinTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['join_time']))
                        last_speak_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(lala['last_speak_time']))
                        # sheet.write(lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'])
                        data = [lala['uin'], joinTime, last_speak_time, lala['card'], lala['nick'], lala['qage']]
                        for inser in range(0, 6):
                            sheet.write(down, inser, data[inser])
                        down += 1
                        print("账号:", str(lala['uin']), "加入时间:", joinTime, "最后发言时间:", last_speak_time, "群昵称", lala['card'],
                              "qq昵称:", lala['nick'], lala['qage'])

        # 保存excel文件,以群名称保存
        savepath = 'C:/Users/Administrator/Desktop/shuju/'+str(qunname[q])+'.xls'
        book.save(savepath)

        print(a["mems"])
        print(html_str)
        print(c,d)
    # ss=12
    # c=ss/2
    # print(c)
    # print(type(person))
    # print(html_str)

最后参考下面的博客把py文件合并成exe
pyinstaller打包py文件压缩成exe

最后要记得将浏览器驱动放在exe所在的目录下

  • 1
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
要使用Python爬取QQ群信息,可以采用以下步骤: 1. 安装必要的库:首先,您需要安装Selenium库,用于模拟登录QQ网页版。您可以使用以下命令在Python中安装Selenium库: ``` pip install selenium ``` 2. 下载并配置浏览器驱动程序:Selenium需要使用浏览器驱动程序来控制浏览器。您可以根据您所使用的浏览器下载相应的驱动程序,如Chrome驱动程序或Firefox驱动程序。下载后,将驱动程序添加到您的系统路径中。 3. 导入必要的库:在Python中,导入Selenium库和WebDriver对象,如下所示: ```python from selenium import webdriver ``` 4. 创建WebDriver对象:使用适当的驱动程序创建WebDriver对象。例如,如果您使用Chrome浏览器,可以使用以下代码创建一个ChromeWebDriver对象: ```python driver = webdriver.Chrome() ``` 5. 打开QQ群页面:使用`get()`方法打开要爬取QQ群链接。链接形式为“https://qun.qq.com/member.html#gid=”加上号码。例如,您可以使用以下代码打开链接: ```python url = "https://qun.qq.com/member.html#gid=号码" driver.get(url) ``` 6. 登录QQ网页版:由于QQ网页版是异步加载的,需要模拟登录才能获取到网页的源码。使用Selenium的自动化操作,您可以模拟点击二维码登录。扫描二维码后,您将登录到QQ网页版。 7. 解析源码并提取所需信息:登录后,您可以使用Selenium的各种方法和技巧来解析页面的源码并提取您想要的QQ群信息。例如,您可以使用XPath或CSS选择器来定位和提取成员的网名、名片、QQ号、性别、Q龄、入时间等信息。 请注意,爬取QQ群信息可能有一些法律和隐私方面的限制,请遵守相关法律法规并尊重用户隐私。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值