Python基础之tkinter界面爬虫采集(补充完善添加翻页功能)

最新推荐文章于 2024-09-28 17:40:14 发布

寂寞的孤独者

最新推荐文章于 2024-09-28 17:40:14 发布

阅读量938

点赞数 17

分类专栏： tk 文章标签：爬虫

本文链接：https://blog.csdn.net/qq_69920603/article/details/141371505

版权

tk 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

接上次的代码继续完善。添加了爬虫翻页的功能，所以采集的数据显示在同一个Treeview表格中。后面还会继续完善这个框架。等完善了可能会开源。

先看结果图。

窗口代码，self.tree = Treeview(root,show="headings",) 这个是表格的窗口，创建的时候不需要写入columns=('serial','id','singer', 'name', 'url')标题的参数，在后面分离的时候再写Treeview逻辑

import tkinter as tk

from tkinter import  PhotoImage,Entry,Button
from tkinter.ttk import LabelFrame,Label,Treeview,Style,Scrollbar

class Slider_tk:
    def __init__(self,root=None,):
        self.root = root
        self.root.title("slider_kz歌曲搜索")
        self.root.minsize(1000, 490)
        # self.root.resizable(False, False)
        pdh = './Image/ico_o.ico'
        icon = PhotoImage(file=pdh)
        root.iconphoto(True, icon)
        self.Labe_lFram3 = LabelFrame(self.root, text='告示')
        self.Labe_lFrame = LabelFrame( self.root, text='操作界面')
        self.Labe_lFram2 = LabelFrame( self.root, text='数据界面')
        self.Labe_lFram4 = LabelFrame(self.root,)
        self.Labe_lFram5 = LabelFrame(self.root, )
        self.Labe_lFram6 = LabelFrame(self.root, )
        self.label1 = Label(self.root, text='双击为下载，下载完毕有提示！请勿填写数字和空，歌曲下载慢，是服务器的原因，要是没有返回结果，需要等一段时间！DJ舞曲搜索请舞曲编号或关键字', )
        self.label = Label(self.root, text='zk查找:', )
        self.entry = Entry( self.root, font=('Arial 10 bold'), width=33)
        self.submit_button = Button(self.root, text="搜索", width=6, height=1, font=('Arial 10 bold'),)
        self.label2 = Label(self.root, text='网易查找:', )
        self.entry2 = Entry(self.root, font=('Arial 10 bold'), width=33)
        self.submit_button2 = Button(self.root, text="搜索", width=6, height=1, font=('Arial 10 bold'), )
        self.label_dj = Label(self.root, text='DJ舞曲查找:', )
        self.label_pa = Label(self.root, text='输入页数:', )
        self.entry2_dj = Entry(self.root, font=('Arial 10 bold'), width=25)
        self.entry2_pa = Entry(self.root, font=('Arial 10 bold'), width=25)
        self.submit_button_dj = Button(self.root, text="搜索", width=6, height=1, font=('Arial 10 bold'), )
        self.submit_button_pe = Button(self.root, text="翻页", width=6, height=1, font=('Arial 10 bold'), )
        self.tree = Treeview(root,show="headings",)#columns=('serial','id','singer', 'name', 'url'),
        style = Style()
        style.theme_use("default")
        style.map("Treeview")
        self.center_window(520, 480)  # 屏幕居中

    def center_window(self, width, height):
        # 获取屏幕的尺寸
        x = (self.root.winfo_screenwidth() - width) // 2
        y = (self.root.winfo_screenheight() - height) // 2
        # 设置窗口的位置
        self.root.geometry(f'{width}x{height}+{x}+{y}')

这里是窗口布局和最后处理爬虫的逻辑。

"""
# @当前时间 :2024/8/19 13:08
# @Author  : TS
# @Email   : TS@gmail.com
# @File    : ts2.py
# @Software: PyCharm
"""
import math
from concurrent.futures import ThreadPoolExecutor
from tkinter import filedialog, messagebox
from slider_tk import Slider_tk
from slider_kz import Slide_kz
from slider_kz import Fart_pi
from dj_zk import Yj_youyou
class Slider_subassembly(Slider_tk):
    def __init__(self, root=None):
        super().__init__(root=root)
        self.root = root
        self.kz = Slide_kz()
        self.fp = Fart_pi()
        self.dj = Yj_youyou()
        self.submit_button_dj.config(command=self.dj_searcj)
        self.submit_button.config(command=self.on_submit)
        self.submit_button2.config(command=self.on_submt)
        self.submit_button_pe.config(command=self.Page)
        self.tree.bind("<Button-1>", self.on_tree_click)
        self.show()
    #
    def show(self):
        self.label1.place(x=10, y=20, )
        self.Labe_lFram6.place(x=2, y=190, width=400, height=5)
        self.Labe_lFram5.place(x=2, y=110, width=400, height=5)
        self.Labe_lFram4.place(x=2, y=150, width=400, height=5)
        self.Labe_lFram3.place(x=2, y=1, width=997, height=50)
        self.Labe_lFrame.place(x=2, y=50, width=400, height=448)
        self.Labe_lFram2.place(x=400, y=50, width=600, height=440)
        self.label.place(x=4, y=80, )
        self.entry.place(x=100, y=80)
        self.submit_button.place(x=340, y=78,)
        self.label2.place(x=4, y=125, )
        self.entry2.place(x=100, y=125)
        self.submit_button2.place(x=340, y=120, )
        self.tree.place(x=404, y=68, height=418, width=593, )
        # 绑定鼠标点击事件
        self.label_dj.place(x=4, y=165)
        self.label_pa.place(x=4, y=210)
        self.entry2_dj.place(x=80, y=165)
        self.entry2_pa.place(x=80, y=210)
        self.submit_button_dj.place(x=270, y=160)
        self.submit_button_pe.place(x=340, y=210)

    def search_(self):
        self.tree['columns'] = ('serial', 'id', 'singer', 'name', 'url')
        for col in self.tree['columns']:
            self.tree.heading(col, text=col)
        self.tree.column('serial', width=2, )
        self.tree.column('id', width=2, )
        self.tree.column('singer', width=2)
        self.tree.column('name', width=2, )
        self.tree.column('url', width=2, )
        n=0
        params = self.entry.get()
        pase = self.kz.requests(params)
        dict_ =self.kz.pase(pase)
        for di_ct in dict_:
            id = di_ct['id']
            song_name = di_ct['song_name']
            singer = di_ct['singer']
            url_ = di_ct['url']
            self.tree.insert("", "end", values=(n, id, singer,song_name, url_))
            n+=1

    def Page(self):
        Page = self.entry2_pa.get()
        self.tree.delete(*self.tree.get_children())
        self.dj_searcj(Page)
    def dj_searcj(self,page=None):
        self.tree['columns'] = ('serial', 'id','name','time', '页数',  'url')
        for col in self.tree['columns']:
            self.tree.heading(col, text=col)
        self.tree.column('serial', width=2, )
        self.tree.column('id', width=2, )
        self.tree.column('页数', width=2)
        self.tree.column('name', width=2, )
        self.tree.column('time', width=2, )
        self.tree.column('url', width=2, )
        n = 0
        params = self.entry2_dj.get()
        pase = self.dj.requests(params,page)
        dict_ = self.dj.url_dq(pase)
        for i in dict_:
            name = i['name']
            Pages = i['Recording'] #页数
            li_url = i['li_url']
            id_st = i['id_st']
            time_s=i['time_s']
            divisor = 20
            result = math.ceil(int(Pages) / divisor)
            # print(n, id_st, name,time_s,result,li_url)
            self.tree.insert("", "end", values=(n, id_st, name,time_s,result,li_url))
            n += 1
    def on_tree_cli(self):
        self.tree['columns'] = ('serial', 'id', 'singer', 'name', 'url')
        for col in self.tree['columns']:
            self.tree.heading(col, text=col)
        self.tree.column('serial', width=2, )
        self.tree.column('id', width=2, )
        self.tree.column('singer', width=2)
        self.tree.column('name', width=2, )
        self.tree.column('url', width=2, )
        n=0
        params = self.entry2.get()
        pase=self.fp.res(params)
        dict_=self.fp.pase(pase)
        for di_ct in dict_:
            id = di_ct['songid']
            song_name = di_ct['author']
            singer = di_ct['title']
            url_ = di_ct['Location']

            self.tree.insert("", "end", values=(n, id, singer, song_name, url_))
            n += 1
    def on_tree_click(self, event):
        if event.widget == self.tree:
            # 获取双击的item
            item = self.tree.identify_row(event.y)
            if item:
                # 获取item的values
                values = self.tree.item(item, "values")
                # # 假设values是一个元组，包含了歌曲的id, song_name, singer, url等信息
                url = values[-1]
                song_name = values[2]
                download_message = "是否下载"
                if messagebox.askyesno("Download",download_message):
                    self.fp.save_mp3(url,song_name)
                    # messagebox.showinfo("下载完毕！",download_messa)
                    messagebox.askyesno("下载完毕", song_name)

    def on_submit(self):
        self.tree.delete(*self.tree.get_children())  # 清空树形控件数据
        self.search_()  # 传入回调函数

    def on_submt(self):
        self.tree.delete(*self.tree.get_children())  # 清空树形控件数据
        self.on_tree_cli()  # 传入回调函数

    def dj_submt(self):
        self.tree.delete(*self.tree.get_children())  # 清空树形控件数据
        self.dj_searcj()

这里是布局代码

事件初始化调用

Treeview列表标题在这里处理。其中你想展示那个爬虫的自己定义代码。可以写多个这样的函数调用。

其中一个爬虫代码。里面的url脱敏处理了。

import os
import re

import requests

from slider_tk import Slider_tk

class Yj_youyou:
    def __init__(self):
        pass
    def cookies(self):
        cookies = {
            'Hm_lvt_93e672d9487d1b71d59dffcaaca8cf4d': '1724142971',
            'HMACCOUNT': 'E543AEA66C42B296',
            'Hm_lvt_1602614188207057874070b514c435ac': '1724142971',
            'musicls': '%7C247911%7C%2C',
            'djuu_mlog': '%7C247911%7C%2C',
            'PLAYSTYLE': '0',
            'bf': '1',
            'PHPSESSID': '47q8h5jpk33r19fuop0dq59sik',
            'search_log': '%2C%u4F24%u611F',
            'Hm_lpvt_93e672d9487d1b71d59dffcaaca8cf4d': '1724154335',
            'Hm_lpvt_1602614188207057874070b514c435ac': '1724154335',
        }
        return cookies
    def headers(self):
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            # 'Cookie': 'Hm_lvt_93e672d9487d1b71d59dffcaaca8cf4d=1724142971; HMACCOUNT=E543AEA66C42B296; Hm_lvt_1602614188207057874070b514c435ac=1724142971; musicls=%7C247911%7C%2C; djuu_mlog=%7C247911%7C%2C; PLAYSTYLE=0; bf=1; PHPSESSID=47q8h5jpk33r19fuop0dq59sik; search_log=%2C%u4F24%u611F; Hm_lpvt_93e672d9487d1b71d59dffcaaca8cf4d=1724154335; Hm_lpvt_1602614188207057874070b514c435ac=1724154335',
            'Pragma': 'no-cache',
            'Referer': Base64 解密aHR0cHM6Ly93d3cuZGp1dS5jb20vc2VhcmNoP211c2ljbmFtZT0lRTQlQkMlQTQlRTYlODQlOUYmbGlzdD0yJmNpZD0wJnBhZ2U9Mw==,
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': '你自己的U-a',
            'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }
        return headers
    def params(self):
        params = {
            'musicname': '伤感',
            'list': '2',
            'cid': '0',
            'page': '1',
        }
        return params

    def requests(self, musicname, page=None):
        data = self.params()
        data['musicname'] = musicname
        data['page'] = page
        res = requests.get('aHR0cHM6Ly93d3cuZGp1dS5jb20vc2VhcmNo', params=data, cookies=self.cookies(), headers=self.headers()).text
        res_url = re.findall(r'\<a href\=\"(.*?)" target\=\"_Pt\" title\=\".*?"\>\<img ', res)
        Recording = re.findall(
            r'\<div style\=\"line\-height\:24px\; text\-indent\: 10px\; color\:\#c7c7c7 \"\> 共搜索到 \<span class\=\"fbold\" style\=\"color\: \#de651d\"\>(.*?)\<\/span\> 个关于\‘\<span id\=\".*?"\>\<em\>\<\/em\>\<\/span\>\’的记录\. \<\/div\>',
            res)[0]
        time_s = re.findall(r'\<\/a\>\<\/span\>\<span class\=\"sc_2\"\>TIME (.*?)\<\/span\>', res)
        dict = [{
            'res_url': res_url,
            'Recording': Recording,
            'time_s':time_s
        }]
        return dict
    def url_dq(self, musicname):
        dict={}
        for i in musicname:
            for j, k in zip(i['res_url'], i['time_s']):
                dict['Recording'] = i['Recording']
                url = 'aHR0cHM6Ly93d3cuZGp1dS5jb20v' + j
                res = requests.get(url=url, cookies=self.cookies(), headers=self.headers()).text
                # print(res)
                dict['name'] = re.findall(r'\<h1\>(.*?)<\/h1\>', res)[0]
                li_st = re.findall(
                    r"var music \= \{id\: .*?\, type\: \'.*?\'\, name\: \'.*?'\, file\: \'(.*?)\'\, .*?}\,",
                    res)[0]
                dict['li_url'] = f'https://mp4.djuu.com/{li_st}.m4a'
                dict['id_st'] = re.findall(
                    r"var music \= \{id\: (.*?)\, type\: \'.*?\'\, name\: \'.*?'\, file\: \'.*?\'\, .*?}\,",
                    res)[0]
                dict['time_s'] = k
                yield dict

    def save_mp3(self, url, song_name):
        res = requests.get(url=url, cookies=self.cookies(), headers=self.headers()).content
        sfolder_path = 'Song_saving'
        if not os.path.exists(sfolder_path):
            # 文件不存在，创建一个新的的文件
            os.makedirs(sfolder_path)
        with open(sfolder_path + '\\' + song_name + '.mp3', mode='wb') as f:
            # 写入数据
            f.write(res)

爬虫写的不是很好，写的是比较简单的，

https://blog.csdn.net/qq_69920603/article/details/141323031?

完整的代码可以私信联系我。

新手刚写这个，写的不是很好的！转载请说明出处!原创不易