需要用到的库有:requests、beautifulsoup4、pandas、openpyxl
安装库:pip install requests、pip install beautifulsoup4、pip install pandas、pip install openpyxl
#!/usr/bin/python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas
import webbrowser
from tkinter import *
import tkinter as tk
from tkinter import ttk
from tkinter.filedialog import askdirectory
window = tk.Tk()
window.title("全网热搜")
window.geometry("640x480+500+100")
window.resizable(width=False, height=False) # 不可调整窗口大小
lab1 = tk.Label(window, text="目标路径:")
lab2 = tk.Label(window, text="选择分类:")
path = StringVar()
menu = ttk.Combobox(window, width=10)
menu['value'] = ('知乎热榜', '微博热搜榜', '微信热文榜', 'bilibili日榜', '抖音视频榜')
input = tk.Entry(window, textvariable=path, width=45) # 创建一个输入框显示存放路径
thebox = tk.Listbox(window, width=89, height=19)
# ------选取本地路径------
def select_path():
path_ = askdirectory()
path.set(path_)
# -------定义一个函数清除输出框第一行到最后一行的内容-----
def cle():
thebox.delete(0, "end")
# -------选择爬取热搜的类别------
def get_cid():
if menu.get() == "知乎热榜":
cid = '/n/mproPpoq6O'
if menu.get() == "微博热搜榜":
cid = '/n/KqndgxeLl9'
if menu.get() == "微信热文榜":
cid = '/n/WnBe01o371'
if menu.get() == "bilibili日榜":
cid = '/n/74KvxwokxM'
if menu.get() == "抖音视频榜":
cid = '/n/DpQvNABoNE'
return cid
# -----------获取响应解析内容---------
def download():
base_url = 'https://tophub.today'
url = base_url + str(get_cid()) # 最终解析链接
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
resp = requests.get(url, headers=headers)
html = resp.text
soup = BeautifulSoup(html, 'html.parser')
nodes = soup.find('tbody').find_all('tr') # 获取第一个tbody标签中的tr标签
df = pandas.DataFrame() # 初始化df
lb = [] # 初始化lb
for node in nodes:
messages = node.find('td', class_='al').find_all('a') # 获取所有a标签
order = node.find('td', align='center').get_text() # 获取排名
heat = node.find_all('td')[2].get_text() # 获取第二个td标签==热度
for message in messages:
title = message.get_text() # 获取标题
link = ['https://tophub.today' + message['href']] # 获取链接
lb = lb + link # 将link循环写入到lb
file_name = input.get() + '/' + menu.get() + '.xlsx'
data = {
'排名': [order],
'热度': [heat],
'标题': [title],
'链接': link
}
item = pandas.DataFrame(data)
df = pandas.concat([df, item]) # 合并表格
df.to_excel(file_name, index=False) # 写入表格到指定文件夹
thebox.insert('end', (order, heat, title)) # 写入到展示框
return lb
# -----------打开选中链接----------
def open_url():
choice = thebox.curselection()[0] # 获取选中的行号
link = download() # 调用download函数中的lb列表
webbrowser.open(link[choice], new=1, autoraise=True) # 用浏览器打开选中行对应的链接
# ----------用于选择保存路径-------
button0 = tk.Button(window, text='选择路径', relief=tk.RAISED, width=8, height=1, command=select_path)
# ----------用于paqu功能----------
button1 = tk.Button(window, text='爬取', relief=tk.RAISED, width=8, height=1, command=download)
# ----------用于清空输出框--------
button2 = tk.Button(window, text='清空输出', relief=tk.RAISED, width=8, height=1, command=cle)
# -----------用于打开链接----------
button3 = tk.Button(window, text='打开', width=8, height=1, command=open_url)
# -----------完成元素布局和设置部件位置---------
lab1.place(x=10, y=10)
lab2.place(x=10, y=60)
menu.place(x=90, y=60)
input.place(x=90, y=10)
thebox.place(x=5, y=110)
button0.place(x=540, y=5)
button1.place(x=420, y=55)
button2.place(x=540, y=55)
button3.place(x=300, y=55)
tk.mainloop()