tkinter 界面
import os
import sys
import time
import schedule
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
import threading
from tkinter.filedialog import askdirectory, askopenfilename
from utils import parse_url_form_txt, downloader
check_button = False
class main(tk.Tk):
cur_path = os.path.dirname(sys.argv[0])
is_start = False
urls = []
def __init__(self):
super().__init__()
self.url_file_config_is_true = False
self.result_display = tk.StringVar()
self.path = tk.StringVar()
self.set_init_win()
self.set_save_path()
self.set_start_button()
self.set_check_button()
self.set_result()
self.set_progress_bar()
def set_progress_bar(self):
tk.Label(self, text='进度条').grid(row=5, column=1)
self.pb = ttk.Progressbar(self, length=200, value=0, mode="determinate")
self.pb.grid(row=6, column=1)
def set_start_button(self):
tk.Button(self, text='开始', command=self.start_spider).grid(row=3, column=2)
def getdirsize(self):
size = 0
for root, dirs, files in os.walk(os.path.join(self.cur_path, 'download_img')):
size += sum([os.path.getsize(os.path.join(self.cur_path, root, name)) for name in files])
return size
def loop_download(self, save_base_path):
for index, url in enumerate(self.urls):
if not url:
continue
download_url = url.strip().split(',')[0]
path = url.strip().split(',')[1]
downloader(download_url, os.path.join(save_base_path, path))
def excute_scrapy(self):
self.pb['value'] = 1
save_base_path = os.path.join(self.cur_path, 'download_img')
if check_button:
self.result_display.set('循环爬取中.........')
self.pb['mode'] = 'indeterminate'
print('循环爬取--')
self.pb.start()
schedule.every(10).minutes.do(self.loop_download, save_base_path)
while True:
schedule.run_pending()
time.sleep(3)
else:
print('不循环爬取')
self.pb['maximum'] = len([i for i in self.urls if i])
for index, url in enumerate(self.urls):
if not url:
continue
self.pb['value'] = index + 1
self.pb.update()
download_url = url.strip().split(',')[0]
path = url.strip().split(',')[1]
downloader(download_url, os.path.join(save_base_path, path))
self.result_display.set('单次爬取完毕')
self.is_start = False
def start_spider(self):
if self.url_file_config_is_true and not self.is_start:
t = threading.Thread(target=self.excute_scrapy)
t.setDaemon(True)
t.start()
self.result_display.set('爬取进行中')
else:
messagebox.showerror('警告', '请先添加配置文件,或者程序已经运行中')
def set_init_win(self):
self.title('中央气象台图片')
self.resizable(0, 0)
self.geometry("320x160")
def set_check_button(self):
tk.Checkbutton(self, text='是否循环爬取', command=self.check).grid(row=3, column=1)
def check(self):
global check_button
if check_button:
check_button = False
else:
check_button = True
def set_result(self):
tk.Label(self, text='爬取结果:').grid(row=4, column=0)
tk.Label(self, textvariable=self.result_display).grid(row=4, column=1)
def set_save_path(self):
tk.Label(self, text='配置文件').grid(row=0, column=0)
tk.Label(self, text='文件路径').grid(row=1, column=0)
tk.Entry(self, textvariable=self.path).grid(row=1, column=1)
tk.Button(self, text='路径选择', command=self.select_path).grid(row=1, column=2)
def select_path(self):
path_ = askopenfilename()
self.path.set(path_)
file_content = None
with open(path_, 'r', encoding='utf-8') as f:
file_content = f.read()
urls = parse_url_form_txt(file_content)
self.urls = urls
messagebox.showinfo('提示', '配置文件成功')
self.url_file_config_is_true = True
if __name__ == '__main__':
main().mainloop()
下载工具
import os
import re
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
cookies = {
'UM_distinctid': '17a126b2aac89-0c792247cd7f5-3373266-13c680-17a126b2aad534',
'ray_leech_token': '1624499370',
'CNZZDATA1254743953': '513171663-1623801415-^%^7C1624496980',
'__utrace': '82eacfa080788380c407189aaea16c59',
}
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
def parse_url_form_txt(content):
urls = content.split('\n')
return urls
def get_html(url):
response = requests.get(url, headers=headers,
cookies=cookies, verify=False)
return response.text
def parse_image_urls(html):
images_urls = list()
html = etree.HTML(html)
elements = html.xpath("//div[contains(@class,'col-xs-12 time')]")
for element in elements:
img_url = element.xpath('./@data-img')
images_urls = images_urls + img_url
return images_urls
def get_name_form_url(url):
name = re.search('(.*?)\?', os.path.basename(url)).group(1)
return name
def dwonloader_images(url, file_path):
name = get_name_form_url(url)
exists = os.path.exists(os.path.join(file_path, name))
print(url.replace('/medium',""))
if not exists:
response = requests.get(url.replace('/medium',""), headers=headers,
cookies=cookies, verify=False)
with open(os.path.join(file_path, name), 'wb') as f:
f.write(response.content)
def downloader(url, file_path):
if not os.path.exists(file_path):
os.makedirs(file_path)
html = get_html(url)
images_urls = parse_image_urls(html)
thread_pool = ThreadPoolExecutor(10)
for url in images_urls:
thread_pool.submit(dwonloader_images, url, file_path)
thread_pool.shutdown(wait=True)
if __name__ == '__main__':
downloader('http://www.nmc.cn/publish/satellite/FY4A-true-color.htm',
os.path.join(os.path.dirname(__file__), 'test'))
下载列表
http://www.nmc.cn/publish/observations/china/dm/weatherchart-h000.htm,中国天气000
http://www.nmc.cn/publish/observations/china/dm/weatherchart-h925.htm,中国天气925
http://www.nmc.cn/publish/satellite/FY4A-true-color.htm,卫星真彩色