tkinter的天气爬虫

tkinter 界面

import os
import sys
import time
import schedule
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
import threading
from tkinter.filedialog import askdirectory, askopenfilename

from utils import parse_url_form_txt, downloader

check_button = False


class main(tk.Tk):
    cur_path = os.path.dirname(sys.argv[0])
    is_start = False
    urls = []

    def __init__(self):
        super().__init__()
        # 判断配置是否成功
        self.url_file_config_is_true = False
        # 爬取状态
        self.result_display = tk.StringVar()
        # 保存路径
        self.path = tk.StringVar()
        # 初始化窗口
        self.set_init_win()
        # 设置保存
        self.set_save_path()
        # 设置开始按钮
        self.set_start_button()
        # 设置复选框
        self.set_check_button()
        self.set_result()
        # 设置进度条
        self.set_progress_bar()

    def set_progress_bar(self):
        tk.Label(self, text='进度条').grid(row=5, column=1)
        self.pb = ttk.Progressbar(self, length=200, value=0, mode="determinate")
        self.pb.grid(row=6, column=1)

    def set_start_button(self):
        tk.Button(self, text='开始', command=self.start_spider).grid(row=3, column=2)

    def getdirsize(self):
        size = 0
        for root, dirs, files in os.walk(os.path.join(self.cur_path, 'download_img')):
            size += sum([os.path.getsize(os.path.join(self.cur_path, root, name)) for name in files])
        return size

    def loop_download(self, save_base_path):
        for index, url in enumerate(self.urls):
            if not url:
                continue
            download_url = url.strip().split(',')[0]
            path = url.strip().split(',')[1]
            downloader(download_url, os.path.join(save_base_path, path))

    def excute_scrapy(self):
        # 初始化进度条
        self.pb['value'] = 1
        # 保存路径
        save_base_path = os.path.join(self.cur_path, 'download_img')
        # 判断是否循环爬取
        if check_button:
            self.result_display.set('循环爬取中.........')
            self.pb['mode'] = 'indeterminate'
            print('循环爬取--')
            self.pb.start()
            schedule.every(10).minutes.do(self.loop_download, save_base_path)
            while True:
                schedule.run_pending()
                time.sleep(3)
        else:
            print('不循环爬取')
            self.pb['maximum'] = len([i for i in self.urls if i])
            for index, url in enumerate(self.urls):
                if not url:
                    continue
                self.pb['value'] = index + 1
                self.pb.update()
                download_url = url.strip().split(',')[0]
                path = url.strip().split(',')[1]
                downloader(download_url, os.path.join(save_base_path, path))
            self.result_display.set('单次爬取完毕')
            self.is_start = False

    def start_spider(self):
        if self.url_file_config_is_true and not self.is_start:
            t = threading.Thread(target=self.excute_scrapy)
            t.setDaemon(True)
            t.start()
            self.result_display.set('爬取进行中')
        else:
            messagebox.showerror('警告', '请先添加配置文件,或者程序已经运行中')

    def set_init_win(self):
        # 初始话窗口
        self.title('中央气象台图片')
        # 固定大小
        self.resizable(0, 0)
        # 设置窗口大小
        self.geometry("320x160")

    def set_check_button(self):
        # 设置复选框
        tk.Checkbutton(self, text='是否循环爬取', command=self.check).grid(row=3, column=1)

    def check(self):
        global check_button
        if check_button:
            check_button = False
        else:
            check_button = True

    def set_result(self):
        # 设置爬取结果
        tk.Label(self, text='爬取结果:').grid(row=4, column=0)
        tk.Label(self, textvariable=self.result_display).grid(row=4, column=1)

    def set_save_path(self):
        # 设置保存图片的路径
        tk.Label(self, text='配置文件').grid(row=0, column=0)
        tk.Label(self, text='文件路径').grid(row=1, column=0)
        tk.Entry(self, textvariable=self.path).grid(row=1, column=1)
        tk.Button(self, text='路径选择', command=self.select_path).grid(row=1, column=2)

    def select_path(self):
        # 选择保存图片的路径
        path_ = askopenfilename()
        self.path.set(path_)
        file_content = None
        # 读取文件
        with open(path_, 'r', encoding='utf-8') as f:
            file_content = f.read()
        # 解析里面的url
        urls = parse_url_form_txt(file_content)
        self.urls = urls
        messagebox.showinfo('提示', '配置文件成功')
        self.url_file_config_is_true = True


if __name__ == '__main__':
    main().mainloop()

下载工具

# this is toolkit
import os
import re

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor

cookies = {
    'UM_distinctid': '17a126b2aac89-0c792247cd7f5-3373266-13c680-17a126b2aad534',
    'ray_leech_token': '1624499370',
    'CNZZDATA1254743953': '513171663-1623801415-^%^7C1624496980',
    '__utrace': '82eacfa080788380c407189aaea16c59',
}

headers = {
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9',
}


def parse_url_form_txt(content):
    urls = content.split('\n')
    return urls


def get_html(url):
    response = requests.get(url, headers=headers,
                            cookies=cookies, verify=False)
    return response.text


def parse_image_urls(html):
    images_urls = list()
    html = etree.HTML(html)
    elements = html.xpath("//div[contains(@class,'col-xs-12 time')]")
    for element in elements:
        img_url = element.xpath('./@data-img')
        images_urls = images_urls + img_url
    return images_urls


def get_name_form_url(url):
    name = re.search('(.*?)\?', os.path.basename(url)).group(1)
    return name


def dwonloader_images(url, file_path):
    name = get_name_form_url(url)
    exists = os.path.exists(os.path.join(file_path, name))
    print(url.replace('/medium',""))
    if not exists:
        response = requests.get(url.replace('/medium',""), headers=headers,
                                cookies=cookies, verify=False)
        with open(os.path.join(file_path, name), 'wb') as f:
            f.write(response.content)


def downloader(url, file_path):
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    html = get_html(url)
    images_urls = parse_image_urls(html)
    thread_pool = ThreadPoolExecutor(10)
    for url in images_urls:
        thread_pool.submit(dwonloader_images, url, file_path)
    thread_pool.shutdown(wait=True)


if __name__ == '__main__':
    # downloader('http://www.nmc.cn/publish/observations/china/dm/weatherchart-h000.htm')
    downloader('http://www.nmc.cn/publish/satellite/FY4A-true-color.htm',
               os.path.join(os.path.dirname(__file__), 'test'))

下载列表

http://www.nmc.cn/publish/observations/china/dm/weatherchart-h000.htm,中国天气000
http://www.nmc.cn/publish/observations/china/dm/weatherchart-h925.htm,中国天气925
http://www.nmc.cn/publish/satellite/FY4A-true-color.htm,卫星真彩色
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值