使用 python, wget, curl 三者结合，制作一个爬虫（批量从镜像站点下载包）

shimly123456

于 2024-04-18 01:09:59 发布

阅读量139

点赞数 3

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/shimly123456/article/details/137895576

版权

import re
import subprocess
from timeout_decorator import timeout, TimeoutError

timeout_seconds = 5 # 超时事件 (秒)

# 定义超时装饰器
@timeout(timeout_seconds)
def run_command(command):
    return subprocess.run(command, shell=True)

# head url 字符串
head = "https://mirrors.ustc.edu.cn/CTAN/systems/win32/miktex/tm/packages/"

# 使用正则表达式匹配链接和文件名
target_pattern = r'<a href="([^"]+.tar.lzma)">([^<]+.tar.lzma)</a>'

# 打开文件
with open('theweb.html', 'r') as file:
    # 逐行读取并输出文件内容
    for line in file:
        # 使用strip()方法去除每行末尾的换行符
        line = line.strip()
        match = re.search(target_pattern, line)
        if match:
            filename = match.group(1)
            url = head + filename
            command = "wget -O - " + url + " > " + filename
            ret = 1
            while 0 != ret: 
                try: 
                    retval = run_command(command)
                    ret = retval.returncode
                except TimeoutError:
                    print("after timeouterror")
                    continue # 继续循环