Java解析网页中的超链接批量下载用

1. 使用场景

迅雷的批量下载功能不好用,还会把网页中的超链接的原文件给重命名,很烦。
另外,经常使用UNIX/LINUX系统,wget -i 命令批量下载时也需要用到下载地址列表。
现写了一个,希望对大家有帮助。

2. 找列表

打开要下载的文件所在的页面,比如我要下载centos的packages,打开的页面是清华镜像站:

https://mirror.tuna.tsinghua.edu.cn/centos-vault/7.8.2003/os/x86_64/Packages/

鼠标右键,查看网页源代码:
在这里插入图片描述
将内容全选保存到本地文件。
在这里插入图片描述

3. 分析下载地址列表

在这里插入图片描述
分析出的结果如下:
在这里插入图片描述

4. 使用下载工具下载

在这里插入图片描述

5. 源代码

package org.ray.UrlTool;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Map;

/**
 * @Classname UrlToolMain
 * @Date 2021/2/4 11:14
 * @Created by 236212005@qq.com
 */
public class UrlToolMain {
    public static final Map<String, String> START_PARAMS = new HashMap<>();
    public static final String PARAM_INPUT_FILE = "-i";//"C:\\Users\\23621\\Desktop\\new 12.txt"
    public static final String PARAM_OUTPUT_FILE = "-o";//"C:\\Users\\23621\\Desktop\\result.txt"
    public static final String PARAM_APPEND_PREFIX = "-p";//https://mirror.tuna.tsinghua.edu.cn/centos-vault/7.8.2003/os/x86_64/Packages/
    public static final String PARAM_DOWN_SUFFIX = "-e";//.rpm

    private static final String BREAK_LINE_SYMBOL = System.getenv("OS").toLowerCase().contains("win") ? "\r\n" : "\n";
    private static final String SYMBOL_QUO = "\"";
    private static final String SYMBOL_MINUS = "-";
    private static final String EMPTY_STR = "";
    private static final String URL_TAG_START = "<a href=\"";
    private static final String URL_TAG_END = "</a>";
    private static final int INVALID = -1;
    public static final double EXCHANGE_UNIT = 1000.00;

    public static void main(String[] args) {
        if (args == null || args.length <= 0) {
            System.out.println("There's an error according when application runs: Invalid parameters.");
            System.out.println("Description:");
            System.out.println("Use -i to specify the html source file which contains the urls, ");
            System.out.println("-o to locate an absolute path of the output file, ");
            System.out.println("-p can help you to add a prefix of each char sequence in tag <a href=\"\">, for some situation like site uses abstract path,");
            System.out.println("-e makes you can decide the file type witch you want to download.");
            System.exit(-1);
        } else {
            recordParam(args);
            File file = new File(getParam(PARAM_INPUT_FILE));
            String context = readFile(file);
            StringBuilder stringBuffer = new StringBuilder();

            int startIndex = 0;
            int endIndex = 0;
            int lastEndIndex = 0;

            long startTime = Calendar.getInstance().getTimeInMillis();
            while ((startIndex = context.indexOf(URL_TAG_START, lastEndIndex)) != INVALID) {
                endIndex = context.indexOf(URL_TAG_END, startIndex);
                String url = context.substring(startIndex, endIndex + URL_TAG_END.length());
                int firstSymbolIndex = 0;
                int secondSymbolIndex = 0;
                firstSymbolIndex = url.indexOf(SYMBOL_QUO);
                secondSymbolIndex = url.indexOf(SYMBOL_QUO, firstSymbolIndex + 1);
                if (url.endsWith(getParam(PARAM_DOWN_SUFFIX))) {
                    stringBuffer.append(getParam(PARAM_APPEND_PREFIX));
                    stringBuffer.append(url, firstSymbolIndex + 1, secondSymbolIndex);
                    stringBuffer.append(BREAK_LINE_SYMBOL);
                }
                lastEndIndex = endIndex + URL_TAG_END.length();
            }
            context = stringBuffer.toString();
            writeFile(context, new File(getParam(PARAM_OUTPUT_FILE)));

            long endTime = Calendar.getInstance().getTimeInMillis();
            System.out.println("Mission complete in " + ((endTime - startTime) / EXCHANGE_UNIT) + " sec.");
            System.exit(0);
        }
    }

    private static String getParam(String key) {
        return null == START_PARAMS.get(key) ? EMPTY_STR : START_PARAMS.get(key);
    }

    private static void recordParam(String[] args) {
        for (int i = 0; i < args.length; i++) {
            if (args[i].startsWith(SYMBOL_MINUS)) {
                START_PARAMS.put(args[i], args[i + 1]);
            }
        }
    }

    public static String readFile(File file) {
        if (!file.exists()) {
            return EMPTY_STR;
        }
        StringBuilder stringBuilder = new StringBuilder();
        try (FileInputStream fileInputStream = new FileInputStream(file)) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(fileInputStream));
            String line;
            while ((line = reader.readLine()) != null) {
                stringBuilder.append(line);
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return stringBuilder.toString();
    }

    public static File writeFile(String content, File dstFile) {
        boolean isFileExist = dstFile.exists();
        if (!isFileExist) {
            try {
                isFileExist = dstFile.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
                isFileExist = false;
            }
        }
        if (isFileExist) {
            try {
                FileOutputStream fileOutputStream = new FileOutputStream(dstFile);
                fileOutputStream.write(content.getBytes(StandardCharsets.UTF_8));
                fileOutputStream.flush();
                fileOutputStream.close();
                if (dstFile.length() >= content.length()) {
                    isFileExist = true;
                } else {
                    isFileExist = false;
                    dstFile.delete();
                }
            } catch (IOException e) {
                e.printStackTrace();
                isFileExist = false;
            }
        }
        return isFileExist ? dstFile : null;
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值