Java爬虫-爬取页面源码并下载页面的指定格式文件

最新推荐文章于 2022-08-04 17:07:39 发布

lily-0622

最新推荐文章于 2022-08-04 17:07:39 发布

阅读量1.3k

点赞数

分类专栏：线上问题分析

本文链接：https://blog.csdn.net/yao940622/article/details/106495958

版权

线上问题分析专栏收录该内容

23 篇文章 1 订阅

订阅专栏

一、客户需求

获取页面音频资料和字幕文件，格式分别为MP3和lrc.

页面地址为：

https://www.51voa.com/VOA_Special_English/researchers-call-for-a-use-tax-to-clean-up-space-84650.html

二、分析页面源码

页面的部分源码如下：

分析结果：

href=”xxx.mp3”和href=”xxx.lrc”这两个是我们可以用到的数据，是对应格式文件的下载链接。<title>xxx</title>是下载文件的标题。

三、实现过程

java代码部分，目录结构如下图：

HtmlRequest.java

来通过网站域名URL获取该网站的源码，得到标题和mp3、lrc下载链接。

import java.io.ByteArrayOutputStream;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;



/**

 * 通过网站域名URL获取该网站的源码

 *

 * @author Administrator

 *

 */

public class HtmlRequest {

    /** */
    /**
     * 通过网站域名URL获取该网站的源码
     *
     * @param url
     * @return String
     * @throws Exception
     */
    public String getURLSource(URL url) throws Exception {
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        conn.setRequestMethod("GET");
        conn.setConnectTimeout(5 * 1000);
        InputStream inStream = conn.getInputStream(); // 通过输入流获取html二进制数据
        byte[] data = readInputStream(inStream); // 把二进制数据转化为byte字节数据
        String htmlSource = new String(data);
        return htmlSource;
    }

    /** */
    /**
     * 把二进制流转化为byte字节数组
     *
     * @param instream
     * @return byte[]
     * @throws Exception
     */
    public byte[] readInputStream(InputStream instream) throws Exception {
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        byte[] buffer = new byte[1204];
        int len = 0;
        while ((len = instream.read(buffer)) != -1) {
            outStream.write(buffer, 0, len);
        }
        instream.close();
        return outStream.toByteArray();
    }


    /**
     * 从html源码(字符串)中去掉标题
     * @param htmlSource
     * @return
     */
    public String getTitle(String htmlSource){
        List<String> list = new ArrayList<String>();
        String title = "";

        //Pattern pa = Pattern.compile("<title>.*?</title>", Pattern.CANON_EQ);也可以
        Pattern pa = Pattern.compile("<title>.*?</title>");//源码中标题正则表达式
        Matcher ma = pa.matcher(htmlSource);
        while (ma.find())//寻找符合el的字串
        {
            list.add(ma.group());//将符合el的字串加入到list中
        }
        for (int i = 0; i < list.size(); i++)
        {
            title = title + list.get(i);
        }
        return outTag(title);
    }

    /**
     * 去掉html源码中的标签
     * @param s
     * @return
     */
    public String outTag(String s)
    {
        return s.replaceAll("<.*?>", "");
    }

}

Main.java

import java.io.File;

import java.net.URL;

import java.util.Scanner;

import java.util.regex.Matcher;

import java.util.regex.Pattern;


public class Main {

    public static void main(String[] args) throws Exception {
        HtmlRequest httpRequest=new HtmlRequest();
        Scanner sc = new Scanner(System.in);
        String[] strs=sc.next().split(",");
        for(int i=0;i<strs.length;i++) {
            System.out.println("\n*****************第"+(i+1)+"条链接开始解析下载*****************");
            System.out.println("下载链接："+strs[i]);
            URL url = new URL(strs[i]);
            String urlsource = httpRequest.getURLSource(url);
//       System.out.println(urlsource);//输出页面源码

            //下载mp3地址
            Pattern p1 = Pattern.compile("href=\"(.*).mp3\"");
            Matcher m1 = p1.matcher(urlsource);
            //下载lrc地址
            Pattern p2 = Pattern.compile("href=\"(.*).lrc\"");
            Matcher m2 = p2.matcher(urlsource);

            String title= httpRequest.getTitle(urlsource);

            System.out.println("请输入保存路径：");
            Scanner scPath = new Scanner(System.in);
            String path =scPath.next().toString();
                    //判断是否存在目录. 不存在则创建
            isChartPathExist(path);


            while(m1.find()&m2.find()) {

                DownloadManager downloadManager = new DownloadManager(path+title+".mp3" , 2 , m1.group(1)+".mp3");
                DownloadManager downloadManager2 = new DownloadManager(path+title+".lrc" , 1 , "https://www.51voa.com"+m2.group(1)+".lrc");
                downloadManager.action();
                downloadManager2.action();

            }
        }
        System.out.println("*****************任务全部下载完成*****************");
    }

/**
 * 判断文件夹是否存在，如果不存在则新建
 *
 * @param dirPath 文件夹路径
 */
private static void isChartPathExist(String dirPath) {
    File file = new File(dirPath);
    if (!file.exists()) {
        file.mkdirs();
    }
}

 }

DownloadManager.java

实现多线程下载

其中DownloadManager的实现参数分别是

1.保存路径，2.线程数，3.下载链接

import java.io.File; 

import java.io.FileInputStream; 

import java.io.FileOutputStream; 

import java.io.IOException; 

import java.io.InputStream; 

import java.io.OutputStream; 

import java.net.HttpURLConnection; 

import java.net.MalformedURLException; 

import java.net.URL; 

import java.util.ArrayList; 

import java.util.List; 

 

public class DownloadManager implements Runnable { 

    // 保存路径 

    private String savePath; 

    // 总的下载线程数 

    private int threadNum; 

    // 下载的链接地址 

    private String urlFile; 

    // 是否下载开始 

    private boolean isStarted; 

    // 用于监视何时合并文件存放Thread的list 

    private List<DownloadThread> downloadList = new ArrayList<DownloadThread>(); 

 

    public DownloadManager(String savePath, int threadNum, String urlFile) { 

        super(); 

        this.savePath = savePath; 

        this.threadNum = threadNum; 

        this.urlFile = urlFile; 

    } 

 

    // 最终调用线程下载。本线程中调用分线程。 

    public void action() { 

        new Thread(this).start(); 

    } 

 

    public void run() { 

        long t1 = System.currentTimeMillis(); 

        System.out.println(t1); 

        // 如果没有下载 ， 就开始 ， 并且将已经下载的变量值设为true 

        if (!isStarted) { 

            startDownload(); 

            isStarted = true; 

        } 

        while (true) { 

            // 初始化认为所有线程下载完成，逐个检查 

            boolean finish = true; 

            // 如果有任何一个没完成，说明下载没完成，不能合并文件 

            for (DownloadThread thread : downloadList) { 

                if (!thread.isFinish()) { 

                    finish = false; 

                    break; 

                } 

            } 

            // 全部下载完成才为真 

            if (finish) { 

                // 合并文件 

                mergeFiles(); 

                // 跳出循环 ， 下载结束 

                break; 

            } 

            // 休息一会 ， 减少cpu消耗 

            try { 

                Thread.sleep(1000); 

            } catch (InterruptedException e) { 

                e.printStackTrace(); 

            } 

        } 

        long t2 = System.currentTimeMillis(); 

        System.out.println(t2); 

         System.out.println("下载用时：" + (t2 -t1)); 

    } 

 

    public void startDownload() { 

        // 得到每个线程开始值 ， 下载字节数大小 

        int[][] posAndLength = getPosAndLength(); 

        // 根据下载信息创建每个下载线程，并且启动他们。 

        for (int i = 0; i < posAndLength.length; i++) { 

            int pos = posAndLength[i][0]; 

            int length = posAndLength[i][1]; 

            DownloadThread downloadThread = new DownloadThread(i + 1, length, 

                    pos, savePath, urlFile); 

            new Thread(downloadThread).start(); 

            downloadList.add(downloadThread); 

        } 

    } 

 

    /**

     * 获得文件大小

     * 

     * @return 文件大小

     */ 

    public long getFileLength() { 

        System.out.println("获得文件大小  start......"); 

        HttpURLConnection conn = null; 

        long result = 0; 

        try { 

            URL url = new URL(urlFile); 

            conn = (HttpURLConnection) url.openConnection(); 

            // 使用Content-Length头信息获得文件大小 

            result = Long.parseLong(conn.getHeaderField("Content-Length")); 

        } catch (MalformedURLException e) { 

            e.printStackTrace(); 

        } catch (IOException e) { 

            e.printStackTrace(); 

        } finally { 

            if (conn != null) { 

                conn.disconnect(); 

            } 

        } 

        System.out.println("获得文件大小  end......" + result); 

        return result; 

    } 

 

    // 具体细节求出每个线程的开始位置和文件下载大小 

    public int[][] getPosAndLength() { 

        int[][] result = new int[threadNum][2]; 

        int fileLength = (int) getFileLength(); 

        int every = fileLength % threadNum == 0 ? fileLength / threadNum 

                : fileLength / threadNum + 1; 

        for (int i = 0; i < result.length; i++) { 

            int length = 0; 

            if (i != result.length - 1) { 

                length = every; 

            } else { 

                length = fileLength - i * every; 

            } 

            result[i][0] = i * every; 

            result[i][1] = length; 

        } 

        return result; 

    } 

 

    // 合并文件 

    public void mergeFiles() { 

        System.out.println("合并文件  start......"); 

        OutputStream out = null; 

        try { 

            out = new FileOutputStream(savePath); 

            for (int i = 1; i <= threadNum; i++) { 

                InputStream in = new FileInputStream(savePath + i); 

                byte[] bytes = new byte[2048]; 

                int read = 0; 

                while ((read = in.read(bytes)) != -1) { 

                    out.write(bytes, 0, read); 

                    out.flush(); 

                } 

                if (in != null) {  

                    in.close(); 

                    new File(savePath + i).delete(); 

                } 

            } 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } finally { 

            if (out != null) { 

                try { 

                    out.close(); 

                } catch (IOException e) { 

                    e.printStackTrace(); 

                } 

            } 

        } 

        System.out.println("合并文件  end......"); 

    } 

 

    public String getSavePath() { 

        return savePath; 

    } 

 

    public void setSavePath(String savePath) { 

        this.savePath = savePath; 

    } 

 

    public int getThreadNum() { 

        return threadNum; 

    } 

 

    public void setThreadNum(int threadNum) { 

        this.threadNum = threadNum; 

    } 

 

    public String getUrlFile() { 

        return urlFile; 

    } 

 

    public void setUrlFile(String urlFile) { 

        this.urlFile = urlFile; 

    }  

 

    public boolean isStarted() { 

        return isStarted; 

    } 

 

    public void setStarted(boolean isStarted) { 

        this.isStarted = isStarted; 

    } 

 

    public List<DownloadThread> getDownloadList() { 

        return downloadList; 

    } 

 

    public void setDownloadList(List<DownloadThread> downloadList) { 

        this.downloadList = downloadList; 

    } 

}

DownloadThread.java

import java.io.FileOutputStream; 

import java.io.IOException; 

import java.io.InputStream; 

import java.io.OutputStream; 

import java.net.HttpURLConnection; 

import java.net.URL; 



public class DownloadThread implements Runnable { 

    // 当前第几个线程 ， 用于给下载文件起名 file1 file2 file3 ... 

    private int whichThread; 

    // 监听单一线程下载是否完成 

    private boolean isFinish; 

    // 本线程要下载的文件字节数 

    private int length; 

    // 本线程向服务器发送请求时输入流的首位置 

    private int startPosition; 

    // 保存的路径 

    private String savePath; 

    // 要下载的文件 ， 用于创建连接 

    private String url; 



    public void run() { 

        HttpURLConnection conn = null; 

        InputStream in = null; 

        OutputStream out = null; 

        try { 

            System.out.println("正在执行的线程：" + whichThread); 

            URL fileUrl = new URL(url); 

            // 与服务器创建连接 

            conn = (HttpURLConnection) fileUrl.openConnection(); 

            // 下载使用get请求 

            conn.setRequestMethod("GET"); 

            // 告诉服务器 ， 我是火狐 ， 不要不让我下载。 

            conn.setRequestProperty( 

                            "User-Agent", 

                            "Firefox Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3"); 

            // 这里是设置文件输入流的首位置 

            conn.setRequestProperty("Range", "bytes=" + startPosition + "-"); 

            // 与服务器创建连接 

            conn.connect(); 

            // 获得输入流 

            in = conn.getInputStream(); 

            // 在硬盘上创建file1 , file2 , ...这样的文件 ， 准备往里面写东西 

            out = new FileOutputStream(savePath + whichThread); 

            // 用于写入的字节数组 

            byte[] bytes = new byte[4096]; 

            // 一共下载了多少字节 

            int count = 0; 

            // 单次读取的字节数 

            int read = 0; 

            while ((read = in.read(bytes)) != -1) { 

                // 检查一下是不是下载到了本线程需要的长度 

                if (length - count < bytes.length) { 

                    // 比如说本线程还需要900字节，但是已经读取1000 

                    // 字节，则用要本线程总下载长度减去 

                    // 已经下载的长度 

                    read = length - count; 

                } 

                // 将准确的字节写入输出流 

                out.write(bytes, 0, read); 

                // 已经下载的字节数加上本次循环字节数 

                count = count + read; 

                // 如果下载字节达到本线程所需要字节数，消除循环， 

                // 停止下载 

                if (count == length) { 

                    break; 

                } 

            } 

            // 将监视变量设置为true 

            isFinish = true; 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } finally { 

            // 最后进行输入、输出、连接的关闭 

            if (in != null) { 

                try { 

                    in.close(); 

                } catch (IOException e) { 

                    e.printStackTrace(); 

                } 

            } 

            if (out != null) { 

                try { 

                    out.close(); 

                } catch (IOException e) { 

                    e.printStackTrace(); 

                } 

            } 

            if (conn != null) { 

                conn.disconnect(); 

            } 

        } 

    } 



    public int getStartPosition() { 

        return startPosition; 

    } 



    public void setStartPosition(int startPosition) { 

        this.startPosition = startPosition; 

    } 



    public String getUrl() { 

        return url; 

    } 



    public void setUrl(String url) { 

        this.url = url; 

    } 



    public int getWhichThread() { 

        return whichThread; 

    } 



    public void setWhichThread(int whichThread) { 

        this.whichThread = whichThread; 

    } 



    public int getLength() { 

        return length; 

    } 



    public void setLength(int length) { 

        this.length = length; 

    } 



    public String getSavePath() { 

        return savePath; 

    } 



    public void setSavePath(String savePath) { 

        this.savePath = savePath; 

    } 



    public DownloadThread(int whichThread, int length, int startPosition, 

            String savePath, String url) { 

        super(); 

        this.whichThread = whichThread; 

        this.length = length; 

        this.startPosition = startPosition; 

        this.savePath = savePath; 

        this.url = url; 

    } 



    public DownloadThread() { 

        super(); 

    } 



    public boolean isFinish() { 

        return isFinish; 

    } 



    public void setFinish(boolean isFinish) { 

        this.isFinish = isFinish; 

    } 

}

按回车后就可以在main方法里指定的目录看到下载的文件了。

需要下载其他页面的其他文件只需要修改页面链接、正则部分，以及文件格式后缀，正则部分可以根据需要增删，也就是下面圈出来的那部分：

本文有部分代码来自：

https://ljlleo.iteye.com/blog/1397765

lily-0622

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
1
评论
Java爬虫-爬取页面源码并下载页面的指定格式文件

一、客户需求获取页面音频资料和字幕文件，格式分别为MP3和lrc.页面地址为：https://www.51voa.com/VOA_Special_English/researchers-call-for-a-use-tax-to-clean-up-space-84650.html二、分析页面源码页面的部分源码如下：分析结果： href=”xxx.mp3”和href=”xxx.lrc”这两个是我们可以用到的数据，是对应格式文件的下载...
复制链接

扫一扫