2021-05-22

阳光少年159

于 2021-05-22 00:48:41 发布

阅读量54

点赞数

分类专栏： java

本文链接：https://blog.csdn.net/mma2297/article/details/117137194

版权

java 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

Maven 里编写爬虫程序

Create New Project---->Maven _---->create from archetype ---->quickstart----->建工程-----> 在这里插入图片描述点击 Enable Auto_import

1.下的1.7改为1.8
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210522003755287.png

2.下添加镜像：

在这里插入图片描述
org.apache.httpcomponents
httpclient
4.5.5

添加后为
在这里插入图片描述 3.点击红色框处，选择左侧Modules,Sources下选择点击应用

4. file–>setting---->Complier—>java Compiler选择8，点击应用
5.删除Project下的test，在main中建包，类

代码如下：

//Spider*

package org.example.core;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Spider {
    private static CloseableHttpClient httpClient = HttpClients.createDefault();
    private static final int M = 1024*1024;

    private static void close(AutoCloseable...closes){
        for (AutoCloseable close : closes) {
            if (null!=close) {
                try {
                    close.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    //1、获取列表页的源码
    private static String getHtml(String url){
        String html = null;
        HttpGet httpGet = new HttpGet(url);
        try {
            CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity entity = httpResponse.getEntity();
            html = EntityUtils.toString(entity);
            html = html.replaceAll("\r?\n\t*","");
        } catch (IOException e) {
            e.printStackTrace();
        }
        return html;
    }

    //2、解析源码，获取详情页的链接
    private static String replacePathSign(String name){
        return name.replaceAll("/|\\\\","");
    }

    private static Map<String,String> parseHtml(String html, String regex){
        Map<String,String> map = new HashMap<>();
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(html);
        int index = 0;
        while (matcher.find(index)) {
            map.put(matcher.group(1),replacePathSign(matcher.group(2)));
            index = matcher.end();
        }
        return map;
    }

    //3、数据持久化
    private static void disk(String imgUrl,String path,String name){
        BufferedInputStream bis = null;
        BufferedOutputStream bos = null;
        try {
            bis = new BufferedInputStream(
                    httpClient.execute(new HttpGet(imgUrl))
                            .getEntity().getContent(),M);
            bos = new BufferedOutputStream(
                    new FileOutputStream(path+"/"+name+".jpg"),M);
            byte[] bs = new byte[M];
            int len = -1;
            while (-1 != (len = bis.read(bs))) {
                bos.write(bs,0,len);
            }
            bos.flush();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            close(bos,bis);
        }
    }

    private static String getReplace(String regexPage){
        return regexPage.substring(
                regexPage.indexOf("(")+1,regexPage.indexOf(")"));
    }

    private static String regexFill(String replace,int page){
        String[] split = replace.split("\\\\d\\+\\\\");
        return split[0]+page+split[1];
    }

    /**
     *
     * @param url           页面url
     * @param regexTotal    提取页码正则
     * @param pageSize      每页数据容量
     * @param regexPage     分页页码替换正则
     * @param regxUrl       提取数据项正则
     * @param path          写盘路径
     */
    public static void crawl(String url,String regexTotal,int pageSize,String regexPage,String regxUrl,String path){
        String html = getHtml(url);
        System.out.println(html);
        //总页码正则匹配器
        Matcher matherTotal = Pattern.compile(regexTotal).matcher(html);
        //分页特征正则匹配器
        Matcher matcherPage = Pattern.compile(regexPage).matcher(url);
        int total = 1;
        String replaceRegx = null;
        boolean pageMatches = matcherPage.matches();
        if(pageMatches){ //能够提取分页特征
            replaceRegx = getReplace(regexPage);
            if(matherTotal.matches()) { //能够提取总页码
                String totalSize = matherTotal.group(1);
                System.out.println("total size : "+totalSize);
                total = (int)(Math.ceil(Double.parseDouble(totalSize)/pageSize));
            }
        }else{
            System.err.println(regexPage+" 无法提取分页特征，将作为单页解析");
        }

        System.out.println("total page : "+total);

        int page = 1;
        ExecutorService es = Executors.newFixedThreadPool(
                Runtime.getRuntime().availableProcessors());
        do {
            final String HTML = html;
            final int PAGE_NO = page;
            es.submit(()->{
                Map<String, String> map = parseHtml(HTML, regxUrl);
                if (!map.isEmpty()) {
                    for (Map.Entry<String, String> e : map.entrySet()) {
                        disk(e.getKey(), path, e.getValue());
                    }
                }
                System.out.println("page "+PAGE_NO+" finished");
            });
            if(!pageMatches) break;
            page++;
            url = url.replaceFirst(replaceRegx,regexFill(replaceRegx,page));
            html = getHtml(url);
        }while (page<=total);
        es.shutdown();
    }

    /**
     *
     * @param url           页面url
     * @param regexTotal    提取页码正则
     * @param pageSize      每页数据容量
     * @param regexPage     分页页码替换正则
     * @param regxUrl       提取数据项正则
     * @param path          写盘路径
     */
    public static void crawl2(String url,String regexTotal,int pageSize,String regexPage,String regxUrl,String path){
        String html = getHtml(url);
        System.out.println(html);
        //总页码正则匹配器
        Matcher matherTotal = Pattern.compile(regexTotal).matcher(html);
        //分页特征正则匹配器
        Matcher matcherPage = Pattern.compile(regexPage).matcher(url);
        int total = 1;
        String replaceRegx = null;
        boolean pageMatches = matcherPage.matches();
        if(pageMatches){ //能够提取分页特征
            replaceRegx = getReplace(regexPage);
            if(matherTotal.matches()) { //能够提取总页码
                String totalSize = matherTotal.group(1);
                System.out.println("total size : "+totalSize);
                total = (int)(Math.ceil(Double.parseDouble(totalSize)/pageSize));
            }
        }else{
            System.err.println(regexPage+" 无法提取分页特征，将作为单页解析");
        }

        System.out.println("total page : "+total);

        int page = 1;
        ExecutorService es = Executors.newFixedThreadPool(
                Runtime.getRuntime().availableProcessors());
        do {
            final String HTML = html;
            final int PAGE_NO = page;
            Map<String, String> map = parseHtml(HTML, regxUrl);
            if (!map.isEmpty()) {
                for (Map.Entry<String, String> e : map.entrySet()) {
                    es.submit(()->disk(e.getKey(), path, e.getValue()));
                }
            }
            System.out.println("page "+PAGE_NO+" finished");
            if(!pageMatches) break;
            page++;
            url = url.replaceFirst(replaceRegx,regexFill(replaceRegx,page));
            html = getHtml(url);
        }while (page<=total);
        es.shutdown();
    }
}

**//Test：**

package org.example.core;

public class Test {

    public static void main(String[] args) {
        String url = "https://www.3gbizhi.com/tag/dongman/1.html";
        String regexTotal = ".*?<a class=\"a1\">(.*?)条</a>.*?";
        String regexPage = ".*?(/\\d+\\.html)";
        //String regexUrl = "<img src=\"(.*?)\\.\\d+\\.\\d+\\.jpg\".*?alt=\"(.*?)\">";
        String regexUrl = "<img lazysrc=\"(.*?)\\.\\d+\\.\\d+\\.jpg\".*?alt=\"(.*?)\".*?/>";
        String path = "E:\\123\\spider_path";
        Spider.crawl2(url,regexTotal,20,regexPage,regexUrl,path);
    }
}

阳光少年159

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2021-05-22

Maven 里编写爬虫程序Create New Project---->Maven _---->create from archetype ---->quickstart----->建工程-----> 点击 Enable Auto_import1.下的1.7改为1.82.下添加镜像：org.apache.httpcomponentshttpclient4.5.5添加后为3.点击红色框处，选择左侧Modules,Sources下选择点击应用4. f
复制链接

扫一扫