2021-05-22

Maven 里编写爬虫程序

Create New Project---->Maven _---->create from archetype ---->quickstart----->建工程-----> 在这里插入图片描述在这里插入图片描述点击 Enable Auto_import
在这里插入图片描述
1.下的1.7改为1.8
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210522003755287.png

2.下添加镜像:

在这里插入图片描述
org.apache.httpcomponents
httpclient
4.5.5

添加后为
在这里插入图片描述3.点击红色框处,选择左侧Modules,Sources下选择在这里插入图片描述点击应用
在这里插入图片描述
4. file–>setting---->Complier—>java Compiler选择8,点击应用
在这里插入图片描述5.删除Project下的test,在main中建包,类

代码如下:

//Spider*

package org.example.core;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Spider {
    private static CloseableHttpClient httpClient = HttpClients.createDefault();
    private static final int M = 1024*1024;

    private static void close(AutoCloseable...closes){
        for (AutoCloseable close : closes) {
            if (null!=close) {
                try {
                    close.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    //1、获取列表页的源码
    private static String getHtml(String url){
        String html = null;
        HttpGet httpGet = new HttpGet(url);
        try {
            CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity entity = httpResponse.getEntity();
            html = EntityUtils.toString(entity);
            html = html.replaceAll("\r?\n\t*","");
        } catch (IOException e) {
            e.printStackTrace();
        }
        return html;
    }

    //2、解析源码,获取详情页的链接
    private static String replacePathSign(String name){
        return name.replaceAll("/|\\\\","");
    }

    private static Map<String,String> parseHtml(String html, String regex){
        Map<String,String> map = new HashMap<>();
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(html);
        int index = 0;
        while (matcher.find(index)) {
            map.put(matcher.group(1),replacePathSign(matcher.group(2)));
            index = matcher.end();
        }
        return map;
    }

    //3、数据持久化
    private static void disk(String imgUrl,String path,String name){
        BufferedInputStream bis = null;
        BufferedOutputStream bos = null;
        try {
            bis = new BufferedInputStream(
                    httpClient.execute(new HttpGet(imgUrl))
                            .getEntity().getContent(),M);
            bos = new BufferedOutputStream(
                    new FileOutputStream(path+"/"+name+".jpg"),M);
            byte[] bs = new byte[M];
            int len = -1;
            while (-1 != (len = bis.read(bs))) {
                bos.write(bs,0,len);
            }
            bos.flush();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            close(bos,bis);
        }
    }

    private static String getReplace(String regexPage){
        return regexPage.substring(
                regexPage.indexOf("(")+1,regexPage.indexOf(")"));
    }

    private static String regexFill(String replace,int page){
        String[] split = replace.split("\\\\d\\+\\\\");
        return split[0]+page+split[1];
    }

    /**
     *
     * @param url           页面url
     * @param regexTotal    提取页码正则
     * @param pageSize      每页数据容量
     * @param regexPage     分页页码替换正则
     * @param regxUrl       提取数据项正则
     * @param path          写盘路径
     */
    public static void crawl(String url,String regexTotal,int pageSize,String regexPage,String regxUrl,String path){
        String html = getHtml(url);
        System.out.println(html);
        //总页码正则匹配器
        Matcher matherTotal = Pattern.compile(regexTotal).matcher(html);
        //分页特征正则匹配器
        Matcher matcherPage = Pattern.compile(regexPage).matcher(url);
        int total = 1;
        String replaceRegx = null;
        boolean pageMatches = matcherPage.matches();
        if(pageMatches){ //能够提取分页特征
            replaceRegx = getReplace(regexPage);
            if(matherTotal.matches()) { //能够提取总页码
                String totalSize = matherTotal.group(1);
                System.out.println("total size : "+totalSize);
                total = (int)(Math.ceil(Double.parseDouble(totalSize)/pageSize));
            }
        }else{
            System.err.println(regexPage+" 无法提取分页特征,将作为单页解析");
        }

        System.out.println("total page : "+total);

        int page = 1;
        ExecutorService es = Executors.newFixedThreadPool(
                Runtime.getRuntime().availableProcessors());
        do {
            final String HTML = html;
            final int PAGE_NO = page;
            es.submit(()->{
                Map<String, String> map = parseHtml(HTML, regxUrl);
                if (!map.isEmpty()) {
                    for (Map.Entry<String, String> e : map.entrySet()) {
                        disk(e.getKey(), path, e.getValue());
                    }
                }
                System.out.println("page "+PAGE_NO+" finished");
            });
            if(!pageMatches) break;
            page++;
            url = url.replaceFirst(replaceRegx,regexFill(replaceRegx,page));
            html = getHtml(url);
        }while (page<=total);
        es.shutdown();
    }

    /**
     *
     * @param url           页面url
     * @param regexTotal    提取页码正则
     * @param pageSize      每页数据容量
     * @param regexPage     分页页码替换正则
     * @param regxUrl       提取数据项正则
     * @param path          写盘路径
     */
    public static void crawl2(String url,String regexTotal,int pageSize,String regexPage,String regxUrl,String path){
        String html = getHtml(url);
        System.out.println(html);
        //总页码正则匹配器
        Matcher matherTotal = Pattern.compile(regexTotal).matcher(html);
        //分页特征正则匹配器
        Matcher matcherPage = Pattern.compile(regexPage).matcher(url);
        int total = 1;
        String replaceRegx = null;
        boolean pageMatches = matcherPage.matches();
        if(pageMatches){ //能够提取分页特征
            replaceRegx = getReplace(regexPage);
            if(matherTotal.matches()) { //能够提取总页码
                String totalSize = matherTotal.group(1);
                System.out.println("total size : "+totalSize);
                total = (int)(Math.ceil(Double.parseDouble(totalSize)/pageSize));
            }
        }else{
            System.err.println(regexPage+" 无法提取分页特征,将作为单页解析");
        }

        System.out.println("total page : "+total);

        int page = 1;
        ExecutorService es = Executors.newFixedThreadPool(
                Runtime.getRuntime().availableProcessors());
        do {
            final String HTML = html;
            final int PAGE_NO = page;
            Map<String, String> map = parseHtml(HTML, regxUrl);
            if (!map.isEmpty()) {
                for (Map.Entry<String, String> e : map.entrySet()) {
                    es.submit(()->disk(e.getKey(), path, e.getValue()));
                }
            }
            System.out.println("page "+PAGE_NO+" finished");
            if(!pageMatches) break;
            page++;
            url = url.replaceFirst(replaceRegx,regexFill(replaceRegx,page));
            html = getHtml(url);
        }while (page<=total);
        es.shutdown();
    }
}

**//Test:**

package org.example.core;

public class Test {

    public static void main(String[] args) {
        String url = "https://www.3gbizhi.com/tag/dongman/1.html";
        String regexTotal = ".*?<a class=\"a1\">(.*?)条</a>.*?";
        String regexPage = ".*?(/\\d+\\.html)";
        //String regexUrl = "<img src=\"(.*?)\\.\\d+\\.\\d+\\.jpg\".*?alt=\"(.*?)\">";
        String regexUrl = "<img lazysrc=\"(.*?)\\.\\d+\\.\\d+\\.jpg\".*?alt=\"(.*?)\".*?/>";
        String path = "E:\\123\\spider_path";
        Spider.crawl2(url,regexTotal,20,regexPage,regexUrl,path);
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值