Java爬虫


简介:之前玩过python爬虫,但是不太想在学习python了,爬虫这方面就不概述了

HttpClient

入门程序

  1. 创建工程,引入依赖
	<dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.13</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.6.6</version>
        </dependency>
    </dependencies>
  1. resource 下日志文件 log4j.properties
log4j.rootLogger=DEBUG,A1
log4j.logger.com.zhj = DEBUG

log4j.appender.A1 = org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout = org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [%t] [%c]-[%p] %m%n
  1. 程序
public class CrawlerFirst {
    public static void main(String[] args) throws IOException {
        // 创建client对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        // 输入网址
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        // 回车
        CloseableHttpResponse response = httpClient.execute(httpGet);
        // 判断响应 获取数据
        StatusLine statusLine = response.getStatusLine();
        if (statusLine.getStatusCode() == 200) {
            HttpEntity entity = response.getEntity();
            String content = EntityUtils.toString(entity, "utf8");
            System.out.println(content);
        }
    }
}

HttpGet

public class HttpGetTest {
    public static void main(String[] args) {
        CloseableHttpClient httpClient= HttpClients.createDefault();
        HttpGet httpGet = new HttpGet("http://www.baidu.com");
        CloseableHttpResponse response = null;
        try {
             response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

带参数Get

public class CrawlerFirst {
    public static void main(String[] args) throws Exception {
        // 创建client对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        // 带参数 GET
        URIBuilder uriBuilder = new URIBuilder("http://www.baidu.com/s");
        uriBuilder.setParameter("wd","许龄月");
        uriBuilder.setParameter("ie","UTF-8");
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        // 回车
        CloseableHttpResponse response = httpClient.execute(httpGet);
        // 判断响应 获取数据
        StatusLine statusLine = response.getStatusLine();
        if (statusLine.getStatusCode() == 200) {
            HttpEntity entity = response.getEntity();
            String content = EntityUtils.toString(entity, "utf8");
            System.out.println(content);
        }
    }
}

HttpPost

HttpPost httpPost = new HttpPost("http://www.baidu.com");

HttpPost 带参数

public class HttpPostTest {
    public static void main(String[] args) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
        // post 请求参数封装
        List<NameValuePair> list = new ArrayList<NameValuePair>();
        list.add(new BasicNameValuePair("keys","java"));
        UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(list);
        httpPost.setEntity(urlEncodedFormEntity);

        CloseableHttpResponse response = httpClient.execute(httpPost);
        if (response.getStatusLine().getStatusCode() == 200) {
            String content = EntityUtils.toString(response.getEntity(), "utf8");
            System.out.println(content);
        }
    }
}


连接池

public class HttpPostPoolTest {
    public static void main(String[] args) throws Exception {
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        // 设置最大连接数
        cm.setMaxTotal(100);
        // 设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);
        doGet(cm);
        doGet(cm);
    }

    private static void doGet(PoolingHttpClientConnectionManager cm) throws Exception {
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        System.out.println(httpClient);
        HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
        // post 请求参数封装
        List<NameValuePair> list = new ArrayList<NameValuePair>();
        list.add(new BasicNameValuePair("keys","java"));
        UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(list);
        httpPost.setEntity(urlEncodedFormEntity);

        CloseableHttpResponse response = httpClient.execute(httpPost);
        if (response.getStatusLine().getStatusCode() == 200) {
            String content = EntityUtils.toString(response.getEntity(), "utf8");
            System.out.println(content.length());
        }
        // 这里为了方便 没有写try finally,注意不能关掉httpClient,是交给连接池处理的
    }
}

请求参数

	// 请求信息
        RequestConfig build = RequestConfig.custom().setConnectionRequestTimeout(1000) // 创建连接最长时间,单位是毫秒
                .setConnectionRequestTimeout(500) // 设置获取连接的最长时间,单位是毫秒
                .setSocketTimeout(10 * 1000) // 设置数据传输的最长时间 ,单位是毫秒
                .build();
        httpPost.setConfig(build);

JSoup

java的html解析工具
新增依赖

  <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.11</version>
        </dependency>
  1. 解析dom
  2. 解析String
  3. 解析文件
public class JsoupParse {
    // 解析url
    @Test
    public void testUrl () throws Exception {
        // url , 超时时间
        Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
    }
    // 解析String
    @Test
    public void testString () throws Exception {
        String string = FileUtils.readFileToString(new File("/Users/mac/IdeaProjects/spring-demos/czbk.html"), "utf8");
        Document doc = Jsoup.parse(string);
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
    }
    // 解析File
    @Test
    public void testFile () throws Exception {
        Document doc = Jsoup.parse(new File("/Users/mac/IdeaProjects/spring-demos/czbk.html"), "utf8");
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
    }
}

通过标签获取

通过Css获取

document.select()
常用方法:
tagname:通过标签查找元素,比如:span
#id:通过ID查找元素,比如:#city_bj
.class: 通过class名称查找元素,比如 : .class_a
[attribute]:利用属性查找元素,比如:[abc]
[attr=value]:利用属性值查找元素,比如:[class=s_name]
组合标签的使用
el#id: 元素+id,比如h3#city_bj
el.class:元素+class
el[attr]:元素+属性名
任意组合:比如:span[abc].s_name
ancestor child:查找某个元素下的子元素
parent > child: 查找某个父元素下的直接子元素
.city_con > ul > li: 查找city_con一级的ul,在找所有的ul下的第一级的li
parent > *:查找某个父元素下的所有直接子元素

案例工具类

HttpUtils

@Component
public class HttpUtils {
    private PoolingHttpClientConnectionManager cm;
    public HttpUtils(){
        this.cm=new PoolingHttpClientConnectionManager();
        this.cm.setMaxTotal(100); // 设置最大连接数
        this.cm.setDefaultMaxPerRoute(10); // 设置每个主机的最大连接数
    }

    /**
     * 获取html
     * @param url
     * @return
     */
    public String doGetHtml(String url){
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(this.getConfig());
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                String text = EntityUtils.toString(entity, "utf8");
                return text;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return "";
    }
    /**
     * 获取Image
     * @param url
     * @return
     */
    public String doGetImage(String url){
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(this.getConfig());
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                if (!Objects.isNull(entity)) {
                    String extName = url.substring(url.lastIndexOf("."));
                    String picName = UUID.randomUUID().toString() + extName;
                    FileOutputStream fileOutputStream = new FileOutputStream(new File("/Users/mac/Desktop/img" + picName));
                    response.getEntity().writeTo(fileOutputStream);
                    return picName;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return "";
    }

    private RequestConfig getConfig() {
        return RequestConfig.custom()
                .setConnectTimeout(1000)
                .setConnectionRequestTimeout(500)
                .setSocketTimeout(10 * 1000)
                .build();

    }
}

定时任务的使用

  1. 启动类上 加 @EnableScheduling
  2. 定时任务类 加上 @Component
  3. 定时方法
@Component
public class ItemTask {

    @Autowired
    HttpUtils httpUtils;
    // 间隔时间执行
    @Scheduled(fixedDelay = 10*1000)
    public void itemTask() {
//        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=6ee395300307469ab7f60f1c5a2a7365&s=116&click=0&page";
        int page = 1;
        String url = "http://www.shuhai.com/shuku/0_113_0_0_0_2_0_"+page+".html";
        for (int i = 1; i < 10;i++) {
            page = i;
            String html = httpUtils.doGetHtml(url + i);
            this.parse(html);
        }
    }

    private void parse(String html) {
        Document document = Jsoup.parse(html);
        System.out.println(html);
        Elements elements = document.select("body > div.content > div > div:nth-child(1) > div.c8.shadow.bgfff > div.book-list-wrapper > div");
        for (int i = 0; i < elements.size(); i++) {
            Elements select = elements.get(i).select("div > div.flex > div.book-name > a");
            System.out.println(elements.text());

        }
    }
//    //表示方法执行完成后5秒
//    @Scheduled(fixedDelay = 5000)
//    public void fixedDelayJob() throws InterruptedException {
//        System.out.println("fixedDelay 每隔5秒" + new Date());
//    }
//
//    //表示每隔3秒
//    @Scheduled(fixedRate = 3000)
//    public void fixedRateJob() {
//
//        System.out.println("fixedRate 每隔3秒" + new Date());
//    }
//
//    //表示每天8时30分0秒执行
//    @Scheduled(cron = "0 0,30 0,8 ? * ? ")
//    public void cronJob() {
//        System.out.println(new Date() + " ...>>cron....");
//    }
}

解析之后正常保存到数据库即可

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值