网络爬虫HttpClient+Jsoup+WebMagic学习笔记

补充:爬虫相关文档笔记链接

一、入门程序体验

1、HttpClient处理网络请求

1、添加依赖

<dependency>
       <groupId>org.apache.httpcomponents</groupId>
       <artifactId>httpclient</artifactId>
       <version>4.5.10</version>
</dependency>

2、初识网络爬虫

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class testdemo {
    public static void main(String[] args) throws Exception {
        //案例一:入门程序,模拟get请求,爬取网页原页面
        /*CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
        URIBuilder uriBuilder = new URIBuilder("http://www.itcast.cn/search");
        uriBuilder.setParameter("keys","java").setParameter("","");
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        //HttpPost httpPost = new HttpPost(uriBuilder.build());
        CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
        if(response.getStatusLine().getStatusCode()==200){
            HttpEntity entity = response.getEntity();
            String s = EntityUtils.toString(entity, "utf-8");
            System.out.println(s);
            //关闭连接
            response.close();
            closeableHttpClient.close();
        }*/

        //案例二:模拟表单post请求
        /*CloseableHttpClient httpClient= HttpClients.createDefault();
        HttpPost httpPost = new HttpPost("http://www.itcast.cn/search");
        ArrayList<NameValuePair> params = new ArrayList<>();
        params.add(new BasicNameValuePair("keys","java"));
        UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(params,"utf-8");
        httpPost.setEntity(urlEncodedFormEntity);
        CloseableHttpResponse response = httpClient.execute(httpPost);
        if(response.getStatusLine().getStatusCode()==200){
            HttpEntity entity = response.getEntity();
            String s = EntityUtils.toString(entity, "utf-8");
            System.out.println(s);
        }*/

        //案例三:连接池管理器
        PoolingHttpClientConnectionManager pool = new PoolingHttpClientConnectionManager();
        pool.setMaxTotal(10);
        doGet(pool);
    }

    private static void doGet(PoolingHttpClientConnectionManager pool) {
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
        HttpPost httpPost = new HttpPost("http://www.itcast.cn");
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpPost);
            if(response.getStatusLine().getStatusCode()==200){
                HttpEntity entity = response.getEntity();
                String s = EntityUtils.toString(entity);
                System.out.println(s);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }
}

2、Jsoup页面解析

1、引入依赖

 <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.10.2</version>
</dependency>

2、小demo

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.net.URL;

public class JsoupDemo {
    public static void main(String[] args) throws Exception {
    //Jsoup.parse()这个方法参数可以是文件、字符串、网页地址,都能转成Document
        Document document = Jsoup.parse(new URL("http://47.97.200.76/login"), 10000);
        String title = document.getElementsByTag("title").first().text();
        System.out.println("打印=========>"+title);
    }
}

//el#id: 元素+ID,比如: h3#city_bj
String str = document.select("h3#city_bj").text();

//el.class: 元素+class,比如: li.class_a
str = document.select("li.class_a").text();

//el[attr]: 元素+属性名,比如: span[abc]
str = document.select("span[abc]").text();

//任意组合,比如:span[abc].s_name
str = document.select("span[abc].s_name").text();

//ancestor child: 查找某个元素下子元素,比如:.city_con li 查找"city_con"下的所有li
str = document.select(".city_con li").text();

//parent > child: 查找某个父元素下的直接子元素,
//比如:.city_con > ul > li 查找city_con第一级(直接子元素)的ul,再找所有ul下的第一级li
str = document.select(".city_con > ul > li").text();

//parent > * 查找某个父元素下所有直接子元素.city_con > *
str = document.select(".city_con > *").text();

二、小案例:爬取JD手机数据

在这里插入图片描述
1、HttpClient抓取数据
2、Jsoup解析数据
3、存储数据

三、WebMagic学习使用

1、基础概念

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

2、案例

1、爬取页面某些标签属性值

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;

public class WebMagicTest implements PageProcessor {
    private Site site = Site.me()
            .setTimeOut(1000*10)//设置请求超时,单位ms
            .setCharset("utf8")//设置编码
            .setRetrySleepTime(1000*3)//设置请求失败后的重新请求时间
            .setSleepTime(3)//设置重置次数 ;
            ;
    public static void main(String[] args) {
        Spider.create(new WebMagicTest())
                //.addPipeline(new FilePipeline("E:\\STUDY\\study\\jsoup-crawlers\\src\\main\\resources\\static"))//将结果保存到文件中需要单独设置,默认是控制台打印
                .addUrl("http://ace.piesat.cn/login.xhtml")
                .thread(3)//设置五个线程处理
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))//设置去重过滤器为布隆过滤器
                .run();
    }

    @Override
    public void process(Page page) {
        //1、 使用css选择器 解析页面,获取<div>标签下并且class=mod_conatiner 里面的<script>
        page.putField("div1", page.getHtml().css("div.ios_qrcode div").all());

        //2、 使用xpath 解析 解析页面,获取<div>下面的<a>标签下并且id=InitCartUrl-mini的值
        //page.putField("div2", page.getHtml().xpath("//div[@id=copyright]").toString());

        //3、 使用正则表达式 解析 解析页面,获取<div>下面的<a>标签下并且id=InitCartUrl-mini的值
        page.putField("div3",page.getHtml().css("div#copyright").regex(".*Team.*").all());

        //4、 获取某标签内的超链接
        page.putField("div4",page.getHtml().css("div#forget_pass").links().all());

        //5
//        page.addTargetRequests( page.getHtml().css("div#forget_pass").links().all());
//        page.addTargetRequest("http://ace.piesat.cn/login.xhtml");
    }

    @Override
    public Site getSite() {
        return site;
    }
}

注意:去重过滤器有三种,HashSet、redis的set、布隆过滤器
补充一个知识点,布隆过滤器。在爬虫过程中爬取的网址和数据可能会重复,这时就需要将重复网址或数据进行去重,其中有个比HashSet内存占用小十倍的布隆过滤器。下面介绍下布隆过滤器:
(1)布隆过滤器是一个位数组,它会将地址通过特殊算法,得到一串数字放入位数组中(布隆过滤器与HashSet的区别也在此,HashSet存放的是整个字符串,一个字母 = 一个字节 = 8位,布隆过滤器通过自己的算法后会将地址存放占用更少空间)
(2)若匹配到某些数字,通过标记为1,来表示已经存在过此数据
(3)但是这样做可能会导致误判重复
在这里插入图片描述

2、爬取某工作网站计算机软件行业的信息存入mysql

1、建表

CREATE TABLE `job_info`  (
  `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `company_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '公司名称',
  `company_addr` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '公司联系方式',
  `company_info` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '公司信息',
  `job_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '职位名称',
  `job_addr` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '工作地点',
  `job_info` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '职位信息',
  `salary_min` int(10) NULL DEFAULT NULL COMMENT '薪资范围,最小',
  `salary_max` int(10) NULL DEFAULT NULL COMMENT '薪资范围,最大',
  `url` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '招聘信息详情页',
  `time` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '职位最近发布时间',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 5826 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '招聘信息' ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

2、搭建一个干净的springboot框架,pom加入下面的依赖

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.10</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.12.0</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.5</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.5</version>
        </dependency>
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>31.0.1-jre</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
            <version>2.2.4.RELEASE</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.25</version>
        </dependency>

3、配置文件写入:

spring.profiles.active=dev
logging.level.root=info
logging.config=classpath:logback-spring.xml
debug=true
logging.level.org.springframework.boot.autoconfigure=ERROR
#DB Configuration:
spring.datasource.driverClassName=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler?useUnicode=true&characterEncoding=UTF-8&serverTimezone=GMT%2B8&zeroDateTimeBehavior=convertToNull&allowMultiQueries=true
spring.datasource.username=root
spring.datasource.password=root
#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true

4、启动类加上注解@EnableScheduling//开启定时
5、新建实体类、dao、service

import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
@Entity
public class JobInfo{
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    private String companyName;
    private String companyAddr;
    private String companyInfo;
    private String jobName;
    private String jobAddr;
    private String jobInfo;
    private Integer salaryMin;
    private Integer salaryMax;
    private String url;
    private String time;
    public JobInfo() {
    }
    public JobInfo(String url) {
        this.url = url;
    }
}//get/set省略了,自己补充
import com.furenqaing.jsoupcrawlers.entity.JobInfo;
import org.springframework.data.jpa.repository.JpaRepository;
public interface JobInfoDao extends JpaRepository<JobInfo,Long> {
}
import com.furenqaing.jsoupcrawlers.entity.JobInfo;
import java.util.List;
public interface JobInfoService {
    public void save(JobInfo jobInfo);
    public List<JobInfo> findJobInfo(JobInfo jobInfo);
}
import com.alibaba.fastjson.JSON;
import com.furenqaing.jsoupcrawlers.dao.JobInfoDao;
import com.furenqaing.jsoupcrawlers.entity.JobInfo;
import com.furenqaing.jsoupcrawlers.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.List;
@Service
public class JobInfoServiceImpl implements JobInfoService {
    @Autowired
    JobInfoDao jobInfoDao;
    @Override
    @Transactional
    public void save(JobInfo jobInfo) {
        Example<JobInfo> example = Example.of(new JobInfo(jobInfo.getUrl()));
        long count = jobInfoDao.count(example);
        if(count==0){
            jobInfoDao.saveAndFlush(jobInfo);
            System.out.println("mysql存入数据:"+ JSON.toJSONString(jobInfo));
        }
    }
    @Override
    public List<JobInfo> findJobInfo(JobInfo jobInfo) {
        Example<JobInfo> example = Example.of(jobInfo);
        List<JobInfo> jobInfoList = jobInfoDao.findAll(example);
        return jobInfoList;
    }
}

6、新建工具类,处理工资20-25k格式

public class SalaryUtil {
    public static String computeSalary(String salary){
        salary=salary.split("·")[0];
        String[] split = salary.split("-");
        if (split.length<2){
            return "0,0";
        }
        String unit = split[1].substring(split[1].length() - 1);
        String maxStr = split[1].substring(0, split[1].length() - 1);
        Integer min = Integer.valueOf(split[0]);
        Integer max=Integer.valueOf(maxStr);
        switch (unit){
            case "k":
                min=min*1000;
                max=max*1000;
                break;
            case "w":
                min=min*10000;
                max=max*10000;
                break;
        }
        return min+","+max;
    }
}

7、新建存储管道(方式)

import com.furenqaing.jsoupcrawlers.entity.JobInfo;
import com.furenqaing.jsoupcrawlers.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
@Component
public class SpringDataPipeline implements Pipeline {
    @Autowired
    JobInfoService jobInfoService;
    @Override
    public void process(ResultItems resultItems, Task task) {
        JobInfo jobInfo = resultItems.get("jobInfo");
        if (jobInfo==null){
            return;
        }
        this.jobInfoService.save(jobInfo);
    }
}

8、新建任务包task,任务类JobInfoTask处理webmagic相关业务

import com.furenqaing.jsoupcrawlers.entity.JobInfo;
import com.furenqaing.jsoupcrawlers.utils.SalaryUtil;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
@Component
public class JobInfoTask implements PageProcessor {
    @Autowired
    SpringDataPipeline springDataPipeline;
    private Site site = Site.me()
            .setTimeOut(1000*60)//设置请求超时,单位ms
            .setCharset("utf8")//设置编码
            .setRetrySleepTime(1000*3)//设置请求失败后的重新请求时间
            .setSleepTime(3)//设置重置次数 ;
            ;
    @Override
    public void process(Page page) {
        List<Selectable> nodes = page.getHtml().css("div.left-list-box ul li").nodes();
        if (nodes.size()==0){
            saveJobInfo(page);
        }else {
            for (Selectable node : nodes) {
                String url = node.links().toString();
                System.out.println("=========>"+url);
                //将此地址加入任务中
                page.addTargetRequest(url);
            }
        }
    }
    private void saveJobInfo(Page page) {
        Html html = page.getHtml();
        String jobName = html.css("div.name-box span", "text").toString();
        String companyName = html.css("div.title-box span a", "text").toString();
        if (StringUtils.isEmpty(companyName)){
            companyName = Jsoup.parse(html.css("div.title-box span").all().toString()).getElementsByTag("span").get(1).text();
        };
         companyName=companyName.replace("·", "").replace(" ","");
        String salary = html.css("div.name-box span.salary", "text").toString();
        String salaryStr = SalaryUtil.computeSalary(salary);
        String[] salarys = salaryStr.split(",");
        String url = page.getUrl().toString();
        JobInfo jobInfo = new JobInfo();
        jobInfo.setSalaryMin(Integer.valueOf(salarys[0]));
        jobInfo.setSalaryMax(Integer.valueOf(salarys[1]));
        jobInfo.setCompanyName(companyName);
        jobInfo.setJobName(jobName);
        jobInfo.setUrl(url);
        //放入webmagic内存中,在Pipeline中取
        page.putField("jobInfo",jobInfo);
    }
    @Override
    public Site getSite() {
        return this.site;
    }
    //initialDelay:当任务启动后多久开始执行此方法,fixedDelay:执行频率为多少
    @Scheduled(initialDelay = 1000,fixedDelay = 1000*10)
    public void process(){
        Spider.create(new JobInfoTask())
                .addUrl("https://www.liepin.com/zhaopin/?key=java&pubTime=1&eduLevel=040")
                .thread(5)//设置五个线程处理
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))//设置去重过滤器为布隆过滤器
                .addPipeline(springDataPipeline)//添加输出模式
                .run();
    }
}

9、执行效果
在这里插入图片描述

3、爬取某网站所有POI数据存入ES

核心PageProcessor爬取代码,其他和存mysql差别不大:

import com.alibaba.fastjson.JSON;
import com.furenqaing.jsoupcrawlers.entity.GeoCodeEntity;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.elasticsearch.core.geo.GeoPoint;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;

@Component
public class GeoPoiTask implements PageProcessor {

    @Autowired
    SpringDataPipeline springDataPipeline;

    private Site site = Site.me()
            .setTimeOut(1000 * 60)//设置请求超时,单位ms
            .setCharset("utf8")//设置编码
            .setRetrySleepTime(1000 * 3)//设置请求失败后的重新请求时间
            .setSleepTime(3)//设置重置次数 ;
            ;

    @Override
    public void process(Page page) {
        List<Selectable> nodes = page.getHtml().css("div.wrap-bg div.container div.row ul li").nodes();
        if (nodes.size() == 0) {
            saveGeoInfo(page);
        } else {
            for (Selectable node : nodes) {
                String url = node.links().toString();
                String cityName = node.css("a", "text").toString();
                System.out.println("爬取的城市:" + cityName+"-地址:"+url);
                PoolingHttpClientConnectionManager pool = new PoolingHttpClientConnectionManager();
                pool.setMaxTotal(10);
                List<String> urls = getUrls(pool, url);
                for (String s : urls) {
                    page.addTargetRequest(s);//将此地址加入任务中
                }
            }
        }
    }

    private void saveGeoInfo(Page page) {
        Html html = page.getHtml();
        List<Selectable> nodestr = html.css("div.data-wrap tbody tr").nodes();
        List<GeoCodeEntity> geoCodeEntitys = new ArrayList<>();
        for (Selectable nodetr : nodestr) {
            List<Selectable> nodestd = nodetr.css("td").nodes();
            String text = nodestd.get(1).css("td", "text").toString();
            String province = nodestd.get(2).css("td", "text").toString();
            String city = nodestd.get(3).css("td", "text").toString();
            String counties = nodestd.get(4).css("td", "text").toString();
            String areaCode = nodestd.get(5).css("td", "text").toString();
            String phone = nodestd.get(6).css("td", "text").toString();
            String area = nodestd.get(7).css("td", "text").toString();
            String adress = nodestd.get(8).css("td", "text").toString();
            String maxClass = nodestd.get(9).css("td", "text").toString();
            String minClass = nodestd.get(10).css("td", "text").toString();
            String lon = nodestd.get(11).css("td", "text").toString();
            String lat = nodestd.get(12).css("td", "text").toString();
            String latStr = new BigDecimal(lat).setScale(6).toString();
            String lonStr = new BigDecimal(lon).setScale(6).toString();
            String id = lonStr + latStr;
            GeoCodeEntity geoCodeEntity = new GeoCodeEntity();
            geoCodeEntity.setId(id.replace(".", ""));
            geoCodeEntity.setText(text);
            geoCodeEntity.setProvince(province);
            geoCodeEntity.setCity(city);
            geoCodeEntity.setCounties(counties);
            geoCodeEntity.setAreaCode(areaCode);
            geoCodeEntity.setPhone(phone);
            geoCodeEntity.setArea(area);
            geoCodeEntity.setAdress(adress);
            geoCodeEntity.setMaxClass(maxClass);
            geoCodeEntity.setMinClass(minClass);
            GeoPoint geoPoint = new GeoPoint(Double.valueOf(lat), Double.valueOf(lon));
            geoCodeEntity.setLocation(geoPoint);
            geoCodeEntitys.add(geoCodeEntity);
        }
        System.out.println("每页数据分别是:======>" + JSON.toJSONString(geoCodeEntitys));
        page.putField("geoCodeEntitys",geoCodeEntitys);
    }

    @Override
    public Site getSite() {
        return this.site;
    }

    //initialDelay:当任务启动后多久开始执行此方法,fixedDelay:执行频率为多少
    @Scheduled(initialDelay = 1000, fixedDelay = 1000 * 10)
    public void process() {
        Spider.create(new GeoPoiTask())
                .addUrl("http://www.poilist.cn/cities-list/%E6%99%AF%E7%82%B9")
                .thread(10)//设置五个线程处理
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))//设置去重过滤器为布隆过滤器
                .addPipeline(springDataPipeline)//添加输出模式
                .run();
    }

    private List<String> getUrls(PoolingHttpClientConnectionManager pool, String uri) {
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
        HttpGet httpGet = new HttpGet(uri);
        CloseableHttpResponse response = null;
        List<String> urlList = new ArrayList<String>();
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                String htmlStr = EntityUtils.toString(entity);
                Document html = Jsoup.parse(htmlStr);
                Elements select = html.select("ul.pagination li");
                for (Element element : select) {
                    String url = element.select("a").first().attr("href");
                    System.out.println("爬取的二级地址有======>:" + url);
                    urlList.add(url);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return urlList;
    }
}

3、数据去重算法

google的simhash算法
simhash是由 Charikar 在2002年提出来的,为了便于理解尽量不使用数学公式,分为这几步:
1、分词,把需要判断文本分词形成这个文章的特征单词。
2、hash,通过hash算法把每个词变成hash值,比如“美国”通过hash算法计算为 100101,“51区”通过hash算法计算为 101011。这样我们的字符串就变成了一串串数字。
3、加权,通过 2步骤的hash生成结果,需要按照单词的权重形成加权数字串,“美国”的hash值为“100101”,通过加权计算为“4 -4 -4 4 -4 4”
“51区”计算为 “ 5 -5 5 -5 5 5”。
4、合并,把上面各个单词算出来的序列值累加,变成只有一个序列串。
“美国”的 “4 -4 -4 4 -4 4”,“51区”的 “ 5 -5 5 -5 5 5”
把每一位进行累加, “4+5 -4±5 -4+5 4±5 -4+5 4+5”“9 -9 1 -1 1 9”
5、降维,把算出来的 “9 -9 1 -1 1 9”变成 0 1 串,形成最终的simhash签名。
在这里插入图片描述

4、使用代理,解决网页反爬

在这里插入图片描述
点击进入:免费代理网站
设置代理服务器的代码如下:

//initialDelay:当任务启动后多久开始执行此方法,fixedDelay:执行频率为多少
    @Scheduled(initialDelay = 1000,fixedDelay = 1000*10)
    public void process(){
 		//创建下载器Downloader
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("60.191.11.249", 3128)));//此ip可在上面的免费代理网站找
        //给下载器设置代理服务器信息

        Spider.create(new TestSimHash())
                .addUrl("http://ip.chinaz.com/getip.aspx")
                //设置代理下载器
                .setDownloader(httpClientDownloader)
                .run();
    }
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值