java爬虫笔记

最新推荐文章于 2022-05-09 22:46:51 发布

能一块玩吗丶

最新推荐文章于 2022-05-09 22:46:51 发布

阅读量172

点赞数

本文链接：https://blog.csdn.net/weixin_41618135/article/details/109644403

版权

Java 专栏收录该内容

11 篇文章 2 订阅

订阅专栏

Java爬虫

导入依赖

 Apache-HttpClient
 SLF4J LOG4J 12 Binding

入门程序

         //1.创建HttpClient对象
         CloseableHttpClient httpClient = HttpClients.createDefault();
 
         //2.创建get请求对象
         HttpGet httpGet = new HttpGet("http://www.baidu.com");
 
         //3.HttpClient对象发送get请求
         CloseableHttpResponse response = httpClient.execute(httpGet);
 
         //4.获得response
         if (response.getStatusLine().getStatusCode() == 200){
             HttpEntity httpEntity = response.getEntity();
             String content = EntityUtils.toString(httpEntity,"utf-8");
             System.out.println(content);

带参数的get请求

         //1.创建HttpClient对象
         CloseableHttpClient httpClient = HttpClients.createDefault();
         //创建uri
         URIBuilder uriBuilder = new URIBuilder("http://www.itheima.com/");
         uriBuilder.setParameter("keys","Java");
         //2.创建get请求对象
         HttpGet httpGet = new HttpGet(uriBuilder.build());
 
         //3.HttpClient对象发送get请求
         CloseableHttpResponse response = httpClient.execute(httpGet);
 
         //4.获得response
         if (response.getStatusLine().getStatusCode() == 200){
             HttpEntity httpEntity = response.getEntity();
             String content = EntityUtils.toString(httpEntity,"utf-8");
             System.out.println(content);
         }

Post请求

 将httpGet改成httpPost

带参数的Post请求

        //1.创建HttpClient对象
         CloseableHttpClient httpClient = HttpClients.createDefault();
 
         //2.创建post请求对象
         HttpPost httpPost = new HttpPost("http://www.baidu.com");
 
         //3.创建post表单对象
         List<NameValuePair> params = new ArrayList<>();
         params.add(new BasicNameValuePair("keys","Java"));
 
         //4.创建表单的entity对象
         UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
 
         //5.将entity放到post请求中
         httpPost.setEntity(formEntity);
 
         //6.获得response
         CloseableHttpResponse response = httpClient.execute(httpPost);
         if (response.getStatusLine().getStatusCode() == 200){
             HttpEntity httpEntity = response.getEntity();
             String content = EntityUtils.toString(httpEntity,"utf-8");
             System.out.println(content);
         }

连接池管理器

     public static void main(String[] args) throws IOException, URISyntaxException {
 
         //1.创建连接池
         PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
         //2.设置最大连接数
         cm.setMaxTotal(100);
         //3.设置每台主机的最大连接数，即网站的最大连接数
         cm.setDefaultMaxPerRoute(10);
         doGet(cm);
         doGet(cm);
 
     }
 
     private static void doGet(PoolingHttpClientConnectionManager cm) throws IOException {
         //1.创建HttpClient对象
         CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
 
         //2.创建get请求对象
         HttpGet httpGet = new HttpGet("http://www.itheima.com/");
 
         //3.HttpClient对象发送get请求
         CloseableHttpResponse response = httpClient.execute(httpGet);
 
         //4.获得response
         if (response.getStatusLine().getStatusCode() == 200){
             HttpEntity httpEntity = response.getEntity();
             String content = EntityUtils.toString(httpEntity,"utf-8");
             System.out.println(content);
         }
     }

请求配置

         HttpGet httpGet = new HttpGet(uriBuilder.build());
 
         RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(1000)   //创建链接最长时间
                 .setConnectionRequestTimeout(500) //设置获取链接最长时间
                 .setSocketTimeout(10*1000)   //设置数据传输最长时间
                 .build();
         httpGet.setConfig(config);    //给请求设置配置信息

Jsoup

前期准备

 导入依赖jsoup   org.jsoup
 commons io //对文件操作
 junit
 commons lang3

解析html

         //解析url地址，获取html，第一个参数是访问的url地址，第二个参数是访问时间的超时时间
         Document doc = Jsoup.parse(new URL("http://www.itheima.com"),1000);
         //使用标签选择器，获取title中的内容
         String title = doc.getElementsByTag("title").first().text();
         System.out.println(title);

解析字符串

 Document doc = Jsoup.parse(new File("文件地址"),"utf-8");
 String title = doc.getElementByTag("title").first.text();

使用dom文档遍历文档

 doc.getElementById("id");      //返回element
 doc.getElementByTag("title");  //返回elements
 doc.getElementByClass("class_a class_b").first(); //返回element
 doc.getElementByAttribute("abc")   
 doc.getElementByAttributeValue("href","www.baidu.com");

获取元素中的数据

 element 是一个节点元素
 String str = element.id(); //返回string
 element.className();  //返回string
 element.classNames();  //返回string的set集合  Set<String>
 element.attr("class")   //返回属性的值
 element.attributes();   //返回attributes集合 
 element.text();         //返回string  元素的值

使用选择器查询元素

 doc.select("span");      //返回elements
 doc.select("#id");        //返回elements
 doc.select("[abc]");       //通过属性abc获取
 doc.select("[class=className]");
 
 
 任意组合
 doc.select("span[abc]");     //元素和属性的组合
 h3#id       //元素+id
 h3.className   //元素+类名

WebMagic

导入依赖

 WebMagic core 
 WebMagic Extension

入门程序

 public class Demo01 implements PageProcessor {
     @Override
     public void process(Page page) {
         //解析页面
         //Css
         //也有xpath，也有正则表达式
         page.putField("div",page.getHtml().css("title").regex(".*s")all());
         
     }
     private Site site = Site.me();
     @Override
     public Site getSite() {
         return site;
     }
 
     public static void main(String[] args) {
         Spider.create(new Demo01())
                 .addUrl("https://www.itheima.com/")       //设置爬取页面
                 .run();
     }
 }
 page.getHtml().css("title").regex(".*s").all()  //所有结果
 page.getHtml().css("title").regex(".*s").get()   //结果的第一条
 page.getHtml().css("title").regex(".*s").toString()   //同get

获取链接

 page.addTargetRequests(page.getHtml().css().links().all());
 page.putFiled()

多线程

         Spider.create(new Demo01())
                 .addUrl("https://www.biqugeg.com/20_20368/15354666.html")       //设置爬取页面
                 .addPipeline(new FilePipeline("C:\\Users\\WX\\Desktop\\新建文件夹"))
                 .thread(5)
                 .run();

爬虫的配置、启动和终止

     private Site site = Site.me()
     .setCharset("utf8")  //设置编码
     .setTimeOut(10000)    //设置超时时间   单位ms
     .setRetrySleepTime(1000)    //设置重试时间间隔 单位ms
     .setSleepTime(3)                 //设置重试次数
     ;
     @Override
     public Site getSite() {
         return site;
     }

定时任务

 schedule

能一块玩吗丶

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
java爬虫笔记

Java爬虫导入依赖Apache-HttpClientSLF4J LOG4J 12 Binding入门程序 //1.创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //2.创建get请求对象 HttpGet httpGet = new HttpGet("http://www.baidu.com");...
复制链接

扫一扫

专栏目录