Java爬虫

最新推荐文章于 2023-06-29 09:25:41 发布

LifeBackwards

最新推荐文章于 2023-06-29 09:25:41 发布

阅读量487

点赞数 3

分类专栏： Java

Java 专栏收录该内容

58 篇文章 2 订阅

订阅专栏

转载自：http://blog.csdn.net/lmj623565791/article/details/23272657

思想很简单：就是通过Java访问的链接，然后拿到html字符串，然后就是解析链接等需要的数据。

技术上使用Jsoup方便页面的解析，当然Jsoup很方便，也很简单，一行代码就能知道怎么用了：

[java]view plaincopy 
   
 Document doc = Jsoup.connect("http://www.oschina.net/")   
  .data("query", "Java")   // 请求参数  
  .userAgent("I ’ m jsoup") // 设置 User-Agent   
  .cookie("auth", "token") // 设置 cookie   
  .timeout(3000)           // 设置连接超时时间  
  .post();                 // 使用 POST 方法访问 URL   

下面介绍整个实现过程：

1、分析需要解析的页面：

网址：http://www1.sxcredit.gov.cn/public/infocomquery.do?method=publicIndexQuery

页面：

先在这个页面上做一次查询：观察下请求的url，参数，method等。

这里我们使用chrome内置的开发者工具（快捷键F12），下面是查询的结果：

我们可以看到url，method，以及参数。知道了如何或者查询的URL，下面就开始代码了，为了重用与扩展，我定义了几个类：

1、Rule.java用于指定查询url,method,params等

[java]view plaincopy 
   
 package com.zhy.spider.rule;  
   
 /** 
  * 规则类 
  *  
  * @author zhy 
  *  
  */  
 public class Rule  
 {  
     /** 
      * 链接 
      */  
     private String url;  
   
     /** 
      * 参数集合 
      */  
     private String[] params;  
     /** 
      * 参数对应的值 
      */  
     private String[] values;  
   
     /** 
      * 对返回的HTML，第一次过滤所用的标签，请先设置type 
      */  
     private String resultTagName;  
   
     /** 
      * CLASS / ID / SELECTION 
      * 设置resultTagName的类型，默认为ID  
      */  
     private int type = ID ;  
       
     /** 
      *GET / POST 
      * 请求的类型，默认GET 
      */  
     private int requestMoethod = GET ;   
       
     public final static int GET = 0 ;  
     public final static int POST = 1 ;  
       
   
     public final static int CLASS = 0;  
     public final static int ID = 1;  
     public final static int SELECTION = 2;  
   
     public Rule()  
     {  
     }  
   
       
     public Rule(String url, String[] params, String[] values,  
             String resultTagName, int type, int requestMoethod)  
     {  
         super();  
         this.url = url;  
         this.params = params;  
         this.values = values;  
         this.resultTagName = resultTagName;  
         this.type = type;  
         this.requestMoethod = requestMoethod;  
     }  
   
     public String getUrl()  
     {  
         return url;  
     }  
   
     public void setUrl(String url)  
     {  
         this.url = url;  
     }  
   
     public String[] getParams()  
     {  
         return params;  
     }  
   
     public void setParams(String[] params)  
     {  
         this.params = params;  
     }  
   
     public String[] getValues()  
     {  
         return values;  
     }  
   
     public void setValues(String[] values)  
     {  
         this.values = values;  
     }  
   
     public String getResultTagName()  
     {  
         return resultTagName;  
     }  
   
     public void setResultTagName(String resultTagName)  
     {  
         this.resultTagName = resultTagName;  
     }  
   
     public int getType()  
     {  
         return type;  
     }  
   
     public void setType(int type)  
     {  
         this.type = type;  
     }  
   
     public int getRequestMoethod()  
     {  
         return requestMoethod;  
     }  
   
     public void setRequestMoethod(int requestMoethod)  
     {  
         this.requestMoethod = requestMoethod;  
     }  
   
 }  

简单说一下：这个规则类定义了我们查询过程中需要的所有信息，方便我们的扩展，以及代码的重用，我们不可能针对每个需要抓取的网站写一套代码。

2、需要的数据对象，目前只需要链接，LinkTypeData.java

[java]view plaincopy 
   
 package com.zhy.spider.bean;  
   
 public class LinkTypeData  
 {  
     private int id;  
     /** 
      * 链接的地址 
      */  
     private String linkHref;  
     /** 
      * 链接的标题 
      */  
     private String linkText;  
     /** 
      * 摘要 
      */  
     private String summary;  
     /** 
      * 内容 
      */  
     private String content;  
     public int getId()  
     {  
         return id;  
     }  
     public void setId(int id)  
     {  
         this.id = id;  
     }  
     public String getLinkHref()  
     {  
         return linkHref;  
     }  
     public void setLinkHref(String linkHref)  
     {  
         this.linkHref = linkHref;  
     }  
     public String getLinkText()  
     {  
         return linkText;  
     }  
     public void setLinkText(String linkText)  
     {  
         this.linkText = linkText;  
     }  
     public String getSummary()  
     {  
         return summary;  
     }  
     public void setSummary(String summary)  
     {  
         this.summary = summary;  
     }  
     public String getContent()  
     {  
         return content;  
     }  
     public void setContent(String content)  
     {  
         this.content = content;  
     }  
 }  

3、核心的查询类：ExtractService.java

[java]view plaincopy 
   
 package com.zhy.spider.core;  
   
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.List;  
 import java.util.Map;  
   
 import javax.swing.plaf.TextUI;  
   
 import org.jsoup.Connection;  
 import org.jsoup.Jsoup;  
 import org.jsoup.nodes.Document;  
 import org.jsoup.nodes.Element;  
 import org.jsoup.select.Elements;  
   
 import com.zhy.spider.bean.LinkTypeData;  
 import com.zhy.spider.rule.Rule;  
 import com.zhy.spider.rule.RuleException;  
 import com.zhy.spider.util.TextUtil;  
   
 /** 
  *  
  * @author zhy 
  *  
  */  
 public class ExtractService  
 {  
     /** 
      * @param rule 
      * @return 
      */  
     public static List<LinkTypeData> extract(Rule rule)  
     {  
   
         // 进行对rule的必要校验  
         validateRule(rule);  
   
         List<LinkTypeData> datas = new ArrayList<LinkTypeData>();  
         LinkTypeData data = null;  
         try  
         {  
             /** 
              * 解析rule 
              */  
             String url = rule.getUrl();  
             String[] params = rule.getParams();  
             String[] values = rule.getValues();  
             String resultTagName = rule.getResultTagName();  
             int type = rule.getType();  
             int requestType = rule.getRequestMoethod();  
   
             Connection conn = Jsoup.connect(url);  
             // 设置查询参数  
   
             if (params != null)  
             {  
                 for (int i = 0; i < params.length; i++)  
                 {  
                     conn.data(params[i], values[i]);  
                 }  
             }  
   
             // 设置请求类型  
             Document doc = null;  
             switch (requestType)  
             {  
             case Rule.GET:  
                 doc = conn.timeout(100000).get();  
                 break;  
             case Rule.POST:  
                 doc = conn.timeout(100000).post();  
                 break;  
             }  
   
             //处理返回数据  
             Elements results = new Elements();  
             switch (type)  
             {  
             case Rule.CLASS:  
                 results = doc.getElementsByClass(resultTagName);  
                 break;  
             case Rule.ID:  
                 Element result = doc.getElementById(resultTagName);  
                 results.add(result);  
                 break;  
             case Rule.SELECTION:  
                 results = doc.select(resultTagName);  
                 break;  
             default:  
                 //当resultTagName为空时默认去body标签  
                 if (TextUtil.isEmpty(resultTagName))  
                 {  
                     results = doc.getElementsByTag("body");  
                 }  
             }  
   
             for (Element result : results)  
             {  
                 Elements links = result.getElementsByTag("a");  
   
                 for (Element link : links)  
                 {  
                     //必要的筛选  
                     String linkHref = link.attr("href");  
                     String linkText = link.text();  
   
                     data = new LinkTypeData();  
                     data.setLinkHref(linkHref);  
                     data.setLinkText(linkText);  
   
                     datas.add(data);  
                 }  
             }  
   
         } catch (IOException e)  
         {  
             e.printStackTrace();  
         }  
   
         return datas;  
     }  
   
     /** 
      * 对传入的参数进行必要的校验 
      */  
     private static void validateRule(Rule rule)  
     {  
         String url = rule.getUrl();  
         if (TextUtil.isEmpty(url))  
         {  
             throw new RuleException("url不能为空！");  
         }  
         if (!url.startsWith("http://"))  
         {  
             throw new RuleException("url的格式不正确！");  
         }  
   
         if (rule.getParams() != null && rule.getValues() != null)  
         {  
             if (rule.getParams().length != rule.getValues().length)  
             {  
                 throw new RuleException("参数的键值对个数不匹配！");  
             }  
         }  
   
     }  
   
   
 }  

4、里面用了一个异常类：RuleException.java

[java]view plaincopy 
   
 package com.zhy.spider.rule;  
   
 public class RuleException extends RuntimeException  
 {  
   
     public RuleException()  
     {  
         super();  
         // TODO Auto-generated constructor stub  
     }  
   
     public RuleException(String message, Throwable cause)  
     {  
         super(message, cause);  
         // TODO Auto-generated constructor stub  
     }  
   
     public RuleException(String message)  
     {  
         super(message);  
         // TODO Auto-generated constructor stub  
     }  
   
     public RuleException(Throwable cause)  
     {  
         super(cause);  
         // TODO Auto-generated constructor stub  
     }  
   
 }  

5、最后是测试了：这里使用了两个网站进行测试，采用了不同的规则，具体看代码吧

[java]view plaincopy 
   
 package com.zhy.spider.test;  
   
 import java.util.List;  
   
 import com.zhy.spider.bean.LinkTypeData;  
 import com.zhy.spider.core.ExtractService;  
 import com.zhy.spider.rule.Rule;  
   
 public class Test  
 {  
     @org.junit.Test  
     public void getDatasByClass()  
     {  
         Rule rule = new Rule(  
                 "http://www1.sxcredit.gov.cn/public/infocomquery.do?method=publicIndexQuery",  
         new String[] { "query.enterprisename","query.registationnumber" }, new String[] { "兴网","" },  
                 "cont_right", Rule.CLASS, Rule.POST);  
         List<LinkTypeData> extracts = ExtractService.extract(rule);  
         printf(extracts);  
     }  
   
     @org.junit.Test  
     public void getDatasByCssQuery()  
     {  
         Rule rule = new Rule("http://www.11315.com/search",  
                 new String[] { "name" }, new String[] { "兴网" },  
                 "div.g-mn div.con-model", Rule.SELECTION, Rule.GET);  
         List<LinkTypeData> extracts = ExtractService.extract(rule);  
         printf(extracts);  
     }  
   
     public void printf(List<LinkTypeData> datas)  
     {  
         for (LinkTypeData data : datas)  
         {  
             System.out.println(data.getLinkText());  
             System.out.println(data.getLinkHref());  
             System.out.println("***********************************");  
         }  
   
     }  
 }  

输出结果：

[java]view plaincopy 
   
 深圳市网兴科技有限公司  
 http://14603257.11315.com  
 ***********************************  
 荆州市兴网公路物资有限公司  
 http://05155980.11315.com  
 ***********************************  
 西安市全兴网吧  
 #  
 ***********************************  
 子长县新兴网城  
 #  
 ***********************************  
 陕西同兴网络信息有限责任公司第三分公司  
 #  
 ***********************************  
 西安高兴网络科技有限公司  
 #  
 ***********************************  
 陕西同兴网络信息有限责任公司西安分公司  
 #  
 ***********************************  

最后使用一个Baidu新闻来测试我们的代码：说明我们的代码是通用的。

[java]view plaincopy 
   
         /** 
  * 使用百度新闻，只设置url和关键字与返回类型 
  */  
 @org.junit.Test  
 public void getDatasByCssQueryUserBaidu()  
 {  
     Rule rule = new Rule("http://news.baidu.com/ns",  
             new String[] { "word" }, new String[] { "支付宝" },  
             null, -1, Rule.GET);  
     List<LinkTypeData> extracts = ExtractService.extract(rule);  
     printf(extracts);  
 }  

我们只设置了链接、关键字、和请求类型，不设置具体的筛选条件。

结果：有一定的垃圾数据是肯定的，但是需要的数据肯定也抓取出来了。我们可以设置Rule.SECTION,以及筛选条件进一步的限制。

[html]view plaincopy 
   
 按时间排序  
 /ns?word=支付宝&ie=utf-8&bs=支付宝&sr=0&cl=2&rn=20&tn=news&ct=0&clk=sortbytime  
 ***********************************  
 x  
 javascript:void(0)  
 ***********************************  
 支付宝将联合多方共建安全基金 首批投入4000万  
 http://finance.ifeng.com/a/20140409/12081871_0.shtml  
 ***********************************  
 7条相同新闻  
 /ns?word=%E6%94%AF%E4%BB%98%E5%AE%9D+cont:2465146414%7C697779368%7C3832159921&same=7&cl=1&tn=news&rn=30&fm=sd  
 ***********************************  
 百度快照  
 http://cache.baidu.com/c?m=9d78d513d9d437ab4f9e91697d1cc0161d4381132ba7d3020cd0870fd33a541b0120a1ac26510d19879e20345dfe1e4bea876d26605f75a09bbfd91782a6c1352f8a2432721a844a0fd019adc1452fc423875d9dad0ee7cdb168d5f18c&p=c96ec64ad48b2def49bd9b780b64&newp=c4769a4790934ea95ea28e281c4092695912c10e3dd796&user=baidu&fm=sc&query=%D6%A7%B8%B6%B1%A6&qid=a400f3660007a6c5&p1=1  
 ***********************************  
 OpenSSL漏洞涉及众多网站 支付宝称暂无数据泄露  
 http://tech.ifeng.com/internet/detail_2014_04/09/35590390_0.shtml  
 ***********************************  
 26条相同新闻  
 /ns?word=%E6%94%AF%E4%BB%98%E5%AE%9D+cont:3869124100&same=26&cl=1&tn=news&rn=30&fm=sd  
 ***********************************  
 百度快照  
 http://cache.baidu.com/c?m=9f65cb4a8c8507ed4fece7631050803743438014678387492ac3933fc239045c1c3aa5ec677e4742ce932b2152f4174bed843670340537b0efca8e57dfb08f29288f2c367117845615a71bb8cb31649b66cf04fdea44a7ecff25e5aac5a0da4323c044757e97f1fb4d7017dd1cf4&p=8b2a970d95df11a05aa4c32013&newp=9e39c64ad4dd50fa40bd9b7c5253d8304503c52251d5ce042acc&user=baidu&fm=sc&query=%D6%A7%B8%B6%B1%A6&qid=a400f3660007a6c5&p1=2  
 ***********************************  
 雅虎日本6月起开始支持支付宝付款  
 http://www.techweb.com.cn/ucweb/news/id/2025843  
 ***********************************  
 
   
 
 
  
  
 

LifeBackwards

关注

3
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
Java爬虫

转载自：http://blog.csdn.net/lmj623565791/article/details/23272657思想很简单：就是通过Java访问的链接，然后拿到html字符串，然后就是解析链接等需要的数据。技术上使用Jsoup方便页面的解析，当然Jsoup很方便，也很简单，一行代码就能知道怎么用了：[java] view plain copy
复制链接

扫一扫