java爬虫学习

1.新建java项目,导入需要用到的jar包:
jsoup-1.8.1.jar
junit-4.8.2.jar
2.新建Rule.java


public class Rule {
	 /** 
     * 链接 
     */  
    private String url;  
  
    /** 
     * 参数集合 
     */  
    private String[] params;  
    /** 
     * 参数对应的值 
     */  
    private String[] values;  
  
    /** 
     * 对返回的HTML,第一次过滤所用的标签,请先设置type 
     */  
    private String resultTagName;  
  
    /** 
     * CLASS / ID / SELECTION 
     * 设置resultTagName的类型,默认为ID  
     */  
    private int type = ID ;  
      
    /** 
     *GET / POST 
     * 请求的类型,默认GET 
     */  
    private int requestMoethod = GET ;   
      
    public final static int GET = 0 ;  
    public final static int POST = 1 ;  
      
  
    public final static int CLASS = 0;  
    public final static int ID = 1;  
    public final static int SELECTION = 2;  
  
    public Rule()  
    {  
    }  
  
      
    public Rule(String url, String[] params, String[] values,  
            String resultTagName, int type, int requestMoethod)  
    {  
        super();  
        this.url = url;  
        this.params = params;  
        this.values = values;  
        this.resultTagName = resultTagName;  
        this.type = type;  
        this.requestMoethod = requestMoethod;  
    }  
  
    public String getUrl()  
    {  
        return url;  
    }  
  
    public void setUrl(String url)  
    {  
        this.url = url;  
    }  
  
    public String[] getParams()  
    {  
        return params;  
    }  
  
    public void setParams(String[] params)  
    {  
        this.params = params;  
    }  
  
    public String[] getValues()  
    {  
        return values;  
    }  
  
    public void setValues(String[] values)  
    {  
        this.values = values;  
    }  
  
    public String getResultTagName()  
    {  
        return resultTagName;  
    }  
  
    public void setResultTagName(String resultTagName)  
    {  
        this.resultTagName = resultTagName;  
    }  
  
    public int getType()  
    {  
        return type;  
    }  
  
    public void setType(int type)  
    {  
        this.type = type;  
    }  
  
    public int getRequestMoethod()  
    {  
        return requestMoethod;  
    }  
  
    public void setRequestMoethod(int requestMoethod)  
    {  
        this.requestMoethod = requestMoethod;  
    }  
}

3.新建LinkTypeData.java


public class LinkTypeData {
	 private int id;  
	    /** 
	     * 链接的地址 
	     */  
	    private String linkHref;  
	    /** 
	     * 链接的标题 
	     */  
	    private String linkText;  
	    /** 
	     * 摘要 
	     */  
	    private String summary;  
	    /** 
	     * 内容 
	     */  
	    private String content;  
	    public int getId()  
	    {  
	        return id;  
	    }  
	    public void setId(int id)  
	    {  
	        this.id = id;  
	    }  
	    public String getLinkHref()  
	    {  
	        return linkHref;  
	    }  
	    public void setLinkHref(String linkHref)  
	    {  
	        this.linkHref = linkHref;  
	    }  
	    public String getLinkText()  
	    {  
	        return linkText;  
	    }  
	    public void setLinkText(String linkText)  
	    {  
	        this.linkText = linkText;  
	    }  
	    public String getSummary()  
	    {  
	        return summary;  
	    }  
	    public void setSummary(String summary)  
	    {  
	        this.summary = summary;  
	    }  
	    public String getContent()  
	    {  
	        return content;  
	    }  
	    public void setContent(String content)  
	    {  
	        this.content = content;  
	    }  
}

4.新建TextUtil.java工具类


public class TextUtil {
	public static boolean isEmpty(String url){
		if (url == null) {
			return true;
		} else if (url.toLowerCase().equals("null")) {
			return true;
		}else if(url.equals("")){
			return true;
		}else{
			return false;
		}
	}
}

5.新建RuleException.java异常类


public class RuleException extends RuntimeException{
	 public RuleException()  
	    {  
	        super();  
	        // TODO Auto-generated constructor stub  
	    }  
	  
	    public RuleException(String message, Throwable cause)  
	    {  
	        super(message, cause);  
	        // TODO Auto-generated constructor stub  
	    }  
	  
	    public RuleException(String message)  
	    {  
	        super(message);  
	        // TODO Auto-generated constructor stub  
	    }  
	  
	    public RuleException(Throwable cause)  
	    {  
	        super(cause);  
	        // TODO Auto-generated constructor stub  
	    }  
	  
}

6.ExtractService.java

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Connection;
import org.jsoup.Jsoup;  
import org.jsoup.nodes.Document;  
import org.jsoup.nodes.Element;  
import org.jsoup.select.Elements;  


public class ExtractService {
	 /** 
     * @param rule 
     * @return 
     */  
    public static List<LinkTypeData> extract(Rule rule)  
    {  
        // 进行对rule的必要校验  
        validateRule(rule);  
  
        List<LinkTypeData> datas = new ArrayList<LinkTypeData>();  
        LinkTypeData data = null;  
        try  
        {  
            /** 
             * 解析rule 
             */  
            String url = rule.getUrl();  
            String[] params = rule.getParams();  
            String[] values = rule.getValues();  
            String resultTagName = rule.getResultTagName();  
            int type = rule.getType();  
            int requestType = rule.getRequestMoethod();  
  
            Connection conn = Jsoup.connect(url).ignoreContentType(true);
            //Document pod = Jsoup.connect(url).ignoreContentType(true).get();
            // 设置查询参数  
  
            if (params != null)  
            {  
                for (int i = 0; i < params.length; i++)  
                {  
                    conn.data(params[i], values[i]);  
                }  
            }  
  
            // 设置请求类型  
            Document doc = null;  
            switch (requestType)  
            {  
            case Rule.GET:  
                doc = conn.timeout(100000).get();  
                break;  
            case Rule.POST:  
                doc = conn.timeout(100000).post();  
                break;  
            }  
  
            //处理返回数据  
            Elements results = new Elements();  
            switch (type)  
            {  
            case Rule.CLASS:  
                results = doc.getElementsByClass(resultTagName);  
                break;  
            case Rule.ID:  
                Element result = doc.getElementById(resultTagName);  
                results.add(result);  
                break;  
            case Rule.SELECTION:  
                results = doc.select(resultTagName);  
                break;  
            default:  
                //当resultTagName为空时默认去body标签  
                if (TextUtil.isEmpty(resultTagName))  
                {  
                    results = doc.getElementsByTag("body");  
                }  
            }  
  
            for (Element result : results)  
            {  
//                Elements links = result.getElementsByTag("a");  
//  
//                for (Element link : links)  
//                {  
//                    //必要的筛选  
//                    String linkHref = link.attr("href");  
//                    String linkText = link.text();  
//  
//                    data = new LinkTypeData();  
//                    data.setLinkHref(linkHref);  
//                    data.setLinkText(linkText);  
//  
//                    datas.add(data);  
//                }  
            	Elements links = result.getElementsByTag("body");
            	for (Element link : links) {
            		String linkHref = link.attr("href");  
	                String linkText = link.text();  
	                data = new LinkTypeData();  
	                data.setLinkHref(linkHref);  
	                data.setLinkText(linkText);  
	                datas.add(data);  
				}
            }  
  
        } catch (IOException e)  
        {  
            e.printStackTrace();  
        }  
        return datas;  
    }  
  
    /** 
     * 对传入的参数进行必要的校验 
     */  
    private static void validateRule(Rule rule)  
    {  
        String url = rule.getUrl();  
        if (TextUtil.isEmpty(url))  
        {  
            throw new RuleException("url不能为空!");  
        }  
        if (!url.startsWith("http://"))  
        {  
            throw new RuleException("url的格式不正确!");  
        }  
  
        if (rule.getParams() != null && rule.getValues() != null)  
        {  
            if (rule.getParams().length != rule.getValues().length)  
            {  
                throw new RuleException("参数的键值对个数不匹配!");  
            }  
        }  
  
    }  
}

7.测试类(junit单元测试)

import java.util.List;

public class Test {
	 	@org.junit.Test  
	    public void getDatasByClass()  
	    {  
//	        Rule rule = new Rule(  
//	                "http://www1.sxcredit.gov.cn/public/infocomquery.do?method=publicIndexQuery",  
//	        new String[] { "query.enterprisename","query.registationnumber" }, new String[] { "兴网","" },  
//	                "cont_right", Rule.CLASS, Rule.POST);  
	        
	        Rule rule=new Rule("http://hiweshare.com/topicaction/gettopics.do", new String[]{"page","rows"}, new String[]{"1","12"}, null, -1, Rule.POST);
	        List<LinkTypeData> extracts = ExtractService.extract(rule);  
	        printf(extracts);  
	    }  
	 
	 public void printf(List<LinkTypeData> datas)  
	    {  
	        for (LinkTypeData data : datas)  
	        {  
	            System.out.println(data.getLinkText());  
	            System.out.println(data.getLinkHref());  
	            System.out.println("***********************************");  
	        }  
	    }  
}

8.可能会出现的报错信息:
如果这样获取连接的时候可能会报错:

Connection conn = Jsoup.connect(url)

报错信息:

Unhandled content type. Must be text/*, application/xml, or application/xhtm

修改为:

Connection conn = Jsoup.connect(url).ignoreContentType(true);

hiweshare.com是我的一个正在建设的网站,我先在上面测试了下爬虫

http://hiweshare.com/

感谢博主:http://blog.csdn.net/lmj623565791/article/details/23272657

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值