Android Jsoup抓取网络数据

1.libs中导入jar包:

compile files('libs/jsoup-1.7.2.jar')

2.用于指定查询url,method,params(自己get和set)

/**
 * 规则类
 */
public class Rule {
    /**
     * 链接
     */
    private String url;
    /**
     * 参数集合
     */
    private String[] params;
    /**
     * 参数对应的值
     */
    private String[] values;

    /**
     * 对返回的HTML,第一次过滤所用的标签,请先设置type
     */
    private String resultTagName;
    /**
     * CLASS / ID / SELECTION
     * 设置resultTagName的类型,默认为ID
     */
    private int type = ID;
    /**
     * GET / POST
     * 请求的类型,默认GET
     */
    private int requestMoethod = GET;

    public final static int GET = 0;
    public final static int POST = 1;
    
    public final static int CLASS = 0;
    public final static int ID = 1;
    public final static int SELECTION = 2;

    public Rule() {
    }

    public Rule(String url, String[] params, String[] values,
                String resultTagName, int type, int requestMoethod) {
        super();
        this.url = url;
        this.params = params;
        this.values = values;
        this.resultTagName = resultTagName;
        this.type = type;
        this.requestMoethod = requestMoethod;
    }
}

3.数据对象

public class LinkTypeData {

    private int id;
    /**
     * 链接的地址
     */
    private String linkHref;
    /**
     * 链接的标题
     */
    private String linkText;
    /**
     * 摘要
     */
    private String summary;
    /**
     * 内容
     */
    private String content;
}

4.查询类

public class ExtractService {

    public static List<LinkTypeData> e;

    public static void getJsoup(final Rule rule) {
        // 8、使用线程执行访问服务器,获取返回信息后通知主线程更新UI或者提示信息。
        final Handler handler = new Handler() {
            @Override
            public void handleMessage(Message msg) {
                if (msg.what == 1) {
                    printf(e);
                }
            }
        };

        // 启动线程来执行任务
        new Thread() {
            public void run() {
                // 请求网络
                e = extract(rule);
                Message m = new Message();
                m.what = 1;

                // 发送消息到Handler
                handler.sendMessage(m);
            }
        }.start();
    }

    public static void printf(List<LinkTypeData> datas) {
        for (LinkTypeData data : datas) {
            Log.e("CJY", "A=" + data.getLinkText());
            Log.e("CJY", "B=" + data.getLinkHref());
            Log.e("CJY", "C=" + data.getId());
            Log.e("CJY", "D=" + data.getContent());
            Log.e("CJY", "E=" + data.getSummary());
            Log.e("CJY", "F=CJYCJYCJYCJYCJYCJY");
        }
    }

    /**
     * @param rule
     * @return
     */
    public static List<LinkTypeData> extract(Rule rule) {

        // 进行对rule的必要校验
        validateRule(rule);

        List<LinkTypeData> datas = new ArrayList<LinkTypeData>();
        LinkTypeData data = null;
        try {
            /**
             * 解析rule
             */
            String url = rule.getUrl();
            String[] params = rule.getParams();
            String[] values = rule.getValues();
            String resultTagName = rule.getResultTagName();
            int type = rule.getType();
            int requestType = rule.getRequestMoethod();

            Connection conn = Jsoup.connect(url);
            // 设置查询参数

            if (params != null) {
                for (int i = 0; i < params.length; i++) {
                    conn.data(params[i], values[i]);
                }
            }

            // 设置请求类型
            Document doc = null;
            switch (requestType) {
                case Rule.GET:
                    doc = conn.timeout(100000).get();
                    break;
                case Rule.POST:
                    doc = conn.timeout(100000).post();
                    break;
            }

            //处理返回数据
            Elements results = new Elements();
            switch (type) {
                case Rule.CLASS:
                    results = doc.getElementsByClass(resultTagName);
                    break;
                case Rule.ID:
                    Element result = doc.getElementById(resultTagName);
                    results.add(result);
                    break;
                case Rule.SELECTION:
                    results = doc.select(resultTagName);
                    break;
                default:
                    //当resultTagName为空时默认去body标签
                    if (TextUtils.isEmpty(resultTagName)) {
                        results = doc.getElementsByTag("body");
                    }
            }

            for (Element result : results) {
                Elements links = result.getElementsByTag("a");

                for (Element link : links) {
                    //必要的筛选
                    String linkHref = link.attr("href");
                    String linkText = link.text();

                    data = new LinkTypeData();
                    data.setLinkHref(linkHref);
                    data.setLinkText(linkText);

                    datas.add(data);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return datas;
    }

    /**
     * 对传入的参数进行必要的校验
     */
    private static void validateRule(Rule rule) {
        String url = rule.getUrl();
        if (TextUtils.isEmpty(url)) {
            throw new RuleException("url不能为空!");
        }
        if (!url.startsWith("http://")) {
            throw new RuleException("url的格式不正确!");
        }

        if (rule.getParams() != null && rule.getValues() != null) {
            if (rule.getParams().length != rule.getValues().length) {
                throw new RuleException("参数的键值对个数不匹配!");
            }
        }
    }
}

5.异常类

public class RuleException extends RuntimeException {

    public RuleException() {
        super();
        // TODO Auto-generated constructor stub
    }

    public RuleException(String message, Throwable cause) {
        super(message, cause);
        // TODO Auto-generated constructor stub
    }

    public RuleException(String message) {
        super(message);
        // TODO Auto-generated constructor stub
    }

    public RuleException(Throwable cause) {
        super(cause);
        // TODO Auto-generated constructor stub
    }

}

6.使用方法:

Rule rule = new Rule("http://gold.cnfol.com/mingjiadianjin/#",
        new String[]{}, new String[]{},
        "ul.ColumnList li", Rule.SELECTION, Rule.GET);
ExtractService.getJsoup(rule);

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值