java简单爬虫小demo爬信息为主

最新推荐文章于 2022-07-18 20:25:14 发布

可爱的黄油手

最新推荐文章于 2022-07-18 20:25:14 发布

阅读量499

点赞数

本文链接：https://blog.csdn.net/gengzhi1293443962/article/details/79309368

版权

/**
 * 基础实现对网页字段的抓取
 * Created by 耿直 on 2017/11/29.
 */
public class Html {
    public Document getHtmlTextByUrl(String url) {

        Document doc = null;
        int i = (int) (Math.random() * 1000);
        while (i != 0) {
            i--;
        }
        try {
            doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(300000).post();
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            doc = Jsoup.connect(url).timeout(500000).post();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }


    /**
     * 获取树节点<tr><td></td></tr>
     *
     * @param doc
     * @param className
     * @return
     */
    public Elements getElementByClass(Document doc, String className) {
        Elements elements = null;
        elements = doc.select(className);
        return elements;
    }

    /**
     * 调用
     *
     * @param name
     * @param url  网址
     * @param type provincertr
     * @return
     */
    public ArrayList getProvince(String name, String url, String type) {
        ArrayList result = new ArrayList();
        String classType = "." + type;
        Document doc = this.getHtmlTextByUrl(url);
        if (doc != null) {
            Elements elements = this.getElementByClass(doc, classType);
            for (Element item : elements) {
                if (item != null) {
                    for (Element items : item.children()) {
                        String[] str = new String[4];
                        if (items.children().first() != null) {
                           // str[0] = url;
                            str[1] = items.children().first().ownText();
                            String ownUrl = items.children().first().attr("href");
                            str[2] = ownUrl;
                            str[3] = type;
                            result.add(str);
                        }
                    }
                }
            }
        }
        return result;
    }

}