抓取一些乱的URL

最新推荐文章于 2022-09-12 12:42:36 发布

smilelearner

最新推荐文章于 2022-09-12 12:42:36 发布

阅读量387

点赞数

分类专栏：网页抓取文章标签： java 网络 url json

本文链接：https://blog.csdn.net/smileks/article/details/73331055

版权

网页抓取专栏收录该内容

1 篇文章 0 订阅

订阅专栏

概述

抓一些杂乱的URL，URL可能是图片，可能是json,可能是一个下载链接

获得网络请求的heard

public static Map<String, Object> getPageContent(String url, String regx) {
        String pageContent = null;
        Header[] headers = null;
        Map<String, Object> contentmap = new HashMap<String, Object>();
        AnimalHttpClient client = AnimalExt.assistant(AnimalHttpClient.class);
        AnimalRquest request = new AnimalRquest();
        request.Get(url);
        request.commondHeader();
        //request.setAutoProxy(false);
        request.setTimeout(6000);
        request.setReqTryNum(2);
        request.setIoErrorDealyTime(6000);
        request.setCircularRedirectsAllowed(false);//重定向
        request.setRedirectsEnabled(false);
         request.setAutoLoaclAddress(true); 
        ResultView view = client.request(request, new ParseView() {
            @Override
            public Boolean call(String page) {
                if (page.contains(regx)) {
                    return true;
                } else {
                    return false;
                }
            }
        });
        if (view != null) {
            try {
                pageContent = view.getContent().asString();
                headers = view.getAllHeaders();
                contentmap.put("pageContent", pageContent);
                contentmap.put("headers", headers);
            } catch (JQHttpClientException e) {
                e.printStackTrace();
            } finally {
                view.close();
            }
        }
        return contentmap;
    }

public void getdetail(List<Scd> Scds, Scd seed) {
        String url = "";
        String contenttype = "";
        String title = "";
        String metakeywords = "";
        String metadescription = "";
        String state = "1";
        String content = "";
        Header[] headers = null;
        try {
            url = seed.getField("<url>");
            Map<String, Object> contentmap = getPageContent(url, "");
            content = (String) contentmap.get("pageContent");
            headers = (Header[]) contentmap.get("headers");
            readnum++;
            System.out.println("读到" + readnum + "个种子" + ">>>" + url);
        } catch (Exception e) {
            e.printStackTrace();
        }
        if (StringUtils.isBlank(content)) {
            state = "页面为空";
            return;
        }
        // heads
        try {
            if (headers != null && headers.length > 0) {
                for (Header h : headers) {
                    if (h.getName().contains("Content-Type") || h.getName().contains("Content-type") || h.getName().contains("content-type")) {
                        contenttype = h.getValue();
                        if (contenttype.contains(";")) {
                            contenttype = StringUtils.substringBefore(contenttype, ";");
                        }
                        if (contenttype.contains("；")) {
                            contenttype = StringUtils.substringBefore(contenttype, "；");
                        }
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        Document doc = Jsoup.parse(content);
        // title
        try {
            title = doc.title();
            if (title.contains("404")) {
                state = "404";
            }
            if (title.contains("403")) {
                state = "403";
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        // meta-keywords、meta-description
        try {
            Element htmlheads = doc.head();
            Elements metaeles = htmlheads.select("meta");
            for (Element ele : metaeles) {
                // keywords
                String metaname = ele.attr("name");
                if (metaname.contains("keywords") || metaname.contains("keyword")) {
                    metakeywords = ele.attr("content");
                }
                // description
                if (metaname.contains("description") || metaname.contains("descriptions")) {
                    metadescription = ele.attr("content");
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        // SCD
        try {
            Scd scd = new Scd();
            scd.addField("<DOCID>", UidGenerator.md5(url));
            scd.addField("<date>", c_data);
            scd.addField("<url>", url);
            scd.addField("<contenttype>", contenttype);
            scd.addField("<title>", title);
            scd.addField("<metakeywords>", metakeywords);
            scd.addField("<metadescription>", metadescription);
            // content
            if (contenttype.contains("json")) {
                scd.addField("<content>", content);
            } else {
                scd.addField("<content>", "");
            }
            scd.addField("<state>", state);
            Scds.add(scd);
            creatnum++;
            System.out.println(scd.toStringScd());
            System.out.println("=====产生SCD======" + creatnum);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }