抓取一些乱的URL

概述

抓一些杂乱的URL,URL可能是图片,可能是json,可能是一个下载链接

获得网络请求的heard

public static Map<String, Object> getPageContent(String url, String regx) {
        String pageContent = null;
        Header[] headers = null;
        Map<String, Object> contentmap = new HashMap<String, Object>();
        AnimalHttpClient client = AnimalExt.assistant(AnimalHttpClient.class);
        AnimalRquest request = new AnimalRquest();
        request.Get(url);
        request.commondHeader();
        //request.setAutoProxy(false);
        request.setTimeout(6000);
        request.setReqTryNum(2);
        request.setIoErrorDealyTime(6000);
        request.setCircularRedirectsAllowed(false);//重定向
        request.setRedirectsEnabled(false);
         request.setAutoLoaclAddress(true); 
        ResultView view = client.request(request, new ParseView() {
            @Override
            public Boolean call(String page) {
                if (page.contains(regx)) {
                    return true;
                } else {
                    return false;
                }
            }
        });
        if (view != null) {
            try {
                pageContent = view.getContent().asString();
                headers = view.getAllHeaders();
                contentmap.put("pageContent", pageContent);
                contentmap.put("headers", headers);
            } catch (JQHttpClientException e) {
                e.printStackTrace();
            } finally {
                view.close();
            }
        }
        return contentmap;
    }

public void getdetail(List<Scd> Scds, Scd seed) {
        String url = "";
        String contenttype = "";
        String title = "";
        String metakeywords = "";
        String metadescription = "";
        String state = "1";
        String content = "";
        Header[] headers = null;
        try {
            url = seed.getField("<url>");
            Map<String, Object> contentmap = getPageContent(url, "");
            content = (String) contentmap.get("pageContent");
            headers = (Header[]) contentmap.get("headers");
            readnum++;
            System.out.println("读到" + readnum + "个种子" + ">>>" + url);
        } catch (Exception e) {
            e.printStackTrace();
        }
        if (StringUtils.isBlank(content)) {
            state = "页面为空";
            return;
        }
        // heads
        try {
            if (headers != null && headers.length > 0) {
                for (Header h : headers) {
                    if (h.getName().contains("Content-Type") || h.getName().contains("Content-type") || h.getName().contains("content-type")) {
                        contenttype = h.getValue();
                        if (contenttype.contains(";")) {
                            contenttype = StringUtils.substringBefore(contenttype, ";");
                        }
                        if (contenttype.contains(";")) {
                            contenttype = StringUtils.substringBefore(contenttype, ";");
                        }
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        Document doc = Jsoup.parse(content);
        // title
        try {
            title = doc.title();
            if (title.contains("404")) {
                state = "404";
            }
            if (title.contains("403")) {
                state = "403";
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        // meta-keywords、meta-description
        try {
            Element htmlheads = doc.head();
            Elements metaeles = htmlheads.select("meta");
            for (Element ele : metaeles) {
                // keywords
                String metaname = ele.attr("name");
                if (metaname.contains("keywords") || metaname.contains("keyword")) {
                    metakeywords = ele.attr("content");
                }
                // description
                if (metaname.contains("description") || metaname.contains("descriptions")) {
                    metadescription = ele.attr("content");
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        // SCD
        try {
            Scd scd = new Scd();
            scd.addField("<DOCID>", UidGenerator.md5(url));
            scd.addField("<date>", c_data);
            scd.addField("<url>", url);
            scd.addField("<contenttype>", contenttype);
            scd.addField("<title>", title);
            scd.addField("<metakeywords>", metakeywords);
            scd.addField("<metadescription>", metadescription);
            // content
            if (contenttype.contains("json")) {
                scd.addField("<content>", content);
            } else {
                scd.addField("<content>", "");
            }
            scd.addField("<state>", state);
            Scds.add(scd);
            creatnum++;
            System.out.println(scd.toStringScd());
            System.out.println("=====产生SCD======" + creatnum);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

需要注意

需要判断get到的页面的大小,如果太大,超过2M,就舍弃

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值