概述
抓一些杂乱的URL,URL可能是图片,可能是json,可能是一个下载链接
获得网络请求的heard
public static Map<String, Object> getPageContent(String url, String regx) {
String pageContent = null;
Header[] headers = null;
Map<String, Object> contentmap = new HashMap<String, Object>();
AnimalHttpClient client = AnimalExt.assistant(AnimalHttpClient.class);
AnimalRquest request = new AnimalRquest();
request.Get(url);
request.commondHeader();
//request.setAutoProxy(false);
request.setTimeout(6000);
request.setReqTryNum(2);
request.setIoErrorDealyTime(6000);
request.setCircularRedirectsAllowed(false);//重定向
request.setRedirectsEnabled(false);
request.setAutoLoaclAddress(true);
ResultView view = client.request(request, new ParseView() {
@Override
public Boolean call(String page) {
if (page.contains(regx)) {
return true;
} else {
return false;
}
}
});
if (view != null) {
try {
pageContent = view.getContent().asString();
headers = view.getAllHeaders();
contentmap.put("pageContent", pageContent);
contentmap.put("headers", headers);
} catch (JQHttpClientException e) {
e.printStackTrace();
} finally {
view.close();
}
}
return contentmap;
}
public void getdetail(List<Scd> Scds, Scd seed) {
String url = "";
String contenttype = "";
String title = "";
String metakeywords = "";
String metadescription = "";
String state = "1";
String content = "";
Header[] headers = null;
try {
url = seed.getField("<url>");
Map<String, Object> contentmap = getPageContent(url, "");
content = (String) contentmap.get("pageContent");
headers = (Header[]) contentmap.get("headers");
readnum++;
System.out.println("读到" + readnum + "个种子" + ">>>" + url);
} catch (Exception e) {
e.printStackTrace();
}
if (StringUtils.isBlank(content)) {
state = "页面为空";
return;
}
// heads
try {
if (headers != null && headers.length > 0) {
for (Header h : headers) {
if (h.getName().contains("Content-Type") || h.getName().contains("Content-type") || h.getName().contains("content-type")) {
contenttype = h.getValue();
if (contenttype.contains(";")) {
contenttype = StringUtils.substringBefore(contenttype, ";");
}
if (contenttype.contains(";")) {
contenttype = StringUtils.substringBefore(contenttype, ";");
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
Document doc = Jsoup.parse(content);
// title
try {
title = doc.title();
if (title.contains("404")) {
state = "404";
}
if (title.contains("403")) {
state = "403";
}
} catch (Exception e) {
e.printStackTrace();
}
// meta-keywords、meta-description
try {
Element htmlheads = doc.head();
Elements metaeles = htmlheads.select("meta");
for (Element ele : metaeles) {
// keywords
String metaname = ele.attr("name");
if (metaname.contains("keywords") || metaname.contains("keyword")) {
metakeywords = ele.attr("content");
}
// description
if (metaname.contains("description") || metaname.contains("descriptions")) {
metadescription = ele.attr("content");
}
}
} catch (Exception e) {
e.printStackTrace();
}
// SCD
try {
Scd scd = new Scd();
scd.addField("<DOCID>", UidGenerator.md5(url));
scd.addField("<date>", c_data);
scd.addField("<url>", url);
scd.addField("<contenttype>", contenttype);
scd.addField("<title>", title);
scd.addField("<metakeywords>", metakeywords);
scd.addField("<metadescription>", metadescription);
// content
if (contenttype.contains("json")) {
scd.addField("<content>", content);
} else {
scd.addField("<content>", "");
}
scd.addField("<state>", state);
Scds.add(scd);
creatnum++;
System.out.println(scd.toStringScd());
System.out.println("=====产生SCD======" + creatnum);
} catch (Exception e) {
e.printStackTrace();
}
}
需要注意
需要判断get到的页面的大小,如果太大,超过2M,就舍弃