修改 org.apache.nutch.fetcher的Fetcher.java
一,追加私有方法
//追加图片写入文件
private void writeImgToLocal(String url, Content content) {
String fileType = "";
int index = url.lastIndexOf(".");
if (index > 0) {
fileType = url.substring(url.lastIndexOf(".") + 1).toLowerCase();
}
if ("jpg".equals(fileType) || "png".equals(fileType)
|| "gif".equals(fileType) || "ico".equals(fileType)
|| "jpeg".equals(fileType) || "bmp".equals(fileType)) {
String outPath = getConf().get("img.output.path");
String fileName = DigestUtils.md5Hex(url)+"." + fileType;
byte[] contentByte = content.getContent();
try {
String domain = getDomain(new URL(url).getHost());
File file = new File(outPath + domain);
if (!file.exists()) {
file.mkdir();
}
FileOutputStream fos = new FileOutputStream(outPath + domain + "/" + fileName);
fos.write(contentByte);
fos.close();
} catch (MalformedURLException e1) {
return;
} catch (FileNotFoundException e) {
return;
} catch (IOException e) {
return;
}
}
return;
}
private String getDomain(String host) {
if (host.contains(".")) {
return host.substring(host.indexOf(".") + 1);
}
return host;
}
二,在Fetcher.java的run方法中加入调用
ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
//追加图片写入文件
writeImgToLocal(fit.url.toString(), content);
ParseStatus pstatus = null;
// unblock queue
fetchQueues.finishFetchItem(fit);
String urlString = fit.url.toString();
reporter.incrCounter("FetcherStatus", status.getName(), 1);