java自动下载网页图片源码,OSChina源码:自动下载HTML中的非本地图片,并替换内容...

package net.oschina.utils;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.URL;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.HashMap;

import org.apache.commons.io.FilenameUtils;

import org.apache.commons.io.IOUtils;

import org.apache.commons.lang.RandomStringUtils;

import org.apache.commons.lang.StringUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import my.mvc.BadWord;

import my.mvc.BlockIP;

import my.mvc.RequestContext;

import my.util.Multimedia;

/**

* Action类常用工具

* @author Winter Lau @ OSChina

*/

public class HTMLImageFetcher{

public static void main(String[] args) {

String html = "这张图片很漂亮啊!"+

""+

" ,太帅了!oschina";

System.out.println(fetchHTML_Images(html));

}

/**

* 下载HTML文档中的所有图片

* @param html

* @return

*/

protected static String fetchHTML_Images(String html) {

if(StringUtils.isBlank(html))

return html;

HashMap img_urls = new HashMap();

Document doc = Jsoup.parse(html);

Elements imgs = doc.select("img");

for(int i=0;i

Element img = imgs.get(i);

String src = img.attr("src");

if(!src.startsWith("/"))

try {

URL imgUrl = new URL(src);

String imgHost = imgUrl.getHost().toLowerCase();

if(!imgHost.endsWith(".oschina.net")){

String new_src = img_urls.get(src);

if(new_src == null){

new_src = fetchImageViaHttp(imgUrl);

img_urls.put(src, new_src);

}

img.attr("src", new_src);

}

} catch (MalformedURLException e) {

img.remove();

} catch (Exception e){

e.printStackTrace();

img.remove();

}

}

return doc.body().html();

}

private static String fetchImageViaHttp(URL imgUrl) throws IOException {

String sURL = imgUrl.toString();

String imgFile = imgUrl.getPath();

HttpURLConnection cnx = (HttpURLConnection)imgUrl.openConnection();

String uri = null;

try{

cnx.setAllowUserInteraction(false);

cnx.setDoOutput(true);

cnx.addRequestProperty("Cache-Control", "no-cache");

RequestContext ctx = RequestContext.get();

if(ctx != null)

cnx.addRequestProperty("User-Agent", ctx.header("user-agent"));

else

cnx.addRequestProperty("User-Agent", user_agent);

cnx.addRequestProperty("Referer", sURL.substring(0, sURL.indexOf('/', sURL.indexOf('.'))+1));

cnx.connect();

if(cnx.getResponseCode() != HttpURLConnection.HTTP_OK)

return null;

InputStream imgData = cnx.getInputStream();

String ext = FilenameUtils.getExtension(imgFile).toLowerCase();

if(!Multimedia.isImageFile("aa."+ext))

ext = "jpg";

uri = FMT_FN.format(new Date()) + RandomStringUtils.randomAlphanumeric(4) + '.' + ext;

File fileDest = new File(img_path + uri);

if(!fileDest.getParentFile().exists())

fileDest.getParentFile().mkdirs();

FileOutputStream fos = new FileOutputStream(fileDest);

try{

IOUtils.copy(imgData, fos);

}finally{

IOUtils.closeQuietly(imgData);

IOUtils.closeQuietly(fos);

}

}finally{

cnx.disconnect();

}

return RequestContext.get().contextPath() + "/uploads/img/" + uri;

}

protected final static String img_path = RequestContext.root() + "uploads" +

File.separator + "img" + File.separator;

protected final static SimpleDateFormat FMT_FN = new SimpleDateFormat("yyyyMM/ddHHmmss_");

private final static String user_agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)";

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值