packagecom.googosoft.until;importjava.io.BufferedInputStream;importjava.io.FileOutputStream;importjava.io.IOException;importjava.net.HttpURLConnection;importjava.net.URL;importjava.util.ArrayList;importjava.util.Date;importjava.util.List;importjava.util.regex.Matcher;importjava.util.regex.Pattern;importorg.junit.Test;public classHtmlUtil {public staticString delHTMLTag(String htmlStr) {
String regEx_script= "
String regEx_style = "
String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
Pattern p_script=Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
Matcher m_script=p_script.matcher(htmlStr);
htmlStr= m_script.replaceAll(""); //过滤script标签
Pattern p_style=Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
Matcher m_style=p_style.matcher(htmlStr);
htmlStr= m_style.replaceAll(""); //过滤style标签
Pattern p_html=Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
Matcher m_html=p_html.matcher(htmlStr);
htmlStr= m_html.replaceAll(""); //过滤html标签
return htmlStr.trim(); //返回文本字符串
}/*** 根据图片的网络路径将图片下载到本地,并返回本地路径
*@paramurlHttp 图片的网络路径
*@parampath 新生成的图片的目录
*@return
*/
private staticString getPicture2(String urlHttp, String path) {
FileOutputStream out= null;
BufferedInputStream in= null;
HttpURLConnection connection= null;
String newPath= "";byte[] buf = new byte[1024];int len = 0;try{
URL url= newURL(urlHttp);
connection=(HttpURLConnection) url.openConnection();
connection.connect();
in= newBufferedInputStream(connection.getInputStream());
newPath= path + "/" + new Date().getTime() + ".jpg";
out= newFileOutputStream(newPath);while ((len = in.read(buf)) != -1) {
out.write(buf,0, len);
}
out.flush();
}catch(Exception e) {
e.printStackTrace();
}finally{try{
in.close();
out.close();
connection.disconnect();
}catch(IOException e) {
e.printStackTrace();
}
}returnnewPath;
}/*** 提取HTML字符串中的img列表
*@paramhtmlStr 要处理的html字符串
*@return
*/
private static ListgetImgStrList(String htmlStr) {
List list = new ArrayList<>();
String img= "";
Pattern p_image;
Matcher m_image;
String regEx_img= "]*?>";
p_image=Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
m_image=p_image.matcher(htmlStr);while(m_image.find()) {
img=m_image.group();
Matcher m= Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);while(m.find()) {
list.add(handleSrc(m.group(1)));
}
}returnlist;
}/*** 去除src路径中的前后单引号
*@paramsrc 图片的src路径
*@return
*/
private staticString handleSrc(String src) {if (src != null) {if (src.startsWith("'")) {return src.substring(1, src.length());
}if (src.endsWith("'")) {return src.substring(0, src.length());
}
}returnsrc;
}
@Testpublic void testTransSrc() throwsException {
String str= "
标题
List imgList =getImgStrList(str);for(String img : imgList) {
System.out.println(getPicture2(img,"D://uploadFiles"));
}
}
}