package com.myhitron.jlw.forum.util;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.myhitron.jlw.core.util.DataUtil;
public class CatchHtmlUtil {
// img标签
private static final String IMGURL_REGEX = "(".*?"|<img.*src=(.*?)[^>]*?>)";
// src路径
private static final String IMGSRC_REGEX = "http:\"?(.*?)(\"|>|\\s+)";
/**
* 获取富文本信息的的文字
* @author xuye
* Date: 2017年8月22日 上午10:04:10
* @param html
* @return
*/
public static String catchWord(String HTMLSource) {
if (!DataUtil.isNotEmpty(HTMLSource)) {
return null;
}
String regMatchEnter = "\\s*|\t|\r|\n";
Pattern p = Pattern.compile(regMatchEnter);
Matcher m = p.matcher(HTMLSource);
HTMLSource = m.replaceAll("");
String regMatchTag = "<[^>]*>";
Pattern p1 = Pattern.compile(regMatchTag);
Matcher m1 = p1.matcher(HTMLSource);
HTMLSource = m1.replaceAll("");
HTMLSource = HTMLSource.replaceAll(regMatchTag, "");
return HTMLSource;
}
/**
* 获取img标签
* @author xuye
* Date: 2017年8月22日 上午10:07:58
* @param HTML
* @return
*/
public static List<String> getImageUrl(String html) {
Matcher matcher = Pattern.compile(IMGURL_REGEX).matcher(html);
List<String> listImgUrl = new ArrayList<String>();
while (matcher.find()) {
listImgUrl.add(matcher.group().replaceAll(""", "\""));
}
return listImgUrl;
}
/**
* 获取ImgSrc路径
* @author xuye
* Date: 2017年8月22日 上午10:08:04
* @param listImageUrl
* @return
*/
public static List<String> getImageSrc(List<String> listImageUrl) {
List<String> listImgSrc = new ArrayList<String>();
for (String image : listImageUrl) {
Matcher matcher = Pattern.compile(IMGSRC_REGEX).matcher(image);
while (matcher.find()) {
listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
}
}
return listImgSrc;
}
public static void main(String[] args) {
String html = "<p style=\"text-indent:0em;margin:4px auto 0px auto;\"><br></p><img src=\"\" width=\"100%\"><p style=\"text-indent:0em;margin:4px auto 0px auto;\"></p><div style=\"margin: 30px 0px; background-repeat: no-repeat; background-position: center center; background-size: cover; height: 800px; background-image: url("http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523656437.jpg");\" class=\"image\"></div><div style=\"display: flex\"><div style=\"flex: 2;padding-left:100px;padding-right: 40px;\"><h1 style=\"margin-bottom: 20px;font-size: 30px;height:45px\" class=\"text\">将来网论坛</h1><h3 style=\"font-size: 16px;min-height: 180px;\" class=\"text\">好棒的论坛!</h3></div><p style=\"flex: 1;font-size: 16px;min-height:200px;margin: 0 40px;\" class=\"text\">赞赞赞</p></div><div style=\"display: flex;margin-top: 30px;\"><div style=\"flex: 1 1 0%; background-repeat: no-repeat; background-position: center center; background-size: cover; height: 800px; background-image: url("http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523686912.jpg");\" class=\"image\"></div><div style=\"flex: 2;margin-left: 30px\"><div style=\"background-repeat: no-repeat; background-position: center center; background-size: cover; height: 390px; margin-bottom: 20px; background-image: url("http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523688911.jpg");\" class=\"image\"></div><div style=\"background-repeat: no-repeat; background-position: center center; background-size: cover; height: 390px; background-image: url("http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523693965.jpg");\" class=\"image\"></div></div></div><div style=\"display: flex;margin-top: 30px;\"><p style=\"flex:1;font-size: 14px;margin:0 40px;min-height: 400px;\" class=\"text\">哇塞</p><p>哇塞</p><p>哇塞</p></div><div style=\"display: flex;margin: 30px 0;\"><div style=\"flex: 3 1 0%; background-repeat: no-repeat; background-position: center center; background-size: cover; height: 1200px; background-image: url("http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523713183.jpg");\" class=\"image\"></div><div style=\"flex: 1;padding:0 40px\"><p>哇塞</p><div style=\"background-repeat: no-repeat; background-position: center center; background-size: cover; height: 300px; margin: 60px 0px; background-image: url("http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523721282.jpg");\" class=\"image\"></div><p>哇塞</p></div></div>";
//String html = "<p style=\"text-indent:0em;margin:4px auto 0px auto;\"><font style=\"font-size:20.000000;color:#000000\">yu</font></p><img src=\"http://jlw.myhitron.com/jlw-forum/headimg/thumb-87-1503759242787_750_485.jpg\" width=\"100%\"/><p style=\"text-indent:0em;margin:4px auto 0px auto;\"></p><img src=\"http://jlw.myhitron.com/jlw-forum/headimg/thumb-87-1503759251229_1280_992.jpg\" width=\"100%\"/><p style=\"text-indent:0em;margin:4px auto 0px auto;\"></p>";
//获取文字
System.out.println(catchWord(html));
System.out.println("###############################################");
List<String> imgUrl = getImageUrl(html);
System.out.println("许晔抓图片" + imgUrl.toString());
//获取图片src地址
List<String> imgSrc = getImageSrc(imgUrl);
System.out.println("许晔抓的图片" + imgSrc.toString());
}
}
html页面数据抓取
最新推荐文章于 2024-02-27 23:14:33 发布