java 正则解析HTML代码段

本文使用正则方式,解析HTML代码段里面图片内容,视频内容,文字描述,具体看一下代码:

package zhoushitong;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class test {

    public static void main(String[] args) {
        String string = "<table style=\"float: left; font-size: 12px;margin-bottom:10px;border:1px solid #CCC;width:100%;border-collapse:inherit;padding:8px;border-top:none\"><tbody><tr class=\"firstRow\"><td width=\"220\" valign=\"top\" style=\"float: left; display: inline; width: 220px; height: 18px; overflow: hidden; margin: 10px 15px 0px 0px; line-height: 18px; vertical-align: top; white-space: nowrap; text-overflow: ellipsis; color: rgb(102, 102, 102); word-break: break-all;\"><em style=\"font-style: normal;\">品牌:</em> 巴巴腾</td><td width=\"220\" valign=\"top\" style=\"float: left; display: inline; width: 220px; height: 18px; overflow: hidden; margin: 10px 15px 0px 0px; line-height: 18px; vertical-align: top; white-space: nowrap; text-overflow: ellipsis; color: rgb(102, 102, 102); word-break: break-all;\"><em style=\"font-style: normal;\">货号:A1</em></td><td width=\"220\" valign=\"top\" style=\"list-style: none; float: left; white-space: nowrap; text-overflow: ellipsis; display: inline; height: 18px; overflow: hidden; line-height: 18px; vertical-align: top; color: rgb(102, 102, 102); margin: 10px 15px 0px 0px; word-break: break-all;\"><em style=\"font-style: normal;\">颜色分类:白色</em></td></tr><tr><td width=\"220\" valign=\"top\" style=\"float: left; display: inline; width: 220px; height: 18px; overflow: hidden; margin: 10px 15px 0px 0px; line-height: 18px; vertical-align: top; white-space: nowrap; text-overflow: ellipsis; color: rgb(102, 102, 102); word-break: break-all;\"><em style=\"font-style: normal;\">材质:</em>PVC</td><td width=\"220\" valign=\"top\" style=\"float: left; display: inline; width: 220px; height: 18px; overflow: hidden; margin: 10px 15px 0px 0px; line-height: 18px; vertical-align: top; white-space: nowrap; text-overflow: ellipsis; color: rgb(102, 102, 102); word-break: break-all;\"></td><td width=\"220\" valign=\"top\" style=\"float: left; display: inline; width: 220px; height: 18px; overflow: hidden; margin: 10px 15px 0px 0px; line-height: 18px; vertical-align: top; white-space: nowrap; text-overflow: ellipsis; color: rgb(102, 102, 102); word-break: break-all;display:none\"></td></tr></tbody></table><p><video src=\"./data/files/video/20171130/1512044000315028.mp4\" autoplay=\"true\" class=\"edui-upload-video video-js vjs-default-skin video-js\" controls=\"\" preload=\"none\" width=\"70%\" height=\"70%\"></video><img src=\"./data/files/store_2/goods_49/201711301644098301.jpg\" alt=\"7_01-jpg.jpg\"/><img src=\"./data/files/store_2/goods_49/201711301644094874.jpg\" alt=\"7_02-jpg.jpg\"/><img src=\"./data/files/store_2/goods_49/201711301644099889.jpg\" alt=\"7_03-jpg.jpg\"/><img src=\"./data/files/store_2/goods_49/201711301644093468.jpg\" alt=\"7_04-jpg.jpg\"/><img src=\"./data/files/store_2/goods_50/201711301644102839.jpg\" alt=\"7_05-jpg.jpg\"/><img src=\"./data/files/store_2/goods_50/201711301644104379.jpg\" alt=\"7_06-jpg.jpg\"/><img src=\"./data/files/store_2/goods_50/201711301644106072.jpg\" alt=\"7_07-jpg.jpg\"/><img src=\"./data/files/store_2/goods_51/201711301644111540.jpg\" alt=\"7_08-jpg.jpg\"/><img src=\"./data/files/store_2/goods_147/201808021039077422.png\" alt=\"7.png\"/></p><p></p>";

        //解析html代码段中的图片
        String srcReg = "<img[^<>]*?\\ssrc=['\"]?(.*?)['\"]?(\\s.*?)?>";
        List<String> imgList = match(string,srcReg);
        System.out.println("解析html图片为:"+ imgList);

        //解析html代码段中的视频
        String videoReg = "<video[^<>]*?\\ssrc=['\"]?(.*?)['\"]?(\\s.*?)?>";
        List<String> videoList = match(string,videoReg);
        System.out.println("解析html视频为:"+ videoList);

        //解析html代码段中的文字,通过观察发现文字使用span标签,所以要编写span的正则
        String span = "<span(?:.*?)?>(.*?)</span>";
        List<String> spanList = match(string,span);
        System.out.println("解析html文字为:"+ spanList);

        //变迁td和em标签分别为
        String em = "<em(?:.*?)?>(.*?)</em>";
        String td = "<td(?:.*?)?>(.*?)</td>";
        List<String> tdList = match(string,td);
        List<String> emList = match(string,em);
        System.out.println("解析html里td标签为:" + tdList + "解析html里em标签为:" + emList);

    }
    /**
     * 正则处理html代码段
     * @param source
     * @param reg
     * @return
     */
    public static List<String> match(String source,String reg){
        List<String> result = new ArrayList<>();
        Matcher m = Pattern.compile(reg).matcher(source);
        while (m.find()){
            String r = m.group(1);
            result.add(r);
        }
        return result;
    }

}

执行返回结果为:

解析html图片为:[./data/files/store_2/goods_49/201711301644098301.jpg, ./data/files/store_2/goods_49/201711301644094874.jpg, ./data/files/store_2/goods_49/201711301644099889.jpg, ./data/files/store_2/goods_49/201711301644093468.jpg, ./data/files/store_2/goods_50/201711301644102839.jpg, ./data/files/store_2/goods_50/201711301644104379.jpg, ./data/files/store_2/goods_50/201711301644106072.jpg, ./data/files/store_2/goods_51/201711301644111540.jpg, ./data/files/store_2/goods_147/201808021039077422.png]
解析html视频为:[./data/files/video/20171130/1512044000315028.mp4]
解析html文字为:[]
解析html文字为:[<em style="font-style: normal;">品牌:</em> 巴巴腾, <em style="font-style: normal;">货号:A1</em>, <em style="font-style: normal;">颜色分类:白色</em>, <em style="font-style: normal;">材质:</em>PVC, , ]

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值