hive 自定义UDF函数解析HTML

4 篇文章 0 订阅
1 篇文章 0 订阅

解析HTML 函数实现方式:

      1,引入JSOUP包,解析HTML。

      2,引入JEXL包,实现自动化解析JSOUP参数。

      3,通过解析HTML,返回字符串通过UDF函数返回出来

      4,通过HIVE创建函数并测试使用。

JSOUP包

maven依赖如下:

    <dependency>
      <!-- jsoup HTML parser library @ https://jsoup.org/ -->
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.13.1</version>
    </dependency>

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

API文件如下:https://tool.oschina.net/apidocs/apidoc?api=jsoup-1.6.3

引入JEXL包,实现自动化解析JSOUP参数

maven依赖如下:

    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-jexl</artifactId>
      <version>2.1.1</version>
    </dependency>

jexl包:可反向激活以jvm运行字符串形式JAVA代码段。

解析HTML

import java.util.*;

import com.alibaba.fastjson.JSON;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlContext;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.commons.jexl2.MapContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Jsoup解析html标签时类似于JQuery的一些符号
 *
 * @author chixh
 */
public class JsoupUtils {
    /**
     * @param jexlExp 待执行的java代码字符串
     * @param map     待执行java代码字符串中的一些外部变量
     * @return 返回值为处理结果得到的对象
     */
    public static Object convertToCode(String jexlExp, Map<String, Object> map) {
        JexlEngine jexl = new JexlEngine();
        Expression e = jexl.createExpression(jexlExp);
        JexlContext jc = new MapContext();
        for (String key : map.keySet()) {
            jc.set(key, map.get(key));
        }
        if (null == e.evaluate(jc)) {
            return "";
        }
        return e.evaluate(jc);
    }

    /**
     * @param htmlText       需要解析的html代码
     * @param expressionText 第一轮解析得到数组样本的 Document jsoup表达式
     * @param subString      根据第一轮解析结果得到处理Element对象的表达式
     * @return string
     */
    public static String getHtmlAttributeVaul (String htmlText, String expressionText, String subString) {
        Document doc = Jsoup.parse(htmlText);
        // 获取html的标题
        Map<String, Object> map = new HashMap<>();
        map.put("doc", doc);
        Object code = JsoupUtils.convertToCode(expressionText, map);
        Elements elements = (Elements) code;
        ArrayList<HashMap<String, String>> hashMapArrayList = new ArrayList<>();
        for (Element element : elements) {
            map.put("element", element);
            String[] strings = subString.split("\\|");
            HashMap<String, String> hashMap = new HashMap<>();
            for (String regeString : strings) {
                try {
                    String expressionSub = regeString.split(":")[1];
                    String key = regeString.split(":")[0];
                    Object string = convertToCode(expressionSub, map);
                    hashMap.put(key, string.toString());
                } catch (Exception ignored) {
                }
            }
            hashMapArrayList.add(hashMap);
        }

        return JSON.toJSONString(hashMapArrayList);
    }

    /**
     *
     * @param args man
     */
    public static void main(String[] args) {
        String html = "<!DOCTYPE html><html><head>    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">    <meta name=\"viewport\" content=\"initial-scale=1,maximum-scale=1,user-scalable=no\">    <meta name=\"format-detection\" content=\"telephone=no\">    <title>微博搜索</title>    <link href=\"//img.t.sinajs.cn/t4/appstyle/searchpc/css/h5/css/global.css?version=202106231431\" rel=\"stylesheet\"/>    <link href=\"//img.t.sinajs.cn/t4/appstyle/searchpc/css/h5/css/module.css?version=202106231431\" rel=\"stylesheet\"/>    <link href=\"//img.t.sinajs.cn/t4/appstyle/searchpc/css/h5/css/page.css?version=202106231431\" rel=\"stylesheet\"/></head><body><div class=\"wrap hotrank\">    <a href=\"/\"><img src=\"//simg.s.weibo.com/20180201141807_searchtopimg750-350.jpg\" class=\"banner\" /></a>    <ul class=\"m-nav\">        <li><a href=\"/top/summary?cate=realtimehot\" class=\"cur\">热搜榜</a></li>        <li><a href=\"/top/summary?cate=topicband\" >话题榜</a></li>        <li><a href=\"/top/summary?cate=socialevent\" >新时代</a></li>        <li><a href=\"/top/summary?cate=localband\" >同城榜</a></li>        <li><a href=\"/top/summary?cate=total&key=friends\" >好友搜</a></li>    </ul>        <!--实时榜-->    <section class=\"list\">        <h1 class=\"title\">实时热点,每分钟更新一次</h1>        <ul class=\"list_a\">                        <li>                <a href=\"/weibo?q=%23%E6%88%91%E5%9B%BD%E6%95%B0%E5%AD%97%E7%BB%8F%E6%B5%8E%E8%A7%84%E6%A8%A1%E8%BE%BE5.4%E4%B8%87%E4%BA%BF%E7%BE%8E%E5%85%83%23&Refer=new_time\">                    <i class=\"icon icon_pinned\"></i>                    <span>我国数字经济规模达5.4万亿美元</span>                    <i class=\"icon icon_hot\"></i>                </a>            </li>                                                <li>                <a href=\"/weibo?q=%23EXO%E5%8F%82%E4%B8%8E%E5%BD%95%E5%88%B6%E7%9A%84%E5%9B%9B%E6%9C%9F%E5%BF%AB%E6%9C%AC%E8%A2%AB%E4%B8%8B%E6%9E%B6%23&Refer=top\">                                        <strong class=\"hot\">1</strong>                                        <span>EXO参与录制的四期快本被下架<em>2082358</em> </span>                                        <i class=\"icon icon_boil\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%AD%A6%E6%B1%89%E5%86%9B%E8%BF%90%E4%BC%9A%E5%BC%80%E5%B9%95%E5%BD%93%E5%A4%A9%E7%BE%8E%E6%9C%BA%E6%9E%84%E6%BC%94%E7%BB%83%E7%97%85%E6%AF%92%E6%9A%B4%E5%8F%91%23&Refer=top\">                                        <strong class=\"hot\">2</strong>                                        <span>武汉军运会开幕当天美机构演练病毒暴发<em>949798</em> </span>                                        <i class=\"icon icon_boil\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E5%88%98%E7%BF%94%E8%AF%B4%E5%A4%A7%E5%AE%B6%E6%B2%A1%E5%BF%85%E8%A6%81%E5%90%91%E4%BB%96%E9%81%93%E6%AD%89%23&Refer=top\">                                        <strong class=\"hot\">3</strong>                                        <span>刘翔说大家没必要向他道歉<em>839428</em> </span>                                        <i class=\"icon icon_boil\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E9%BB%84%E6%99%93%E6%98%8E%E5%9B%9E%E5%BA%94%E8%B9%AD%E8%A1%A8%E5%A6%B9%E9%99%88%E6%A2%A6%E7%83%AD%E5%BA%A6%23&Refer=top\">                                        <strong class=\"hot\">4</strong>                                        <span>黄晓明回应蹭表妹陈梦热度<em>812323</em> </span>                                        <i class=\"icon icon_hot\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E4%B8%AD%E5%9B%BD%E9%80%89%E6%89%8B%E7%93%A6%E9%87%8C%E6%B1%97%E8%B5%9B%E9%87%8C%E5%85%8B%E5%8F%A4%E5%85%B8%E5%BC%8F%E6%91%94%E8%B7%A4%E6%91%98%E9%93%9C%23&Refer=top\">                                        <strong class=\"hot\">5</strong>                                        <span>中国选手瓦里汗赛里克古典式摔跤摘铜<em>720779</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E4%BA%8E%E9%80%94%E4%BA%BA%E8%AE%BE%23&Refer=top\">                                        <strong class=\"hot\">6</strong>                                        <span>于途人设<em>690732</em> </span>                                        <i class=\"icon icon_boil\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%88%91%E6%9D%A5%E5%9B%9E%E7%AD%94%E5%91%A8%E5%86%AC%E9%9B%A8%23&topic_ad=1&Refer=top\">                                        <strong class=\"hot\"  style=\"font-style: normal;font-size: 16px\">•</strong>                                        <span>我来回答周冬雨<em></em> </span>                                                                        <i class=\"icon icon_recommend\"></i>                                                            </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E5%A5%88%E9%9B%AA%E7%9A%84%E8%8C%B6%E9%80%9A%E6%8A%A5%E5%90%84%E5%9C%B0%E7%9B%91%E7%AE%A1%E7%AA%81%E6%9F%A5%E7%BB%93%E6%9E%9C%23&Refer=top\">                                        <strong class=\"hot\">7</strong>                                        <span>奈雪的茶通报各地监管突查结果<em>624149</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E4%B8%A5%E6%A0%BC%E6%8E%A7%E5%88%B6%E5%81%B6%E5%83%8F%E5%85%BB%E6%88%90%E7%B1%BB%E8%8A%82%E7%9B%AE%23&Refer=top\">                                        <strong class=\"hot\">8</strong>                                        <span>严格控制偶像养成类节目<em>560656</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%E7%8E%8B%E7%8F%9E%E4%B8%B9%20%E5%B2%81%E6%9C%88%E9%9D%A2%E5%89%8D%E6%88%91%E4%BB%AC%E9%83%BD%E4%B8%8D%E6%98%AF%E5%AF%B9%E6%89%8B&Refer=top\">                                        <strong class=\"hot\">9</strong>                                        <span>王珞丹 岁月面前我们都不是对手<em>429763</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%8E%8B%E6%98%A5%E9%9B%A8%E5%A5%B3%E5%AD%90800%E7%B1%B3%E7%AC%AC5%23&Refer=top\">                                        <strong class=\"hot\">10</strong>                                        <span>王春雨女子800米第5<em>420497</em> </span>                                        <i class=\"icon icon_boil\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%8E%8B%E5%B3%A5%E5%A5%B3%E5%AD%90%E9%93%BE%E7%90%83%E9%93%B6%E7%89%8C%23&Refer=top\">                                        <strong class=\"hot\">11</strong>                                        <span>王峥女子链球银牌<em>356985</em> <img src=\"//img.t.sinajs.cn/t4/appstyle/expression/ext/normal/a6/2021_silvermedal_org.png\" title=\"[银牌]\" alt=\"[银牌]\" class=\"face\" /></span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%89%AC%E5%B7%9E%E5%85%AC%E5%B8%831%E5%8F%B7%E7%97%85%E4%BE%8B%E6%AF%9B%E6%9F%90%E6%B5%81%E8%B0%83%E6%83%85%E5%86%B5%23&Refer=top\">                                        <strong class=\"hot\">12</strong>                                        <span>扬州公布1号病例毛某流调情况<em>326071</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%B0%A2%E9%9C%87%E4%B8%9A20%E7%A7%9245%23&Refer=top\">                                        <strong class=\"hot\">13</strong>                                        <span>谢震业20秒45<em>298751</em> </span>                                        <i class=\"icon icon_hot\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%E9%9C%8D%E5%90%AF%E5%88%9A%20%E7%9C%8B%E8%BF%99%E6%98%AF%E6%88%91%E8%80%81%E5%A9%86&Refer=top\">                                        <strong class=\"hot\">14</strong>                                        <span>霍启刚 看这是我老婆<em>296698</em> </span>                                        <i class=\"icon icon_hot\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%94%B7%E5%AD%90%E5%9B%9E%E5%BA%94%E9%83%91%E5%B7%9E%E6%8A%97%E7%81%BE%E5%90%8E%E8%BF%94%E7%B2%A4%E8%A2%AB%E9%AA%82%23&Refer=top\">                                        <strong class=\"hot\">15</strong>                                        <span>男子回应郑州抗灾后返粤被骂<em>295081</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%92%92%E8%B4%9D%E5%AE%81%E8%B7%9F%E5%A8%83%E8%AF%B4%E6%88%91%E6%B0%B8%E8%BF%9C%E6%98%AF%E4%BD%A0%E4%BB%AC%E7%9A%84%E5%90%8C%E9%BE%84%E4%BA%BA%23&Refer=top\">                                        <strong class=\"hot\">16</strong>                                        <span>撒贝宁跟娃说我永远是你们的同龄人<em>292689</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%B2%99%E6%BA%A2xxj%E6%81%8B%E7%88%B1%E8%A1%8C%E4%B8%BA%23&Refer=top\">                                        <strong class=\"hot\">17</strong>                                        <span>沙溢xxj恋爱行为<em>289729</em> </span>                                        <i class=\"icon icon_hot\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%8B%8F%E7%82%B3%E6%B7%BB%E6%AF%8F%E6%99%9A%E5%8D%81%E7%82%B9%E5%87%86%E6%97%B6%E5%85%B3%E6%89%8B%E6%9C%BA%E7%9D%A1%E8%A7%89%23&Refer=top\">                                        <strong class=\"hot\">18</strong>                                        <span>苏炳添每晚十点准时关手机睡觉<em>288669</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E4%B9%94%E6%99%B6%E6%99%B6%E8%BF%87%E7%94%9F%E6%97%A5%E6%B2%A1%E6%9C%89%E7%A4%BC%E7%89%A9%23&Refer=top\">                                        <strong class=\"hot\">19</strong>                                        <span>乔晶晶过生日没有礼物<em>285971</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E5%88%98%E6%B4%8B%E5%92%8B%E7%9C%8B%E7%9D%80%E6%9C%89%E7%82%B9%E7%9C%BC%E7%86%9F%23&Refer=top\">                                        <strong class=\"hot\">20</strong>                                        <span>刘洋咋看着有点眼熟<em>284135</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%96%B0%E5%8F%91%E7%8E%B0%E4%B8%AD%E8%8D%AF%E8%BF%9E%E8%8A%B1%E6%B8%85%E7%98%9F%E5%AF%B9%E5%BE%B7%E5%B0%94%E5%A1%94%E7%97%85%E6%AF%92%E6%9C%89%E6%95%88%23&Refer=top\">                                        <strong class=\"hot\">21</strong>                                        <span>新发现中药连花清瘟对德尔塔病毒有效<em>280169</em> </span>                                        <i class=\"icon icon_hot\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%E4%BC%8A%E8%97%A4%E7%BE%8E%E8%AF%9A&Refer=top\">                                        <strong class=\"hot\">22</strong>                                        <span>伊藤美诚<em>278819</em> </span>                                        <i class=\"icon icon_hot\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4%E5%93%AD%E5%BE%97%E8%AE%A9%E4%BA%BA%E5%BF%83%E7%96%BC%23&Refer=top\">                                        <strong class=\"hot\">23</strong>                                        <span>迪丽热巴哭得让人心疼<em>274190</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%E8%B0%A2%E9%9C%86%E9%94%8B%20%E5%90%AC%E8%AF%B4%E4%BD%A0%E4%BB%AC%E8%A6%81%E7%82%B8%E6%88%91%E5%8E%A8%E6%88%BF&Refer=top\">                                        <strong class=\"hot\">24</strong>                                        <span>谢霆锋 听说你们要炸我厨房<em>273540</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E4%B8%AD%E5%9B%BD%E8%BF%90%E5%8A%A8%E5%91%98%E9%98%B2%E7%96%AB%E6%84%8F%E8%AF%86%E6%9C%89%E5%A4%9A%E5%BC%BA%23&Refer=top\">                                        <strong class=\"hot\">25</strong>                                        <span>中国运动员防疫意识有多强<em>270812</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%B2%89%E7%BA%A2%E5%B0%8F%E7%8C%AA%23&Refer=top\">                                        <strong class=\"hot\">26</strong>                                        <span>粉红小猪<em>265783</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%B7%B3%E6%B0%B4%E8%BF%90%E5%8A%A8%E5%91%98%E6%94%BE%E6%89%8B%E6%9C%BA%E7%9A%84%E6%96%B9%E5%BC%8F%23&Refer=top\">                                        <strong class=\"hot\">27</strong>                                        <span>跳水运动员放手机的方式<em>265416</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%89%BE%E7%A6%8F%E6%9D%B0%E5%B0%BC%E6%81%8B%E6%83%85%23&Refer=top\">                                        <strong class=\"hot\">28</strong>                                        <span>艾福杰尼恋情<em>262920</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%BF%99%E5%B1%8A%E5%A5%A5%E8%BF%90%E4%BC%9A%E4%B8%8E%E5%BE%80%E5%B1%8A%E7%9A%84%E4%B8%8D%E5%90%8C%E4%B9%8B%E5%A4%84%23&Refer=top\">                                        <strong class=\"hot\">29</strong>                                        <span>这届奥运会与往届的不同之处<em>258551</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E4%B9%94%E5%A4%95%E8%BE%B0%E7%AE%80%E4%BA%A6%E7%B9%81%E7%BB%88%E4%BA%8E%E4%BA%B2%E4%BA%86%23&Refer=top\">                                        <strong class=\"hot\">30</strong>                                        <span>乔夕辰简亦繁终于亲了<em>256435</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%E5%B0%8FS%20%E6%80%81%E5%BA%A6%E5%8F%8A%E8%A8%80%E8%AE%BA%E8%A7%86%E9%A2%91%E5%90%88%E9%9B%86&Refer=top\">                                        <strong class=\"hot\">31</strong>                                        <span>小S 态度及言论视频合集<em>256414</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E5%90%B4%E6%98%A0%E6%B4%81ins%23&Refer=top\">                                        <strong class=\"hot\">32</strong>                                        <span>吴映洁ins<em>255180</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E5%88%98%E6%99%93%E8%89%B3%E5%BE%AE%E5%8D%9A%E8%A2%AB%E5%B0%81%23&Refer=top\">                                        <strong class=\"hot\">33</strong>                                        <span>刘晓艳微博被封<em>254810</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%89%AC%E5%B7%9E%E6%8B%9B300%E5%90%8D%E5%BF%97%E6%84%BF%E8%80%85%E6%8A%A5%E5%90%8D%E7%94%B5%E8%AF%9D%E8%A2%AB%E6%89%93%E7%88%86%23&Refer=top\">                                        <strong class=\"hot\">34</strong>                                        <span>扬州招300名志愿者报名电话被打爆<em>253337</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%AE%A1%E6%99%A8%E8%BE%B0%E8%86%9D%E7%9B%96%E4%B8%8A%E5%86%99%E4%B8%AD%E5%9B%BD%E5%8A%A0%E6%B2%B9%23&Refer=top\">                                        <strong class=\"hot\">35</strong>                                        <span>管晨辰膝盖上写中国加油<em>218553</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4%E5%91%98%E5%B7%A5%E8%A7%84%E6%A8%A1%E8%B6%8525%E4%B8%87%E4%BA%BA%23&Refer=top\">                                        <strong class=\"hot\">36</strong>                                        <span>阿里巴巴员工规模超25万人<em>212898</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%B0%91%E8%88%AA%E5%B1%80%E6%98%8E%E7%A1%AE%E8%BF%91%E6%9C%9F%E5%9B%BD%E5%86%85%E6%9C%BA%E7%A5%A8%E5%85%8D%E8%B4%B9%E9%80%80%E7%A5%A8%E8%A6%81%E6%B1%82%23&Refer=top\">                                        <strong class=\"hot\">37</strong>                                        <span>民航局明确近期国内机票免费退票要求<em>206818</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%AE%A1%E6%99%A8%E8%BE%B0%E5%A5%B3%E5%AD%90%E5%B9%B3%E8%A1%A1%E6%9C%A8%E9%87%91%E7%89%8C%23&Refer=top\">                                        <strong class=\"hot\">38</strong>                                        <span>管晨辰女子平衡木金牌<em>195993</em> <img src=\"//img.t.sinajs.cn/t4/appstyle/expression/ext/normal/6d/2021_goldmedal_org.png\" title=\"[金牌]\" alt=\"[金牌]\" class=\"face\" /></span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%BE%8E%E5%9B%BD%E5%A5%96%E7%89%8C%E6%A6%9C%E6%8E%92%E5%90%8D%23&Refer=top\">                                        <strong class=\"hot\">39</strong>                                        <span>美国奖牌榜排名<em>187002</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%B4%BE%E4%B8%80%E5%87%A1%E4%B8%80%E5%AE%B6%E6%98%AF%E5%9C%A8%E8%AF%B4%E8%84%B1%E5%8F%A3%E7%A7%80%E5%90%97%23&Refer=top\">                                        <strong class=\"hot\">40</strong>                                        <span>贾一凡一家是在说脱口秀吗<em>182132</em> </span>                                        <i class=\"icon icon_new\"></i>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E7%A7%A6%E7%89%9B%E6%AD%A3%E5%A8%81%E7%94%A8%E8%A1%A8%E6%83%85%E5%8C%85%E5%9B%9E%E6%80%BC%E6%81%B6%E8%AF%84%23&Refer=top\">                                        <strong class=\"hot\">41</strong>                                        <span>秦牛正威用表情包回怼恶评<em>181888</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%8A%B1%E5%84%BF%E8%B6%85%E7%BA%A7%E4%B9%96%E7%A7%B0%E4%B8%8E%E9%BB%84%E4%BF%8A%E6%8D%B7%E6%98%AF%E6%83%85%E4%BE%A3%23&Refer=top\">                                        <strong class=\"hot\">42</strong>                                        <span>花儿超级乖称与黄俊捷是情侣<em>181097</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E8%A5%BF%E5%AE%89%E6%9A%B4%E9%9B%A8%23&Refer=top\">                                        <strong class=\"hot\">43</strong>                                        <span>西安暴雨<em>176920</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%BE%B3%E9%97%A8%E5%AE%A3%E5%B8%83%E8%BF%9B%E5%85%A5%E5%8D%B3%E6%97%B6%E9%A2%84%E9%98%B2%E7%8A%B6%E6%80%81%23&Refer=top\">                                        <strong class=\"hot\">44</strong>                                        <span>澳门宣布进入即时预防状态<em>163698</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E4%BD%A0%E6%98%AF%E6%88%91%E7%9A%84%E8%8D%A3%E8%80%80%E9%A2%84%E5%91%8A%23&Refer=top\">                                        <strong class=\"hot\">45</strong>                                        <span>你是我的荣耀预告<em>156782</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%9C%B1%E4%B8%80%E9%BE%99%E6%B2%A1%E6%BC%94%E6%88%90%E9%87%91%E6%99%A8%E5%8F%94%E5%8F%94%23&Refer=top\">                                        <strong class=\"hot\">46</strong>                                        <span>朱一龙没演成金晨叔叔<em>155318</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E5%85%B3%E6%99%93%E5%BD%A4%E5%B8%8C%E6%9C%9B%E5%92%8C%E9%82%B9%E6%95%AC%E5%9B%AD%E6%9D%A5%E4%B8%AA%E7%9C%9F%E5%90%88%E5%BD%B1%23&Refer=top\">                                        <strong class=\"hot\">47</strong>                                        <span>关晓彤希望和邹敬园来个真合影<em>146907</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E6%97%A5%E6%9C%AC%E5%A5%B3%E4%B9%92%E6%99%8B%E7%BA%A7%E5%9B%A2%E4%BD%93%E5%86%B3%E8%B5%9B%23&Refer=top\">                                        <strong class=\"hot\">48</strong>                                        <span>日本女乒晋级团体决赛<em>124538</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%E4%B8%8A%E6%B8%B8&Refer=top\">                                        <strong class=\"hot\">49</strong>                                        <span>上游<em>124178</em> </span>                                    </a>            </li>                                                <li>                <a href=\"/weibo?q=%23%E5%BC%A0%E6%96%87%E5%AE%8F%E8%AF%B4%E7%99%BE%E5%88%86%E7%99%BE%E9%A2%84%E9%98%B2%E6%84%9F%E6%9F%93%E7%9A%84%E7%96%AB%E8%8B%97%E6%98%AF%E4%B8%8D%E5%AD%98%E5%9C%A8%E7%9A%84%23&Refer=top\">                                        <strong class=\"hot\">50</strong>                                        <span>张文宏说百分百预防感染的疫苗是不存在的<em>121983</em> </span>                                    </a>            </li>                                </ul>    </section>    <!--/实时榜-->                            <!--<i class=\"icon icon_top\"></i>--></div></body></html>";
        String expression = "doc.select(\"ul.list_a li\");";
           String rule = getHtmlAttributeVaul(html, expression,
                 "ranktop:element.select(\"strong.hot \").first().ownText()|" +
                         "list_litle:element.select(\"span\").first().ownText()|" +
                         "list_num:element.select(\"span em\").first().ownText()|" +
                         "table_flage:element.select(\"i\").attr(\"class\")|" +
                         "list_addr:element.attr(\"href\")" );
        System.out.println(rule);

    }

}

UDTF代码如下


import com.xxx.platback.utils.JsoupUtils;
import org.apache.hadoop.hive.ql.exec.UDF;


public class JsoupParseHtml extends UDF {
    /**
     * @param inParm       html text
     * @param inParmOfRule 规则  : doc.select("title").text()
     * @return string  获取的内容
     */

    public String evaluate(String inParm, String inParmOfRule, String subString) {
        String value = "";
        try {
            value = JsoupUtils.getHtmlAttributeVaul(inParm, inParmOfRule, subString);
        } catch (Exception ex) {
            value = "解析错误";
            ex.printStackTrace();
        }
        return value;
    }
}

以上代码得到HTML解析后的JSON数据。通过JSON解析即可获得数据。

UDF函数创建

   1,maven打JAR包。【JsoupParseHtml-1.0-SNAPSHOT.jar

   2,JAR包从通道机上传至HADOOP集群机器

   3,JAR包从HADOOP机器,上传至HDFS路径。

   4,创建函数   create function jsoupParseHtml AS 'com.xxx.platback.appointment.JsoupParseHtml' using jar 'hdfs路径' ;

   5,按照使用说明测试并输出结果。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值