####离线数据处理,hive分析(保留汉子、字母、数字、开头数字去除、过滤垃圾数据)
select content,count(1) wordsnumber from (
select regexp_replace(split(eventname,'_')[1],' |,|:|/|-|\r|\n|[[^0-9a-zA-Z\\\u4e00-\\\u9fa5]]','') content from commodlog where eventname like '搜索关键词%' ) as a where content not REGEXP '^[0-9]' and content !='' and content !='null' group by content sort by wordsnumber ASC;