spark sql + 算法批量预测

风路丞

已于 2022-11-15 10:52:30 修改

阅读量943

点赞数

分类专栏：大数据组件hadoop、flink等学习文章标签： spark sql 大数据

于 2022-11-15 10:18:33 首次发布

本文链接：https://blog.csdn.net/qq_19072921/article/details/127860851

版权

大数据组件hadoop、flink等学习专栏收录该内容

8 篇文章 0 订阅

订阅专栏

数据集的获取测试结果可看：hive、spark 窗口滑动获取数据集_风路丞的博客-CSDN博客_hive sql 滑动窗口

1、近7天的数据集

思想：自身和自身做left join, 并通过on 的条件限制，把每个id和其近7天内的id关联起来。

-- 1、建表 （15200条）
	use cf;
	drop table if exists taizhou;
	create table taizhou(
		id string,
		content string,
		ends string,
		one string,
		two string,
		createdate string,
		level string
	) comment '台州'
	row format delimited fields terminated by '\t';

	load data inpath 'hdfs://tqHadoopCluster/cf/taiz.txt' into table taizhou;


-- 2、给每周数据做标记
-- 2.1 找出每条数据对应的近7天数据,且当天事件发生时间 不晚于 当前事件时间
	drop table if exists taizhou_zhou;
	create table taizhou_zhou as 
	with temp as 
		(select *, from_unixtime(UNIX_TIMESTAMP(createdate,'yyyy/MM/dd HH:mm'),'yyyy-MM-dd HH:mm') as time from taizhou) 
	select /*+ repartiton(40) */ 
		t1.id as id,t1.time as time,t1.content,t2.id as done_id,t2.time as done_time,t2.content as done_content  
	from 
		temp t1 
	left join 
		temp t2 
	on t1.id!=t2.id and datediff(t1.time, t2.time) between 0 and 6 and t1.time>=t2.time;

	-- select count(1) from (select id, time,count(1) as c from taizhou_zhou group by id,time) a where a.c=1;


-- 3、分割最近一周的数据(Time taken: 485.678 seconds)
-- 3.1 算法模型预测重复和相似，并获取重复、相似的”id:相似度“列表
	-- 3.1.1 获取每条数据的比较对象
		-- 【思路】比较对象以map<string,string>类型存储在hive表，其中key-事件唯一标识，value-事件内容
		-- 【实现】str_to_map函数、collect_list函数、窗口函数
			-- 【注意点】str_to_map函数是根据英文逗号来识别一个key-value对的，所以要先处理掉文本中可能存在的英文逗号，否则会出现null。
			-- 【函数】collect_list(xx) 把xx组合成list。

			drop table if exists taizhou_all_map_3d;
			create table taizhou_all_map_3d as 
			select /*+ repartition(240) */ 
				id, 
				content, 
				time,
				str_to_map(
					concat_ws(",",collect_list(concat_ws(':', cast(done_id as string), regexp_replace(done_content, ',', ''))))
				) as contentmap 
			from taizhou_zhou 
			group by id,content,time;


-------------------------------------以下是分组模型预测-----------------------------------
	
-- 3.1.2 算法判断是否重复/相似，并输出重复/相似列表
		-- step1、【自定义的udtf函数说明】函数中定义了异常值返回2，且java 类中用hashmap接收hive的map<String,String>类型数据
		-- 因为是hashmap非线程安全，并行计算会报java.util.HashMap$Node cannot be cast to java.util.HashMap$TreeNode
			use cf;
			-- 关闭向量化查询
			-- set hive.vectorized.execution.enabled=false;
			-- set hive.vectorized.execution.reduce.enabled=false;
			add jar hdfs://tqHadoopCluster/cf/validate-map-result20220920-0.0.1-bin.jar;
			create temporary function checkudf_hashmap as 'com.tianque.IssueCheckHashMapUDTF';

			drop table if exists taizhou_all_check_hashmap_3d;
			create table taizhou_all_check_hashmap_3d as 
			select 
				a.id as id, 
				a.content as content, 
				a.time as time,
				t1.check_result as issame,
				t1.check_ids as sameids 
			from taizhou_all_map_3d a 
			lateral view checkudf_hashmap(content, contentmap, 0.95) t1;

		-- 启动
			nohup spark-sql --master yarn --num-executors 4 --executor-memory 2G --executor-cores 2 --driver-memory 4G -f check.hql &


	-- 【测试】结果检查
	-- select issame, count(1) from taizhou_all_check_hashmap_3d group by issame;

	
-- 4 导出数据
	spark-sql -e 'select * from cf.taizhou_model' | sed 's/,/，/g' > /home/admin/cf_temp/taizhou/taizhou_model.csv

2、指定分组数量的数据集：ntile(group num)

由于指定的参数是group num，所以每个group里的数据量需要自己提前估算下，避免因每个group里数量太多导致spark内存溢出。每个group的数据量=该表总数据量 / group-num。

-- 1、建表
use cf;
drop table if exists train;
create table train(
id string,
content string
) comment '模型预测文本'
row format delimited fields terminated by '\t';

-- 关联hdfs文件
load data inpath 'hdfs://tqHadoopCluster/cf/train.txt' into table train;


-- 2、分组: 使用ntile(xx）函数分组，这里指定分成30组
drop table if exists test_train_group;
create table test_train_group as
select id,content, ntile(30) over(order by id) as group_num from train;



--3、构建map
drop table if exists test_train_group_map;
create table test_train_group_map as
select /*+ repartition(30) */ 
group_num, 
str_to_map(concat_ws(',',collect_set(concat_ws(':',cast(a.id as string), regexp_replace(a.content, ',', ''))))) as content_map  
from 
test_train_group a 
group by group_num;


-------------------------------------以下是分组模型预测-----------------------------------
--4、分组预测
spark-sql --master yarn --num-executors 15 --executor-memory 4G --executor-cores 2 --driver-memory 5G

use cf;
add jar hdfs://tqHadoopCluster/cf/model-batch-eval-0.0.1-bin.jar;

create temporary function batchmap as 'com.tianque.udf.model.BatchMapUdf';

drop table if exists test_train_group_result;
create table test_train_group_result 
as
select /*+ repartition(30) */ 
group_num, batchmap(content_map) as result_map 
from test_train_group_map;


-- 5、列转行
drop table if exists test_train_group_model_result;
create table test_train_group_model_result 
as
select b.group_num, t1.id, t1.result  
from test_train_group_result b  
lateral view explode(result_map) t1 as id, result;

-- select count(1),result from test_train_group_model_result group by result;


-- 6、关联模型结果和样本
drop table if exists test_train_result;
create table test_train_result 
as 
select a.id,b.content,a.result
from test_train_group_model_result a 
join 
test b
on a.id=b.id;

-- 7、输出表
spark-sql -e 'select * from cf.test_train_result' | sed 's/,/，/g' > /home/admin/cf_temp/train/test_train_result.csv

3、指定行范围

涉及：时间窗口函数over(partition by ... order by ... rows between .. and ..)

-- 1、excel 数据处理
-- 替换\t替换成空格
-- sed -i "s/\t//g" test.xlsx
	step1：把excel中的换行符替换成空格(excel表格ctrl+F  --> 在弹出框的右下角“特殊内容”中选“换行符”即可。或者直接输入excel中的换行符标识"^l")
	step2：把替换后的excel另存为txt
	step3：打开txt,再次另存为txt,但另存时的编码选择utf8(无BOM头)
	step4：把utf 无BOM头的txt文件上传到大数据集群


-- 2、hive建表
use cf;
		create table test(
		id string,
		serialnumber string,
		subject string,
		issuecontent string,
		createorg string,
		createdate string
		) comment '邯郸'
		row format delimited fields terminated by '\t';

		load data inpath 'hdfs://tqHadoopCluster/txt/test.txt' into table test;


-- 3、算法模型预测重复和相似，并获取重复、相似的”id:相似度“列表
	-- 3.1 获取每条数据的比较对象
		-- 【重点】比较对象限定在：同一发生区域
		-- 【思路】比较对象以map<string,string>类型存储在hive表，其中key-事件唯一标识，value-事件内容
		-- 【实现】str_to_map函数、collect_list函数、窗口函数
			-- 【注意点】str_to_map函数是根据英文逗号来识别一个key-value对的，所以要先处理掉文本中可能存在的英文逗号，否则会出现null。
			-- 【函数】collect_list(xx) 把xx组合成list。如果后接时间窗口函数over(partition by ... order by ... rows between .. and ..)，则组合的xx来源时间窗口内的数据；否则，xx取全表数据。

			drop table if exists test_map_3d;
			create table test_map_3d as 
			select 
				a.id, 
				a.issuecontent, 
				a.place,
				str_to_map(
					concat_ws(",",collect_list(concat_ws(':', cast(id as string), regexp_replace(issuecontent, ',|:', '，'))) 
						over(
							partition by a.place
							order by a.id rows between 1 following and unbounded following)
						)
				) as contentmap 
			from test a;


	-- 3.2 算法判断是否重复/相似，并输出重复/相似列表
		-- 【自定义的udtf函数说明】函数中定义了异常值返回2，且java 类中用LazyMap接收hive的map<String,String>类型数据
		-- step1、编写sql，内容如下：
			use cf;
			-- 关闭向量化查询
			set hive.vectorized.execution.enabled=false;
			set hive.vectorized.execution.reduce.enabled=false;
			add jar hdfs://tqHadoopCluster/cf/validate-lazymap-result-0.0.1-bin.jar;
			create temporary function checkudf_lazymap as 'com.tianque.IssueCheckLazyMapUDTF';

			drop table if exists test_check_lazymap_3d;
			create table test_check_lazymap_3d as 
			select 
				a.id as id, 
				a.issuecontent as issuecontent, 
				a.place as place,
				t1.check_result as issame,
				t1.check_ids as sameids,
				t2.check_result as issimilarity,
				t2.check_ids as similarityids  
			from test_map_3d a 
			lateral view checkudf_lazymap(issuecontent, contentmap, 0.80) t1 
			lateral view checkudf_lazymap(issuecontent, contentmap, 0.60) t2;

		-- step2、hive后台执行命令
			[admin@hadoop1 cf_temp]$ ll
			-rw-rw-r--. 1 admin admin       626 5月  19 14:11 check.sql
			[admin@hadoop1 cf_temp]$ nohup hive -f check.sql &