1.
hdfs dfs -mkdir -p /app/data/exam
hdfs dfs -put /root/data/answer_question.log /app/data/exam
2.
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("exam").setMaster("local[*]")
val sc = new SparkContext(conf)
val rdd = sc.textFile("hdfs://192.168.56.100:9000/app/data/exam/answer_question.log")
rdd.map(line => {
var datas = line.split(" ")
var arr = datas(9).split("_")
(arr(1),arr(2),arr(3).replace("r",""),data(10).split(",")(0)).productIterator.mkString("\t")
}).saveAsTextFile("hdfs://192168.56.100/app/data/result")
}
3.
create_namespace 'exam'
create 'exam:analysis','accuracy','question' # 两个列簇accuracy、question
4.
drop database ex_exam cascade; # 删表加删库,谨慎
create database exam;
use exam;
create external table ex_exam_record (
topic_id string,
student_id string,
question_id string,
score float
)
row format delimited fields terminated by '\t'
location 'hdfs://192.168.56.100:9000/app/data/result';
create external table ex_exam_analysis (
student_id string,
total_score string,
question_count int,
accuracy float
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties("hbase.columns.mapping"=":key,accuracy:total_score,accuracy:question_count,accuracy:accuracy")
tblproperties("hbase.table.name"="exam:analysis");
create external table ex_exam_question (
student_id string,
right string,
half string,
error float
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties("hbase.columns.mapping"=":key,question:right,question:half,question:error")
tblproperties("hbase.table.name"="exam:analysis");
5.
insert into ex_exam_analysis
select student_id,
sum(score) as total_score,
count(topic_id) as question_count,
sum(score)/count(topic_id) as accuracy
from ex_exam_record er
group by student_id;
6.
with
t1 as (select er.student_id,er.score,concat_wa(',',collect_set(question_id)) as question from ex_exam_record er group by er.student_id,er.score)
insert into table ex_exam_question
select student_id,
max(case when score=1.0 then question else '' end) as right,
max(case when score=0.5 then question else '' end) as half,
max(case when score=0 then question else '' end) as error
from t1
group by student_id
scan 'exam:analysis',{COLUMN=>'question'}