目录
4.启动Hive的Remote metastore server - hive远程元数据服务
业务背景
业务需求:
精英教育是一家辅导小学生作文的一家培训机构,里面课程有作文课,珠心算,神奇的书法课,英语单词背诵课,现在需要计算他每天来学习的人数
整个精英教育开通VIP人员:
整体算
VIP开通总数量=正式VIP学员开通数量+试听VIP学员开通数量 +线上转VIP(只记新增,不计流失)
按课程算:
VIP开通总数量=正式VIP学员开通数量+试听VIP学员开通数量 +脱产转VIP + VIP转VIP转入(只记新增,不计流失)
**正式VIP学员开通数量:已报课程学员
**VIP转线上人数:这家培训机构咱我们县的各个乡镇开了好几家分店,对于偏远的地区不想来培训班班学习,可以找老师购买课程号,给他开通在家进行学习。
**vip转vip:本来学习珠心算的同学现在不想学了,想学习作文课,进行换班学习
生产测试数据
导包
模板文件: jyjy.ftl
<#list stus as stu>
{"student_email":"${stu.student_email}", "student_name":"${stu.student_name}", "student_gender":"${stu.student_gender}", "student_phone":"${stu.student_phone}", "student_education_background":"${stu.student_education_background}", "student_order_code":"${stu.student_order_code}", "class_code":"${stu.class_code}", "series_class_code":"${stu.series_class_code}", "center_code":"${stu.center_code}","course_id":"${stu.course_id}", "study_mode":"${stu.study_mode}", "account_state":"${stu.account_state}", "create_time":"${stu.create_time}", "operation_mode":"${stu.operation_mode}"<#if stu.operation_mode == "update">, "modified_fields":[ {"student_email":"${stu.student_email}"}, {"student_phone":"${stu.student_phone}"} ], "before_class_study_mode":"${stu.before_class_study_mode}", "after_class_study_mode":"${stu.after_class_study_mode}", "before_course_id":"${stu.before_course_id}", "after_course_id":"${stu.after_course_id}"</#if>}
</#list>
生成bean类
package com.lj.domain;
import java.util.List;
import java.util.Map;
public class Stu {
private String student_email;
private String student_name;
private String student_gender;
private String student_phone;
private String student_education_background;
private String student_order_code;
private String class_code;
private String series_class_code;
private String center_code;
private String course_id;
private String study_mode;
private String account_state;
private String create_time;
private String operation_mode;
private List<Map<String,String>> modified_fields;
private String before_class_study_mode;
private String after_class_study_mode;
private String before_course_id;
private String after_course_id;
public Stu() {
}
public Stu(String student_email, String student_name, String student_gender, String student_phone,
String student_education_background, String student_order_code, String class_code, String series_class_code,
String center_code, String course_id, String study_mode, String account_state, String create_time,
String operation_mode, List<Map<String, String>> modified_fields, String before_class_study_mode,
String after_class_study_mode, String before_course_id, String after_course_id) {
this.student_email = student_email;
this.student_name = student_name;
this.student_gender = student_gender;
this.student_phone = student_phone;
this.student_education_background = student_education_background;
this.student_order_code = student_order_code;
this.class_code = class_code;
this.series_class_code = series_class_code;
this.center_code = center_code;
this.course_id = course_id;
this.study_mode = study_mode;
this.account_state = account_state;
this.create_time = create_time;
this.operation_mode = operation_mode;
this.modified_fields = modified_fields;
this.before_class_study_mode = before_class_study_mode;
this.after_class_study_mode = after_class_study_mode;
this.before_course_id = before_course_id;
this.after_course_id = after_course_id;
}
public String getStudent_email() {
return student_email;
}
public void setStudent_email(String student_email) {
this.student_email = student_email;
}
public String getStudent_name() {
return student_name;
}
public void setStudent_name(String student_name) {
this.student_name = student_name;
}
public String getStudent_gender() {
return student_gender;
}
public void setStudent_gender(String student_gender) {
this.student_gender = student_gender;
}
public String getStudent_phone() {
return student_phone;
}
public void setStudent_phone(String student_phone) {
this.student_phone = student_phone;
}
public String getStudent_education_background() {
return student_education_background;
}
public void setStudent_education_background(String student_education_background) {
this.student_education_background = student_education_background;
}
public String getStudent_order_code() {
return student_order_code;
}
public void setStudent_order_code(String student_order_code) {
this.student_order_code = student_order_code;
}
public String getClass_code() {
return class_code;
}
public void setClass_code(String class_code) {
this.class_code = class_code;
}
public String getSeries_class_code() {
return series_class_code;
}
public void setSeries_class_code(String series_class_code) {
this.series_class_code = series_class_code;
}
public String getCenter_code() {
return center_code;
}
public void setCenter_code(String center_code) {
this.center_code = center_code;
}
public String getCourse_id() {
return course_id;
}
public void setCourse_id(String course_id) {
this.course_id = course_id;
}
public String getStudy_mode() {
return study_mode;
}
public void setStudy_mode(String study_mode) {
this.study_mode = study_mode;
}
public String getAccount_state() {
return account_state;
}
public void setAccount_state(String account_state) {
this.account_state = account_state;
}
public String getCreate_time() {
return create_time;
}
public void setCreate_time(String create_time) {
this.create_time = create_time;
}
public String getOperation_mode() {
return operation_mode;
}
public void setOperation_mode(String operation_mode) {
this.operation_mode = operation_mode;
}
public List<Map<String, String>> getModified_fields() {
return modified_fields;
}
public void setModified_fields(List<Map<String, String>> modified_fields) {
this.modified_fields = modified_fields;
}
public String getBefore_class_study_mode() {
return before_class_study_mode;
}
public void setBefore_class_study_mode(String before_class_study_mode) {
this.before_class_study_mode = before_class_study_mode;
}
public String getAfter_class_study_mode() {
return after_class_study_mode;
}
public void setAfter_class_study_mode(String after_class_study_mode) {
this.after_class_study_mode = after_class_study_mode;
}
public String getBefore_course_id() {
return before_course_id;
}
public void setBefore_course_id(String before_course_id) {
this.before_course_id = before_course_id;
}
public String getAfter_course_id() {
return after_course_id;
}
public void setAfter_course_id(String after_course_id) {
this.after_course_id = after_course_id;
}
@Override
public String toString() {
return "Stu [student_email=" + student_email + ", student_name=" + student_name + ", student_gender="
+ student_gender + ", student_phone=" + student_phone + ", student_education_background="
+ student_education_background + ", student_order_code=" + student_order_code + ", class_code="
+ class_code + ", series_class_code=" + series_class_code + ", center_code=" + center_code
+ ", course_id=" + course_id + ", study_mode=" + study_mode + ", account_state=" + account_state
+ ", create_time=" + create_time + ", operation_mode=" + operation_mode + ", modified_fields="
+ modified_fields + ", before_class_study_mode=" + before_class_study_mode + ", after_class_study_mode="
+ after_class_study_mode + ", before_course_id=" + before_course_id + ", after_course_id="
+ after_course_id + "]";
}
}
准备数据类
package com.lj.fm;
import java.io.File;
import java.io.FileWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import cn.tedu.domain.Stu;
import freemarker.template.Configuration;
import freemarker.template.Template;
import freemarker.template.TemplateExceptionHandler;
public class TTsFMDemo01 {
public static void main(String[] args) throws Exception {
//1.创建配置对象
Configuration cfg = new Configuration(Configuration.VERSION_2_3_27);
//--指定加载模板的位置
cfg.setDirectoryForTemplateLoading(new File("template"));
//--指定读取文件使用的编码
cfg.setDefaultEncoding("UTF-8");
//--设定异常处理机制
cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
cfg.setLogTemplateExceptions(false);
cfg.setWrapUncheckedExceptions(true);
//2.准备数据
Stu stu1 = new Stu("pq@jyjy.cn","pq","male","110","xx","112233","123321","e1808","999","english","vip","nomal","2019-06-23","update",null,"vip","vip","english","math");
Stu stu2 = new Stu("swk@jyjy.cn","swk","male","119","xx","112233","123321","m1808","999","math","vip","nomal","2019-06-23","add",null,null,null,null,null);
Stu stu3 = new Stu("zbj@qq.cn","zbj","female","110","xx","112233","123321","w1808","999","write","vip","nomal","2019-06-23","update",null,"vip","vip","write","english");
Stu stu4 = new Stu("swj@163.com","swj","female","110","xx","112233","123321","m1808","999","math","vip","nomal","2019-06-23","update",null,"vip","vip","math","write");
Stu stu5 = new Stu("ts@qq.cn","ts","female","110","xx","112233","123321","e1808","999","english","vip","audition","2019-06-23","update",null,"vip","vip","english","math");
Stu stu6 = new Stu("blm@souhu.cn","blm","female","110","xx","112233","123321","m1808","999","math","vip","audition","2019-06-23","add",null,null,null,null,null);
List<Stu> stus = Arrays.asList(stu1,stu2,stu3,stu4,stu5,stu6);
Map map = new HashMap();
map.put("stus", stus);
//3.加载模板
Template template = cfg.getTemplate("jyjy.ftl");
//4.模板应用数据输出
Writer writer = new FileWriter("jyjy.json");
template.process(map, writer);
writer.flush();
writer.close();
}
}
Hive中存储json格式的数据
加载hcatalog包,此包已经包含在hive中
add jar /home/software/apache-hive-1.2.0-bin/hcatalog/share/hcatalog/hive-hcatalog-core-1.2.0.jar;
创建可以处理json格式数据的表
CREATE TABLE vip(
student_email string,
student_name string,
student_gender string,
student_phone string,
student_education_background string,
student_order_code string,
class_code string,
series_class_code string,
center_code string,
course_id string,
study_mode string,
account_state string,
create_time string,
operation_mode string,
modified_fields array<
struct<
student_email:string,
student_name:string,
student_gender:string,
student_phone:string,
education_background:string,
student_order_code:string,
class_code:string,
series_class_code:string,
center_code:string,
study_mode:string,
course_id:string,
operation_mode:string,
create_time:string,
study_type:string
>
>,
before_class_study_mode string,
after_class_study_mode string,
before_course_id string,
after_course_id string
)ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' ;
加载数据
load data local inpath '/root/vip.json' into table vip;
Flume收集数据到Hive表
总体思路:1.hive可以存储json格式的数据
2.flum基于hivesink可以把数据写到hive表里面
要求:1.有事务机制,2.启动远程原数据服务
create database hive charset 'latin1';
1.配置环境变量
vim /etc/profile
export HIVE_HOME=/home/software/apache-hive-1.2.0-bin
export HCAT_HOME=/home/software/apache-hive-1.2.0-bin/hcatalog/
export PATH=$PATH:$HIVE_HOME
export PATH=$PATH:$HCAT_HOME
source /etc/profile
2.初始化hive的元数据库
./schematool -initSchema -dbType mysql
此时登录数据库,
use hive;
show tables;
我们会发现有53张表,是因为把hive原数据一致性都初始化好了
启动hive:./hive
create databse jyjy;
3.建立事务性vip表
此时这张表是带有事务的表,你可以对它进行增删改查的操作
要求:1.必须有桶,2.必须采用orc格式存储 3.必须开启表的事务功能
CREATE TABLE vip(
student_email string,
student_name string,
student_gender string,
student_phone string,
student_education_background string,
student_order_code string,
class_code string,
series_class_code string,
center_code string,
course_id string,
study_mode string,
account_state string,
create_time string,
operation_mode string,
modified_fields array<
struct<
student_email:string,
student_name:string,
student_gender:string,
student_phone:string,
student_education_background:string,
student_order_code:string,
class_code:string,
series_class_code:string,
center_code:string,
course_id:string,
study_mode:string,
account_state:string,
create_time:string
>
>,
before_class_study_mode string,
after_class_study_mode string,
before_course_id string,
after_course_id string
) partitioned by (reportTime string)
clustered by (student_order_code) into 4 buckets #必须有桶
stored as orc #必须采用orc格式存储
TBLPROPERTIES('transactional'='true'); #必须开启表的事务功能
#ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' #采用orc格式存储时由于数据被压缩不再是json格式,所以不要配置这句话
4.启动Hive的Remote metastore server - hive远程元数据服务
./hive --service metastore
此服务默认监听9083端口,可以通过thrift://clientip:9083方式连接。
Flume中hive sink 操作hive时需要用到hive中的元数据信息,所以需要连接连接hive的远程元数据服务,需要先启动此服务。
5.编写flume配置文件
vim jyjy.conf
#配置Agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1#配置Source
a1.sources.r1.type = spooldir//指定文件夹读取数据
a1.sources.r1.spoolDir = /home/jyjy//忽略.tmp文件
a1.sources.r1.ignorePattern = ^.*\\.tmp$//配置拦截器,将日志体中的时间加到日志头,进行分区
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
a1.sources.r1.interceptors.i1.regex = ^.*"create_time":"(\\d{4}-\\d{2}-\\d{2})".*$
a1.sources.r1.interceptors.i1.serializers = s1
a1.sources.r1.interceptors.i1.serializers.s1.name = today#配置Sink
a1.sinks.k1.type = hive
a1.sinks.k1.hive.metastore = thrift://lj01:9083
a1.sinks.k1.hive.database = jyjydb
a1.sinks.k1.hive.table = vip
a1.sinks.k1.hive.partition = %{today}
a1.sinks.k1.serializer =json#配置Channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000#绑定关系
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
6.启动flume agent
flume-ng agent --conf ../conf --conf-file ../conf/jyjy.conf --name a1 -Dflume.root.logger=INFO,console
7.将保存测试数据的jyjy.json.tmp上传到 /home/jyjy,上传完成后把 .tmp去掉,就会开始处理数据
select * from vip; 可以发现数据已经加到hive
在Hive中进行数据处理
数据清洗
所有统计都需要去除@jyjy.cn的员工管理账号
方法一:因为此时hive已经具有对表的增删改查,可以直接在表中对不符合的邮箱进行删除
方法二:建立一个新的clearvip清洗表,每天写入数据
为了保证数据不被破坏我选择方法二,这根据自己习惯选择
create table vipclear like vip;
insert into vipclear partition(reportTime = '2019-06-23')
select student_email,student_name,student_gender,student_phone,student_education_background,student_order_code,class_code,series_class_code,center_code,course_id,study_mode,account_state,create_time,operation_mode,modified_fields,before_class_study_mode,after_class_study_mode,before_course_id,after_course_id from vip where reportTime='2019-6-23' and student_email not rlike '^(\\w+@jyjy.cn)$';
此时将符合条件的数据查询到写入vipclear表中。
指标计算 - 新增vip开通数量
新增VIP开通数量=正式VIP学员开通数量+试听VIP学员开通数量 +脱产转VIP + VIP转VIP转入(只记新增,不计流失)
创建vip_open_num数据表:时间,课程编号,账户状态,数量,开通模式
create table vip_open_num(reporttime string , course_id string , account_state string ,num int,open_type string) row format delimited fields terminated by '|' ;
从数据清洗表中导入数据到vip_open_num数据表
新增VIP学员正式开通数量
insert into vip_open_num
select
reporttime,course_id,account_state,count(*),"new_add"
from
vipclear
where
reporttime = '2019-06-23'
and
operation_mode = 'add'
and
study_mode = 'vip'
and
account_state = 'nomal'
group by course_id
新增试听开通数量
insert into vip_open_num
select
reporttime,course_id,account_state,count(*),"new_add_audition"
from
vipclear
where
reporttime = '2019-06-23'
and
operation_mode = 'add'
and
study_mode = 'vip'
and
account_state = 'audition'
group by course_id
线下转vip
insert into vip_open_num
select
reporttime,after_course_id,account_state,count(*),"offline2vip"
from
vipclear
where
reporttime = '2019-06-23'
and
operation_mode = 'update'
and
before_class_study_mode='offline'
and
after_class_study_mode='vip'
group by
after_course_id,account_state
vip转vip转入
insert into vip_open_num
select
reporttime,after_course_id,account_state,count(*),"vip2vip"
from
vipclear
where
reporttime = '2019-06-23'
and
operation_mode = 'update'
and
before_class_study_mode='vip'
and
after_class_study_mode='vip'
and
before_course_id != after_course_id
group by
after_course_id
查询结果
查询指定课程方向新增vip开通数量
select
course_id,count(num)
from
vip_open_num
where
reportTime = '2019-06-23'
and
course_id = 'big'
查询整个精英教育新增vip开通数量
select
reportTime,count(num)
from
vip_open_num
where
reportTime ='2019-06-23'
and
open_type !='vip2vip'
导出到关系型数据库
导出数据技术方案
1.通过sqoop导出数据
2.通过SpringData直接将结果数据读取出来写入mysql
这次数据查询非常灵活,所以我选择方法2.
#在MySql中
#创建数据库
create database jyjydb;
use jyjydb;
创建事实表
create table vip_open_num(id int primary key auto_increment,time_id varchar(10),course_id varchar(10),account_state_id varchar(10),open_type_id varchar(10),num int);
创建时间维度表
create table jyjy_time(time_id varchar(10),year int,season int,month int,day int,week int);
#初始化时间维度表
insert into jyjy_time values('2019-06-23',2019, 1,6,13,7);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,1);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,2);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,3);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,4);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,5);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,6);
#创建学科方向维度表
create table jyjy_course(
course_id varchar(10),
course_name varchar(10)
);
#初始化学科方向维度表
insert into jyjy_course values('math','数学');
insert into jyjy_course values('english',英语');
#创建账户状态维度表
create table jyjy_account_state(
account_state_id varchar(10),
account_state_name varchar(10)
);
#初始化账户状态维度表
insert into jyjy_account_state values('audition','试听');
insert into jyjy_account_state values('normal','正式');
#创建开通方式维度表
create table jyjy_opentype(
opentype_id varchar(40),
opentype_name varchar(40)
);
insert into jyjy_opentype values ('new_add','新增');
insert into jyjy_opentype values ('vip_to_vip','vip转vip转入');
insert into jyjy_opentype values ('offline_to_vip','脱产转vip');
开发SpringData
pom文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>SpringDataHive_Demo01</groupId>
<artifactId>SpringDataHive_Demo01</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>SpringDataHive_Demo01</name>
<url>http://maven.apache.org</url>
<properties>
<spring.version>4.1.6.RELEASE</spring.version>
<slf4j.version>1.7.6</slf4j.version>
<log4j.version>1.2.17</log4j.version>
<hamcrest.version>1.3</hamcrest.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<spring.hadoop.version>2.3.0.M1</spring.hadoop.version>
<hadoop.version>2.7.1</hadoop.version>
<hive.version>1.2.1</hive.version>
</properties>
<dependencies>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.8</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
<!-- junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
<!-- Spring -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${spring.version}</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-beans</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aop</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<!-- Spring Data Hadoop -->
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-hadoop</artifactId>
<version>${spring.hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- mySql数据库驱动 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<!-- Spring Data Jdbc -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>${spring.version}</version>
</dependency>
<!--SpringData Jpa -->
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-jpa</artifactId>
<version>1.8.0.RELEASE</version>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-entitymanager</artifactId>
<version>4.3.6.Final</version>
</dependency>
<!-- Spring Test -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- Spring Tx -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<scope>compile</scope>
</dependency>
<!-- Hive -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-service</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.thrift</groupId>
<artifactId>libfb303</artifactId>
<version>0.9.1</version>
</dependency>
<!-- runtime Hive deps start -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-common</artifactId>
<version>${hive.version}</version>
<scope>runtime</scope>
</dependency>
<!-- <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-builtins</artifactId>
<version>${hive.version}</version> <scope>runtime</scope> </dependency> -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-shims</artifactId>
<version>${hive.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-serde</artifactId>
<version>${hive.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-contrib</artifactId>
<version>${hive.version}</version>
<scope>runtime</scope>
</dependency>
<!-- runtime Hive deps end -->
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy</artifactId>
<version>1.8.5</version>
<scope>runtime</scope>
</dependency>
</dependencies>
</project>
ApplicationContext.xml
<?xml version="1.0" encoding="UTF-8"?>
<beans:beans xmlns="http://www.springframework.org/schema/hadoop"
xmlns:beans="http://www.springframework.org/schema/beans"
xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:jpa="http://www.springframework.org/schema/data/jpa"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context.xsd
http://www.springframework.org/schema/tx
http://www.springframework.org/schema/tx/spring-tx-4.0.xsd
http://www.springframework.org/schema/hadoop
http://www.springframework.org/schema/hadoop/spring-hadoop.xsd
http://www.springframework.org/schema/data/jpa
http://www.springframework.org/schema/data/jpa/spring-jpa-1.3.xsd
"
>
<!-- 1.配置MySql数据源 -->
<beans:bean id="dataSource"
class="org.springframework.jdbc.datasource.DriverManagerDataSource">
<beans:property name="driverClassName" value="com.mysql.jdbc.Driver"></beans:property>
<beans:property name="url" value="jdbc:mysql://lj01:3306/jyjydb"></beans:property>
<beans:property name="username" value="root"></beans:property>
<beans:property name="password" value="root"></beans:property>
</beans:bean>
<!-- 2.配置SpringDataJPA实体管理器工厂 -->
<beans:bean id="entityManagerFactory"
class="org.springframework.orm.jpa.LocalContainerEntityManagerFactoryBean">
<!-- 数据源 -->
<beans:property name="dataSource" ref="dataSource"></beans:property>
<beans:property name="jpaVendorAdapter">
<beans:bean class="org.springframework.orm.jpa.vendor.HibernateJpaVendorAdapter" />
</beans:property>
<!-- 配置扫描哪个包去寻找实体配置 -->
<beans:property name="packagesToScan" value="com.lj.domain" />
<!-- JPA相关配置 -->
<beans:property name="jpaProperties">
<beans:props>
<!-- 命名策略 -->
<beans:prop key="hibernate.ejb.naming_strategy">org.hibernate.cfg.ImprovedNamingStrategy</beans:prop>
<!-- 数据库方言 -->
<beans:prop key="hibernate.dialect">org.hibernate.dialect.MySQL5InnoDBDialect</beans:prop>
<!-- 是否显示sql -->
<beans:prop key="hibernate.show_sql">true</beans:prop>
<!-- 是否要格式化sql -->
<beans:prop key="hibernate.format_sql">true</beans:prop>
<!-- 是否要自动创建表 -->
<beans:prop key="hibernate.hbm2ddl.auto">update</beans:prop>
</beans:props>
</beans:property>
</beans:bean>
<!-- 3.配置SpringDataJPA事务管理器 -->
<beans:bean id="transactionManager" class="org.springframework.orm.jpa.JpaTransactionManager">
<beans:property name="entityManagerFactory" ref="entityManagerFactory" />
</beans:bean>
<!-- 4.配置支持注解的事务 -->
<tx:annotation-driven transaction-manager="transactionManager" />
<!-- 5.配置spring data -->
<jpa:repositories base-package="com.lj"
entity-manager-factory-ref="entityManagerFactory" />
<!-- 6.配置spring包扫描 -->
<context:component-scan base-package="cn.tedu.domain"></context:component-scan>
<context:component-scan base-package="cn.tedu.repository"></context:component-scan>
<!-- 7.配置HDFS地址 -->
<configuration>
fs.defaultFS=hdfs://lj01:9000
</configuration>
<!-- 8.Hive客户端工厂 -->
<hive-client-factory id="hiveClientFactory"
hive-data-source-ref="hiveDataSource" />
<!-- 9.Hive驱动 -->
<beans:bean id="hiveDriver" class="org.apache.hive.jdbc.HiveDriver" />
<!-- 10.Hive数据源 -->
<beans:bean id="hiveDataSource"
class="org.springframework.jdbc.datasource.SimpleDriverDataSource">
<beans:constructor-arg name="driver" ref="hiveDriver" />
<beans:constructor-arg name="url"
value="jdbc:hive2://lj01:10000/ttsdb" />
<beans:constructor-arg name="username" value="root" />
<beans:constructor-arg name="password" value="root" />
</beans:bean>
<!-- 11.Hive模板类,用于操作hive -->
<hive-template id="hiveTemplate" />
</beans:beans>
主要逻辑
package com.lj;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.data.hadoop.hive.HiveClient;
import org.springframework.data.hadoop.hive.HiveClientCallback;
import org.springframework.data.hadoop.hive.HiveTemplate;
import cn.lj.domain.Vip_Open_Num_Bean;
import cn.lj.repository.VONBRepository;
public class jyjyStart {
public static void main(String[] args) {
//0.参数
String hql = null;
Date date = new Date();
SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd");
final String dateStr = sdf.format(date);
//1.初始化容器
ApplicationContext context = new ClassPathXmlApplicationContext("applicationContext.xml");
HiveTemplate hiveTemplate = (HiveTemplate) context.getBean("hiveTemplate");
//2.清洗数据到vipclear表
hql = "insert into vipclear partition(reportTime = '"+dateStr+"') select student_email,student_name,student_gender,student_phone,student_education_background,student_order_code,class_code,series_class_code,center_code,course_id,study_mode,account_state,create_time,operation_mode,modified_fields,before_class_study_mode,after_class_study_mode,before_course_id,after_course_id from vip where reportTime='"+dateStr+"' and student_email not rlike '^(\\w+@jyjy.cn)$';";
hiveTemplate.query(hql);
//3.数据处理将结果写入vip_open_num
//--新增vip
hql = "insert into vip_open_num select reporttime,course_id,account_state,count(*),'new_add' from dataclear where reporttime = '"+dateStr+"' and operation_mode = 'add' and study_mode = 'vip' group by reporttime,course_id,account_state";
hiveTemplate.query(hql);
//--脱产转vip
hql = "insert into vip_open_num select reporttime,after_course_id,account_state,count(*),'offline2vip' from vipclear where reporttime = '"+dateStr+"' and operation_mode = 'update' and before_class_study_mode='offline' and after_class_study_mode='vip' group by reporttime,after_course_id,account_state ";
hiveTemplate.query(hql);
//--vip转vip转入
hql = "insert into vip_open_num select reporttime,after_course_id,account_state,count(*),'vip2vip' from vipclear where reporttime = '"+dateStr+"' and operation_mode = 'update' and before_class_study_mode='vip' and after_class_study_mode='vip' and before_course_id != after_course_id group by reporttime,after_course_id,account_state ";
hiveTemplate.query(hql);
//4.从hive中查询vip_open_num数据
List<Vip_Open_Num_Bean> list = hiveTemplate.execute(new HiveClientCallback<List<Vip_Open_Num_Bean>>() {
public List<Vip_Open_Num_Bean> doInHive(HiveClient hiveClient) throws Exception {
List<Vip_Open_Num_Bean> list = new ArrayList<Vip_Open_Num_Bean>();
String hql = "select * from vip_open_num where reportTime=?";
Connection conn = hiveClient.getConnection();
PreparedStatement ps = conn.prepareStatement(hql);
ps.setString(1, dateStr);
ResultSet rs = ps.executeQuery();
while(rs.next()){
Vip_Open_Num_Bean vonb = new Vip_Open_Num_Bean();
vonb.setTime_id(rs.getString("reporttime"));
vonb.setCourse_id(rs.getString("course_id"));
vonb.setAccount_state_id(rs.getString("account_state"));
vonb.setOpen_type_id(rs.getString("open_type"));
vonb.setNum(rs.getInt("num"));
list.add(vonb);
}
return list;
}
});
System.out.println(list.size());
//5.写入mysql的vip_open_num表
VONBRepository vonbRepository = context.getBean(VONBRepository.class);
vonbRepository.save(list);
}
}
实体类
package com.lj.domain;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;
@Entity(name="vip_open_num")
public class Vip_Open_Num_Bean {
private int id;
private String time_id;
private String course_id;
private String account_state_id;
private String open_type_id;
private int num;
public Vip_Open_Num_Bean() {
}
public Vip_Open_Num_Bean(int id,String time_id, String course_id, String account_state_id, String open_type_id, int num) {
this.id = id;
this.time_id = time_id;
this.course_id = course_id;
this.account_state_id = account_state_id;
this.open_type_id = open_type_id;
this.num = num;
}
@Id
@GeneratedValue
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getTime_id() {
return time_id;
}
public void setTime_id(String time_id) {
this.time_id = time_id;
}
public String getCourse_id() {
return course_id;
}
public void setCourse_id(String course_id) {
this.course_id = course_id;
}
public String getAccount_state_id() {
return account_state_id;
}
public void setAccount_state_id(String account_state_id) {
this.account_state_id = account_state_id;
}
public String getOpen_type_id() {
return open_type_id;
}
public void setOpen_type_id(String open_type_id) {
this.open_type_id = open_type_id;
}
public int getNum() {
return num;
}
public void setNum(int num) {
this.num = num;
}
@Override
public String toString() {
return "Vip_Open_Num_Bean [id=" + id + ", time_id=" + time_id + ", course_id=" + course_id
+ ", account_state_id=" + account_state_id + ", open_type_id=" + open_type_id + ", num=" + num + "]";
}
}
核心接口CrudRepository
package com.lj.repository;
import org.springframework.data.repository.CrudRepository;
import com.lj.domain.Vip_Open_Num_Bean;
public interface VONBRepository extends CrudRepository<Vip_Open_Num_Bean, String> {
}