精英教育业务数据处理分析

目录

业务背景

业务需求:

生产测试数据

导包

模板文件: jyjy.ftl

生成bean类

准备数据类

Hive中存储json格式的数据

加载hcatalog包,此包已经包含在hive中 

创建可以处理json格式数据的表 

加载数据

Flume收集数据到Hive表

1.配置环境变量 

2.初始化hive的元数据库

3.建立事务性vip表

4.启动Hive的Remote metastore server - hive远程元数据服务

5.编写flume配置文件

6.启动flume agent

在Hive中进行数据处理

数据清洗

指标计算 - 新增vip开通数量

 新增VIP学员正式开通数量

新增试听开通数量

线下转vip

vip转vip转入

查询结果

导出到关系型数据库

导出数据技术方案

创建事实表

创建时间维度表

开发SpringData

pom文件

ApplicationContext.xml

主要逻辑

实体类

核心接口CrudRepository



业务背景

业务需求:

精英教育是一家辅导小学生作文的一家培训机构,里面课程有作文课,珠心算,神奇的书法课,英语单词背诵课,现在需要计算他每天来学习的人数

整个精英教育开通VIP人员:

整体算

VIP开通总数量=正式VIP学员开通数量+试听VIP学员开通数量 +线上转VIP(只记新增,不计流失)

按课程算:

VIP开通总数量=正式VIP学员开通数量+试听VIP学员开通数量 +脱产转VIP + VIP转VIP转入(只记新增,不计流失)

**正式VIP学员开通数量:已报课程学员

**VIP转线上人数:这家培训机构咱我们县的各个乡镇开了好几家分店,对于偏远的地区不想来培训班班学习,可以找老师购买课程号,给他开通在家进行学习。

**vip转vip:本来学习珠心算的同学现在不想学了,想学习作文课,进行换班学习

生产测试数据

导包

模板文件: jyjy.ftl

<#list stus as stu>
{"student_email":"${stu.student_email}", "student_name":"${stu.student_name}", "student_gender":"${stu.student_gender}", "student_phone":"${stu.student_phone}", "student_education_background":"${stu.student_education_background}", "student_order_code":"${stu.student_order_code}", "class_code":"${stu.class_code}", "series_class_code":"${stu.series_class_code}", "center_code":"${stu.center_code}","course_id":"${stu.course_id}", "study_mode":"${stu.study_mode}", "account_state":"${stu.account_state}", "create_time":"${stu.create_time}", "operation_mode":"${stu.operation_mode}"<#if stu.operation_mode == "update">, "modified_fields":[ {"student_email":"${stu.student_email}"}, {"student_phone":"${stu.student_phone}"} ], "before_class_study_mode":"${stu.before_class_study_mode}", "after_class_study_mode":"${stu.after_class_study_mode}", "before_course_id":"${stu.before_course_id}", "after_course_id":"${stu.after_course_id}"</#if>}
</#list> 

生成bean类

package com.lj.domain;

import java.util.List;
import java.util.Map;

public class Stu {
	private String student_email;
	private String student_name;
	private String student_gender;
	private String student_phone;
	private String student_education_background;
	private String student_order_code;
	private String class_code;
	private String series_class_code;
	private String center_code;
	private String course_id;
	private String study_mode;
	private String account_state;
	private String create_time;
	private String operation_mode;
	private List<Map<String,String>> modified_fields;
	private String before_class_study_mode;
	private String after_class_study_mode;
	private String before_course_id;
	private String after_course_id;
	
	public Stu() {
	}

	public Stu(String student_email, String student_name, String student_gender, String student_phone,
			String student_education_background, String student_order_code, String class_code, String series_class_code,
			String center_code, String course_id, String study_mode, String account_state, String create_time,
			String operation_mode, List<Map<String, String>> modified_fields, String before_class_study_mode,
			String after_class_study_mode, String before_course_id, String after_course_id) {
		this.student_email = student_email;
		this.student_name = student_name;
		this.student_gender = student_gender;
		this.student_phone = student_phone;
		this.student_education_background = student_education_background;
		this.student_order_code = student_order_code;
		this.class_code = class_code;
		this.series_class_code = series_class_code;
		this.center_code = center_code;
		this.course_id = course_id;
		this.study_mode = study_mode;
		this.account_state = account_state;
		this.create_time = create_time;
		this.operation_mode = operation_mode;
		this.modified_fields = modified_fields;
		this.before_class_study_mode = before_class_study_mode;
		this.after_class_study_mode = after_class_study_mode;
		this.before_course_id = before_course_id;
		this.after_course_id = after_course_id;
	}

	public String getStudent_email() {
		return student_email;
	}
	public void setStudent_email(String student_email) {
		this.student_email = student_email;
	}
	public String getStudent_name() {
		return student_name;
	}
	public void setStudent_name(String student_name) {
		this.student_name = student_name;
	}
	public String getStudent_gender() {
		return student_gender;
	}
	public void setStudent_gender(String student_gender) {
		this.student_gender = student_gender;
	}
	public String getStudent_phone() {
		return student_phone;
	}
	public void setStudent_phone(String student_phone) {
		this.student_phone = student_phone;
	}
	public String getStudent_education_background() {
		return student_education_background;
	}
	public void setStudent_education_background(String student_education_background) {
		this.student_education_background = student_education_background;
	}
	public String getStudent_order_code() {
		return student_order_code;
	}
	public void setStudent_order_code(String student_order_code) {
		this.student_order_code = student_order_code;
	}
	public String getClass_code() {
		return class_code;
	}
	public void setClass_code(String class_code) {
		this.class_code = class_code;
	}
	public String getSeries_class_code() {
		return series_class_code;
	}
	public void setSeries_class_code(String series_class_code) {
		this.series_class_code = series_class_code;
	}
	public String getCenter_code() {
		return center_code;
	}
	public void setCenter_code(String center_code) {
		this.center_code = center_code;
	}
	public String getCourse_id() {
		return course_id;
	}
	public void setCourse_id(String course_id) {
		this.course_id = course_id;
	}
	public String getStudy_mode() {
		return study_mode;
	}
	public void setStudy_mode(String study_mode) {
		this.study_mode = study_mode;
	}
	public String getAccount_state() {
		return account_state;
	}
	public void setAccount_state(String account_state) {
		this.account_state = account_state;
	}
	public String getCreate_time() {
		return create_time;
	}
	public void setCreate_time(String create_time) {
		this.create_time = create_time;
	}
	public String getOperation_mode() {
		return operation_mode;
	}
	public void setOperation_mode(String operation_mode) {
		this.operation_mode = operation_mode;
	}
	public List<Map<String, String>> getModified_fields() {
		return modified_fields;
	}
	public void setModified_fields(List<Map<String, String>> modified_fields) {
		this.modified_fields = modified_fields;
	}
	public String getBefore_class_study_mode() {
		return before_class_study_mode;
	}
	public void setBefore_class_study_mode(String before_class_study_mode) {
		this.before_class_study_mode = before_class_study_mode;
	}
	public String getAfter_class_study_mode() {
		return after_class_study_mode;
	}
	public void setAfter_class_study_mode(String after_class_study_mode) {
		this.after_class_study_mode = after_class_study_mode;
	}
	public String getBefore_course_id() {
		return before_course_id;
	}
	public void setBefore_course_id(String before_course_id) {
		this.before_course_id = before_course_id;
	}
	public String getAfter_course_id() {
		return after_course_id;
	}
	public void setAfter_course_id(String after_course_id) {
		this.after_course_id = after_course_id;
	}
	@Override
	public String toString() {
		return "Stu [student_email=" + student_email + ", student_name=" + student_name + ", student_gender="
				+ student_gender + ", student_phone=" + student_phone + ", student_education_background="
				+ student_education_background + ", student_order_code=" + student_order_code + ", class_code="
				+ class_code + ", series_class_code=" + series_class_code + ", center_code=" + center_code
				+ ", course_id=" + course_id + ", study_mode=" + study_mode + ", account_state=" + account_state
				+ ", create_time=" + create_time + ", operation_mode=" + operation_mode + ", modified_fields="
				+ modified_fields + ", before_class_study_mode=" + before_class_study_mode + ", after_class_study_mode="
				+ after_class_study_mode + ", before_course_id=" + before_course_id + ", after_course_id="
				+ after_course_id + "]";
	}

	
}

准备数据类

package com.lj.fm;

import java.io.File;
import java.io.FileWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import cn.tedu.domain.Stu;
import freemarker.template.Configuration;
import freemarker.template.Template;
import freemarker.template.TemplateExceptionHandler;

public class TTsFMDemo01 {
	public static void main(String[] args) throws Exception {
		//1.创建配置对象
		Configuration cfg = new Configuration(Configuration.VERSION_2_3_27);
		//--指定加载模板的位置
		cfg.setDirectoryForTemplateLoading(new File("template"));
		//--指定读取文件使用的编码
		cfg.setDefaultEncoding("UTF-8");
		//--设定异常处理机制
		cfg.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
		cfg.setLogTemplateExceptions(false);
		cfg.setWrapUncheckedExceptions(true);
				
		//2.准备数据
		Stu stu1 = new Stu("pq@jyjy.cn","pq","male","110","xx","112233","123321","e1808","999","english","vip","nomal","2019-06-23","update",null,"vip","vip","english","math");
		Stu stu2 = new Stu("swk@jyjy.cn","swk","male","119","xx","112233","123321","m1808","999","math","vip","nomal","2019-06-23","add",null,null,null,null,null);
		Stu stu3 = new Stu("zbj@qq.cn","zbj","female","110","xx","112233","123321","w1808","999","write","vip","nomal","2019-06-23","update",null,"vip","vip","write","english");
		Stu stu4 = new Stu("swj@163.com","swj","female","110","xx","112233","123321","m1808","999","math","vip","nomal","2019-06-23","update",null,"vip","vip","math","write");
		Stu stu5 = new Stu("ts@qq.cn","ts","female","110","xx","112233","123321","e1808","999","english","vip","audition","2019-06-23","update",null,"vip","vip","english","math");
		Stu stu6 = new Stu("blm@souhu.cn","blm","female","110","xx","112233","123321","m1808","999","math","vip","audition","2019-06-23","add",null,null,null,null,null);
		List<Stu> stus = Arrays.asList(stu1,stu2,stu3,stu4,stu5,stu6);
		Map map = new HashMap();
		map.put("stus", stus);
		
		//3.加载模板
		Template template = cfg.getTemplate("jyjy.ftl");
		
		//4.模板应用数据输出
		Writer writer = new FileWriter("jyjy.json");
		template.process(map, writer);
		writer.flush();
		writer.close();
	}
}

Hive中存储json格式的数据

加载hcatalog包,此包已经包含在hive中 

add jar /home/software/apache-hive-1.2.0-bin/hcatalog/share/hcatalog/hive-hcatalog-core-1.2.0.jar; 

创建可以处理json格式数据的表 

CREATE TABLE vip(
	student_email string,
	student_name string,
	student_gender string,
	student_phone string,
	student_education_background string,
	student_order_code string,
	class_code string,
	series_class_code string,
	center_code string,
	course_id string,
	study_mode string,
	account_state string,
	create_time string,
	operation_mode string,
	modified_fields array<
	    struct<
	        student_email:string,
	        student_name:string,
	        student_gender:string,
	        student_phone:string,
	        education_background:string,
	        student_order_code:string,
	        class_code:string,
	        series_class_code:string,
	        center_code:string,
	        study_mode:string,
	        course_id:string,
	        operation_mode:string,
	        create_time:string,
	        study_type:string
	    >
	>,
	before_class_study_mode string,
	after_class_study_mode string,
	before_course_id string,
	after_course_id string  
)ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' ;
		

加载数据

load data local inpath '/root/vip.json' into table vip;

Flume收集数据到Hive表

总体思路:1.hive可以存储json格式的数据

2.flum基于hivesink可以把数据写到hive表里面

   要求:1.有事务机制,2.启动远程原数据服务


 

create database hive charset 'latin1';

1.配置环境变量 

vim /etc/profile

export HIVE_HOME=/home/software/apache-hive-1.2.0-bin

export HCAT_HOME=/home/software/apache-hive-1.2.0-bin/hcatalog/

export PATH=$PATH:$HIVE_HOME

export PATH=$PATH:$HCAT_HOME

source /etc/profile

2.初始化hive的元数据库

./schematool -initSchema -dbType mysql

此时登录数据库,

use hive;

show tables;

我们会发现有53张表,是因为把hive原数据一致性都初始化好了

 

 

启动hive:./hive

create databse jyjy;

3.建立事务性vip表

此时这张表是带有事务的表,你可以对它进行增删改查的操作

要求:1.必须有桶,2.必须采用orc格式存储 3.必须开启表的事务功能

CREATE TABLE vip(
	student_email string,
	student_name string,
	student_gender string,
	student_phone string,
	student_education_background string,
	student_order_code string,
	class_code string,
	series_class_code string,
	center_code string,
	course_id string,
	study_mode string,
	account_state string,
	create_time string,
	operation_mode string,
	modified_fields array<
		struct<
		student_email:string,
		student_name:string,
		student_gender:string,
		student_phone:string,
		student_education_background:string,
		student_order_code:string,
		class_code:string,
		series_class_code:string,
		center_code:string,
		course_id:string,
		study_mode:string,
		account_state:string,
		create_time:string
		>
	>,
	before_class_study_mode string,
	after_class_study_mode string,
	before_course_id string,
	after_course_id string
)   partitioned by (reportTime string)
	clustered by (student_order_code) into 4 buckets #必须有桶
	stored as orc #必须采用orc格式存储
	TBLPROPERTIES('transactional'='true'); #必须开启表的事务功能

#ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' #采用orc格式存储时由于数据被压缩不再是json格式,所以不要配置这句话

4.启动Hive的Remote metastore server - hive远程元数据服务

./hive --service metastore

此服务默认监听9083端口,可以通过thrift://clientip:9083方式连接。

Flume中hive sink 操作hive时需要用到hive中的元数据信息,所以需要连接连接hive的远程元数据服务,需要先启动此服务。

5.编写flume配置文件

vim jyjy.conf

#配置Agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

#配置Source
a1.sources.r1.type = spooldir

//指定文件夹读取数据
a1.sources.r1.spoolDir = /home/jyjy

//忽略.tmp文件
a1.sources.r1.ignorePattern = ^.*\\.tmp$

//配置拦截器,将日志体中的时间加到日志头,进行分区
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
a1.sources.r1.interceptors.i1.regex = ^.*"create_time":"(\\d{4}-\\d{2}-\\d{2})".*$
a1.sources.r1.interceptors.i1.serializers = s1
a1.sources.r1.interceptors.i1.serializers.s1.name = today

#配置Sink
a1.sinks.k1.type = hive
a1.sinks.k1.hive.metastore = thrift://lj01:9083
a1.sinks.k1.hive.database = jyjydb
a1.sinks.k1.hive.table = vip
a1.sinks.k1.hive.partition =  %{today}
a1.sinks.k1.serializer =json

#配置Channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000

#绑定关系
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

6.启动flume agent

flume-ng agent --conf ../conf --conf-file ../conf/jyjy.conf --name a1 -Dflume.root.logger=INFO,console

7.将保存测试数据的jyjy.json.tmp上传到 /home/jyjy,上传完成后把 .tmp去掉,就会开始处理数据

select * from vip;  可以发现数据已经加到hive

在Hive中进行数据处理

数据清洗

所有统计都需要去除@jyjy.cn的员工管理账号

方法一:因为此时hive已经具有对表的增删改查,可以直接在表中对不符合的邮箱进行删除

方法二:建立一个新的clearvip清洗表,每天写入数据

为了保证数据不被破坏我选择方法二,这根据自己习惯选择

create table vipclear like vip;

insert into vipclear partition(reportTime = '2019-06-23')

select student_email,student_name,student_gender,student_phone,student_education_background,student_order_code,class_code,series_class_code,center_code,course_id,study_mode,account_state,create_time,operation_mode,modified_fields,before_class_study_mode,after_class_study_mode,before_course_id,after_course_id from vip where reportTime='2019-6-23' and student_email not rlike '^(\\w+@jyjy.cn)$';

此时将符合条件的数据查询到写入vipclear表中。

指标计算 - 新增vip开通数量

 

新增VIP开通数量=正式VIP学员开通数量+试听VIP学员开通数量 +脱产转VIP + VIP转VIP转入(只记新增,不计流失)

创建vip_open_num数据表:时间,课程编号,账户状态,数量,开通模式

create table  vip_open_num(reporttime string , course_id string , account_state string ,num int,open_type string) row format delimited fields terminated by '|' ;

从数据清洗表中导入数据到vip_open_num数据表

 新增VIP学员正式开通数量

insert into vip_open_num
	select
		reporttime,course_id,account_state,count(*),"new_add"
	from
		vipclear
	where
		reporttime = '2019-06-23'
		and
		operation_mode = 'add'
		and
		study_mode = 'vip'
		and
		account_state = 'nomal'
	group by course_id

新增试听开通数量

insert into vip_open_num
	select
		reporttime,course_id,account_state,count(*),"new_add_audition"
	from
		vipclear
	where
		reporttime = '2019-06-23'
		and
		operation_mode = 'add'
		and
		study_mode = 'vip'
		and
		account_state = 'audition'
	group by course_id

 

线下转vip

insert into vip_open_num
select
reporttime,after_course_id,account_state,count(*),"offline2vip"
from
vipclear
where
reporttime = '2019-06-23'
and
operation_mode = 'update'
and
before_class_study_mode='offline'
and
after_class_study_mode='vip'
group by
after_course_id,account_state

vip转vip转入

insert into vip_open_num
	select
		reporttime,after_course_id,account_state,count(*),"vip2vip"
	from 
		vipclear
	where
		reporttime = '2019-06-23'
		and
		operation_mode = 'update'
		and
		before_class_study_mode='vip'
		and
		after_class_study_mode='vip'
		and
		before_course_id != after_course_id
	group by
		after_course_id

查询结果

查询指定课程方向新增vip开通数量

select
    course_id,count(num)
from
    vip_open_num
where
    reportTime = '2019-06-23'
    and
    course_id = 'big'

查询整个精英教育新增vip开通数量

select
    reportTime,count(num)
from  
    vip_open_num
where
    reportTime ='2019-06-23'
    and 
    open_type !='vip2vip'

 

 

导出到关系型数据库

导出数据技术方案

1.通过sqoop导出数据

2.通过SpringData直接将结果数据读取出来写入mysql

这次数据查询非常灵活,所以我选择方法2.

#在MySql中

#创建数据库

create database jyjydb;
use jyjydb;

创建事实表

create table vip_open_num(id int primary key auto_increment,time_id varchar(10),course_id varchar(10),account_state_id varchar(10),open_type_id varchar(10),num int);

创建时间维度表

create table jyjy_time(time_id varchar(10),year int,season int,month int,day int,week int);

#初始化时间维度表

insert into jyjy_time values('2019-06-23',2019, 1,6,13,7);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,1);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,2);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,3);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,4);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,5);
insert into jyjy_time values('2019-06-23',2019, 1,6,13,6);

#创建学科方向维度表

create table jyjy_course(
    course_id varchar(10),
    course_name varchar(10)
);

#初始化学科方向维度表

insert into jyjy_course values('math','数学');
insert into jyjy_course values('english',英语');

#创建账户状态维度表

create table jyjy_account_state(
    account_state_id varchar(10),
    account_state_name varchar(10)
);

#初始化账户状态维度表

insert into jyjy_account_state values('audition','试听');
insert into jyjy_account_state values('normal','正式');

#创建开通方式维度表

create table jyjy_opentype(
opentype_id varchar(40),
opentype_name varchar(40)
);
insert into jyjy_opentype values ('new_add','新增');
insert into jyjy_opentype values ('vip_to_vip','vip转vip转入');
insert into jyjy_opentype values ('offline_to_vip','脱产转vip');

 

开发SpringData

pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>SpringDataHive_Demo01</groupId>
	<artifactId>SpringDataHive_Demo01</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>SpringDataHive_Demo01</name>
	<url>http://maven.apache.org</url>

	<properties>
		<spring.version>4.1.6.RELEASE</spring.version>
		<slf4j.version>1.7.6</slf4j.version>
		<log4j.version>1.2.17</log4j.version>
		<hamcrest.version>1.3</hamcrest.version>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<spring.hadoop.version>2.3.0.M1</spring.hadoop.version>
		<hadoop.version>2.7.1</hadoop.version>
		<hive.version>1.2.1</hive.version>
	</properties>

	<dependencies>
		<dependency>
			<groupId>jdk.tools</groupId>
			<artifactId>jdk.tools</artifactId>
			<version>1.8</version>
			<scope>system</scope>
			<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
		</dependency>

		<!-- junit -->
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.10</version>
			<scope>test</scope>
		</dependency>

		<!-- Spring -->
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-core</artifactId>
			<version>${spring.version}</version>
			<exclusions>
				<exclusion>
					<groupId>commons-logging</groupId>
					<artifactId>commons-logging</artifactId>
				</exclusion>
			</exclusions>
		</dependency>

		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-beans</artifactId>
			<version>${spring.version}</version>
		</dependency>

		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context</artifactId>
			<version>${spring.version}</version>
		</dependency>

		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-aop</artifactId>
			<version>${spring.version}</version>
		</dependency>

		<!-- Logging -->
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>${slf4j.version}</version>
		</dependency>

		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>jcl-over-slf4j</artifactId>
			<version>${slf4j.version}</version>
		</dependency>

		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-log4j12</artifactId>
			<version>${slf4j.version}</version>
			<scope>runtime</scope>
		</dependency>

		<dependency>
			<groupId>log4j</groupId>
			<artifactId>log4j</artifactId>
			<version>${log4j.version}</version>
			<scope>runtime</scope>
		</dependency>

		<!-- Spring Data Hadoop -->
		<dependency>
			<groupId>org.springframework.data</groupId>
			<artifactId>spring-data-hadoop</artifactId>
			<version>${spring.hadoop.version}</version>
			<exclusions>
				<exclusion>
					<groupId>org.springframework</groupId>
					<artifactId>spring-context-support</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<!-- mySql数据库驱动 -->
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.38</version>
		</dependency>
		<!-- Spring Data Jdbc -->
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-jdbc</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<!--SpringData Jpa -->
		<dependency>
			<groupId>org.springframework.data</groupId>
			<artifactId>spring-data-jpa</artifactId>
			<version>1.8.0.RELEASE</version>
		</dependency>
		<dependency>
			<groupId>org.hibernate</groupId>
			<artifactId>hibernate-entitymanager</artifactId>
			<version>4.3.6.Final</version>
		</dependency>
		<!-- Spring Test -->
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-test</artifactId>
			<version>${spring.version}</version>
		</dependency>

		<!-- Spring Tx -->
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-tx</artifactId>
			<version>${spring.version}</version>
		</dependency>

		<!-- Hadoop -->
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>${hadoop.version}</version>
			<scope>compile</scope>
		</dependency>

		<!-- Hive -->
		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-metastore</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-service</artifactId>
			<version>${hive.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.thrift</groupId>
			<artifactId>libfb303</artifactId>
			<version>0.9.1</version>
		</dependency>

		<!-- runtime Hive deps start -->
		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-common</artifactId>
			<version>${hive.version}</version>
			<scope>runtime</scope>
		</dependency>

		<!-- <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-builtins</artifactId> 
			<version>${hive.version}</version> <scope>runtime</scope> </dependency> -->
		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-jdbc</artifactId>
			<version>${hive.version}</version>
			<scope>runtime</scope>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-shims</artifactId>
			<version>${hive.version}</version>
			<scope>runtime</scope>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-serde</artifactId>
			<version>${hive.version}</version>
			<scope>runtime</scope>
		</dependency>

		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-contrib</artifactId>
			<version>${hive.version}</version>
			<scope>runtime</scope>
		</dependency>

		<!-- runtime Hive deps end -->

		<dependency>
			<groupId>org.codehaus.groovy</groupId>
			<artifactId>groovy</artifactId>
			<version>1.8.5</version>
			<scope>runtime</scope>
		</dependency>

	</dependencies>
</project>

ApplicationContext.xml

<?xml version="1.0" encoding="UTF-8"?>
<beans:beans xmlns="http://www.springframework.org/schema/hadoop"
	xmlns:beans="http://www.springframework.org/schema/beans"
	xmlns:tx="http://www.springframework.org/schema/tx" 
	xmlns:jpa="http://www.springframework.org/schema/data/jpa"
	xmlns:context="http://www.springframework.org/schema/context"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="
		http://www.springframework.org/schema/beans 
		http://www.springframework.org/schema/beans/spring-beans.xsd
	  	http://www.springframework.org/schema/context 
	  	http://www.springframework.org/schema/context/spring-context.xsd
	  	http://www.springframework.org/schema/tx 
		http://www.springframework.org/schema/tx/spring-tx-4.0.xsd
	  	http://www.springframework.org/schema/hadoop 
	  	http://www.springframework.org/schema/hadoop/spring-hadoop.xsd
	  	http://www.springframework.org/schema/data/jpa 
		http://www.springframework.org/schema/data/jpa/spring-jpa-1.3.xsd
	  	"
	  	>
	
	<!-- 1.配置MySql数据源 -->
	<beans:bean id="dataSource"
		class="org.springframework.jdbc.datasource.DriverManagerDataSource">
		<beans:property name="driverClassName" value="com.mysql.jdbc.Driver"></beans:property>
		<beans:property name="url" value="jdbc:mysql://lj01:3306/jyjydb"></beans:property>
		<beans:property name="username" value="root"></beans:property>
		<beans:property name="password" value="root"></beans:property>
	</beans:bean>

	<!-- 2.配置SpringDataJPA实体管理器工厂 -->
	<beans:bean id="entityManagerFactory"
		class="org.springframework.orm.jpa.LocalContainerEntityManagerFactoryBean">
		<!-- 数据源 -->
		<beans:property name="dataSource" ref="dataSource"></beans:property>
		<beans:property name="jpaVendorAdapter">
			<beans:bean class="org.springframework.orm.jpa.vendor.HibernateJpaVendorAdapter" />
		</beans:property>
		<!-- 配置扫描哪个包去寻找实体配置 -->
		<beans:property name="packagesToScan" value="com.lj.domain" />
		<!-- JPA相关配置 -->
		<beans:property name="jpaProperties">
			<beans:props>
				<!-- 命名策略 -->
				<beans:prop key="hibernate.ejb.naming_strategy">org.hibernate.cfg.ImprovedNamingStrategy</beans:prop>
				<!-- 数据库方言 -->
				<beans:prop key="hibernate.dialect">org.hibernate.dialect.MySQL5InnoDBDialect</beans:prop>
				<!-- 是否显示sql -->
				<beans:prop key="hibernate.show_sql">true</beans:prop>
				<!-- 是否要格式化sql -->
				<beans:prop key="hibernate.format_sql">true</beans:prop>
				<!-- 是否要自动创建表 -->
				<beans:prop key="hibernate.hbm2ddl.auto">update</beans:prop>
			</beans:props>
		</beans:property>
	</beans:bean>

	<!-- 3.配置SpringDataJPA事务管理器 -->
	<beans:bean id="transactionManager" class="org.springframework.orm.jpa.JpaTransactionManager">
		<beans:property name="entityManagerFactory" ref="entityManagerFactory" />
	</beans:bean>

	<!-- 4.配置支持注解的事务 -->
	<tx:annotation-driven transaction-manager="transactionManager" />

	<!-- 5.配置spring data -->
	<jpa:repositories base-package="com.lj"
		entity-manager-factory-ref="entityManagerFactory" />

	<!-- 6.配置spring包扫描 -->
	<context:component-scan base-package="cn.tedu.domain"></context:component-scan>
	<context:component-scan base-package="cn.tedu.repository"></context:component-scan>

	<!-- 7.配置HDFS地址 -->
	<configuration>
		fs.defaultFS=hdfs://lj01:9000
	</configuration>

	<!-- 8.Hive客户端工厂 -->
	<hive-client-factory id="hiveClientFactory"
		hive-data-source-ref="hiveDataSource" />

	<!-- 9.Hive驱动 -->
	<beans:bean id="hiveDriver" class="org.apache.hive.jdbc.HiveDriver" />

	<!-- 10.Hive数据源 -->
	<beans:bean id="hiveDataSource"
		class="org.springframework.jdbc.datasource.SimpleDriverDataSource">
		<beans:constructor-arg name="driver" ref="hiveDriver" />
		<beans:constructor-arg name="url"
			value="jdbc:hive2://lj01:10000/ttsdb" />
		<beans:constructor-arg name="username" value="root" />
		<beans:constructor-arg name="password" value="root" />
	</beans:bean>

	<!-- 11.Hive模板类,用于操作hive -->
	<hive-template id="hiveTemplate" />

</beans:beans>

主要逻辑

package com.lj;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.data.hadoop.hive.HiveClient;
import org.springframework.data.hadoop.hive.HiveClientCallback;
import org.springframework.data.hadoop.hive.HiveTemplate;

import cn.lj.domain.Vip_Open_Num_Bean;
import cn.lj.repository.VONBRepository;

public class jyjyStart {
	public static void main(String[] args) {
		//0.参数
		String hql = null;
		Date date = new Date();
		SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM-dd");
		final String dateStr = sdf.format(date);
		//1.初始化容器
		ApplicationContext context = new ClassPathXmlApplicationContext("applicationContext.xml");
		HiveTemplate hiveTemplate = (HiveTemplate) context.getBean("hiveTemplate");
		//2.清洗数据到vipclear表
		hql = "insert into vipclear partition(reportTime = '"+dateStr+"') select student_email,student_name,student_gender,student_phone,student_education_background,student_order_code,class_code,series_class_code,center_code,course_id,study_mode,account_state,create_time,operation_mode,modified_fields,before_class_study_mode,after_class_study_mode,before_course_id,after_course_id from vip where reportTime='"+dateStr+"' and student_email not rlike '^(\\w+@jyjy.cn)$';";
		hiveTemplate.query(hql);
		//3.数据处理将结果写入vip_open_num
		//--新增vip
		hql = "insert into vip_open_num select reporttime,course_id,account_state,count(*),'new_add' from  dataclear where reporttime = '"+dateStr+"' and operation_mode = 'add' and study_mode = 'vip' group by reporttime,course_id,account_state";
		hiveTemplate.query(hql);
		//--脱产转vip
		hql = "insert into vip_open_num select reporttime,after_course_id,account_state,count(*),'offline2vip' from  vipclear where reporttime = '"+dateStr+"' and operation_mode = 'update' and before_class_study_mode='offline' and after_class_study_mode='vip' group by reporttime,after_course_id,account_state ";
		hiveTemplate.query(hql);
		//--vip转vip转入
		hql = "insert into vip_open_num select reporttime,after_course_id,account_state,count(*),'vip2vip' from  vipclear where reporttime = '"+dateStr+"' and operation_mode = 'update' and before_class_study_mode='vip' and after_class_study_mode='vip' and before_course_id != after_course_id group by reporttime,after_course_id,account_state ";
		hiveTemplate.query(hql);
		
		//4.从hive中查询vip_open_num数据
		List<Vip_Open_Num_Bean> list = hiveTemplate.execute(new HiveClientCallback<List<Vip_Open_Num_Bean>>() {
			public List<Vip_Open_Num_Bean> doInHive(HiveClient hiveClient) throws Exception {
				List<Vip_Open_Num_Bean> list = new ArrayList<Vip_Open_Num_Bean>();
				String hql = "select * from vip_open_num where reportTime=?";
				Connection conn = hiveClient.getConnection();
				PreparedStatement ps = conn.prepareStatement(hql);
				ps.setString(1, dateStr);
				ResultSet rs = ps.executeQuery();
				while(rs.next()){
					Vip_Open_Num_Bean vonb = new Vip_Open_Num_Bean();
					vonb.setTime_id(rs.getString("reporttime"));
					vonb.setCourse_id(rs.getString("course_id"));
					vonb.setAccount_state_id(rs.getString("account_state"));
					vonb.setOpen_type_id(rs.getString("open_type"));
					vonb.setNum(rs.getInt("num"));
					list.add(vonb);
				}
				return list;
			}
		});
		System.out.println(list.size());
		
		//5.写入mysql的vip_open_num表
		VONBRepository vonbRepository = context.getBean(VONBRepository.class);
		vonbRepository.save(list);
	}
}

 

实体类

package com.lj.domain;

import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;

@Entity(name="vip_open_num")
public class Vip_Open_Num_Bean {
	private int id;
	private String time_id;
	private String course_id;
	private String account_state_id;
	private String open_type_id;
	private int num;
	
	public Vip_Open_Num_Bean() {
	}
	
	public Vip_Open_Num_Bean(int id,String time_id, String course_id, String account_state_id, String open_type_id, int num) {
		this.id = id;
		this.time_id = time_id;
		this.course_id = course_id;
		this.account_state_id = account_state_id;
		this.open_type_id = open_type_id;
		this.num = num;
	}
	
	@Id
	@GeneratedValue
	public int getId() {
		return id;
	}

	public void setId(int id) {
		this.id = id;
	}
	
	public String getTime_id() {
		return time_id;
	}

	public void setTime_id(String time_id) {
		this.time_id = time_id;
	}
	public String getCourse_id() {
		return course_id;
	}
	public void setCourse_id(String course_id) {
		this.course_id = course_id;
	}
	public String getAccount_state_id() {
		return account_state_id;
	}
	public void setAccount_state_id(String account_state_id) {
		this.account_state_id = account_state_id;
	}
	public String getOpen_type_id() {
		return open_type_id;
	}
	public void setOpen_type_id(String open_type_id) {
		this.open_type_id = open_type_id;
	}
	public int getNum() {
		return num;
	}
	public void setNum(int num) {
		this.num = num;
	}

	@Override
	public String toString() {
		return "Vip_Open_Num_Bean [id=" + id + ", time_id=" + time_id + ", course_id=" + course_id
				+ ", account_state_id=" + account_state_id + ", open_type_id=" + open_type_id + ", num=" + num + "]";
	}
}

核心接口CrudRepository

package com.lj.repository;

import org.springframework.data.repository.CrudRepository;

import com.lj.domain.Vip_Open_Num_Bean;

public interface VONBRepository extends CrudRepository<Vip_Open_Num_Bean, String> {

}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值