1、与MySql的交互
数据仓库
OLAP //online analyze process,在线分析处理,延迟性高。
数据库
OLTP //online transaction process在线事务处理,实时性好。
1.1 数据库的连接
1.创建mysql数据库和表
create table users(id int primary key auto_increment , name varchar(20) , age int);
2.idea中创建jdbcDemo模块
package cn.ctgu.jdbcdemo;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
public class App {
public static void main(String[] args) {
try {
//创建连接
String driverClass="com.mysql.jdbc.Driver";
String url="jdbc:mysql://localhost:3306/bigdata";
String username="root";
String password="123456";
Class.forName(driverClass);
Connection conn= DriverManager.getConnection(url,username,password);
//创建语句
Statement st=conn.createStatement();
String sql="insert into users(name,age) values('tom',12)";
st.execute(sql);
st.close();
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
1.2mysql存储过程
使用存储过程能批量处理相同的操作,效率更高。
msyql>-- 定义新的终止符,*****不要带空格*****
mysql>delimiter //
mysql>-- 创建存储过程
mysql>CREATE PROCEDURE simpleproc (OUT param1 INT)
BEGIN
SELECT COUNT(*) INTO param1 FROM users; -- into 是赋值方式之一
END
//
mysql>-- 查看存储过程的状态
mysql>show procedure status //
mysql>-- 查看指定存储过程创建语句
mysql>show create procedure simpleproc ;
mysql>-- 调用存储过程,@a在命令中定义变量
mysql>call simpleproc(@a)
mysql>-- 定义加法存储过程,set赋值语句 :=
mysql>create procedure sp_add(in a int,in b int, out c int)
begin
set c := a + b ;
end
//
java访问存储过程
package cn.ctgu.jdbcdemo.test;
import org.junit.Test;
import java.sql.*;
/*
*测试基本操作
*
* */
public class TestCRUD {
@Test
public void testStatement() throws Exception {
//创建连接
String driverClass="com.mysql.jdbc.Driver";
String url="jdbc:mysql://localhost:3306/bigdata";
String username="root";
String password="123456";
Class.forName(driverClass);
Connection conn= DriverManager.getConnection(url,username,password);
//关闭自动提交
conn.setAutoCommit(false);
//创建语句
Statement st=conn.createStatement();
for(int i=0;i<10000;i++){
String sql="insert into users(name,age) values('tomas"+i+"',"+(i%100)+")";
st.execute(sql);
}
conn.commit();
st.close();
conn.close();
}
@Test
public void testPreStatement() throws Exception {
//创建连接
String driverClass="com.mysql.jdbc.Driver";
String url="jdbc:mysql://localhost:3306/bigdata";
String username="root";
String password="123456";
Class.forName(driverClass);
Connection conn= DriverManager.getConnection(url,username,password);
//关闭自动提交
conn.setAutoCommit(false);
//创建语句
String sql="insert into users(name,age) values(?,?)";
PreparedStatement ppst=conn.prepareStatement(sql);
for(int i=0;i<10000;i++){
ppst.setString(1,"tom"+i);
ppst.setInt(2,i%100);
//将sql保存到批次中
ppst.addBatch();//执行批处理
if(i%2000==0){
//统一执行批次(批量提交)
ppst.executeBatch();
}
}
ppst.executeBatch();
conn.commit();
ppst.close();
conn.close();
}
@Test
public void testCallableStatement() throws Exception {
//创建连接
String driverClass = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/bigdata";
String username = "root";
String password = "123456";
Class.forName(driverClass);
Connection conn = DriverManager.getConnection(url, username, password);
//关闭自动提交
conn.setAutoCommit(false);
//创建可调用语句,调用存储过程
CallableStatement cst = conn.prepareCall("{call sp_add(?,?,?)}");
cst.setInt(1, 2); //绑定参数
cst.setInt(2, 3);
//注册输出参数类型
cst.registerOutParameter(3, Types.INTEGER);
cst.execute();
int sum = cst.getInt(3);
System.out.println(sum);
conn.commit();
conn.close();
}
@Test
public void testCallableStatement1() throws Exception {
//创建连接
String driverClass = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/bigdata";
String username = "root";
String password = "123456";
Class.forName(driverClass);
Connection conn = DriverManager.getConnection(url, username, password);
//关闭自动提交
conn.setAutoCommit(false);
//创建可调用语句,调用存储过程
CallableStatement cst = conn.prepareCall("{call sp_batchinsert(?)}");
cst.setInt(1,1000000); //绑定参数
//注册输出参数类型
cst.execute();
conn.commit();
conn.close();
}
//通过callablefunction调用函数
@Test
public void testFunction() throws Exception {
//创建连接
String driverClass = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/bigdata";
String username = "root";
String password = "123456";
Class.forName(driverClass);
Connection conn = DriverManager.getConnection(url, username, password);
//关闭自动提交
conn.setAutoCommit(false);
//创建可调用语句,调用存储过程
CallableStatement cst = conn.prepareCall("{? = call sf_add(?,?)}");
cst.setInt(2,100);
cst.setInt(3,200);
cst.registerOutParameter(1,Types.INTEGER);
//注册输出参数类型
cst.execute();
System.out.println(cst.getInt(1));
conn.commit();
conn.close();
}
}
java访问存储过程
--------------------------
package com.it18zhang.jdbcdemo.test;
import org.junit.Test;
import java.sql.*;
/**
* 测试基本操作
*/
public class TestCRUD {
/**
* 存储过程
*/
@Test
public void testCallableStatement() throws Exception {
long start = System.currentTimeMillis();
//创建连接
String driverClass = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/big4";
String username = "root";
String password = "root";
Class.forName(driverClass);
Connection conn = DriverManager.getConnection(url, username, password);
//关闭自动提交
conn.setAutoCommit(false);
//创建可调用语句,调用存储过程
CallableStatement cst = conn.prepareCall("{call sp_add(?,?,?)}");
cst.setInt(1,2); //绑定参数
cst.setInt(2,3);
//注册输出参数类型
cst.registerOutParameter(3,Types.INTEGER);
cst.execute();
int sum = cst.getInt(3);
System.out.println(sum);
conn.commit();
conn.close();
System.out.println(System.currentTimeMillis() - start);
}
}
百万数据插入,存储过程的性能
------------------------------
1.创建存储过程
mysql>create procedure sp_batchinsert(in n int)
begin
DECLARE name0 varchar(20); -- 定义在begin内部
DECLARE age0 int;
DECLARE i int default 0 ;
while i < n do
set name0 := concat('tom',i) ;
set age0 := i % 100 ;
insert into users(name,age) values(name0,age0);
set i := i + 1 ;
end while ;
end
//
2.java代码
@Test
public void testCallableStatement() throws Exception {
long start = System.currentTimeMillis();
//创建连接
String driverClass = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/big4";
String username = "root";
String password = "root";
Class.forName(driverClass);
Connection conn = DriverManager.getConnection(url, username, password);
//关闭自动提交
conn.setAutoCommit(false);
//创建可调用语句,调用存储过程
CallableStatement cst = conn.prepareCall("{call sp_batchinsert(?)}");
cst.setInt(1,1000000); //绑定参数
//注册输出参数类型
cst.execute();
conn.commit();
conn.close();
System.out.println(System.currentTimeMillis() - start);
}
mysql函数
------------------
1.函数和存储过程相似,只是多了返回值声明.
2.创建函数
mysql>create function sf_add(a int ,b int) returns int
begin
return a + b ;
end
//
3.显式创建的函数
mysql>show function status --
mysql>show function status like '%add%' --
mysql>select sf_add(1,2) --
4.java调用函数
@Test
public void testFunction() throws Exception {
long start = System.currentTimeMillis();
//创建连接
String driverClass = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/big4";
String username = "root";
String password = "root";
Class.forName(driverClass);
Connection conn = DriverManager.getConnection(url, username, password);
//关闭自动提交
conn.setAutoCommit(false);
//创建可调用语句,调用存储过程
CallableStatement cst = conn.prepareCall("{? = call sf_add(?,?)}");
cst.setInt(2,100);
cst.setInt(3,200);
cst.registerOutParameter(1,Types.INTEGER);
//注册输出参数类型
cst.execute();
System.out.println(cst.getInt(1));
conn.commit();
conn.close();
System.out.println(System.currentTimeMillis() - start);
}
1.3 事务(transaction)
和数据库之间的一组操作,特点:
a //atomic,原子性,不可分割.
c //consistent,不能破坏掉
i //isolate,隔离型.
d //durable.永久性
truncate:截断表,类似于delete操作,速度快,数据无法回滚。
truncate table users ;
commit //提交
rollback //回滚
savePoint //保存点
事务的并发执行,容易出现的几个现象
1.脏读
读未提交,一个事务读取了另外一个事务改写还没有提交的数据,如果另外一个
事务在稍后的时候回滚。
2.不可重复读
一个事务进行相同条件查询连续的两次或者两次以上,每次结果都不同。
有其他事务做了update操作。
3.幻读
和(2)很像,其他事务做了insert操作.
隔离级别
为了避免出现哪种并发现象的。
1 //read uncommitted ,读未提交
2 //read committed ,读已提交
4 //repeatable read ,可以重复读
8 //serializable ,串行化(悲观锁)
mysql事务隔离级别示例操作
1.开启mysql客户端
mysql>
2.关闭自动提交
mysql>set autocommit 0 ;
3.每次操作数据,都要开启事务,提交事务。
脏读现象
------------
[A]
1)mysql>start transaction ; -- 开始事务
2)msyql>update users set age = age + 1 where id = 1 ; -- 更新数据,没有提交
6)mysql>rollback ; -- 回滚
7)mysql>select * from users ;
[B]
3)mysql>set session transaction isolation level read uncommitted ; -- 读未提交
4)msyql>start transaction ; -- 开始事务
5)mysql>select * from users ; -- 13
避免脏读
----------------
[A]
1)mysql>start transaction ; -- 开始事务
2)msyql>update users set age = age + 1 where id = 1 ; -- 更新数据,没有提交
6)mysql>rollback ; -- 回滚
7)mysql>select * from users ;
[B]
3)mysql>set session transaction isolation level read committed ; -- 读已提交
4)msyql>start transaction ; -- 开始事务
5)mysql>select * from users ; -- 13
测试不可重复读(隔离级别设置为读已提交不能避免不可重复读。)
------------------
[A]
1)mysql>commit ;
2)mysql>set session transaction isolation level read committed ; -- 读已提交
3)mysql>start transaction ; -- 开始事务
4)mysql>select * from users ; -- 查询
9)mysql>select * from users ;
[B]
5)mysql>commit;
6)mysql>start transaction ;
7)mysql>update users set age = 15 where id = 1 ; -- 更新
8)mysql>commit;
测试避免不可重复读(隔离级别设置为读已提交不能避免不可重复读。)
------------------
[A]
1)mysql>commit ;
2)mysql>set session transaction isolation level repeatable read ; -- 可以重复读
3)mysql>start transaction ; -- 开始事务
4)mysql>select * from users ; -- 查询
9)mysql>select * from users ;
[B]
5)mysql>commit;
6)mysql>start transaction ;
7)mysql>update users set age = 15 where id = 1 ; -- 更新
8)mysql>commit;
测试幻读(隔离级别设置为repeatable)
------------------
[A]
1)mysql>commit ;
2)mysql>set session transaction isolation level serializable; -- 串行化
3)mysql>start transaction ; -- 开始事务
4)mysql>select * from users ; -- 查询
9)mysql>select * from users ;
[B]
5)mysql>commit;
6)mysql>start transaction ;
7)mysql>insert into users(name,age) values('tomas',13); -- 更新
8)mysql>commit;
注意:MySQL支持四种隔离级别。默认隔离级别是可以重复读。隔离级别如果是seriable,则不支持并发写。
编程实现脏读现象
package cn.ctgu.jdbcdemo.test;
import org.junit.Test;
import java.sql.*;
public class TestIsolationLevel {
/*
*
* 执行写,不提交
* */
@Test
public void testA() throws Exception {
//创建连接
String driverClass="com.mysql.jdbc.Driver";
String url="jdbc:mysql://localhost:3306/bigdata";
String username="root";
String password="123456";
Class.forName(driverClass);
Connection conn= DriverManager.getConnection(url,username,password);
//设置事务为手动提交
conn.setAutoCommit(false);
Statement st=conn.createStatement();
st.execute("update users set age=80 where id=1");
System.out.println("==================");
conn.commit();
conn.close();
}
/*
*
* 查询,查到别人没有提交的数据
* */
@Test
public void testB() throws Exception {
//创建连接
String driverClass="com.mysql.jdbc.Driver";
String url="jdbc:mysql://localhost:3306/bigdata";
String username="root";
String password="123456";
Class.forName(driverClass);
Connection conn= DriverManager.getConnection(url,username,password);
//设置隔离级别读未提交====》导致脏读
conn.setTransactionIsolation(Connection.TRANSACTION_READ_UNCOMMITTED);
conn.setAutoCommit(false);
Statement st=conn.createStatement();
ResultSet rs=st.executeQuery("select age from users where id=1");
rs.next();
int age =rs.getInt(1);
System.out.println(age);
System.out.println("=======================");
conn.commit();
conn.close();
}
}
1.3 连接查询
1.准备表[mysql.sql]
drop table if exists customers; -- 删除表
drop table if exists orders ; -- 删除表
create table customers(id int primary key auto_increment , name varchar(20) , age int); -- 创建customers表
create table orders(id int primary key auto_increment , orderno varchar(20) , price float , cid int); -- 创建orders表
-- 插入数据
insert into customers(name,age) values('tom',12);
insert into customers(name,age) values('tomas',13);
insert into customers(name,age) values('tomasLee',14);
insert into customers(name,age) values('tomason',15);
-- 插入订单数据
insert into orders(orderno,price,cid) values('No001',12.25,1);
insert into orders(orderno,price,cid) values('No002',12.30,1);
insert into orders(orderno,price,cid) values('No003',12.25,2);
insert into orders(orderno,price,cid) values('No004',12.25,2);
insert into orders(orderno,price,cid) values('No005',12.25,2);
insert into orders(orderno,price,cid) values('No006',12.25,3);
insert into orders(orderno,price,cid) values('No007',12.25,3);
insert into orders(orderno,price,cid) values('No008',12.25,3);
insert into orders(orderno,price,cid) values('No009',12.25,3);
insert into orders(orderno,price,cid) values('No0010',12.25,NULL);
2.查询--连接查询
mysql>-- 笛卡尔积查询,无连接条件查询
mysql>select a.*,b.* from customers a , orders b ;
mysql>-- 内连接,查询符合条件的记录.
mysql>select a.*,b.* from customers a , orders b where a.id = b.cid ;
mysql>-- 左外连接,查询符合条件的记录.
mysql>select a.*,b.* from customers a left outer join orders b on a.id = b.cid ;
mysql>-- 右外连接,查询符合条件的记录.
mysql>select a.*,b.* from customers a right outer join orders b on a.id = b.cid ;
mysql>-- 全外连接,查询符合条件的记录(mysql不支持全外链接)
mysql>select a.*,b.* from customers a full outer join orders b on a.id = b.cid ;
2.查询--分组
字段列表 表 条件 分组 组内条件 排序 分页
mysql>select ... from ... where ... group by ... having ... order by ... limit ..
mysql>-- 去重查询
mysql>select distinct price,cid from orders ;
mysql>-- 条件查询
mysql>select price,cid from orders where price > 12.27 ;
mysql>-- 聚集查询
mysql>select max(price) from orders ;
mysql>select min(price) from orders ;
mysql>select avg(price) from orders ;
mysql>select sum(price) from orders ;
mysql>select count(id) from orders ;
mysql>-- 分组查询
mysql>select max(price) from orders where cid is not null group by cid ;
mysql>-- 分组查询(组内过滤)
mysql>select cid ,orderno,max(price) as max_price,min(price) from orders where cid is not null group by cid having max_price > 20 ;
mysql>-- 降序查询
mysql>select cid ,orderno,max(price) as max_price,min(price) from orders where cid is not null group by cid having max_price > 20 order by max_price desc;
mysql>-- 模糊查询
mysql>select * from customers where name like 'toma%'
mysql>select * from customers where name not like 'toma%'
mysql>-- 范围查询
mysql>select * from customers where id in (1,2,3)
mysql>select * from customers where id not in (1,2,3)
mysql>-- between 1 and 10,闭区间
mysql>select * from customers where id between 1 and 3 ;
mysql>select * from customers where id >= 1 and id <= 3 ;
mysql>-- 嵌套子查询(查询没有订单的客户)
mysql>select * from customers where id not in (select distinct cid from orders where cid is not null);
mysql>-- 嵌套子查询(查询订单数量>2的客户)
mysql>select * from customers where id in (select cid from orders group by cid having count(cid) > 2);
mysql>select * from customers where id in ( select t.cid from (select cid,count(*) as c from orders group by cid having c > 2) as t);
mysql>-- 嵌套子查询(查询客户id,客户name,订单数量,最贵订单,最便宜订单,平均订单价格 where 订单数量>2的客户)
mysql>select a.id,a.name,b.c,b.max,b.min,b.avg
from customers a,((select cid,count(cid) c , max(price) max ,min(price) min,avg(price) avg from orders group by cid having c > 2) as b)
where a.id = b.cid ;
Hadoop实现连接查询
数据:
[customers.txt]
1,tom,12
2,tom,13
3,tom,14
4,tom,15
[orders.txt]
1,no001,12.23,1
2,no001,12.23,1
3,no001,12.23,2
4,no001,12.23,2
5,no001,12.23,2
6,no001,12.23,3
7,no001,12.23,3
8,no001,12.23,3
9,no001,12.23,3
map端join
MapJoinMapper.java
package cn.ctgu.mr.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
/*
* join操作,map端连接
*
* */
public class MapJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
private Map<String,String> allCustomers=new HashMap<String,String>();
//启动,初始化客户信息
@Override
protected void setup(Context context) throws IOException, InterruptedException {
try{
Configuration conf=context.getConfiguration();
FileSystem fs=FileSystem.get(conf);
FSDataInputStream fis=fs.open(new Path("file:///J:\\Program\\file\\mr\\customers.txt"));
//得到缓冲区间阅读器
BufferedReader br=new BufferedReader(new InputStreamReader(fis));
String line=null;
while((line=br.readLine())!=null){
//得到cid
String cid=line.substring(0,line.indexOf(","));
allCustomers.put(cid,line);
}
}catch (Exception e){
e.printStackTrace();
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//订单信息
String line=value.toString();
//提取customer id
String cid=line.substring(line.lastIndexOf(",")+1);
//订单信息
String orderInfo=line.substring(0,line.lastIndexOf(","));
//连接customer+","+order
String customerInfo=allCustomers.get(cid);
context.write(new Text(customerInfo+","+orderInfo),NullWritable.get());
}
}
MapJoinApp.java
package cn.ctgu.mr.join;
import cn.ctgu.mr.WCMapper;
import cn.ctgu.mr.WCReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* Created by Administrator on 2018/6/9.
*/
public class MapJoinApp {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
conf.set("fs.defaultFS","file:///");
Job job=Job.getInstance(conf);
//设置job的各种属性
job.setJobName("MapJoinApp");//作业名称
job.setJarByClass(MapJoinApp.class);//搜索类
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//没有reduce
job.setNumReduceTasks(0);
job.setMapperClass(MapJoinMapper.class);//mapper类
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.waitForCompletion(true);
}
}
MR实现左外连接
1.自定义key
package cn.ctgu.mr.join.reduceJoin;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class ComboKey implements WritableComparable<ComboKey>{
//0-customer 1-order
private int type;
private int cid;
private int oid;
private String customerInfo="";
private String orderInfo="";
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public int getCid() {
return cid;
}
public void setCid(int cid) {
this.cid = cid;
}
public int getOid() {
return oid;
}
public void setOid(int oid) {
this.oid = oid;
}
public String getCustomerInfo() {
return customerInfo;
}
public void setCustomerInfo(String customerInfo) {
this.customerInfo = customerInfo;
}
public String getOrderInfo() {
return orderInfo;
}
public void setOrderInfo(String orderInfo) {
this.orderInfo = orderInfo;
}
public int compareTo(ComboKey o) {
int type0=o.type;
int cid0=o.cid;
int oid0=o.oid;
String customerInfo0=o.customerInfo;
String orderInfo0=o.orderInfo;
//判断是否是同一个customer的数据
if(cid==cid0){
//相同则为同一个客户的两个订单
if(type==type0){
return oid-oid0;
}
//一个Customer+它的order
else{
if(type==0){
return -1;
}else{
return 1;
}
}
}
//cid不同
else{
return cid-cid0;
}
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(type);
dataOutput.writeInt(cid);
dataOutput.writeInt(oid);
dataOutput.writeUTF(customerInfo);
dataOutput.writeUTF(orderInfo);
}
public void readFields(DataInput dataInput) throws IOException {
this.type=dataInput.readInt();
this.cid=dataInput.readInt();
this.oid=dataInput.readInt();
this.customerInfo=dataInput.readUTF();
this.orderInfo=dataInput.readUTF();
}
}
2.自定义分区类
package cn.ctgu.mr.join.reduceJoin;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/*
*
* 自定义分区,按照CID分区
* */
public class CIDPartitioner extends Partitioner<ComboKey,NullWritable>{
public int getPartition(ComboKey comboKey, NullWritable nullWritable, int i) {
return comboKey.getCid()%i;
}
}
3.创建Mapper
package cn.ctgu.mr.join.reduceJoin;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class ReduceJoinMapper extends Mapper<LongWritable,Text,ComboKey,NullWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//
String line = value.toString() ;
//判断是customer还是order
FileSplit split = (FileSplit)context.getInputSplit();
String path = split.getPath().toString();
//客户信息
ComboKey key2 = new ComboKey();
if(path.contains("customers")){
String cid = line.substring(0,line.indexOf(","));
String custInfo = line ;
key2.setType(0);
key2.setCid(Integer.parseInt(cid));
key2.setCustomerInfo(custInfo);
}
//order info
else{
String cid = line.substring(line.lastIndexOf(",") + 1);
String oid = line.substring(0, line.indexOf(","));
String oinfo = line.substring(0, line.lastIndexOf(","));
key2.setType(1);
key2.setCid(Integer.parseInt(cid));
key2.setOid(Integer.parseInt(oid));
key2.setOrderInfo(oinfo);
}
context.write(key2,NullWritable.get());
}
}
4.创建Reducer
package cn.ctgu.mr.join.reduceJoin;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
* ReduceJoinReducer,reducer端连接实现。
*/
public class ReduceJoinReducer extends Reducer<ComboKey,NullWritable,Text,NullWritable> {
protected void reduce(ComboKey key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
Iterator<NullWritable> it = values.iterator();
it.next();
int type = key.getType();
int cid = key.getCid() ;
String cinfo = key.getCustomerInfo() ;
while(it.hasNext()){
it.next();
String oinfo = key.getOrderInfo();
context.write(new Text(cinfo + "," + oinfo),NullWritable.get());
}
}
}
5.创建排序对比器
package cn.ctgu.mr.join.reduceJoin;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 组合Key排序对比器
*/
public class ComboKeyComparator extends WritableComparator {
protected ComboKeyComparator() {
super(ComboKey.class, true);
}
public int compare(WritableComparable a, WritableComparable b) {
ComboKey k1 = (ComboKey) a;
ComboKey k2 = (ComboKey) b;
return k1.compareTo(k2);
}
}
6.分组对比器
package cn.ctgu.mr.join.reduceJoin;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* CID分组对比器
*/
public class CIDGroupComparator extends WritableComparator {
protected CIDGroupComparator() {
super(ComboKey.class, true);
}
public int compare(WritableComparable a, WritableComparable b) {
ComboKey k1 = (ComboKey) a;
ComboKey k2 = (ComboKey) b;
return k1.getCid() - k2.getCid();
}
}
7.App
package cn.ctgu.mr.join.reduceJoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ReduceJoinApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
//设置job的各种属性
job.setJobName("ReduceJoinApp"); //作业名称
job.setJarByClass(ReduceJoinApp.class); //搜索类
//添加输入路径
FileInputFormat.addInputPath(job,new Path("D:\\mr\\reducejoin"));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path("D:\\mr\\reducejoin\\out"));
job.setMapperClass(ReduceJoinMapper.class); //mapper类
job.setReducerClass(ReduceJoinReducer.class); //reducer类
//设置Map输出类型
job.setMapOutputKeyClass(ComboKey.class); //
job.setMapOutputValueClass(NullWritable.class); //
//设置ReduceOutput类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class); //
//设置分区类
job.setPartitionerClass(CIDPartitioner.class);
//设置分组对比器
job.setGroupingComparatorClass(CIDGroupComparator.class);
//设置排序对比器
job.setSortComparatorClass(ComboKeyComparator.class);
job.setNumReduceTasks(2); //reduce个数
job.waitForCompletion(true);
}
}
2、Hive
2.1 Hive的介绍及特点
Hive是hadoop处理结构化数据的数据仓库,不是关系数据库,不是OLTP,也不能实时查询和行级更新。
hive存储数据结构(schema)在数据库中,处理的数据进入hdfs,支持OLAP,使用HQL / HiveQL语法(支持大量的mysql语法)。
2.2 Hive的安装以及基本命令操作
Hive的安装
1.下载hive2.1-tar.gz
2.tar开
$>tar -xzvf hive-2.1.0.tar.gz -C /soft //tar开
$>cd /soft/hive-2.1.0 //
$>ln -s hive-2.1.0 hive //符号连接
3.配置环境变量
[/etc/profile]
HIVE_HOME=/soft/hive
PATH=...:$HIVE_HOME/bin
4.验证hive安装成功
$>hive --v
5.配置hive,使用win7的mysql存放hive的元数据.
a)复制mysql驱动程序到hive的lib目录下。
...
b)配置hive-site.xml
复制hive-default.xml.template为hive-site.xml
修改连接信息为mysql链接地址,将${system:...字样替换成具体路径。
[hive/conf/hive-site.xml]
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
<description>password to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>Username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.231.1:3306/hive2</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
6)在msyql中创建存放hive信息的数据库
mysql>create database hive2 ;
7)初始化hive的元数据(表结构)到mysql中。
$>cd /soft/hive/bin
$>schematool -dbType mysql -initSchema
hive命令行操作
创建hive的数据库
$hive>hive --version //
$hive>hive --help //
$hive>create database mydb2 ; //
$hive>show databases ;
$hive>use mydb2 ;
$hive>create table mydb2.t(id int,name string,age int);
$hive>drop table t ;
$hive>drop table mydb2.t ;
$hive>select * from mydb2.t ; //查看指定库的表
$hive>exit ; //退出
$>hive //相当于开启hive的客户端(不支持并发)hive --service cli
2.3 通过远程jdbc方式连接到hive数据仓库
1.启动hiveserver2服务器,监听端口10000
$>hive --service hiveserver2 &
2.通过beeline命令行连接到hiveserver2
$>beeline //进入beeline命令行(于hive --service beeline)
$beeline>!help //查看帮助
$beeline>!quit //退出
$beeline>!connect jdbc:hive2://localhost:10000/mydb2//连接到hive数据
$beeline>show databases ;
$beeline>use mydb2 ;
$beeline>show tables; //显式表
使用Hive-jdbc驱动程序采用jdbc方式访问远程数据仓库
1.创建java模块
2.引入maven
3.添加hive-jdbc依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.ctgu</groupId>
<artifactId>HiveDemo</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
</project>
package cn.ctgu.hiveDemo;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
/*
*
* 使用jdbc方式连接到hive数据仓库,数据仓库需要开启hiveserver2服务
*
*
* */
public class App {
public static void main(String[] args) throws Exception {
Class.forName("org.apache.hive.jdbc.HiveDriver");
Connection conn= DriverManager.getConnection("jdbc:hive2://172.25.11.200:10000/mydb2");
//System.out.println(conn);
Statement st=conn.createStatement();
ResultSet rs=st.executeQuery("select id,name,age from t");
while(rs.next()){
System.out.println(rs.getInt(1)+","+rs.getString(2)+","+rs.getInt(3));
}
rs.close();
st.close();
conn.close();
}
}
2.4 hive中的表操作
1.managed table
托管表。
删除表时,数据也删除了。
2.external table
外部表。
删除表时,数据不删。
//创建表,external 外部表
$hive>CREATE external TABLE IF NOT EXISTS t2(id int,name string,age int)
COMMENT 'xx' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE ;
//查看表数据
$hive>desc t2 ;
$hive>desc formatted t2 ;
//加载数据到hive表
$hive>load data local inpath '/home/centos/customers.txt' into table t2 ; //local上传文件
$hive>load data inpath '/user/centos/customers.txt' [overwrite] into table t2 ; //移动文件
//复制表
mysql>create table tt as select * from users ; //携带数据和表结构
mysql>create table tt like users ; //不带数据,只有表结构
hive>create table tt as select * from users ;
hive>create table tt like users ;
//count()查询要转成mr
$hive>select count(*) from t2 ;
$hive>select id,name from t2 ;
$hive>select * from t2 order by id desc ; //MR
//启用/禁用表
$hive>ALTER TABLE t2 ENABLE NO_DROP; //不允许删除
$hive>ALTER TABLE t2 DISABLE NO_DROP; //允许删除
//分区表,优化手段之一,从目录的层面控制搜索数据的范围。
//创建分区表.
$hive>CREATE TABLE t3(id int,name string,age int) PARTITIONED BY (Year INT, Month INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//显式表的分区信息
$hive>SHOW PARTITIONS t3;
//添加分区,创建目录
$hive>alter table t3 add partition (year=2014, month=12);
//删除分区
$hive>ALTER TABLE employee_partitioned DROP IF EXISTS PARTITION (year=2014, month=11);
//分区结构(创建分区相当于创建目录)
/user/hive/warehouse/mydb2.db/t3/year=2014/month=11
/user/hive/warehouse/mydb2.db/t3/year=2014/month=12
//加载数据到分区表
$hive>load data local inpath '/home/centos/customers.txt' into table t3 partition(year=2014,month=11);
//创建桶表(创建桶表相当于创建文件)
$hive>CREATE TABLE t4(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//加载数据不会进行分桶操作
$hive>load data local inpath '/home/centos/customers.txt' into table t4 ;
//查询t3表数据插入到t4中。
$hive>insert into t4 select id,name,age from t3 ;
桶表的数量如何设置?
评估数据量,保证每个桶的数据量block的2倍大小。
2.5 连接查询
建表
$hive>CREATE TABLE customers(id int,name string,age int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
$hive>CREATE TABLE orders(id int,orderno string,price float,cid int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//加载数据到表
//内连接查询
$hive>select a.*,b.* from customers a , orders b where a.id = b.cid ;
//左外
$hive>select a.*,b.* from customers a left outer join orders b on a.id = b.cid ;
$hive>select a.*,b.* from customers a right outer join orders b on a.id = b.cid ;
$hive>select a.*,b.* from customers a full outer join orders b on a.id = b.cid ;
//explode,炸裂,表生成函数。
//使用hive实现单词统计
//1.建表
$hive>CREATE TABLE doc(line string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
$hive>select split(line,' ') from doc //对一行按空格进行切割
$hive>select explode(split(line,' ')) from doc //对切割开的数据进行炸开(它只能针对集合操作)
$hive>select explode(split(line, ' ')) as word from doc //将切割开的数据作为word
$hive>select explode(split(line, ' ')) as word from doc) as t //将切割开的数据作为word并命名为子表t
$hive>select t.word from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word //按单词分组查询
$hive>select t.word,count(*) c from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word order by c desc limit 2 //按单词个数降序查找前两个单词
下面一句相当于上面的所有
$hive>select t.word,count(*) c from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word order by c desc limit 2 ;
创建新表:stats(word string,c int) ;
将查询结果插入到指定表中。
2.6 hive的导出数据及排序操作
$hive>EXPORT TABLE customers TO '/user/centos/tmp.txt'; //导出表结构+数据。
//order全排序(会导致所有的数据集中在一台reducer节点上,然后进行排序,这样很可能会超过单个节点的磁盘和内存存储能力导致任务失败。)
$hive>select * from orders order by id asc ;
//sort,map端排序,本地有序。(分组排序,即有多个reduce,每一个reduce上是有序的)
$hive>select * from orders sort by id asc ;
//DISTRIBUTE BY类似于mysql的group by,进行分区操作。
//select cid , ... from orders distribute by cid sort by name ; //注意顺序.
$hive>select id,orderno,cid from orders distribute by cid sort by cid desc ;
//cluster by ===> distribute by cid sort by cid
$hive>select id,orderno,cid from orders cluster by cid;
2.7 设置作业参数
$hive>set hive.exec.reducers.bytes.per.reducer=xxx //设置reducetask的字节数。
$hive>set hive.exec.reducers.max=0 //设置reduce task的最大任务数
$hive>set mapreduce.job.reduces=0 //设置reducetask个数。
动态分区
动态分区模式:strict-严格模式,插入时至少指定一个静态分区,nonstrict-非严格模式-可以不指定静态分区。
set hive.exec.dynamic.partition.mode=nonstrict //设置非严格模式
$hive>INSERT OVERWRITE TABLE employees PARTITION (country, state) SELECT ..., se.cnty, se.st FROM staged_employees se WHERE se.cnty = 'US';
hive事务处理在>0.13.0之后支持行级事务。
要求:
1.所有事务自动提交。
2.只支持orc格式。
3.使用bucket表。
4.配置hive参数,使其支持事务。
$hive>SET hive.support.concurrency = true; // 支持并发
$hive>SET hive.enforce.bucketing = true; // 桶处理
$hive>SET hive.exec.dynamic.partition.mode = nonstrict; // 动态分区模式
$hive>SET hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; // 事务管理器
$hive>SET hive.compactor.initiator.on = true;
$hive>SET hive.compactor.worker.threads = 1;
5.使用事务性操作
$hive>CREATE TABLE tx(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as orc TBLPROPERTIES ('transactional'='true');
聚合处理
$hive>select cid,count(*) c ,max(price) from orders group by cid having c > 1 ;
2.7 视图
view:视图是虚表,逻辑上的表,便于复杂的连接查询操作
//创建视图
$hive>create view v1 as select a.id aid,a.name ,b.id bid , b.order from customers a left outer join orders b on a.id = b.cid ;
//查看视图
$hive>show tables ;
$hive>select * from v1 ;
Map端连接
$hive>set hive.auto.convert.join=true //设置自动转换连接,默认开启了。
//使用mapjoin连接暗示实现mapjoin
$hive>select /*+ mapjoin(customers) */ a.*,b.* from customers a left outer join orders b on a.id = b.cid ;
2.8 调优
explain,使用explain查看查询计划
$hive>explain [extended] select count(*) from customers ;
$hive>explain select t.name , count(*) from (select a.name ,b.id,b.orderno from customers a ,orders b where a.id = b.cid) t group by t.name ;
//设置limit优化测,避免全部查询.
$hive>set hive.limit.optimize.enable=true
//本地模式
$hive>set mapred.job.tracker=local; //
$hive>set hive.exec.mode.local.auto=true //自动本地模式,测试
//并行执行,同时执行不存在依赖关系的阶段。
$hive>set hive.exec.parallel=true
//严格模式
$hive>set hive.mapred.mode=strict
分区表必须指定分区进行查询,order by时必须使用limit子句,不允许笛卡尔积。
//设置MR的数量
$hive> set hive.exec.reducers.bytes.per.reducer=750000000; //设置reduce处理的字节数。
//JVM重用
$hive>set mapreduce.job.jvm.numtasks=1 //-1没有限制,使用大量小文件。
2.9 UDF(User define function,用户自定义函数)
//显式所有函数
$hive>show functions;
$hive>select array(1,2,3) ;
//显式指定函数帮助
$hive>desc function current_database();
//表生成函数,多行函数。
$hive>explode(str,exp); //按照exp切割str.
自定义加法函数
1.创建类,继承UDF
package com.it18zhang.hivedemo.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* 自定义hive函数
*/
@Description(name = "myadd",
value = "myadd(int a , int b) ==> return a + b ",
extended = "Example:\n"
+ " myadd(1,1) ==> 2 \n"
+ " myadd(1,2,3) ==> 6;")
public class AddUDF extends UDF {
public int evaluate(int a ,int b) {
return a + b ;
}
public int evaluate(int a ,int b , int c) {
return a + b + c;
}
}
2.打成jar包。
cmd>cd {classes所在目录}
cmd>jar cvf HiveDemo.jar -C x/x/x/x/classes/ .
3.添加jar包到hive的类路径
$>cp /mnt/hgfs/downloads/bigdata/data/HiveDemo.jar /soft/hive/lib
4.重进入hive
5.创建临时函数
CREATE TEMPORARY FUNCTION myadd AS 'com.it18zhang.hivedemo.udf.AddUDF';
6.在查询中使用自定义函数
$hive>select myadd(1,2) ;
自定义日期函数
1)定义类
public class ToCharUDF extends UDF {
/**
* 取出服务器的当前系统时间 2017/3/21 16:53:55
*/
public String evaluate() {
Date date = new Date();
SimpleDateFormat sdf = new SimpleDateFormat();
sdf.applyPattern("yyyy/MM/dd hh:mm:ss");
return sdf.format(date) ;
}
public String evaluate(Date date) {
SimpleDateFormat sdf = new SimpleDateFormat();
sdf.applyPattern("yyyy/MM/dd hh:mm:ss");
return sdf.format(date) ;
}
public String evaluate(Date date,String frt) {
SimpleDateFormat sdf = new SimpleDateFormat();
sdf.applyPattern(frt);
return sdf.format(date) ;
}
}
2)导出jar包,通过命令添加到hive的类路径(不需要重进hive)。
$hive>add jar /mnt/hgfs/downloads/bigdata/data/HiveDemo-1.0-SNAPSHOT.jar
3)注册函数
$hive>CREATE TEMPORARY FUNCTION to_char AS 'com.it18zhang.hivedemo.udf.ToCharUDF';
$hive>CREATE TEMPORARY FUNCTION to_date AS 'com.it18zhang.hivedemo.udf.ToDateUDF';
自定义Nvl函数
package com.it18zhang.hivedemo.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
/**
* 自定义null值处理函数
*/
public class Nvl extends GenericUDF {
private GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver;
private ObjectInspector[] argumentOIs;
public ObjectInspector initialize(ObjectInspector[] arguments)
throws UDFArgumentException {
argumentOIs = arguments;
//检查参数个数
if (arguments.length != 2) {
throw new UDFArgumentLengthException(
"The operator 'NVL' accepts 2 arguments.");
}
returnOIResolver = new GenericUDFUtils.ReturnObjectInspectorResolver(true);
//检查参数类型
if (!(returnOIResolver.update(arguments[0]) && returnOIResolver
.update(arguments[1]))) {
throw new UDFArgumentTypeException(2,
"The 1st and 2nd args of function NLV should have the same type, "
+ "but they are different: \"" + arguments[0].getTypeName()
+ "\" and \"" + arguments[1].getTypeName() + "\"");
}
return returnOIResolver.get();
}
public Object evaluate(DeferredObject[] arguments) throws HiveException {
Object retVal = returnOIResolver.convertIfNecessary(arguments[0].get(), argumentOIs[0]);
if (retVal == null) {
retVal = returnOIResolver.convertIfNecessary(arguments[1].get(),
argumentOIs[1]);
}
return retVal;
}
public String getDisplayString(String[] children) {
StringBuilder sb = new StringBuilder();
sb.append("if ");
sb.append(children[0]);
sb.append(" is null ");
sb.append("returns");
sb.append(children[1]);
return sb.toString();
}
}
2)添加jar到类路径
3)注册函数
$hive>CREATE TEMPORARY FUNCTION nvl AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFNvl';
2.10 避免数据倾斜
Hive避免数据倾斜的方式只需要设置以下几个属性即可:
$hive>SET hive.optimize.skewjoin=true;
$hive>SET hive.skewjoin.key=100000;
$hive>SET hive.groupby.skewindata=true;