炼数成金 课程。
1、安装mysql(ubuntu)
检查是否安装和启动了mysql:netstat -tap|grep mysql
安装mysql:sudo apt-get intall mysql-derver mysql-client 安装mysql
新建hadoop用户和建立hive 数据库:
[root@hadoop]# mysql -uroot -p
mysql> grant all on *.* to mysql@'%' identified by 'mysql' with grant option;
mysql> create user 'hadoop' identified by 'hadoop';
mysql> grant all on *.* to hadoop@'%' with grant option;
mysql> quit;
[root@hadoop9 hadoop]# mysql -uhadoop -p
mysql> create database hive;
mysql> quit;
关闭和启动mysql:sudo service mysql start|stop|restart
2、安装hive
解压hive
tar zxf apache-hive-0.13.1-bin.tar.gz
mv apache-hive-0.13.1-bin hive013
cd hive013/conf
cp hive-default.xml.template hive-site.xml
cp hive-env.sh.template hive-env.sh
vi hive-env.sh
HADOOP_HOME=~/h2/hadoop
vi hive-site.xml
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://hadoop3:3306/hive?=createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hadoop</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hadoop</value>
<description>password to use against metastore database</description>
</property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://hadoop3:3306/hive?=createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hadoop</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hadoop</value>
<description>password to use against metastore database</description>
</property>
增加驱动程序, 下载mysql-connector-java-5.1.26-bin.jar,放到/hive/lib下
启动hive后,即可正常使用。
3、hive client 安装
1)
主机,启动metastore服务
hive --service metastore
注:后台方式启动:nohup hive --service metastore > metastore.log
2)
客户端,修改hive-site.xml,配置uris参数:
<property>
<name>hive.metastore.uris</name>
<value>thrift://master:9083</value>
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
</property>
<name>hive.metastore.uris</name>
<value>thrift://master:9083</value>
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
</property>
3)
客户端启动:
hive
4、Hive使用
1)
建立外部表:
hive 只认文件目录,需要把文件先移动到目录中(内部表无此问题)
CREATE EXTERNAL TABLE SOGOUQ1(DT STRING,WEBSESSION STRING,WORD STRING,S_SEQ INT,C_SEQ INT,WEBSITE STRING) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION '/dataguru/data/SogouQ1';
申明了列的分割"\t",行的分割"\n",存储在"/dataguru/data/SogouQ1"。
然后即可在mysql中查证:
select * from TBLS;
2)查询:
Select count(*) from SOGOUQ1;
//显示前10行数据
select * from SOGOUQ1 limit 10; // 没有涉及计算,返回非常快
//搜索结果排名第1,但是点击次序排在第2的数据有多少?
select count(*) from SOGOUQ1 where S_SEQ=1 and C_SEQ=2;
//搜索用户点击的URL含baidu的数据有多少?
select count(*) from SOGOUQ1 where WEBSITE like '%baidu%';
//搜索结果排名第1,但是点击次序排在第2,URL含baidu的数据有多少?
select count(*) from SOGOUQ1 where S_SEQ=1 and C_SEQ=2 and WEBSITE like '%baidu%';
//session查询次数排行榜
select WEBSESSION,count(WEBSESSION) as cw from SOGOUQ1 group by WEBSESSION order by cw desc limit 10;
3)创建内部表并加载数据,**原数据被移动到默认路径
CREATE TABLE SOGOUQ2(DT STRING,WEBSESSION STRING,WORD STRING,S_SEQ INT,C_SEQ INT,WEBSITE STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' ;
LOAD DATA INPATH '/dataguru/data/SogouQ2.txt' INTO TABLE SOGOUQ2;
//查询有多少行数据
Select count(*) from SOGOUQ2;
//session查询次数排行榜
select WEBSESSION,count(WEBSESSION) as cw from SOGOUQ2 group by WEBSESSION order by cw desc limit 10;
select WEBSESSION,count(WEBSESSION) as cw from SOGOUQ2 group by WEBSESSION order by cw desc limit 10;
4)装载本地数据
CREATE DATABASE SALEDATA;
use SALEDATA;
//qryTheDate.txt文件定义了日期的分类,将每天分别赋予所属的月份、星期、季度等属性
//日期,年月,年,月,日,周几,第几周,季度,旬、半月
CREATE TABLE tblDate(dateID string,theyearmonth string,theyear string,themonth string,thedate string,theweek string,theweeks string,thequot string,thetenday string,thehalfmonth string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;
//qrytblStock.txt文件定义了订单表头
//订单号,交易位置,交易日期
CREATE TABLE tblStock(ordernumber STRING,locationid STRING,dateID string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;
//qryStockDetail.txt文件定义了订单明细
//订单号,行号,货品,数量,金额
CREATE TABLE tblStockDetail(ordernumber STRING,rownum int,itemid STRING,qty INT,price int ,amount int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;
LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qryTheDate.txt' INTO TABLE tblDate;
LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qrytblStock.txt' INTO TABLE tblStock;
LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qryStockDetail.txt' INTO TABLE tblStockDetail;
use SALEDATA;
//qryTheDate.txt文件定义了日期的分类,将每天分别赋予所属的月份、星期、季度等属性
//日期,年月,年,月,日,周几,第几周,季度,旬、半月
CREATE TABLE tblDate(dateID string,theyearmonth string,theyear string,themonth string,thedate string,theweek string,theweeks string,thequot string,thetenday string,thehalfmonth string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;
//qrytblStock.txt文件定义了订单表头
//订单号,交易位置,交易日期
CREATE TABLE tblStock(ordernumber STRING,locationid STRING,dateID string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;
//qryStockDetail.txt文件定义了订单明细
//订单号,行号,货品,数量,金额
CREATE TABLE tblStockDetail(ordernumber STRING,rownum int,itemid STRING,qty INT,price int ,amount int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' ;
LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qryTheDate.txt' INTO TABLE tblDate;
LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qrytblStock.txt' INTO TABLE tblStock;
LOAD DATA LOCAL INPATH '/home/mmicky/data/spark/saledata/qryStockDetail.txt' INTO TABLE tblStockDetail;
//异常数据
select sum(b.amount) from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber;
68100782
select sum(b.amount) from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid;
68099079
select a.* from tblstock a where a.dateid not in (select dateid from tblDate);
//select * from tblstock where tblstock.dateid not in (select dateid from tblDate);
//所有订单中每年的销售单数、销售总额
select c.theyear,count(distinct a.ordernumber),sum(b.amount) from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear order by c.theyear;
2004 1094 3265696
2005 3828 13247234
2006 3772 13670416
2007 4885 16711974
2008 4861 14670698
2009 2619 6322137
2010 94 210924
//所有订单中季度销售额前10位
select c.theyear,c.thequot,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,c.thequot order by sumofamount desc limit 10;
2008 1 5252819
2007 4 4613093
2007 1 4446088
2006 1 3916638
2008 2 3886470
2007 3 3870558
2007 2 3782235
2006 4 3691314
2005 1 3592007
2005 3 3304243
//列出销售金额在100000以上的单据
select a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.ordernumber having sumofamount>100000;
HMJSL00009024 119058
HMJSL00009958 159126
//所有订单每年最大金额订单的销售额
第一步:
select a.dateid,a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.dateid,a.ordernumber
第二步:
select c.theyear,max(d.sumofamount) from tbldate c,(select a.dateid,a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.dateid,a.ordernumber) d where c.dateid=d.dateid group by c.theyear sort by c.theyear;
2004 23612
2005 38180
2006 36124
2007 159126
2008 55828
2009 25810
2010 13063
//所有订单中每年最畅销货品
第一步:
select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid;
第二步:
select d.theyear,max(d.sumofamount) as maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) d group by d.theyear ;
第三步:
select distinct e.theyear,e.itemid,f.maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) e , (select d.theyear,max(d.sumofamount) as maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) d group by d.theyear) f where e.theyear=f.theyear and e.sumofamount=f.maxofamount order by e.theyear;
2004 JY424420810101 53374
2005 24124118880102 56569
2006 JY425468460101 113684
2007 JY425468460101 70226
2008 E2628204040101 97981
2009 YL327439080102 30029
2010 SQ429425090101 4494
select sum(b.amount) from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber;
68100782
select sum(b.amount) from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid;
68099079
select a.* from tblstock a where a.dateid not in (select dateid from tblDate);
//select * from tblstock where tblstock.dateid not in (select dateid from tblDate);
//所有订单中每年的销售单数、销售总额
select c.theyear,count(distinct a.ordernumber),sum(b.amount) from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear order by c.theyear;
2004 1094 3265696
2005 3828 13247234
2006 3772 13670416
2007 4885 16711974
2008 4861 14670698
2009 2619 6322137
2010 94 210924
//所有订单中季度销售额前10位
select c.theyear,c.thequot,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,c.thequot order by sumofamount desc limit 10;
2008 1 5252819
2007 4 4613093
2007 1 4446088
2006 1 3916638
2008 2 3886470
2007 3 3870558
2007 2 3782235
2006 4 3691314
2005 1 3592007
2005 3 3304243
//列出销售金额在100000以上的单据
select a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.ordernumber having sumofamount>100000;
HMJSL00009024 119058
HMJSL00009958 159126
//所有订单每年最大金额订单的销售额
第一步:
select a.dateid,a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.dateid,a.ordernumber
第二步:
select c.theyear,max(d.sumofamount) from tbldate c,(select a.dateid,a.ordernumber,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b where a.ordernumber=b.ordernumber group by a.dateid,a.ordernumber) d where c.dateid=d.dateid group by c.theyear sort by c.theyear;
2004 23612
2005 38180
2006 36124
2007 159126
2008 55828
2009 25810
2010 13063
//所有订单中每年最畅销货品
第一步:
select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid;
第二步:
select d.theyear,max(d.sumofamount) as maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) d group by d.theyear ;
第三步:
select distinct e.theyear,e.itemid,f.maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) e , (select d.theyear,max(d.sumofamount) as maxofamount from (select c.theyear,b.itemid,sum(b.amount) as sumofamount from tblStock a,tblStockDetail b,tbldate c where a.ordernumber=b.ordernumber and a.dateid=c.dateid group by c.theyear,b.itemid) d group by d.theyear) f where e.theyear=f.theyear and e.sumofamount=f.maxofamount order by e.theyear;
2004 JY424420810101 53374
2005 24124118880102 56569
2006 JY425468460101 113684
2007 JY425468460101 70226
2008 E2628204040101 97981
2009 YL327439080102 30029
2010 SQ429425090101 4494