1,启动hdfs集群
2,启动hive
create database testdb 创建testdb库
关闭hive保留关键字检查功能:
set hive.support.sql11.reserved.keywords=false
创建一张userlog表:
CREATE TABLE IF NOT EXISTS testdb.userlog(
date string ,
timestamp string ,
user_id string ,
page_id string ,
channel string ,
action string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
表中插入数据:加载本地数据到hive
load data local inpath '/root/resource/userLog' into table testdb.userlog;
hive中的测试操作:是用来和Spark core性能对比
查询page的PV
SELECT date,page_id,count(*) pv FROM userlog WHERE action = 'View' GROUP BY date, page_id ORDER BY pv DESC LIMIT 10;
查询page的UV
SELECT date, page_id, count(distinct(user_id)) uv FROM userlog WHERE action = 'View' GROUP BY date,page_id ORDER BY uv DESC LIMIT 3;
最热门的channel
SELECT date,channel,count(*) channelpv FROM userlog WHERE action = 'View' GROUP BY date, channel ORDER BY channelpv DESC LIMIT 10;
spark core操作:
1,启动spark
2,开启Hive的metaStore服务 在~目录下 ./startHiveMetastoreService.sh启动
3,启动spark core : ./spark-shell --master spark://node1:7077 --total-executor-cores 3
4、import org.apache.spark.sql.hive.HiveContext
val hc = new HiveContext(sc)
hc.sql("show databases").show //查看数据库结构
hc.sql("use testdb").show //切换到testdb库
hc.sql("SELECT date,page_id,count(*) pv FROM userlog WHERE action = 'View' GROUP BY date, page_id ORDER BY pv DESC LIMIT 10").show