启动流程
先开zookeeper zkServer.sh start
start-all.sh
开hbase start-hbase.sh
node03 hive --service metastore
node04 hive
nginx是开机就有了不用管了
flume写好配置文件 flume-ng agent --conf-file /root/flumedir/project --name a1 -Dflume.root.logger=INFO,console
关闭的时候倒着来就行
systemctl start mysqld 开启node1的mysql
systemctl status mysqld 查询mysql状态
sqoop: sqoop --options-file option3
面试时候说 集群规模30-40台 hadoop 10 zook 3 hbase 5
说的时候都说内存不说磁盘 比如128G32核
数据级别 条数:千万级别
大小:几百个G、
1. access.log只能接收nginx主页的信息,其他网页的信息没有收集
解决办法:先看看其他网页js发送信息的目标对不对,是不是你的集群hadoop102
再看看浏览器缓存cookie等信息是否清理,因为不清理你刚才对tomcat的改变不会生效
2.flume执行命令不生效 flume-ng agent --conf-file project --name a1 -Dflume.root.logger=INFO,console
解决办法:这个命令必须去你配置文件所在文件夹执行才可 或者配置绝对路径
flume-ng agent --conf-file /root/flumedir/project --name a1 -Dflume.root.logger=INFO,console
3.Package name ‘xxx‘ does not correspond to the file path ‘xxx‘
解决办法:包名不符路径,要把source这个标记改一下
4.maven导入依赖要正确 如果下载太慢 看看maven那个配置文件是不是忘记设了
5. hbase shell进入后 list会报错 org.apache.hadoop.hbase.ipc.ServerNotRunningYetException: Server is not running yet 错误
解决办法:是Hadoop版本3.3.x高了导致的兼容问题,要么就是hdfs进入安全模式了,但是实际上通过对hbase/conf/hbase-env.sh修改,去掉注释export HBASE_DISABLE_HADOOP_CLASSPATH_LOOKUP="true"后,可以再试试,成功执行。
6. hbase建表失败 org.apache.hadoop.hbase.PleaseHoldException: Master is initializing
org.apache.hadoop.hbase.PleaseHoldException: Master is initializing_weixin_43648549的博客-CSDN博客
7. 运行案例报错 file找系统找不到??
hbase的依赖要在hadoop后面
8. org.apache.hadoop.security.HadoopKerberosName.setRuleMechanism
在远程访问hadoop时,需要提供认证信息
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>3.2.0</version>
</dependency>
9. idea老显示早就更改的信息
maven clean一下
10. hadoop报错:HADOOP_HOME and hadoop.home.dir are unset. 解决方法
hadoop报错:HADOOP_HOME and hadoop.home.dir are unset. 解决方法_TQ2的博客-CSDN博客
运行Hadoop的mr程序:org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
主要是本地运行hadoop要添加本地的环境变量
11. mr设计思想
12. 解析args[]的时候 获取不到最终你传入的时间参数
this.processArgs(conf,args); Job job = Job.getInstance(conf,"analyse_new_install_user");
这个要再创建job对象之前
13.
from (
select
pl, from_unixtime(cast(s_time/1000 as bigint),'yyyy-MM-dd') as day, u_ud,
(case when count(p_url) = 1 then "pv1"
when count(p_url) = 2 then "pv2"
when count(p_url) = 3 then "pv3"
when count(p_url) = 4 then "pv4"
when count(p_url) >= 5 and count(p_url) <10 then "pv5_10"
when count(p_url) >= 10 and count(p_url) <30 then "pv10_30"
when count(p_url) >=30 and count(p_url) <60 then "pv30_60"
else 'pv60_plus' end) as pv
from event_logs
where
en='e_pv'
and p_url is not null
and pl is not null
and s_time >= unix_timestamp('2019-04-29','yyyy-MM-dd')*1000
and s_time < unix_timestamp('2019-04-30','yyyy-MM-dd')*1000
group by
pl, from_unixtime(cast(s_time/1000 as bigint),'yyyy-MM-dd'), u_ud
) as tmp
insert overwrite table stats_view_depth_tmp
select pl,day,pv,count(u_ud) as ct where u_ud is not null group by pl,day,pv;
中间的查询操作是指 我从eventlog里面把每一个用户在每个平台某天的浏览记录都拿出来,然后以平台:日期:uuid分组
得到的是 某个用户在每个平台某一天所有的前置url 通过count(p_url)把这个用户浏览深度取出来,分配成pv
最终结果就是
pl date uuid pv
website 2019-08-20 uuid1 pv2
website 2019-08-20 uuid2 pv5
website 2019-08-20 uuid3 pv4
得到不同用户在当天的浏览深度
然后这个只是中间查询 我需要的是website这个平台 各种浏览深度的用户人数
所以需要再对上表 按平台 日期 以及 PV分组 并计算相应PV量的组里面一共有多少人
插入中间表里面
中间表的内容为
pl date pv ct
website 2019-08-20 pv1 5666
website 2019-08-20 pv2 2566
website 2019-08-20 pv3 89
。。。
然后这个只是结果的 列形表示 我们需要的是最终的一行里面这些东西
with tmp as
(
select pl,`date` as date1,ct as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv1' union all
select pl,`date` as date1,0 as pv1,ct as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv2' union all
select pl,`date` as date1,0 as pv1,0 as pv2,ct as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv3' union all
select pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,ct as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv4' union all
select pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,ct as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv5_10' union all
select pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,ct as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv10_30' union all
select pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,ct as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv30_60' union all
select pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,ct as pv60_plus from stats_view_depth_tmp where col='pv60_plus' union all
select 'all' as pl,`date` as date1,ct as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv1' union all
select 'all' as pl,`date` as date1,0 as pv1,ct as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv2' union all
select 'all' as pl,`date` as date1,0 as pv1,0 as pv2,ct as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv3' union all
select 'all' as pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,ct as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv4' union all
select 'all' as pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,ct as pv5_10,0 as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv5_10' union all
select 'all' as pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,ct as pv10_30,0 as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv10_30' union all
select 'all' as pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,ct as pv30_60,0 as pv60_plus from stats_view_depth_tmp where col='pv30_60' union all
select 'all' as pl,`date` as date1,0 as pv1,0 as pv2,0 as pv3,0 as pv4,0 as pv5_10,0 as pv10_30,0 as pv30_60,ct as pv60_plus from stats_view_depth_tmp where col='pv60_plus'
)
from tmp
insert overwrite table stats_view_depth
select 2,date_convert(date1),6,sum(pv1),sum(pv2),sum(pv3),sum(pv4),sum(pv5_10),sum(pv10_30),sum(pv30_60),sum(pv60_plus),'2019-03-29' group by pl,date1;
ct as pv1 是说 pv1这个属性列里的值为 ct这个值
(case when count(p_url) = 1 then "pv1"
when count(p_url) = 2 then "pv2"
when count(p_url) = 3 then "pv3"
when count(p_url) = 4 then "pv4"
when count(p_url) >= 5 and count(p_url) <10 then "pv5_10"
when count(p_url) >= 10 and count(p_url) <30 then "pv10_30"
when count(p_url) >=30 and count(p_url) <60 then "pv30_60"
else 'pv60_plus' end) as pv
这个是说pv这个属性列里的值 为pv1或者。。
上面的思想叫行转列 因为一行里要求的东西太多了 我们要把行内每个属性单独求一下