HDFS处理流程
NameNode是主节点,存储文件的元数据如文件名,文件目录结构,文件属性(生成时间,副本数,文件权限),以及每个文件的块列表和块所在DataNode等。
DataNode 在本地文件系统存储文件块数据,以及块数据的校验和。
Secondary NameNode 用来监控HDFS状态的辅助后台程序,每隔一段时间获取HDFS元数据的快照。
HDFSClient 客户端向Namenode发送请求,获取文件元信息后就近原则找到一台Datanode服务器请求建立socket流,而后在客户端对Datanode各个文件块进行整合,以输出流的形式读取文件信息
若发现Datanode中文件块损坏,客户端将损坏信息发送给Namenode,Namenode处理将Datanode中副本复制一份,覆盖损坏文件块
安装虚拟机+操作系统:VMware、CentOS
设置基础环境三要素:IP(NAT、静态)、主机名、映射(包括本地)
网络配置:设置DNS解析,Ping通外网 创建普通用户:useradd 用户名、passwd 用户名
设置Sudo权限:visudo
设置连接远程工具:secureCRT、FileZilla、Notepad++
禁用安全系统和防火墙:selinux、iptables、chkconfig iptables
卸载系统自带Open jdk并配置Oracle jdk:rpm -qa、-e、vi /etc/profile
package com.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
/**
* @author qiqu
*/
public class FlowDemo extends Configured implements Tool {
public static void main(String[] args) {
int sucess=0;
try {
sucess = ToolRunner.run(new FlowDemo(), args);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(sucess);
}
@Override
public int run(String[] args) throws Exception {
//创建配置对象
Configuration cfg=new Configuration();
//创建job
Job job=Job.getInstance(cfg,this.getClass().getSimpleName());
job.setJarByClass(FlowDemo.class);
//设置输入输出路径
FileInputFormat.setInputPaths(job,args[0]);
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置map信息
job.setMapperClass(FlowMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置reduce信息
job.setReducerClass(FlowReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//提交任务返回结果
boolean sucess = job.waitForCompletion(true);
return sucess?1:0;
}
public static class FlowMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
Text keyOut=new Text();
/**
* @param key 偏移量
* @param value 一行日志
* @param context
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
String s = split[23];
try {
Integer i = Integer.parseInt(s);
String s1 = i.toString();
keyOut.set(s1);
} catch (NumberFormatException e) {
return;
}
IntWritable valueOut = new IntWritable(1);
context.write(keyOut,valueOut);
}
}
public static class FlowReducer extends Reducer<Text,IntWritable,Text, IntWritable> {
/**
*
* @param key
* @param values
* @param context
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for(IntWritable t:values){
sum+=t.get();
}
IntWritable valueOut=new IntWritable(sum);
context.write(key,valueOut);
}
}
}
hive> show tables; #显示表
OK
Time taken: 0.111 seconds
hive> create table student ( #创建student表
> id int comment 'id of student',
> name string comment 'name of student',
> age int comment 'age of student',
> gender string comment 'sex of student',
> addr string
> )
> comment 'this is a demo'
> row format delimited fields terminated by ','; #以,分割
OK
Time taken: 1.266 seconds
hive> load data local inpath '/home/hadoop/student.log' into table student;
Loading data to table mydb.student
Table mydb.student stats: [numFiles=1, totalSize=50]
OK
Time taken: 9.621 seconds
hive> select * from student; #查看结果
OK
1 xiaoming 25 nan shanghai
2 xiaoli 24 nv beijing
Time taken: 1.063 seconds, Fetched: 2 row(s)
hive> dfs -ls /input; #查看Hadoop目录
Found 1 items
-rw-r--r-- 1 hadoop supergroup 47 2019-10-18 20:59 /input/wordcount.txt
hive> desc student; #查看表信息
OK
id int id of student
name string name of student
age int age of student
gender string sex of student
addr string
Time taken: 0.283 seconds, Fetched: 5 row(s)
hive> desc formatted student; #查看表元数据信息
OK
# col_name data_type comment
id int id of student
name string name of student
age int age of student
gender string sex of student
addr string
# Detailed Table Information
Database: mydb
Owner: hadoop
CreateTime: Mon Oct 28 17:28:12 CST 2019
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://hadoop:8020/user/hive/warehouse/mydb.db/student
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE true
comment this is a demo
numFiles 1
totalSize 50
transient_lastDdlTime 1572254946
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
field.delim ,
serialization.format ,
Time taken: 0.145 seconds, Fetched: 35 row(s)
hive> !pwd; #hive操作Linux
/home/hadoop
hive> dfs -ls /input
> ;
Found 1 items
-rw-r--r-- 1 hadoop supergroup 47 2019-10-18 20:59 /input/wordcount.txt
hive> dfs -put /home/hadoop/student.log /input; #将linux本地表日志上传至HDFS文件系统
hive> dfs -ls /input
> ;
Found 2 items
-rw-r--r-- 1 hadoop supergroup 50 2019-10-28 17:35 /input/student.log
-rw-r--r-- 1 hadoop supergroup 47 2019-10-18 20:59 /input/wordcount.txt
hive> create table stu( #创建表
> id int,
> name string,
> age int,
> gender string,
> addr string)
> row format delimited fields terminated by ',';
OK
Time taken: 0.589 seconds
hive> load data inpath '/input/student.log' into table stu; #加载HDFS文件系统的表到hive
Loading data to table mydb.stu
Table mydb.stu stats: [numFiles=1, totalSize=50]
OK
Time taken: 0.877 seconds
hive>
1.创建Hive表并加载日志
0: jdbc:hive2://hadoop:10000/> select * from tongji;
OK
INFO : Compiling command(queryId=hadoop_20191030172727_a431b261-4f4c-44b1-9fd1-fee83e850ca1): select * from tongji
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:tongji.pv, type:string, comment:null), FieldSchema(name:tongji.uv, type:string, comment:null), FieldSchema(name:tongji.vv, type:string, comment:null), FieldSchema(name:tongji.ip, type:string, comment:null), FieldSchema(name:tongji.dt, type:string, comment:null)], properties:null)
INFO : Completed compiling command(queryId=hadoop_20191030172727_a431b261-4f4c-44b1-9fd1-fee83e850ca1); Time taken: 0.112 seconds
INFO : Concurrency mode is disabled, not creating a lock manager
INFO : Executing command(queryId=hadoop_20191030172727_a431b261-4f4c-44b1-9fd1-fee83e850ca1): select * from tongji
INFO : Completed executing command(queryId=hadoop_20191030172727_a431b261-4f4c-44b1-9fd1-fee83e850ca1); Time taken: 0.0 seconds
INFO : OK
2.铜鼓hive sql求出当前日志的pv,uv,vv,ip
date pv vv ip
insert into tongji partition(dt='2019-10-30')select count(id) pv,count(distinct guid) uv,count(distinct sessionid) vv,
count(distinct ip) ip from log_table;
+------------+------------+------------+------------+-------------+--+
| tongji.pv | tongji.uv | tongji.vv | tongji.ip | tongji.dt |
+------------+------------+------------+------------+-------------+--+
| 64972 | 23938 | 24148 | 19566 | 2019-10-30 |
+------------+------------+------------+------------+-------------+--+
1 row selected (0.17 seconds)
0: jdbc:hive2://hadoop:10000/> select * from avg_sessionid;
FAILED: SemanticException [Error 10001]: Line 1:14 Table not found 'avg_sessionid'
Error: Error while compiling statement: FAILED: SemanticException [Error 10001]: Line 1:14 Table not found 'avg_sessionid' (state=42S02,code=10001)
0: jdbc:hive2://hadoop:10000/> select * from avg_session;
OK
INFO : Compiling command(queryId=hadoop_20191030172828_00244016-51e2-4241-ae4b-5ef20afef1bc): select * from avg_session
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:avg_session.avg_time, type:bigint, comment:null), FieldSchema(name:avg_session.dt, type:string, comment:null)], properties:null)
INFO : Completed compiling command(queryId=hadoop_20191030172828_00244016-51e2-4241-ae4b-5ef20afef1bc); Time taken: 0.109 seconds
INFO : Concurrency mode is disabled, not creating a lock manager
INFO : Executing command(queryId=hadoop_20191030172828_00244016-51e2-4241-ae4b-5ef20afef1bc): select * from avg_session
INFO : Completed executing command(queryId=hadoop_20191030172828_00244016-51e2-4241-ae4b-5ef20afef1bc); Time taken: 0.0 seconds
INFO : OK
3.通过hive sql求出当前日期会话的平均访问时长
insert into avg_session partition (dt='2019-10-30') select avg(t.sub) from(select unix_timestamp(max(tracktime))-unix_timestamp(min(tracktime)) sub from log_table)t;
+-----------------------+-----------------+--+
| avg_session.avg_time | avg_session.dt |
+-----------------------+-----------------+--+
| 300 | 2019-10-30 |
+-----------------------+-----------------+--+
1. 在mysql中创建一张user表,添加任意数据
sqoop import \
> --connect jdbc:mysql://hadoop:3306/test_db \
> --username hadoop \
> --password 123456 \
> --table user
2. 通过sqoop实现mysql数据到HDFS上
hdfs dfs -ls /user/hadoop/user
19/10/30 21:00:53 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 4 items
-rw-r--r-- 1 hadoop supergroup 0 2019-10-30 20:59 /user/hadoop/user/_SUCCESS
-rw-r--r-- 1 hadoop supergroup 18 2019-10-30 20:59 /user/hadoop/user/part-m-00000
-rw-r--r-- 1 hadoop supergroup 15 2019-10-30 20:59 /user/hadoop/user/part-m-00001
-rw-r--r-- 1 hadoop supergroup 16 2019-10-30 20:59 /user/hadoop/user/part-m-00002
修改指定路径及map数
sqoop import \
> --connect jdbc:mysql://hadoop:3306/test_db \
> --username hadoop \
> --password 123456 \
> --table user \
> -m 1 \
> --target-dir /output/table/
hdfs dfs -ls /output/table/
19/10/30 21:09:05 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
-rw-r--r-- 1 hadoop supergroup 0 2019-10-30 21:07 /output/table/_SUCCESS
-rw-r--r-- 1 hadoop supergroup 49 2019-10-30 21:07 /output/table/part-m-00000
3. 通过sqoop实现mysql数据到Hive上
sqoop import \
--connect jdbc:mysql://hadoop:3306/test_db \
--username hadoop \
-password 123456 \
--table user \
--fields-terminated-by '\t' \
--delete-target-dir \
-m 1 \
--hive-import \
--hive-database test_db \
--hive-table user
--hive-target-dir \input\table
hdfs dfs -ls /input/table
19/10/30 22:23:18 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
-rw-r--r-- 1 hadoop supergroup 0 2019-10-30 22:09 /input/table/_SUCCESS
-rw-r--r-- 1 hadoop supergroup 49 2019-10-30 22:09 /input/table/part-m-00000
4. 通过sqoop实现mysql数据到HDFS的增量导入
sqoop job \
--create incremental-job \
-- import \
--connect jdbc:mysql://hadoop:3306/test_db \
--username hadoop \
--password 123456 \
--table user \
-m 1 \
--target-dir /hive/incremental \
--check-column id \
--incremental append \
--last-value 0
qoop-job --list
Warning: /opt/modules/sqoop-1.4.6-cdh5.14.2/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /opt/modules/sqoop-1.4.6-cdh5.14.2/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /opt/modules/sqoop-1.4.6-cdh5.14.2/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /opt/modules/sqoop-1.4.6-cdh5.14.2/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
19/10/30 22:38:27 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.14.2
Available jobs:
incremental-job
hdfs dfs -text /user/hive/warehouse/test_db.db/user/part-m-00000
19/10/30 22:48:02 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
1 zhangsan 9000.0
2 lisi 10000.0
3 wangwu 6000.0
数据倾斜的解决方案:数据分配不均问题,按key分组对key增加随机字段进行分区,而后再分离计算输出,combiner局部聚合,解决数据倾斜问题。
explain [sql语句]查看sql消耗及速度
jobs -l 查看hive服务
bg %1 转向后台
fg %1 转向前台
kill -9 [端口号]杀进程
将文件取名 *.sh不需要加#!/bash/bin,将脚本文件存入bin目录并给到执行权限,不需要加./当前目录或sh
beeline -n [用户名] -p[密码] -u[jdbc连接]
desc formatted user_location;查看表元数据信息
1.利用窗口函数,