1.预备知识
1.1 linux须知
- linux微内核的特性,vmware安装linux
- centos:稳定
- linux常操作目录:/bin,/usr,/etc
- xshell:实用xshell操作centos
- linux记事本:vi/vim
1.2 常用命令
- 帮助:man
- 目录:mkdir,rmdir,mv,ls,rm -rf,cd
- 文件:touch/vi,cat,cp,rm,more,grep
- 搜索:which,whereis,find
- 时间:date ,date -s
- 用户和组管理:useradd…,groupadd…
- 进程:ps -ef,kill -9 进程id,pkill -p id 或者/-f 进程
- 网络:netstat -aux
- 磁盘:df
- 压缩和解压:zip,unzip,tar
- tar -zcvf 压缩
- tar -zxvf 解压
- 软件:yum
- yum list
- yum install
- yum remove
- rpm -ivh,evh:了解
- 上传、下载(lrzsz):rz,sz
- 定时任务:crontab -e
- min,h,d,m,week
- crontab -l
- crontab -r:删除
1.3 shell脚本
-
变量:
x,$x
-
运算符:
$[3+6]
-
判断:
if [];then fi
-
循环:
for(()) 或者 for x in list
do
done
while [ ]
do
done
-
函数:
function fun(){} fun
2.hadoop之windows配置
-
解压hadoop压缩文件
-
指定HADOOP_HOME
-
指定path:/bin,/sbin
-
测试:
hadoop version
3.在linux上搭建hadoop集群
集群成员:
主机 hdfs yarn
master namenode ,secondarynamenode resourcemanager
slave1 datanode nodemanager
slave2 datanode nodemanager
3.1 安装jdk8,hadoop3.2.1
-
上传压缩文件并解压(/usr)
-
设置环境变量(/etc/profile)
export JAVA_HOME=/usr/jdk8
export HADOOP_HOME=/usr/hadoop321
export PATH= P A T H : PATH: PATH:JAVA_HOME/bin: H A D O O P H O M E / b i n : HADOOP_HOME/bin: HADOOPHOME/bin:HADOOP_HOME/sbin -
激活配置文件
. /etc/profile
-
测试:
hadoop version
3.2 hdfs配置
-
core-site.xml
fs.defaultFS hdfs://master:9000
-
hdfs-site.xml
dfs.replication 2 dfs.http.address 0.0.0.0:5700 dfs.namenode.name.dir file:///root/hadoop/dfs/namenode dfs.datanode.data.dir file:///root/hadoop/dfs/datanode dfs.webhdfs.enabled true
-
初始化namenode
hdfs namenode -format
-
start-dfs.sh,stop-dfs.sh
#设置用户
HDFS_NAMENODE_USER=root
HDFS_DATANODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root -
hadoop-env.sh
export JAVA_HOME=/usr/jdk8
3.3 集群成员配置
-
域名与ip绑定(/etc/hosts)
192.168.85.129 master
192.168.85.130 slave1
192.168.85.131 slave2 -
配置workers(工作节点)(/usr/hadoop321/etc/hadoop/workers)
slave1
slave2
-
修改副本数量(数据节点数量)(hdfs-site.xml)
dfs.replication 2
3.4 yarn配置
-
yarn-site.xml
yarn.nodemanager.aux-services mapreduce_shuffle yarn.resourcemanager.hostname master yarn.resourcemanager.webapp.address master:8088 yarn.application.classpath /usr/hadoop321/etc/hadoop:/usr/hadoop321/share/hadoop/common/lib/*:/usr/hadoop321/share/hadoop/common/*:/usr/hadoop321/share/hadoop/hdfs:/usr/hadoop321/share/hadoop/hdfs/lib/*:/usr/hadoop321/share/hadoop/hdfs/*:/usr/hadoop321/share/hadoop/mapreduce/lib/*:/usr/hadoop321/share/hadoop/mapreduce/*:/usr/hadoop321/share/hadoop/yarn:/usr/hadoop321/share/hadoop/yarn/lib/*:/usr/hadoop321/share/hadoop/yarn/*
-
mapred-site.xml
mapreduce.framework.name yarn
-
start-yarn.sh,stop-yarn.sh
YARN_RESOURCEMANAGER_USER=root
YARN_NODEMANAGER_USER=root
3.5 cnetos克隆
-
修改主机名
hostnamectl set-hostanme 主机名
-
删除/tmp目录下的文件,使数据节点在浏览器端能看见(注意:防火墙关闭)
systemctl disable firewalld(开机不自启)
3.6 master免密登录slave
-
在root目录创建密钥:
ssh-keygen
-
authorized_keys拷贝到slave上
cat id_rsa.pub >> authorized_keys
scp 拷贝到salve的.ssh文件夹下
scp authorized_keys root@slave1:/root/.ssh
3.7 启动hadoop集群
-
在master上启动
start-all.sh
-
测试
jps
-
查看节点
hdfs dfsadmin -report
4. mapreduce实例
4.1 单词统计(入门)
/**
* 英文单词统计
*/
public class WordCounter {
//实现分词
public static class MyMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
public static Text text = new Text();
public static IntWritable intWritable = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String v = value.toString();
String[] words = v.split(" ");
for (String word : words) {
text.set(word);
context.write(text,intWritable);
}
}
}
//实现统计
public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count+=value.get();
}
context.write(key,new IntWritable(count));
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
// 任务
Job job = Job.getInstance(conf);
job.setJobName("firstJob");
job.setJarByClass(WordCounter.class);
// 设置mapper,reducer
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 设置输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出目录
FileInputFormat.setInputPaths(job,"data6");
FileOutputFormat.setOutputPath(job,new Path("dTemp"));
// 执行并关闭
job.waitForCompletion(true);
job.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
4.2 中文分词统计(ik)
/**
* 中文单词统计
*/
public class CNWordCounter {
// 实现中文分词
public static class MyMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
public static Text text = new Text();
public static IntWritable intWritable = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
byte[] bytes = value.toString().getBytes();
ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
InputStreamReader isReader = new InputStreamReader(bis);
IKSegmenter ikSegmenter = new IKSegmenter(isReader, true);
Lexeme lexeme=null;
while ((lexeme=ikSegmenter.next())!=null){
String word = lexeme.getLexemeText();
text.set(word);
context.write(text,intWritable);
}
}
}
// 实现统计
public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
public static Text text = new Text();
public static List<Record> list =new ArrayList<Record>();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count=0;
for (IntWritable value : values) {
count+=value.get();
}
// context.write(key,new IntWritable(count));
Record record = new Record(key.toString(), count);
list.add(record);
}
// 实现排序
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Collections.sort(list);
Collections.reverse(list);
for (Record record : list) {
text.set(record.getWord());
context.write(text,new IntWritable(record.getCount()));
}
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
Job job = Job.getInstance(conf);
job.setJobName("secondJob");
job.setJarByClass(CNWordCounter.class);
// 设置mapper,reducer
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出目录
FileInputFormat.setInputPaths(job,"/test99/data2");
FileOutputFormat.setOutputPath(job,new Path("/test99/out"));
// 启动任务并关闭
job.waitForCompletion(true);
job.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
4.3 数据清洗(去重、去空、去非)
/**
* 数据清洗:去空,去重,去非
*/
public class DataClear {
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
Job job = Job.getInstance(conf);
job.setJobName("clearJob");
job.setJarByClass(DataClear.class);
// mapper
job.setMapperClass(RemoveReplyMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 输入输出目录
FileInputFormat.setInputPaths(job,"data4"