开发环境:windos +ieda
生产环境:hdoop+mapreduce+hbase
生产hadoop用户,主要为测试hdoop+mapreduce,真实环境需要加入数据采集,数据存储,数据展示等技术,hbase存储不多说,网上很多。主要是数据处理与分析需要制定大量规则
下述为演示,只写基本规则,伪分布集群存储等
------------------------------------------------------------------------------------------
1.解压缩Hadoop安装包
tar -zxvf hadoop-2.7.1.tar.gz
2.需要配置的文件如下,hadoop-env.sh,core-site.xml,hdfs-site.xml,所有的文件均位于/etc/hadoop下面,具体需要的配置如下:
core-site.xml
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/usr/local/hadoop/tmp</value>
<description>Abase for other temporary directories.</description>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/data</value>
</property>
</configuration>
hadoop-env.sh
export JAVA_HOME="/usr/java/jdk1.8.0_261"
- 初始化HDFS系统
3.1在hadop2.7.1目录下使用如下命令:
bin/hdfs namenode -format
3.2开启NameNode和DataNode守护进程
使用如下命令开启:
sbin/start-dfs.sh
注:如果未进行ssh进行无密码验证登录,需要输入密码
-------------------------------------------------------------------------------------------------------------------------------
1.在HDFS中新建一个文件夹,用于上传本地的words文档,在hadoop2.7.1目录下输入如下命令:
[root@master hadoop]# ./bin/hdfs dfs -mkdir -p /input
[root@master hadoop]# ./bin/hdfs dfs -ls /
Found 2 items
drwxr-xr-x - root supergroup 0 2021-05-07 11:29 /input
drwxr-xr-x - root supergroup 0 2021-05-07 11:24 /test
//上传
[root@master hadoop]# ./bin/hdfs dfs -put /usr/local/hadoop/input/word.txt /input
2.运行
[root@master hadoop]# bin/hadoop jar wordCountSimple-1.0.jar WordCountDriver /input/word.txt /input/out
3.测试是否成功
[root@master ~]# cd /usr/local/hadoop
[root@master hadoop]# bin/hdfs dfs -ls /input
Found 2 items
drwxr-xr-x - root supergroup 0 2021-05-07 12:08 /input/out
-rw-r--r-- 1 root supergroup 179 2021-05-07 11:29 /input/word.txt
[root@master hadoop]# bin/hdfs dfs -ls /input/out
Found 2 items
-rw-r--r-- 1 root supergroup 0 2021-05-07 12:08 /input/out/_SUCCESS
-rw-r--r-- 1 root supergroup 131 2021-05-07 12:08 /input/out/part-r-00000
[root@master hadoop]# bin/hadoop fs -cat /input/out/part-r-00000
3
" 4
""hello"""hello"" 2
"hello 3
"hello" 3
"hello"""hello"""hello"""hello"""hello"""hello"""hello"" 1
hello" 1
hello""hello"" 1
------------------------------------------------------------------------------------------------------------------------
word文档:
" hello"
" hello""hello""
"hello ""hello"""hello""
"hello ""hello"""hello""
" "hello"
"hello"
"hello"""hello"""hello"""hello"""hello"""hello"""hello""
"hello"
"hello "
--------------------------------------------------------------------------------------------------------------------------
java代码
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.5</version>
</dependency>
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
//主类,指定job参数并提交job
public class WordCountDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//jar包位置
job.setJarByClass(WordCountDriver.class);
//指定mapper类和reducer类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//map阶段,输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//输出的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//判断是否有output文件夹
Path path = new Path(args[1]);
FileSystem fileSystem = path.getFileSystem(conf);// 根据path找到这个文件
if (fileSystem.exists(path)) {
fileSystem.delete(path, true);// true的意思是,就算output有东西,也一带删除
}
//数据读取组件 输出组件
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//5.封装参数 本次job 要处理的数据集在哪 生成的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//6.封装参数 项启动多个reduce task数量
job.setNumReduceTasks(1);
//7.提交job
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//Map:每个单词记一个1
//输入key(一行文本的偏移量),输入value(一行文本内容),输出key(单词),输出value(单词个数)
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
//MapTask调用map方法,每读取一个(keyIn,valueIn),就调用一次map方法
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获取每一行的文本内容
String lines = value.toString();
String[] words = lines.split(" ");
for (String word :words) {
context.write(new Text(word), new IntWritable(1));
}
}
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//输入key(map的输出key:单词),输入value(map的输出value:单词个数),输出key(单词),输出value(单词个数)
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
//ReduceTask调用reduce方法:会先将接收到的kv对按照key分组(key相同为一组);然后每组key调用一次reduce方法
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int count =0;
for(IntWritable v :values){
count += v.get();
}
context.write(key, new IntWritable(count));
}
}
先clean,后packsge 生成jar包,使用idea完成
打的包放在hadoop解压目录下
bin/hadoop jar wordCountSimple-1.0.jar WordCountDriver /input/word.txt /input/out
/input/word.txt是在hdfs生成的文件夹,同时将本地word上传
/input/out输出位置 ip加50070
或者使用指令:bin/hdfs dfs -ls /input/out
查看详细信息:bin/hadoop fs -cat /input/out/part-r-00000
上述有,不多说