基于centos8.1hdoop+mapreduce+hbase

最新推荐文章于 2022-09-09 11:02:24 发布

杨宇sss

最新推荐文章于 2022-09-09 11:02:24 发布

阅读量147

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/m0_37942145/article/details/116493985

版权

hadoop 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

开发环境：windos +ieda

生产环境：hdoop+mapreduce+hbase

生产hadoop用户，主要为测试hdoop+mapreduce，真实环境需要加入数据采集，数据存储，数据展示等技术，hbase存储不多说，网上很多。主要是数据处理与分析需要制定大量规则

下述为演示，只写基本规则，伪分布集群存储等

------------------------------------------------------------------------------------------

1.解压缩Hadoop安装包

tar -zxvf hadoop-2.7.1.tar.gz

2.需要配置的文件如下，hadoop-env.sh，core-site.xml，hdfs-site.xml，所有的文件均位于/etc/hadoop下面，具体需要的配置如下：

core-site.xml

<name>hadoop.tmp.dir</name>

<value>file:/usr/local/hadoop/tmp</value>

<description>Abase for other temporary directories.</description>

</property>

<name>fs.defaultFS</name>

<value>hdfs://localhost:9000</value>

</property>

</configuration>

hdfs-site.xml

<name>dfs.replication</name>

</property>

<name>dfs.namenode.name.dir</name>

<value>file:/usr/local/hadoop/tmp/dfs/name</value>

</property>

<name>dfs.datanode.data.dir</name>

<value>file:/usr/local/hadoop/tmp/dfs/data</value>

</property>

</configuration>

hadoop-env.sh

export JAVA_HOME="/usr/java/jdk1.8.0_261"

初始化HDFS系统

3.1在hadop2.7.1目录下使用如下命令：

bin/hdfs namenode -format

3.2开启NameNode和DataNode守护进程

　　　　使用如下命令开启：

sbin/start-dfs.sh

注：如果未进行ssh进行无密码验证登录，需要输入密码

-------------------------------------------------------------------------------------------------------------------------------

1.在HDFS中新建一个文件夹，用于上传本地的words文档，在hadoop2.7.1目录下输入如下命令：

[root@master hadoop]# ./bin/hdfs dfs -mkdir -p /input

[root@master hadoop]# ./bin/hdfs dfs -ls /

Found 2 items

drwxr-xr-x - root supergroup 0 2021-05-07 11:29 /input

drwxr-xr-x - root supergroup 0 2021-05-07 11:24 /test

//上传

[root@master hadoop]# ./bin/hdfs dfs -put /usr/local/hadoop/input/word.txt /input

2.运行

[root@master hadoop]# bin/hadoop jar wordCountSimple-1.0.jar WordCountDriver /input/word.txt /input/out

3.测试是否成功

[root@master ~]# cd /usr/local/hadoop

[root@master hadoop]# bin/hdfs dfs -ls /input

Found 2 items

drwxr-xr-x - root supergroup 0 2021-05-07 12:08 /input/out

-rw-r--r-- 1 root supergroup 179 2021-05-07 11:29 /input/word.txt

[root@master hadoop]# bin/hdfs dfs -ls /input/out

Found 2 items

-rw-r--r-- 1 root supergroup 0 2021-05-07 12:08 /input/out/_SUCCESS

-rw-r--r-- 1 root supergroup 131 2021-05-07 12:08 /input/out/part-r-00000

[root@master hadoop]# bin/hadoop fs -cat /input/out/part-r-00000

" 4

""hello"""hello"" 2

"hello 3

"hello" 3

"hello"""hello"""hello"""hello"""hello"""hello"""hello"" 1

hello" 1

hello""hello"" 1

------------------------------------------------------------------------------------------------------------------------

word文档：

" hello"
" hello""hello""
"hello ""hello"""hello""
"hello ""hello"""hello""
" "hello"
"hello"
"hello"""hello"""hello"""hello"""hello"""hello"""hello""
"hello"
"hello "

--------------------------------------------------------------------------------------------------------------------------

java代码

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.6.5</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>2.6.5</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.6.5</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-core</artifactId>
    <version>2.6.5</version>
</dependency>

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

//主类，指定job参数并提交job
public class WordCountDriver {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //jar包位置
        job.setJarByClass(WordCountDriver.class);

        //指定mapper类和reducer类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //map阶段，输出的数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //输出的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //判断是否有output文件夹
        Path path = new Path(args[1]);
        FileSystem fileSystem = path.getFileSystem(conf);// 根据path找到这个文件
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, true);// true的意思是，就算output有东西，也一带删除
        }

        //数据读取组件 输出组件
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        //5.封装参数 本次job 要处理的数据集在哪 生成的路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        //6.封装参数 项启动多个reduce task数量
        job.setNumReduceTasks(1);
        //7.提交job
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);

    }
}

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

//Map：每个单词记一个1
//输入key（一行文本的偏移量），输入value（一行文本内容），输出key（单词），输出value（单词个数）
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    //MapTask调用map方法，每读取一个（keyIn，valueIn），就调用一次map方法
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {

        //获取每一行的文本内容
        String lines = value.toString();
        String[] words = lines.split(" ");

        for (String word :words) {
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

//输入key（map的输出key：单词），输入value（map的输出value：单词个数），输出key（单词），输出value（单词个数）
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    //ReduceTask调用reduce方法：会先将接收到的kv对按照key分组（key相同为一组）；然后每组key调用一次reduce方法
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
                          Context context) throws IOException, InterruptedException {

        int count =0;
        for(IntWritable v :values){
            count += v.get();
        }
        context.write(key, new IntWritable(count));
    }
}

先clean，后packsge 生成jar包，使用idea完成

打的包放在hadoop解压目录下

bin/hadoop jar wordCountSimple-1.0.jar WordCountDriver /input/word.txt /input/out

/input/word.txt是在hdfs生成的文件夹，同时将本地word上传

/input/out输出位置 ip加50070

或者使用指令：bin/hdfs dfs -ls /input/out

查看详细信息：bin/hadoop fs -cat /input/out/part-r-00000

上述有，不多说

杨宇sss

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
基于centos8.1hdoop+mapreduce+hbase

开发环境：windos +ieda生产环境：hdoop+mapreduce+hbase生产hadoop用户，主要为测试hdoop+mapreduce，真实环境需要加入数据采集，数据存储，数据展示等技术，hbase存储不多说，网上很多。主要是数据处理与分析需要制定大量规则下述为演示，只写基本规则，伪分布集群存储等------------------------------------------------------------------------------------------
复制链接

扫一扫