Hadoop

HDFS安装(伪集群)

  • 准备物理虚拟机一台

版本:CentOS7

  • 配置网络
# 如果需要配置双网卡,需要添加网卡硬件支持
vi /etc/sysconfig/network-scripts/ifcfg-ens33

1. 将ip的分配方式改为static
2. 添加 IPADDR=静态IP地址
3. 添加 NETMASK=255.255.255.0
4. 网卡开机自启动 ONBOOT=yes
  • 关闭防火墙
[root@bogon ~]# systemctl stop firewalld
[root@bogon ~]# systemctl disable  firewalld
  • 修改主机名
[root@bogon ~]# vi /etc/hostname
HadoopNode00
  • 配置hosts映射
[root@Spark ~]# vi /etc/hosts
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.191.100  HadoopNode00

[root@Spark ~]# ping HadoopNode00
  • ssh免密登陆
[root@HadoopNode00 ~]# ssh-keygen -t rsa   # 生成密钥
[root@HadoopNode00 ~]# ssh-copy-id HadoopNode00
  • 安装JDK 8
[root@Spark ~]# rpm -ivh jdk-8u171-linux-x64.rpm
准备中...                          ################################# [100%]
正在升级/安装...
   1:jdk1.8-2000:1.8.0_171-fcs        ################################# [100%]
Unpacking JAR files...
        tools.jar...
        plugin.jar...
        javaws.jar...
        deploy.jar...
        rt.jar...
        jsse.jar...
        charsets.jar...
        localedata.jar...
  • 解压Hadoop
[root@HadoopNode00 ~]# tar -zxf hadoop-2.9.2.tar.gz -C /usr
  • 修改HDFS的配置文件
[root@HadoopNode00 hadoop-2.9.2]# vim /usr/hadoop-2.9.2/etc/hadoop/core-site.xml
<!--nn访问入口-->
<property>
    <name>fs.defaultFS</name>
    <value>hdfs://HadoopNode00:9000</value>
</property>
<!--hdfs工作基础目录-->
<property>
    <name>hadoop.tmp.dir</name>
    <value>/usr/hadoop-2.9.2/hadoop-${user.name}</value>
</property>



[root@HadoopNode00 hadoop-2.9.2]# vim /usr/hadoop-2.9.2/etc/hadoop/hdfs-site.xml

<!--block副本因子-->
<property>
    <name>dfs.replication</name>
    <value>1</value>
</property>
<!--配置Sencondary namenode所在物理主机-->
<property>
    <name>dfs.namenode.secondary.http-address</name>
    <value>HadoopNode00:50090</value>
</property>
<!--设置datanode最大文件操作数-->
<property>
        <name>dfs.datanode.max.xcievers</name>
        <value>4096</value>
</property>
<!--设置datanode并行处理能力-->
<property>
        <name>dfs.datanode.handler.count</name>
        <value>6</value>
</property>



[root@Spark HadoopNode00-2.9.2]# vim /usr/hadoop-2.9.2/etc/hadoop/slaves
HadoopNode00
  • 配置JDK和Hadoop的环境变量
[root@Spark HadoopNode00-2.9.2]# vim /root/.bashrc

export HADOOP_HOME=/usr/hadoop-2.9.2
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

[root@Spark HadoopNode00-2.9.2]# source /root/.bashrc
  • 格式化namenode
[root@HadoopNode00 ~]# hdfs namenode -format
  • 启动关闭
start-dfs.sh   # 开启HDFS 
stop-dfs.sh    # 关闭hdfs 
  • web界面
http://主机名:50070  

HDFS Shell

  • 上传文件
[root@HadoopNode00 ~]# hadoop fs -put  /root/install.log  /1.txt
  • ls文件
[root@HadoopNode00 ~]# hadoop fs -ls /
Found 1 items
-rw-r--r--   1 root supergroup       8901 2019-09-17 23:28 /1.txt
  • 下载文件
[root@HadoopNode00 ~]# hadoop fs -get  /1.txt /root/user.txt
  • 删除文件
[root@HadoopNode00 ~]# hadoop fs -rm /2.txt
  • 查看文件
[root@HadoopNode00 ~]# hadoop fs -cat /1.txt
  • 创建文件夹
[root@HadoopNode00 ~]# hadoop fs -mkdir /user
  • 复制文件
[root@HadoopNode00 ~]# hadoop fs -cp /1.txt /user/
  • 开启回收站机制 core-site.xml
<property>
<name>fs.trash.interval</name>
<value>1</value>
</property>

Java API

依赖

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.9.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>2.9.2</version>
</dependency>

Windows 配置Hadoop环境

  • 解压hadoop到指定的目录
  • 拷贝hadoop.dll和winutils.exe到hadoop/bin 目录下
  • 配置Hadoop环境变量
  • 配置主机名和IP的映射关系

权限不足解决方案

  • -DHADOOP_USER_NAME=root

  • System.setProperty(“HADOOP_USER_NAME”, “root”);

  • <property>
    <name>dfs.permissions.enabled</name>
    <value>false</value>
    </property>
    

相关操作

public class App {

    private Configuration configuration;
    private FileSystem fileSystem;

    @Before
    public void getClient() throws Exception {

        System.setProperty("HADOOP_USER_NAME", "root");
        /*
         * 准备配置对象
         * */
        configuration = new Configuration();
        /*
         * 添加相应的配置文件*/
        configuration.addResource("core-site.xml");
        configuration.addResource("hdfs-site.xml");

        /*
         * 通过FileSystem.newInstance 获得客户端对象*/
        fileSystem = FileSystem.newInstance(configuration);
    }

    @Test
    public void testUpload01() throws Exception {
        /*
         *
         * 源文件  |   目标文件
         * Path 对象
         * */
        fileSystem.copyFromLocalFile(new Path("G:\\A.docx"), new Path("/user/2.docx"));
    }

    @Test
    public void testUpload02() throws Exception {

        /*
         * 准备 本地输入流
         * */
        FileInputStream inputStream = new FileInputStream("G:\\A.docx");

        /*
         * 准备 hdfs 输出流
         * */
        FSDataOutputStream outputStream = fileSystem.create(new Path("/user/3.docx"));

        /*
         * 使用工具类进行拷贝
         * */
        IOUtils.copyBytes(inputStream, outputStream, 1024, true);
    }

    @Test
    public void testDownload01() throws Exception {

        fileSystem.copyToLocalFile(false, new Path("/1.txt"), new Path("G:\\3.txt"), true);
    }

    @Test
    public void testDownload02() throws Exception {

        FileOutputStream outputStream = new FileOutputStream("G:\\4.txt");

        FSDataInputStream inputStream = fileSystem.open(new Path("/1.txt"));
        IOUtils.copyBytes(inputStream, outputStream, 1024, true);
    }

    @Test
    public void test011() throws IOException {

        RemoteIterator<LocatedFileStatus> list = fileSystem.listFiles(new Path("/"), true);

        while (list.hasNext()) {

            LocatedFileStatus locatedFileStatus = list.next();
            Path path = locatedFileStatus.getPath();
            System.out.println(path.toString());
        }
    }

    @Test
    public void test02() throws Exception{

        fileSystem.delete(new Path("/user"),false);
    }
    @Test
    public void test03() throws Exception{

        boolean exists = fileSystem.exists(new Path("/1.txt"));
        if (exists){
            System.out.println("文件存在");
        }else {
            System.out.println("文件不存在");
        }
    }

    @Test
    public void testy04() throws Exception{

        fileSystem.mkdirs(new Path("/user1"));
    }
}

YARN 环境搭建

配置YARN

  • etc/hadoop/yarn-site.xml
 <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
 <property>
    <description>The hostname of the RM.</description>
    <name>yarn.resourcemanager.hostname</name>
    <value>HadoopNode00</value>
  </property> 
  • etc/hadoop/mapred-site.xml

etc/hadoop/ 下其实是没有这个文件 的但是有yitmp结尾的文件,将其改名即可

<property>
  <name>mapreduce.framework.name</name>
  <value>yarn</value>
</property>

启动YARN

[root@HadoopNode00 ~]# start-yarn.sh

MR

依赖

  
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.9.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.9.2</version>
        </dependency>
        
   
		<dependency>
   	 	<groupId>org.apache.hadoop</groupId>
    	<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
    	<version>2.9.2</version>
 
		</dependency>

		 <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.9.2</version>
        </dependency>

Mapper

/*
* keyIn  LongWritable (Long) 输入文本字节偏移量
* valueIn Text (String)      输入文本行
*  keyOut Text(String)
*  valueOut IntWritable(Int)
* */
public class WCMapper  extends Mapper<LongWritable, Text,Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] names = value.toString().split(" ");
        for (String name : names) {
            context.write(new Text(name),new IntWritable(1));
        }
    }
}

Reduce

/*
 *keyIn Text 与mapper的keyOut的数据类型相对应
 *valeuIn IntWritable   与mapper的ValueOut的数据类型相对应
 * KeyOut
 * valueOut
 * */
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        context.write(key, new IntWritable(sum));
    }
}

Job

public class JobRunner {
    public static void main(String[] args) throws Exception {

        /*
         * 获取配置对象
         * */

        Configuration conf = new Configuration();

        /*
         * 获取Job对象
         * */
        Job job = Job.getInstance(conf);

        /*
         * 设置数据输入输出组件
         * */
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        /*
         *设置数据输入输出路径
         * */
        TextInputFormat.setInputPaths(job, new Path("/wordcount.txt"));
        /*
         * 注意: 此输出路径不能存在
         * */
        TextOutputFormat.setOutputPath(job, new Path("/test/out1"));


        /*
         * 设置MAP 和 REDUCE 处理逻辑
         * */
        job.setMapperClass(WCMapper.class);
        job.setReducerClass(WCReducer.class);

        /*
         * 设置 map任务和reduce任务的输出泛型
         * */
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);


        //  提交
        //job.submit();
        job.waitForCompletion(true);
    }
}

部署运行

远程Jar 包部署

 job.setJarByClass(JobRunner.class);

打包

运行 hadoop jar 包的名字 主类名

[root@HadoopNode00 ~]# hadoop jar Hadoop_Test-1.0-SNAPSHOT.jar com.zzj.mr.test01.JobRunner

本地仿真

 <dependency>
     <groupId>log4j</groupId>
     <artifactId>log4j</artifactId>
     <version>1.2.17</version>
</dependency>

log4j.properties

### 配置根 ###
log4j.rootLogger = info,console

### 配置输出到控制台 ###
log4j.appender.console = org.apache.log4j.ConsoleAppender
log4j.appender.console.Target = System.out
log4j.appender.console.layout = org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern =  %p %d{yyyy-MM-dd HH:mm:ss} %c %m%n

跨平台提交

需要拷贝相关配置文件到resource目录

  • core-site.xml
  • hdfs-site.xml
  • yarn-site.xml
  • mapred-site.xml
		System.setProperty("HADOOP_USER_NAME", "root");

        conf.addResource("conf2/core-site.xml");
        conf.addResource("conf2/hdfs-site.xml");
        conf.addResource("conf2/mapred-site.xml");
        conf.addResource("conf2/yarn-site.xml");
        conf.set(MRJobConfig.JAR, "G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\target\\Hadoop_Test-1.0-SNAPSHOT.jar");

配置mapred-site.xml

 <property>
    <name>mapreduce.app-submission.cross-platform</name>
    <value>true</value>
  </property>

或者用代码

  conf.set("mapreduce.app-submission.cross-platform", "true");

自定义Bean对象

import org.apache.hadoop.io.Writable;
import sun.rmi.runtime.Log;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;

public class FlowBean implements Writable {

    private String phone;
    private Long upFlow;
    private Long downFlow;
    private Long sumFlow;

    public FlowBean() {
    }

    public FlowBean(String phone, Long upFlow, Long downFlow, Long sumFlow) {
        this.phone = phone;
        this.upFlow = upFlow;
        this.downFlow = downFlow;
        this.sumFlow = sumFlow;
    }


    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public Long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(Long upFlow) {
        this.upFlow = upFlow;
    }

    public Long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(Long downFlow) {
        this.downFlow = downFlow;
    }

    public Long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(Long sumFlow) {
        this.sumFlow = sumFlow;
    }


    @Override
    public String toString() {
        return "" +
                "phone='" + phone + '\'' +
                " upFlow=" + upFlow +
                " downFlow=" + downFlow +
                " sumFlow=" + sumFlow ;
    }

    /*
     * 序列化 编码
     * */
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.phone);
        dataOutput.writeLong(this.upFlow);
        dataOutput.writeLong(this.downFlow);
        dataOutput.writeLong(this.sumFlow);
    }


    /*
     * 反序列化  解码
     * */
    public void readFields(DataInput dataInput) throws IOException {

        this.phone = dataInput.readUTF();
        this.upFlow = dataInput.readLong();
        this.downFlow = dataInput.readLong();
        this.sumFlow = dataInput.readLong();
    }
}

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {

    /*
     * 18611781163 700000 10000
     * 18611781163 700000 10000
     * 18611781163 700000 10000
     * */

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();
        String[] data = line.split(" ");
        /*
         *  phone
         * */
        context.write(new Text(data[0]), new FlowBean(data[0], Long.valueOf(data[1]), Long.valueOf(data[2]), (Long.valueOf(data[1]) + Long.valueOf(data[2]))));

    }
}
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/*
 * 18611781163  FlowBean[]
 *
 * */
public class FlowReducer extends Reducer<Text, FlowBean, NullWritable, FlowBean> {

    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

        Long up = 0L;
        Long down = 0L;
        Long sum = 0L;

        for (FlowBean flowBean : values) {
            up += flowBean.getUpFlow();
            down += flowBean.getDownFlow();
            sum += flowBean.getSumFlow();
        }
       context.write(NullWritable.get(), new FlowBean(key.toString(), up, down, sum));
    }
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class FlowRunner {
    public static void main(String[] args) throws Exception {
        System.setProperty("HADOOP_USER_NAME", "root");

        Configuration conf = new Configuration();
        conf.addResource("conf2/core-site.xml");
        conf.addResource("conf2/hdfs-site.xml");
        conf.addResource("conf2/mapred-site.xml");
        conf.addResource("conf2/yarn-site.xml");
        conf.set(MRJobConfig.JAR, "G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\target\\Hadoop_Test-1.0-SNAPSHOT.jar");
        conf.set("mapreduce.app-submission.cross-platform", "true");

        Job job = Job.getInstance(conf);


        job.setJarByClass(FlowRunner.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);


        TextInputFormat.setInputPaths(job, new Path("/flow.dat"));

        TextOutputFormat.setOutputPath(job, new Path("/test/out333"));
        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);


        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(FlowBean.class);

        job.waitForCompletion(true);

    }
}

MR组件

InputFormat

分类
  • FileInputFormat

    • TextInputFormat

      • key LongWriteable 行的字节偏移量
      • value Text 文本

      切片:以文件为切分单位,有多少个文件就至少有多少个切片

    • NLineInputFormat

      • key LongWriteable 行的字节偏移量
      • value Text 文本

      切片:n行为一个切片,默认1行为一个切片,可以设置

      conf.set(“mapreduce.input.lineinputformat.linespermap”,“10”)

    NLineInputFormat.setNumLinesPerSplit();

    • CombineTextInputFormat

      • key LongWriteable 行的字节偏移量

      • value Text 文本

      切片:按照SplitSize切分,一个切片可能对应多个Block块

      CombineTextInputFormat.setMaxInputSplitSize();
      

    CombineTextInputFormat.setMinInputSplitSize();

    • SequenceFileInputFormat

      • key 文件名
      • value 文件数据
  • DBInputFormat(数据库)

  • TableInputFormat(HBase)

NLineInputFormat
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class JobRunner {
    public static void main(String[] args) throws Exception {
		//略
        job.setInputFormatClass(NLineInputFormat.class);
        //略
    }
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.io.Serializable;

/*
* keyIn  LongWritable (Long) 输入文本字节偏移量
* valueIn Text (String)      输入文本行
*  keyOut Text(String)
*  valueOut IntWritable(Int)
* */

public class WCMapper  extends Mapper<LongWritable, Text,Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String[] names = value.toString().split(" ");
        for (String name : names) {
            context.write(new Text(name),new IntWritable(1));
        }
    }
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


/*
 *keyIn Text 与mapper的keyOut的数据类型相对应
 *valeuIn IntWritable   与mapper的ValueOut的数据类型相对应
 * KeyOut
 * valueOut
 * */
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        context.write(key, new IntWritable(sum));
    }
}
CombineTextInputFormat

对于小文件计算进行优化

public class JobRunner {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(JobRunner.class);

        /*
         * 设置数据输入输出组件
         * */
        job.setInputFormatClass(CombineTextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        CombineTextInputFormat.setMinInputSplitSize(job, 1048576);
        CombineTextInputFormat.setInputPaths(job, new Path("G:\\Note\\Hadoop\\数据文件\\data"));
      
        TextOutputFormat.setOutputPath(job, new Path("G:\\Note\\Hadoop\\数据文件\\out111122"));

        job.setMapperClass(WCMapper.class);
        job.setReducerClass(WCReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);
    }
}
DBInputFormat
public class JobRunner {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver", "jdbc:mysql://hadoopnode00:3306/hadoop", "root", "1234");
        Job job = Job.getInstance(conf);
        job.setJarByClass(JobRunner.class);

        job.setInputFormatClass(DBInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        DBInputFormat.setInput(job, User.class, "select id,name from user", "select count(1) from user");
        FileOutputFormat.setOutputPath(job, new Path("G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\src\\main\\java\\com\\zzj\\DBInputFormat\\out1"));


        job.setMapperClass(DBMapper.class);
        job.setReducerClass(DBReducer.class);


        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.waitForCompletion(true);
    }
}
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class DBMapper extends Mapper<LongWritable, User, LongWritable, Text> {
    @Override
    protected void map(LongWritable key, User value, Context context) throws IOException, InterruptedException {
        context.write(key, new Text(value.toString()));
    }
}
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class DBReducer extends Reducer<LongWritable, Text, NullWritable, Text> {

    @Override
    protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        for (Text value : values) {
            context.write(NullWritable.get(), value);
        }
    }
}
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.lib.db.DBWritable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

public class User implements Writable, DBWritable {

    int id;
    String name;

    public User() {
    }

    public User(int id, String name) {
        this.id = id;
        this.name = name;
    }

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    @Override
    public String toString() {
        return "User{" +
                "id=" + id +
                ", name='" + name + '\'' +
                '}';
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeInt(this.id);
        dataOutput.writeUTF(this.name);
    }

    public void readFields(DataInput dataInput) throws IOException {
        this.id = dataInput.readInt();
        this.name = dataInput.readUTF();
    }

    public void write(PreparedStatement preparedStatement) throws SQLException {
        preparedStatement.setInt(1, this.id);
        preparedStatement.setString(2, this.name);
    }

    public void readFields(ResultSet resultSet) throws SQLException {
        this.id = resultSet.getInt(1);
        this.name = resultSet.getString(2);
    }
}

  • 如果在本地运行
      <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>
  • 如果在yarn上运行,需要在hadoopNode00中添加MySQL的环境中将mysql的jar包放入/usr/hadoop-2.9.2/share/hadoop/yarn/ 中即可
  • 如果是远程提交
		 System.setProperty("HADOOP_USER_NAME", "root");
        conf.addResource("conf2/core-site.xml");
        conf.addResource("conf2/hdfs-site.xml");
        conf.addResource("conf2/mapred-site.xml");
        conf.addResource("conf2/yarn-site.xml");
        conf.set(MRJobConfig.JAR, "G:\\IDEA_WorkSpace\\BigData\\Hadoop_Test\\target\\Hadoop_Test-1.0-SNAPSHOT.jar");
        conf.set("mapreduce.app-submission.cross-platform", "true");
自定义InputFormat

​ 解决小文件存储问题,将多个小文件存放在一个SequenceFile(SequenceFile文件是Hadoop用来存储二进制文件形式的key-value的文件格式),SequenceFile,存储的形式为文件的路径名称为key,文件的内容为value

public class JobRunner {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(JobRunner.class);

        job.setInputFormatClass(OwnInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
   
        OwnInputFormat.setInputPaths(job, new Path("G:\\Note\\Hadoop\\数据文件\\data"));
       
        SequenceFileOutputFormat.setOutputPath(job, new Path("G:\\Note\\Hadoop\\数据文件\\out12313"));

        job.setMapperClass(FileMapper.class);
        job.setReducerClass(FileReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        job.waitForCompletion(true);
    }
}
public class OwnRecordReader extends RecordReader<Text, BytesWritable> {

    FileSplit fileSplit;
    Configuration conf;
    BytesWritable value = new BytesWritable();
    Text key = new Text();

    boolean isProgress = true;

    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {

        fileSplit = (FileSplit) inputSplit;

        conf = taskAttemptContext.getConfiguration();
    }

    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (isProgress) {
            byte[] bytes = new byte[(int) fileSplit.getLength()];
            
            //获取fs 对象
            /*
             * 当前文件的路径
             * */
            Path path = fileSplit.getPath();

            FileSystem fileSystem = path.getFileSystem(conf);

            /*
             * 获取到文件的数据流
             * */
            FSDataInputStream inputStream = fileSystem.open(path);

            IOUtils.readFully(inputStream, bytes, 0, bytes.length);

            /*
             * 封装value
             * */
            value.set(bytes, 0, bytes.length);
            
            key.set(path.toString());

            IOUtils.closeStream(inputStream);

            isProgress = false;

            return true;

        }
        return false;
    }

    public Text getCurrentKey() throws IOException, InterruptedException {
        return this.key;
    }

    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return this.value;
    }

    public float getProgress() throws IOException, InterruptedException {
        return 0;
    }

    public void close() throws IOException {
    }
}

Partitioner

import org.apache.hadoop.mapreduce.Partitioner;

import java.util.HashMap;

public class OwnPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE> {

    private static HashMap<String, Integer> areaMap = new HashMap<String, Integer>();
    static {
        areaMap.put("hn", 0);
        areaMap.put("henna", 0);
        areaMap.put("bj", 1);
        areaMap.put("tj", 2);
        areaMap.put("hb", 3);
    }

    public int getPartition(KEY key, VALUE value, int i) {
        return areaMap.get(key.toString()) == null ? 5 : areaMap.get(key.toString());
    }
}
package com.baizhi.partitioner;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class FlowBean implements Writable {

    private String phone;
    private Long upFlow;
    private Long downFlow;
    private Long sumFlow;
	//略set get
    /*
     * 序列化 编码
     * */
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.phone);
        dataOutput.writeLong(this.upFlow);
        dataOutput.writeLong(this.downFlow);
        dataOutput.writeLong(this.sumFlow);
    }

    public void readFields(DataInput dataInput) throws IOException {

        this.phone = dataInput.readUTF();
        this.upFlow = dataInput.readLong();
        this.downFlow = dataInput.readLong();
        this.sumFlow = dataInput.readLong();
    }
}
public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();

        String[] data = line.split(" ");

        context.write(new Text(data[3]), new FlowBean(data[0], Long.valueOf(data[1]), Long.valueOf(data[2]), (Long.valueOf(data[1]) + Long.valueOf(data[2]))));
    }
}
public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {

    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
        Long up = 0L;
        Long down = 0L;
        Long sum = 0L;
        String phone = "";
        for (FlowBean flowBean : values) {
            up += flowBean.getUpFlow();
            down += flowBean.getDownFlow();
            sum += flowBean.getSumFlow();
            phone = flowBean.getPhone();

        }
        context.write(key, new FlowBean(phone, up, down, sum));
    }
}

public class FlowRunner {
    public static void main(String[] args) throws Exception {
		//略
        job.setNumReduceTasks(4);
        job.waitForCompletion(true);
    }
}

OutputFormat

自定义输出

public class FileMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.write(NullWritable.get(), value);
    }
}
public class FileReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
    @Override
    protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        for (Text value : values) {
            context.write(NullWritable.get(), value);
        }
    }
}
class Job{
    //略
     job.setOutputFormatClass(OwnOutputFormat.class);
    //略
}
public class OwnOutputFormat extends FileOutputFormat<NullWritable, Text> {
    public RecordWriter<NullWritable, Text> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        try {
            return new OwnRecordWriter(taskAttemptContext);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }
}
public class OwnRecordWriter extends RecordWriter<NullWritable, Text> {
    FileSystem fileSystem;

    FSDataOutputStream outputStream;

    public OwnRecordWriter(TaskAttemptContext taskAttemptContext) throws Exception {
        fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());

        outputStream = fileSystem.create(new Path("G:\\Note\\Hadoop\\数据文件\\testoutputforamt.txt"));
    }

    public void write(NullWritable nullWritable, Text text) throws IOException, InterruptedException {
        outputStream.write(text.getBytes());
    }

    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {

        IOUtils.closeStream(outputStream);
        fileSystem.close();
    }
}

Combiner

  • Combiner是MR程序中Mapper和Reducer之外的一种组件
  • Combiner的组件的父类就是Reducer
  • Combiner和Reucer的区别在于运行的位置
Combiner是在每一个MapTask的节点上运行  (局部汇总)
Reducer是接收全局的所有的Mapper结果再进行处理 (全局汇总)
  • Combiner的意义就是对于每一个MapTask的输出进行局部汇总,减少网络传输量
  • Combiner能够运用的前提是不能影响最终业务结果(累加操作不会影响)而且 Combiner的输出KV 应该能跟Reducer的KV相对应
应用场景

只适用于累加不影响结果的场合

使用
  • 新建CombinerClass继承Reducer ,job.setCombinerClass();
  • 直接使用ReducerClass 作为CombinerClass
job.setCombinerClass(WCReducer.class);

Hadoop高可用集群搭建

HadoopNode01HadoopNode02HadoopNode03
nn1nn2
journal nodejournal nodejournal node
zkfczkfc
datanodedatanodedatanode
zk01zk02zk03
rm1rm2
nodemanagernodemanagernodemanager

1.Zookeeper集群搭建

添加配置文件

[root@HadoopNode0X ~]tar -zxvf zookeeper-3.4.6.tar.gz  -C /usr/
[root@HadoopNode0X zookeeper-3.4.6]# mkdir data
[root@HadoopNode0X zookeeper-3.4.6]# cp conf/zoo_sample.cfg conf/zoo.cfg
[root@HadoopNode0X zookeeper-3.4.6]# vim conf/zoo.cfg
tickTime=2000     # session 过期时间
initLimit=10
syncLimit=5
dataDir=/usr/zookeeper-3.4.6/data/  # zk 数据文件
clientPort=2181  # 外部服务 端口号
server.1=HadoopNode01:2887:3887
server.2=HadoopNode02:2888:3888
server.3=HadoopNode03:2889:3889

新建myid文件

在上述配置文件中对的data目录下,创建一个myid文件,在节点1中的myid中写 1 (就一个数字1),在节点2中写数字2,依次列推

复制配置启动

2.Hadoop集群搭建

基础环境准备

  • 配置主机名和IP的映射关系
[root@HadoopNodeX ~]# vi /etc/hosts
192.168.191.21 hadoopNode00
192.168.191.22 hadoopNode01
192.168.191.23 hadoopNode02
192.168.191.24 hadoopNode03
192.168.191.31 ZK01
192.168.191.32 ZK02
192.168.191.33 ZK03
  • 关闭防火墙
  • 同步时钟
[root@HadoopNodeX ~]# yum -y install ntpdate
[root@HadoopNodeX ~]# ntpdate -u ntp.api.bz
25 Sep 11:19:26 ntpdate[1749]: step time server 114.118.7.163 offset 201181.363384 sec
[root@HadoopNodeX ~]# date
Wed Sep 25 11:19:52 CST 2019
  • 配置SSH 免密登陆
[root@HadoopNodeX ~]# ssh-keygen -t rsa   # 现在三台机器上都运行次命令,在运行下面的
[root@HadoopNodeX ~]# ssh-copy-id hadoopNode01
[root@HadoopNodeX ~]# ssh-copy-id hadoopNode02
[root@HadoopNodeX ~]# ssh-copy-id hadoopNode03
  • Java环境
export JAVA_HOME=/home/java/jdk1.8.0_181
export PATH=$PATH:$JAVA_HOME/bin
  • 启动ZK

  • 安装Hadoop

    • 解压配置环境变量
    export HADOOP_HOME=/home/hadoop/hadoop-2.9.2
    export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
    
    • 配置core-site.xml
    <property>
      <name>fs.defaultFS</name>
      <value>hdfs://mycluster</value>
    </property>
    
    <property>
      <name>hadoop.tmp.dir</name>
      <value>/usr/hadoop-2.9.2/hadoop-${user.name}</value>
    </property>
    
    <property>
      <name>fs.trash.interval</name>
      <value>30</value>  
    </property>
    <property>
      <name>net.topology.script.file.name</name>
      <value>/usr/hadoop-2.9.2/etc/hadoop/rack.sh</value>
    </property>
    
    • 创建机架脚本

    在对应的文件夹中/usr/hadoop-2.9.2/etc/hadoop/ 创建一个rack.sh文件,将下方内容粘贴进入

    while [ $# -gt 0 ] ; do
    	nodeArg=$1
    	exec</usr/hadoop-2.9.2/etc/hadoop/topology.data
    	result=""
    	while read line ; do
            ar=( $line )
            if [ "${ar[0]}" = "$nodeArg" ] ; then
            result="${ar[1]}"
            fi
        done
        shift
        if [ -z "$result" ] ; then
             echo -n "/default-rack"
        else
        	echo -n "$result "
        fi
    done
    

    另外 需要对机架文件进行权限控制,chmod u+x /usr/hadoop-2.9.2/etc/hadoop/rack.sh

    • 创建机架映射文件

    创建 /usr/hadoop-2.9.2/etc/hadoop/topology.data

    192.168.191.22 /rack1
    192.168.191.23 /rack1
    192.168.191.24 /rack2
    
    • 配置 hdfs-site.xml
    <configuration>
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>
    
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
    
    <property> 
        <name>ha.zookeeper.quorum</name>
        <value>zk01:2181,zk02:2181,zk03:2181</value>
    </property>
    <property>
        <name>dfs.nameservices</name>
        <value>mycluster</value>
    </property>
    <property>
        <name>dfs.ha.namenodes.mycluster</name>
        <value>nn1,nn2</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn1</name>
        <value>hadoopNode01:9000</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn2</name>
        <value>hadoopNode02:9000</value>
    </property>
    
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://hadoopNode01:8485;hadoopNode02:8485;hadoopNode03:8485/mycluster</value>
    </property>
    
    <property>
        <name>dfs.client.failover.proxy.provider.mycluster</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
    
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
    </property>
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/root/.ssh/id_rsa</value>
    </property>
    </configuration>
    
    • 编辑slaves信息
hadoopNode01
hadoopNode02
hadoopNode03

启动

[root@HadoopNodeX ~]# rm -rf /usr/hadoop-2.9.2/hadoop-root/*   
[root@HadoopNodeX ~]# hadoop-daemons.sh  start journalnode   # 在任意节点中运行即可
[root@HadoopNode01 ~]# hdfs namenode -format
[root@HadoopNode01 ~]# hadoop-daemon.sh start namenode
[root@HadoopNode02 ~]# hdfs namenode  -bootstrapStandby  # 下载主namenode的元数据
[root@HadoopNode02 ~]# hadoop-daemon.sh start namenode
[root@HadoopNode01|02 ~]# hdfs zkfc -formatZK   #在任意的一个01|02 节点上格式化即可
[root@HadoopNode01 ~]# hadoop-daemon.sh start zkfc
[root@HadoopNode02 ~]# hadoop-daemon.sh start zkfc
[root@HadoopNodeX ~]# hadoop-daemon.sh start datanode
[root@HadoopNode01 ~]# jps
2324 JournalNode
2661 DFSZKFailoverController
2823 Jps
2457 NameNode
2746 DataNode

[root@HadoopNode02 ~]# jps
2595 DataNode
2521 DFSZKFailoverController
2681 Jps
2378 NameNode
2142 JournalNode

[root@HadoopNode03 .ssh]# jps
2304 Jps
2146 JournalNode
2229 DataNode

日常维护

[root@HadoopNode01 ~]# stop-dfs.sh
[root@HadoopNode01 ~]# start-dfs.sh

3.YARN 高可用

yarn

<property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
</property>
<property>
    <name>yarn.resourcemanager.ha.enabled</name>
    <value>true</value>
</property>
<property>
    <name>yarn.resourcemanager.zk-address</name>
    <value>zk01:2181,zk02:2181,zk03:2181</value>
</property>
<property>
    <name>yarn.resourcemanager.cluster-id</name>
    <value>rmcluster01</value>
</property>
<property>
    <name>yarn.resourcemanager.ha.rm-ids</name>
    <value>rm1,rm2</value>
</property>
<property>
    <name>yarn.resourcemanager.hostname.rm1</name>
    <value>hadoopNode02</value>
</property>
<property>
    <name>yarn.resourcemanager.hostname.rm2</name>
    <value>hadoopNode03</value>
</property>

mapred

<property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
</property>

启动

[root@HadoopNode02 ~]# yarn-daemon.sh start resourcemanager
[root@HadoopNode03 ~]# yarn-daemon.sh start resourcemanager
[root@HadoopNodeX ~]# yarn-daemon.sh start nodemanager
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值