java大数据开发训练营--hadoop入门笔记

最新推荐文章于 2022-09-04 09:38:59 发布

Laozizuiku

最新推荐文章于 2022-09-04 09:38:59 发布

阅读量229

点赞数

分类专栏： java基础 hadoop 文章标签： hadoop

本文链接：https://blog.csdn.net/laozizuiku/article/details/109530540

版权

java基础同时被 2 个专栏收录

22 篇文章 0 订阅

订阅专栏

hadoop

1 篇文章 0 订阅

订阅专栏

HDFS读写操作

public class HdfsClient {
    Configuration configuration=null;
    FileSystem fs =null;
    @Before
    public void init() throws URISyntaxException, IOException, InterruptedException {
        configuration = new Configuration();
        fs = FileSystem.get(new URI("hdfs://test1.bigload.com:9000"), configuration,"root");
    }
    @After
    public void deprecated() throws IOException {
        fs.close();
    }
    @Test
    public void test1() throws IOException {
        fs.mkdirs(new Path("/test3"));
    }
    /*上传文件*/
    @Test
    public void test2() throws IOException {
        fs.copyFromLocalFile(new Path("G:\\upload\\bigload.txt"),new Path("/test3/bigload.txt"));
    }
    /*下载文件*/
    @Test
    public void dowload() throws IOException {
        fs.copyToLocalFile(new Path("/test3/bigload.txt"),new Path("G:\\download\\bigload.txt"));
    }
    /*删除文件夹或文件*/
    @Test
    public void delete() throws IOException {
        fs.delete(new Path("/lagou/bigdata/bigload.txt"),true);
    }
    /*查看文件名称、权限、长度、块信息*/
    @Test
    public void catDir() throws IOException {
        /*所有文件*/
        RemoteIterator<LocatedFileStatus> locateFiles = fs.listFiles(new Path("/"), true);

        while(locateFiles.hasNext()){
            LocatedFileStatus file = locateFiles.next();
            /*文件名称*/

            System.out.println(file.getPath());//file.getPath().getName());
            /*权限*/
            System.out.println(file.getPermission());
            /*长度*/
            System.out.println(file.getLen());
            /*储存块*/
            BlockLocation[] blockLocations = file.getBlockLocations();
            for (BlockLocation block : blockLocations) {
                /*在哪个节点上有副本*/
                String[] hosts = block.getHosts();
                for (String host : hosts) {
                    System.out.println(host);
                }
            }
            System.out.println("------------------------hihi---------------------------");
        }

    }
    /*判断是否是文件夹*/
    public void checkDir(Path dir) throws IOException {
        FileStatus[] fileStatuses = fs.listStatus(dir);
        for (FileStatus fileStatus : fileStatuses) {
            if(fileStatus.isDirectory()){
                System.out.println("目录："+fileStatus.getPath());
                this.checkDir(fileStatus.getPath());
            }else{
                System.out.println("文件："+fileStatus.getPath());
            }
        }
    }
    /*判断是否是文件夹*/
    @Test
    public void isDirTest() throws IOException {
        this.checkDir(new Path("/"));
    }

    /*io流文件上传*/
    @Test
    public void streamUpload() throws IOException {
        FileInputStream location = new FileInputStream(new File("G:\\upload\\bigload.txt"));
        FSDataOutputStream out = fs.create(new Path("/lagou/bigdata/bigload.txt"), new Progressable() {
            @Override
            public void progress() {
                /*每64kb就打印一个
                * 如果有数据会先建立通道会调用一次，第二次开始才是数据*/
                System.out.println("%%%");
            }
        });
        IOUtils.copyBytes(location,out,configuration);

    }

    /*io流文件下载*/
    @Test
    public void streamDownLoad() throws IOException {
        FSDataInputStream dataInputStream = fs.open(new Path("/lagou/bigdata/hadoop.txt"));
        FileOutputStream outputStream = new FileOutputStream(new File("G:\\download\\hadoop.txt"));
        IOUtils.copyBytes(dataInputStream,outputStream,configuration);

    }
    /*seek定位，将文件打印两遍*/
    @Test
    public void seekTest() throws IOException {
        FSDataInputStream dataInputStream = fs.open(new Path("/lagou/bigdata/hadoop.txt"));
        IOUtils.copyBytes(dataInputStream,System.out,4096,false);
        dataInputStream.seek(0);
        IOUtils.copyBytes(dataInputStream,System.out,4096,false);
        /*我在尝试跑一下*/
        IOUtils.copyBytes(dataInputStream,System.out,configuration);
        IOUtils.closeStream(dataInputStream);
    }

}

MapReduce思想在生活中处处可见。我们或多或少都曾接触过这种思想。MapReduce的思想核心是分而治之，充分利用了并行处理的优势。

MapReduce任务过程是分为两个处理阶段：

Map阶段：Map阶段的主要作用是“分”，即把复杂的任务分解为若干个“简单的任务”来并行处理。

Map阶段的这些任务可以并行计算，彼此间没有依赖关系。

Reduce阶段：Reduce阶段的主要作用是“合”，即对map阶段的结果进行全局汇总。

speakBean对象

public class speakBean implements Writable {
    private Long selfTime;
    private Long otherTime;
    private Long sumTime;

    public speakBean() {
    }

    public speakBean(Long selfTime, Long otherTime) {
        setSelfTime(selfTime);
        setOtherTime(otherTime);
        setSumTime(selfTime+otherTime);
    }



    public Long getSelfTime() {
        return selfTime;
    }

    public void setSelfTime(Long selfTime) {
        this.selfTime = selfTime;
    }

    public Long getOtherTime() {
        return otherTime;
    }

    public void setOtherTime(Long otherTime) {
        this.otherTime = otherTime;
    }

    public Long getSumTime() {
        return sumTime;
    }

    public void setSumTime(Long sumTime) {
        this.sumTime = sumTime;
    }

    @Override
    public String toString() {
        return selfTime +
                "\t" + otherTime +
                "\t" + sumTime;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeLong(selfTime);
        out.writeLong(otherTime);
        out.writeLong(sumTime);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.setSelfTime(in.readLong());
        this.setOtherTime(in.readLong());
        this.setSumTime(in.readLong());
    }
}

speakMapper

public class speakMapper extends Mapper<LongWritable,Text,Text,speakBean> {
    Text k=new Text();
    speakBean v= new speakBean();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();

        String[] datas = line.split("\t");

        v.setSelfTime(Long.parseLong(datas[datas.length - 3]));
        v.setOtherTime(Long.parseLong(datas[datas.length - 2]));
        v.setSumTime(Long.parseLong(datas[datas.length - 3])+Long.parseLong(datas[datas.length - 2]));

        k.set(datas[1]);

        context.write(k,v);
    }
}

speakReducer

public class speakReducer extends Reducer<Text,speakBean,Text,speakBean> {
    speakBean sb=new speakBean();
    @Override
    protected void reduce(Text key, Iterable<speakBean> values, Context context) throws IOException, InterruptedException {
        Long selfTime=0l;
        Long otherTime=0l;
        for (speakBean value : values) {
            selfTime+=value.getSelfTime();
            otherTime+=value.getOtherTime();
        }
        sb.setSelfTime(selfTime);
        sb.setOtherTime(otherTime);
        sb.setSumTime(selfTime+otherTime);

        context.write(key,sb);
    }
}

speakDriver

public class speakDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        /*导入配置，生成job*/
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        /*jar包所在*/
        job.setJarByClass(speakDriver.class);
        /*mapper所在*/
        job.setMapperClass(speakMapper.class);
        /*reduce所在*/
        job.setReducerClass(speakReducer.class);
        /*mapper的输出k,v类型*/
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(speakBean.class);
        /*输出的k,v类型*/
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(speakBean.class);
        /*指定预聚合*/
        job.setCombinerClass(speakReducer.class);
        /*输入，输出路径*/
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        /*提交任务给yarn*/
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);

    }

}

bean

public class ComparableBean implements WritableComparable<ComparableBean> {
    private String appKey;
    private Long selfTime;
    private Long otherTime;
    private Long sumTime;

    public ComparableBean() {
    }

    public ComparableBean(String appKey, Long selfTime, Long otherTime, Long sumTime) {
        this.appKey = appKey;
        this.selfTime = selfTime;
        this.otherTime = otherTime;
        this.sumTime = sumTime;
    }

    @Override
    public int compareTo(ComparableBean o) {
        return -sumTime.compareTo(o.getSumTime());
    }
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(appKey);
        out.writeLong(selfTime);
        out.writeLong(otherTime);
        out.writeLong(sumTime);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        appKey=in.readUTF();
        selfTime=in.readLong();
        otherTime=in.readLong();
        sumTime=in.readLong();
    }

    @Override
    public String toString() {
        return appKey +
                '\t' + selfTime +
                "\t" + otherTime +
                "\t" + sumTime ;
    }

   

}

bean

public class OrderBean implements WritableComparable<OrderBean> {
    private String orderId;
    private Double money;

    @Override
    public int compareTo(OrderBean o) {
        int compare = orderId.compareTo(o.getOrderId());
        if(compare==0){
            return -money.compareTo(o.getMoney());
        }
        return compare;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(orderId);
        out.writeDouble(money);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        orderId=in.readUTF();
        money=in.readDouble();
    }

    public OrderBean() {
    }

    public OrderBean(String orderId, Double money) {
        this.orderId = orderId;
        this.money = money;
    }



    @Override
    public String toString() {
        return orderId + '\t'+money ;
    }
}

comparator

public class OrderGroupingComparator extends WritableComparator {
    public OrderGroupingComparator() {
        super(OrderBean.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        OrderBean c=(OrderBean)a;
        OrderBean d=(OrderBean)b;
        return c.getOrderId().compareTo(d.getOrderId());
    }
}

Partitioner

public class OrderPartition extends Partitioner<OrderBean, NullWritable> {
    @Override
    public int getPartition(OrderBean orderBean, NullWritable nullWritable, int numPartitions) {
        //按订单id来分区，
        return (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
    }
}

mapper

public class OrderMapper extends Mapper<LongWritable, Text,OrderBean, NullWritable> {
    OrderBean bean=new OrderBean();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] fields = value.toString().split("\t");
        bean.setOrderId(fields[0]);
        bean.setMoney(Double.parseDouble(fields[2]));

        context.write(bean,NullWritable.get());
    }
}

reducer

public class OrderReducer extends Reducer<OrderBean, NullWritable,OrderBean,NullWritable> {
    @Override
    protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        context.write(key,NullWritable.get());
    }
}

driver

public class OrderDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf,"OrderDriver");

        job.setJarByClass(OrderDriver.class);

        job.setMapperClass(OrderMapper.class);
        job.setReducerClass(OrderReducer.class);

        job.setMapOutputKeyClass(OrderBean.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);

        /*指定分区器*/
        job.setPartitionerClass(OrderPartition.class);
        /*指定reduceTask的数量*/
        job.setNumReduceTasks(2);
        /*指定分组比较器*/
        job.setGroupingComparatorClass(OrderGroupingComparator.class);
        
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);

    }
}

FileInputformat

public class CustomFileInputformat extends FileInputFormat<Text, BytesWritable> {

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        /*文件不可切分*/
        return false;
    }

    @Override
    public RecordReader<Text, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        /*返回自己的recordeReader类*/
        CustomRecordReader recordReader= new CustomRecordReader();
        return recordReader;
    }
}

RecordReader

public class CustomRecordReader extends RecordReader<Text, BytesWritable> {
    Configuration configuration=null;
    FileSplit split=null;

    /*输出的kv*/
    Text text=new Text();
    BytesWritable value=new BytesWritable();


    /*未读标识*/
    Boolean isProgress=true;

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        //获取到文件切片以及配置文件对象
        this.split=(FileSplit)split;
        configuration = context.getConfiguration();
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {


        if(isProgress){
            byte[] contents = new byte[(int) split.getLength()];

            FileSystem fs=null;
            FSDataInputStream fis=null;
            try {
                /*获得文件系统*/
                Path path=split.getPath();
                fs=path.getFileSystem(configuration);

                /*读取数据*/
                fis = fs.open(path);
                /*读取文件内容*/
                IOUtils.readFully(fis,contents,0,contents.length);
                /*输出*/
                value.set(contents,0,contents.length);
                text.set(path.toString());
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                IOUtils.closeStream(fis);
                IOUtils.closeStream(fs);
            }
            isProgress=false;
            return true;
        }

        /*true为继续读，false为不继续读*/
        return false;
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return text;
    }

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return 0;
    }

    @Override
    public void close() throws IOException {

    }
}

Mapper

public class SequenceFileMapper extends Mapper<Text, BytesWritable,Text,BytesWritable> {
    @Override
    protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {
        /*直接输出*/
        context.write(key,value);
    }
}

Reducer

public class SequenceFileReducer extends Reducer {
    @Override
    protected void reduce(Object key, Iterable values, Context context) throws IOException, InterruptedException {
        context.write(key,values.iterator().next());
    }
}

Driver

public class SequenceFileDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(SequenceFileDriver.class);
        job.setMapperClass(SequenceFileMapper.class);
        job.setReducerClass(SequenceFileReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        job.setInputFormatClass(CustomFileInputformat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);

    }
}

FileOutputFormat

public class OutputFileOutputFormat extends FileOutputFormat<Text, NullWritable> {

    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
        FileSystem fs = FileSystem.get(job.getConfiguration());

        FSDataOutputStream fsDataOutputStream = fs.create(new Path("G:\\click_log\\out\\lagou"));
        FSDataOutputStream fsDataOutputStream1 = fs.create(new Path("G:\\click_log\\out\\other"));

        return new OutputRecordWriter(fsDataOutputStream,fsDataOutputStream1);
    }
}

RecordWriter

public class OutputRecordWriter extends RecordWriter<Text, NullWritable> {

    FSDataOutputStream lagou=null;
    FSDataOutputStream other=null;

    public OutputRecordWriter(FSDataOutputStream fsDataOutputStream, FSDataOutputStream fsDataOutputStream1) {
        this.lagou=fsDataOutputStream;
        this.other=fsDataOutputStream1;
    }

    @Override
    public void write(Text key, NullWritable value) throws IOException, InterruptedException {
        String s = key.toString();
        if(s.contains("lagou")){
            lagou.write(s.getBytes());
            lagou.write("\r\t".getBytes());
        }else{
            other.write(s.getBytes());
            other.write("\r\t".getBytes());
        }
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
        IOUtils.closeStream(lagou);
        IOUtils.closeStream(other);
    }
}

mapper ,reducer

public class OutputMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.write(value,NullWritable.get());
    }
}


public class OutputReducer extends Reducer<Text, NullWritable,Text,NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        context.write(key,NullWritable.get());

    }
}

Driver

public class OutputDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(OutputDriver.class);
        job.setMapperClass(OutputMapper.class);
        job.setReducerClass(OutputReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setOutputFormatClass(OutputFileOutputFormat.class);
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}

hadoop当中支持的压缩算法

设置map阶段压缩

Configuration configuration = new Configuration();

configuration.set("mapreduce.map.output.compress","true");

configuration.set("mapreduce.map.output.compress.codec","org.apache.hadoop.io.compress.SnappyCodec");

设置reduce阶段的压缩configuration.set("mapreduce.output.fileoutputformat.compress","true");

configuration.set("mapreduce.output.fileoutputformat.compress.type","RECORD");

configuration.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.SnappyCodec");

<property>
    <name>mapreduce.output.fileoutputformat.compress</name>
    <value>true</value>
</property>
<property>
    <name>mapreduce.output.fileoutputformat.compress.type</name>
    <value>RECORD</value>
</property>
<property>
    <name>mapreduce.output.fileoutputformat.compress.codec</name>
    <value>org.apache.hadoop.io.compress.SnappyCodec</value>
</property>