MR实现distinct

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.cc.pxj.wfy</groupId>
    <artifactId>phoneWcRuoZe</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <hadoop.version>2.6.0-cdh5.16.2</hadoop.version>
    </properties>
    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
    </repositories>

    <dependencies>
        <!-- 添加Hadoop依赖 -->
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>


        <!-- https://mvnrepository.com/artifact/junit/junit -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
            <scope>test</scope>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.17</version>
        </dependency>

    </dependencies>

    <build>
        <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
            <plugins>
                <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
                <plugin>
                    <artifactId>maven-clean-plugin</artifactId>
                    <version>3.1.0</version>
                </plugin>
                <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
                <plugin>
                    <artifactId>maven-resources-plugin</artifactId>
                    <version>3.0.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>3.8.0</version>
                </plugin>
                <plugin>
                    <artifactId>maven-surefire-plugin</artifactId>
                    <version>2.22.1</version>
                </plugin>
                <plugin>
                    <artifactId>maven-jar-plugin</artifactId>
                    <version>3.0.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-install-plugin</artifactId>
                    <version>2.5.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-deploy-plugin</artifactId>
                    <version>2.8.2</version>
                </plugin>
                <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
                <plugin>
                    <artifactId>maven-site-plugin</artifactId>
                    <version>3.7.1</version>
                </plugin>
                <plugin>
                    <artifactId>maven-project-info-reports-plugin</artifactId>
                    <version>3.0.0</version>
                </plugin>
            </plugins>
        </pluginManagement>
    </build>
</project>
java代码
package com.ccj.pxj.homework.distinct;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class DistinctMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //String data = value.toString();
        context.write(value,NullWritable.get());
    }
}
package com.ccj.pxj.homework.distinct;


import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class DistinctReducer extends Reducer<Text, NullWritable,Text,NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

            context.write(key,NullWritable.get());

    }
}
package com.ccj.pxj.homework.distinct;

import com.ccj.pxj.homework.two.wc.one.WcMapper;
import com.ccj.pxj.homework.two.wc.one.WcReducer;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class DistinctDriver implements Tool {
    private  Configuration conf;
    @Override
    public int run(String[] args) throws Exception {
        String OutPath="wc/distinct/out";
        String InputPath="data/distinct.txt";
        // 1. 获得 Job 对象
        Job job = Job.getInstance(getConf());
        // 2. 设置主类
        job.setJarByClass(DistinctDriver.class);
        // 3. 设置 Mapper 类
        job.setMapperClass(DistinctMapper.class);
        // 4. 不需要reduce
        job.setReducerClass(DistinctReducer.class);

        // 5. 设置 Map key-value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 6. 设置输入路径
        FileUtils.deleteOutput(conf,OutPath);
        FileInputFormat.setInputPaths(job, new Path(InputPath));

        // 7 设置输出路径
        FileOutputFormat.setOutputPath(job,new Path(OutPath));

        return job.waitForCompletion(true) ? 0 : 1;

    }

    @Override
    public void setConf(Configuration conf) {
       this.conf=conf;
    }

    @Override
    public Configuration getConf() {
        return this.conf;
    }
    public static void main(String[] args) throws Exception {
        int resultCode = ToolRunner.run(new DistinctDriver(), args);
        if (resultCode == 0) {
            System.out.println("执行成功!");
        } else {
            System.out.println("执行失败!");
        }

    }
}

数据

pxj	pxj	pxj
pxj	pxj	pxj
ccj	pxj	wfy
ccj	ccj	pxj
pxj	wfy	pxj
pxj	pxj	ccj
pxj	wfy	wlp
pxj	wfy	pxj
wxc	ccj	jpeson
pxj	pxj	wfy
pxj	wlp	wfy
pxj	pxj	wlp
pxj	pxj	wfy
pxj	pxj	pxj
pxj	wxc	pxj
pxj	pxj	ccj
pk	wxc	pxj
pxj	pxj	ccj
pxj	pxj	zcl
wlp	pxj	lzh
pxj	wfy	wxc
pxj	pxj	pxj
wlp	pxj	wxc
ccj	lzh	pxj
pxj	pxj	pxj
pxj	wfy	pxj
ccj	pxj	wfy
pxj	pxj	lzh
pxj	pxj	ccj
wfy	wfy	ccj

结果

ccj	ccj	pxj
ccj	lzh	pxj
ccj	pxj	wfy
pk	wxc	pxj
pxj	pxj	ccj
pxj	pxj	lzh
pxj	pxj	pxj
pxj	pxj	wfy
pxj	pxj	wlp
pxj	pxj	zcl
pxj	wfy	pxj
pxj	wfy	wlp
pxj	wfy	wxc
pxj	wlp	wfy
pxj	wxc	pxj
wfy	wfy	ccj
wlp	pxj	lzh
wlp	pxj	wxc
wxc	ccj	jpeson

作者:pxj(潘陈)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值