mapreduce数据清理java_Java 操作 Hadoop 的 Mapreduce 数据处理

本文详细介绍了如何使用Java操作Hadoop的MapReduce进行数据处理,包括数据导入、创建Mapper和Reducer、数据分区以及使用Combiner进行数据汇总。通过示例展示了如何统计单词出现次数、按省份分组消费数据以及按姓名和消费排序,提高数据处理效率。
摘要由CSDN通过智能技术生成

1.导入pom依赖

......3.1.2

org.apache.hadoop

hadoop-common

${hadoop.version}

org.apache.hadoop

hadoop-hdfs

${hadoop.version}

org.apache.hadoop

hadoop-client

${hadoop.version}

org.apache.hadoop

hadoop-mapreduce-client-core

${hadoop.version}

......

2.基本使用

a.创建 test1.txt 文件用于统计

hello zhangsan

lisi nihao

hi zhangsan

nihao lisi

x xiaoming

b.创建 Mapper

/*** 这部分的输入是由mapreduce自动读取进来的

* 简单的统计单词出现次数

* KEYIN 默认情况下,是mapreduce所读取到的一行文本的起始偏移量,Long类型,在hadoop中有其自己的序列化类LongWriteable

* VALUEIN 默认情况下,是mapreduce所读取到的一行文本的内容,hadoop中的序列化类型为Text

* KEYOUT 是用户自定义逻辑处理完成后输出的KEY,在此处是单词,String

* VALUEOUT 是用户自定义逻辑输出的value,这里是单词出现的次数,Long*/

public class WordCountMapper extends Mapper{

@Overrideprotected void map(LongWritable key, Text value, Mapper.Context context) throwsIOException, InterruptedException {//这是mapreduce读取到的一行字符串

String line =value.toString();

String[] words= line.split(" ");for(String word : words) {//将单词输出为key,次数输出为value,这行数据会输到reduce中

context.write(new Text(word), new LongWritable(1));

}

}

}

c.创建 Reducer

/*** 第一个Text: 是传入的单词名称,是Mapper中传入的

* 第二个:LongWritable 是该单词出现了多少次,这个是mapreduce计算出来的,比如 hello出现了11次

* 第三个Text: 是输出单词的名称 ,这里是要输出到文本中的内容

* 第四个LongWritable: 是输出时显示出现了多少次,这里也是要输出到文本中的内容

**/

public class WordCountReduce extends Reducer{

@Overrideprotected void reduce(Text key, Iterable values, Reducer.Context context) throwsIOException, InterruptedException {long count = 0;for(LongWritable num : values) {

count+=num.get();

}

context.write(key,newLongWritable(count));

}

}

d.使用

importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.FSDataInputStream;importorg.apache.hadoop.fs.FSDataOutputStream;importorg.apache.hadoop.fs.FileSystem;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.IOUtils;importorg.apache.hadoop.io.LongWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;importjava.io.FileInputStream;importjava.io.InputStream;importjava.net.URI;public classTestMain {private static final String HDFS_PREFIX = "hdfs://localhost:9527";private staticFileSystem fs;/*** 上传测试文件到HDFS文件系统*/

public static void doBeforeWork(String sourceFilePath, String inFile, String outDir) throwsException {

Configuration conf= newConfiguration();

conf.set("fs.defaultFS", HDFS_PREFIX); //对应 core-site.xml 中配置的端口//拿到操作HDFS的一个实例,并且设置其用户(由于windows权限问题"zwj"需替换为管理员账号)

fs = FileSystem.get(new URI(HDFS_PREFIX),conf,"zwj");

FSDataOutputStream fout= fs.create(new Path(inFile), true);

InputStream in= newFileInputStream(sourceFilePath);

IOUtils.copyBytes(in, fout,1024,true);//删除结果文件夹

fs.delete(new Path(outDir), true);

}/*** 打印结果*/

public static void doAfterWork(String outFilePath) throwsException {

FSDataInputStream fin= fs.open(newPath(outFilePath));

IOUtils.copyBytes(fin, System.out,1024,true);

}/*** 运行*/

public static void run(String inFilePath, String outFilePath) throwsException{

Configuration conf= newConfiguration();//如果是打包在linux上运行,则不需要写这两行代码// //指定运行在yarn中//conf.set("mapreduce.framework.name", "yarn");// //指定resourcemanager的主机名//conf.set("yarn.resourcemanager.hostname", "localhost");

Job job =Job.getInstance(conf);//使得hadoop可以根据类包,找到jar包在哪里

job.setJarByClass(TestMain.class);//指定Mapper的类

job.setMapperClass(WordCountMapper.class);//指定reduce的类

job.setReducerClass(WordCountReduce.class);//设置Mapper输出的类型

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(LongWritable.class);//设置最终输出的类型

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(LongWritable.class);//指定输入文件的位置

FileInputFormat.setInputPaths(job, newPath(inFilePath));//指定输入文件的位置

FileOutputFormat.setOutputPath(job, newPath(outFilePath));//将job中的参数,提交到yarn中运行//job.submit();

try{

job.waitForCompletion(true);//这里的为true,会打印执行结果

} catch (ClassNotFoundException |InterruptedException e) {

e.printStackTrace();

}

}public static voidmain(String[] args) {try{

String sourceFilePath= "E:/tmp/test1.txt";

String inFile= "/mydir/test1.txt";

String inFileUrl= HDFS_PREFIX +inFile;

String outDir= "/outdir";

String outDirUrl= HDFS_PREFIX +outDir;

String outFileUrl= outDirUrl + "/part-r-00000";

doBeforeWork(sourceFilePath, inFile, outDir);

run(inFileUrl, outDirUrl);

doAfterWork(outFileUrl);

}catch(Exception e){

e.printStackTrace();

}

}

}

e.注意:若运行后报 "HADOOP_HOME and hadoop.home.dir are unset." 异常,则需要客户端也配置 HADOOP_HOME 环境变量,并重启电脑

3.数据分区

a.创建 test2.txt 文件用于统计

张三 江西 打车 200李四 广东 住宿600王五 北京 伙食320张三 江西 话费50张三 湖南 打车900周六 上海 采购3000李四 西藏 旅游1000王五 北京 借款500李四 上海 话费50周六 北京 打车600张三 广东 租房3050

b.创建开销实体类

public class SpendBean implementsWritable {privateText userName;privateIntWritable money;privateText province;publicSpendBean(Text userName, IntWritable money, Text province) {this.userName =userName;this.money =money;this.province =province;

}/*** 反序列化时必须有一个空参的构造方法*/

publicSpendBean(){}/*** 序列化的代码

*@paramout

*@throwsIOException*/@Overridepublic void write(DataOutput out) throwsIOException {

userName.write(out);

money.write(out);

province.write(out);

}/*** 反序列化的代码

*@paramin

*@throwsIOException*/@Overridepublic void readFields(DataInput in) throwsIOException {

userName= newText();

userName.readFields(in);

money= newIntWritable();

money.readFields(in);

province= newText();

province.readFields(in);

}publicText getUserName() {returnuserName;

}public voidsetUserName(Text userName) {this.userName =userName;

}publicIntWritable getMoney() {returnmoney;

}public voidsetMoney(IntWritable money) {this.money =money;

}publicText getProvince() {returnprovince;

}public voidsetProvince(Text province) {this.province =province;

}

@OverridepublicString toString() {return "[SpendBean]: userName[" + userName + "], money[" + money + "], province[" + province + "]";

}

}

c.创建 Mapper

public class GroupUserMapper extends Mapper{

@Overrideprotected void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {

String val=value.toString();

String[] split= val.split(" ");//这里就不作字符串异常的处理了,核心代码简单点

String name = split[0];

String province= split[1];

String type= split[2];int money = Integer.parseInt(split[3]);

SpendBean groupUser= newSpendBean();

groupUser.setUserName(newText(name));

groupUser.setMoney(newIntWritable(money));

groupUser.setProvince(newText(province));

context.write(newText(name), groupUser);

}

}

d.创建 Reducer

public class GroupUserReducer extends Reducer{/*** 姓名

*@paramkey

*@paramvalues

*@paramcontext

*@throwsIOException

*@throwsInterruptedException*/@Overrideprotected void reduce(Text key, Iterable values, Context context) throwsIOException, InterruptedException {int money = 0;//消费金额//遍历

Text province = null;for(SpendBean bean : values){

money+=bean.getMoney().get();

province=bean.getProvince();

}//输出汇总结果

context.write(key,new SpendBean(key, newIntWritable(money), province));

}

}

e.创建分区 Partition

public class ProvincePartitioner extends Partitioner{private static Map provinces = new HashMap<>();static{//这里给每一个省份编制一个分区

provinces.put("江西",0);

provinces.put("广东",1);

provinces.put("北京",2);

provinces.put("湖南",3);

provinces.put("上海",4);

provinces.put("西藏",5);

}/*** 给指定的数据一个分区

*@paramtext

*@paramspendBean

*@paramnumPartitions

*@return

*/@Overridepublic int getPartition(Text text, SpendBean spendBean, intnumPartitions) {

Integer province=provinces.get(spendBean.getProvince().toString());

province= province == null ? 6 : province; //如果在省份列表中找不到,则指定一个默认的分区

returnprovince;

}

}

f.使用

importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.FSDataInputStream;importorg.apache.hadoop.fs.FSDataOutputStream;importorg.apache.hadoop.fs.FileSystem;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.IOUtils;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;importjava.io.FileInputStream;importjava.io.InputStream;importjava.net.URI;importjava.util.ArrayList;importjava.util.List;public classTestMain {private static final String HDFS_PREFIX = "hdfs://localhost:9527";private staticFileSystem fs;/*** 上传测试文件到HDFS文件系统*/

public static void doBeforeWork(String sourceFilePath, String inFile, String outDir) throwsException {

Configuration conf= newConfiguration();

conf.set("fs.defaultFS", HDFS_PREFIX); //对应 core-site.xml 中配置的端口//拿到操作HDFS的一个实例,并且设置其用户(由于windows权限问题"zwj"需替换为管理员账号)

fs = FileSystem.get(new URI(HDFS_PREFIX),conf,"zwj");

FSDataOutputStream fout= fs.create(new Path(inFile), true);

InputStream in= newFileInputStream(sourceFilePath);

IOUtils.copyBytes(in, fout,1024,true);//删除结果文件夹

fs.delete(new Path(outDir), true);

}/*** 打印结果*/

public static void doAfterWork(List outFilePathList) throwsException {for(String outFilePath : outFilePathList){

FSDataInputStream fin= fs.open(newPath(outFilePath));

IOUtils.copyBytes(fin, System.out,1024,false);

System.out.println("=======================================================================");

}

}/*** 运行*/

public static void run(String inFilePath, String outFilePath) throwsException {

Configuration configuration= newConfiguration();

Job job=Job.getInstance(configuration);

job.setJarByClass(TestMain.class); //设置jar中的启动类,可以根据这个类找到相应的jar包

job.setMapperClass(GroupUserMapper.class); //设置mapper的类

job.setReducerClass(GroupUserReducer.class); //设置reducer的类

job.setMapOutputKeyClass(Text.class); //mapper输出的key

job.setMapOutputValueClass(SpendBean.class); //mapper输出的value

job.setPartitionerClass(ProvincePartitioner.class);//指定数据分区规则,不是必须要的,根据业务需求分区

job.setNumReduceTasks(7); //设置相应的reducer数量,这个数量要与分区的大最数量一致

job.setOutputKeyClass(Text.class); //最终输出的数据类型

job.setOutputValueClass(SpendBean.class);

FileInputFormat.setInputPaths(job,new Path(inFilePath));//输入的文件位置

FileOutputFormat.setOutputPath(job,new Path(outFilePath));//输出的文件位置

boolean b = job.waitForCompletion(true);//等待完成,true,打印进度条及内容

if(b){//success

}

}public static voidmain(String[] args) {try{

String sourceFilePath= "E:/tmp/test2.txt";

String inFile= "/mydir/test2.txt";

String inFileUrl= HDFS_PREFIX +inFile;

String outDir= "/outdir";

String outDirUrl= HDFS_PREFIX +outDir;

doBeforeWork(sourceFilePath, inFile, outDir);

run(inFileUrl, outDirUrl);

List outFileUrlList = newArrayList();for (int i = 0; i < 7; i++) {

outFileUrlList.add(outDirUrl+ "/part-r-0000" +i);

}

doAfterWork(outFileUrlList);

}catch(Exception e){

e.printStackTrace();

}

}

}

4.数据排序

a.创建 test3.txt 文件用于统计

张三 2980李四8965王五1987小黑6530小陈2963小梅980

b.创建开销实体类,实现 WritableComparable,重写 compareTo 方法

public class Spend implements WritableComparable{private Text name; //姓名

private IntWritable money; //花费

publicSpend(){}publicSpend(Text name, IntWritable money) {this.name =name;this.money =money;

}public voidset(Text name, IntWritable money) {this.name =name;this.money =money;

}

@Overridepublic intcompareTo(Spend o) {return o.getMoney().get() - this.money.get();

}

@Overridepublic void write(DataOutput out) throwsIOException {

name.write(out);

money.write(out);

}

@Overridepublic void readFields(DataInput in) throwsIOException {

name= newText();

name.readFields(in);

money= newIntWritable();

money.readFields(in);

}publicText getName() {returnname;

}public voidsetName(Text name) {this.name =name;

}publicIntWritable getMoney() {returnmoney;

}public voidsetMoney(IntWritable money) {this.money =money;

}

@OverridepublicString toString() {return name.toString() + "," +money.get();

}

}

c.创建 Mapper

public class SortMapper extends Mapper{private Spend spend = newSpend();private IntWritable moneyWritable = newIntWritable();private Text text = newText();

@Overrideprotected void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {

String[] split= value.toString().split(" ");//这里就不做异常处理了,只写核心逻辑

String name = split[0];int money = Integer.parseInt(split[1]);

text.set(name);

moneyWritable.set(money);

spend.set(text, moneyWritable);

context.write(spend,text);

}

}

d.创建 Reducer

public class SortReducer extends Reducer{

@Overrideprotected void reduce(Spend key, Iterable values, Context context) throwsIOException, InterruptedException {

context.write(values.iterator().next(), key);

}

}

e.使用

importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.FSDataInputStream;importorg.apache.hadoop.fs.FSDataOutputStream;importorg.apache.hadoop.fs.FileSystem;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.IOUtils;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;importjava.io.FileInputStream;importjava.io.InputStream;importjava.net.URI;public classTestMain {private static final String HDFS_PREFIX = "hdfs://localhost:9527";private staticFileSystem fs;/*** 上传测试文件到HDFS文件系统*/

public static void doBeforeWork(String sourceFilePath, String inFile, String outDir) throwsException {

Configuration conf= newConfiguration();

conf.set("fs.defaultFS", HDFS_PREFIX); //对应 core-site.xml 中配置的端口//拿到操作HDFS的一个实例,并且设置其用户(由于windows权限问题"zwj"需替换为管理员账号)

fs = FileSystem.get(new URI(HDFS_PREFIX),conf,"zwj");

FSDataOutputStream fout= fs.create(new Path(inFile), true);

InputStream in= newFileInputStream(sourceFilePath);

IOUtils.copyBytes(in, fout,1024,true);//删除结果文件夹

fs.delete(new Path(outDir), true);

}/*** 打印结果*/

public static void doAfterWork(String outFilePath) throwsException {

FSDataInputStream fin= fs.open(newPath(outFilePath));

IOUtils.copyBytes(fin, System.out,1024,true);

}/*** 运行*/

public static void run(String inFilePath, String outFilePath) throwsException {

Configuration config= newConfiguration();

Job job=Job.getInstance(config);

job.setJarByClass(TestMain.class);

job.setMapperClass(SortMapper.class);

job.setReducerClass(SortReducer.class);

job.setMapOutputKeyClass(Spend.class);

job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Spend.class);

FileInputFormat.setInputPaths(job,newPath(inFilePath));

FileOutputFormat.setOutputPath(job,newPath(outFilePath));boolean b = job.waitForCompletion(true);if(b){//success

}

}public static voidmain(String[] args) {try{

String sourceFilePath= "E:/tmp/test3.txt";

String inFile= "/mydir/test3.txt";

String inFileUrl= HDFS_PREFIX +inFile;

String outDir= "/outdir";

String outDirUrl= HDFS_PREFIX +outDir;

String outFileUrl= outDirUrl + "/part-r-00000";

doBeforeWork(sourceFilePath, inFile, outDir);

run(inFileUrl, outDirUrl);

doAfterWork(outFileUrl);

}catch(Exception e){

e.printStackTrace();

}

}

}

5.数据汇总(Combiner)

a.执行顺序:Mapper—Combiner—Reducer

b.作用:就是在各个map中预先进行一次,减少在reducer阶段的数据量,这样能提升很高的效率。

fd24680a6a03d26f79c0514d937411f1.png

dff9bbcce1316f5562c3567617fc90b4.png

b.加入anjs中文分词依赖,用于分词(此依赖与Combiner无关)

org.ansj

ansj_seg

5.1.1

c.创建 Mapper

public class StoryMapper extends Mapper{private Text text = newText();private LongWritable longWritable = newLongWritable();

@Overrideprotected void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {

String line=value.toString().trim();//剔除空的一行

if(!StringUtils.isBlank(line)){//分词的代码

Result parse =ToAnalysis.parse(line);

List terms =parse.getTerms();

Iterator iterator =terms.iterator();while(iterator.hasNext()){

Term term=iterator.next();

longWritable.set(1);

text.set(term.getName());

context.write(text,longWritable);

}

}

}

}

d.创建 Reducer

public class StoryReducer extends Reducer{

@Overrideprotected void reduce(Text key, Iterable values, Context context) throwsIOException, InterruptedException {

Iterator iterator =values.iterator();long num = 0;while(iterator.hasNext()){

LongWritable longWritable=iterator.next();

num+=longWritable.get();

}

context.write(newLongWritable(num),key);

}

}

e.创建Combiner

public class StoryCombiner extends Reducer{private LongWritable longWritable = newLongWritable();

@Overrideprotected void reduce(Text key, Iterable values, Context context) throwsIOException, InterruptedException {

Iterator iterator =values.iterator();long num = 0;while(iterator.hasNext()){

LongWritable longWritable=iterator.next();

num+=longWritable.get();

}

longWritable.set(num);

context.write(key, longWritable);

}

}

f.使用

importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.FSDataInputStream;importorg.apache.hadoop.fs.FSDataOutputStream;importorg.apache.hadoop.fs.FileSystem;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.IOUtils;importorg.apache.hadoop.io.LongWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;importjava.io.FileInputStream;importjava.io.InputStream;importjava.net.URI;public classTestMain {private static final String HDFS_PREFIX = "hdfs://localhost:9527";private staticFileSystem fs;/*** 上传测试文件到HDFS文件系统*/

public static void doBeforeWork(String sourceFilePath, String inFile, String outDir) throwsException {

Configuration conf= newConfiguration();

conf.set("fs.defaultFS", HDFS_PREFIX); //对应 core-site.xml 中配置的端口//拿到操作HDFS的一个实例,并且设置其用户(由于windows权限问题"zwj"需替换为管理员账号)

fs = FileSystem.get(new URI(HDFS_PREFIX),conf,"zwj");

FSDataOutputStream fout= fs.create(new Path(inFile), true);

InputStream in= newFileInputStream(sourceFilePath);

IOUtils.copyBytes(in, fout,1024,true);//删除结果文件夹

fs.delete(new Path(outDir), true);

}/*** 打印结果*/

public static void doAfterWork(String outFilePath) throwsException {

FSDataInputStream fin= fs.open(newPath(outFilePath));

IOUtils.copyBytes(fin, System.out,1024,true);

}/*** 运行*/

public static void run(String inFilePath, String outFilePath) throwsException {

Configuration configuration= newConfiguration();

Job job=Job.getInstance(configuration);

job.setJarByClass(TestMain.class);

job.setMapperClass(StoryMapper.class);

job.setReducerClass(StoryReducer.class);

job.setCombinerClass(StoryCombiner.class);//设置Combiner

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(LongWritable.class);

job.setOutputKeyClass(LongWritable.class);

job.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(job,newPath(inFilePath));

FileOutputFormat.setOutputPath(job,newPath(outFilePath));boolean b = job.waitForCompletion(true);if(b){//success

}

}public static voidmain(String[] args) {try{

String sourceFilePath= "E:/tmp/test4.txt";

String inFile= "/mydir/test4.txt";

String inFileUrl= HDFS_PREFIX +inFile;

String outDir= "/outdir";

String outDirUrl= HDFS_PREFIX +outDir;

String outFileUrl= outDirUrl + "/part-r-00000";

doBeforeWork(sourceFilePath, inFile, outDir);

run(inFileUrl, outDirUrl);

doAfterWork(outFileUrl);

}catch(Exception e){

e.printStackTrace();

}

}

}

6.参考文章:https://www.cnblogs.com/zhuxiaojie/p/7224772.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值