使用MapReduce进行数据预处理

清风--明月

已于 2023-03-26 17:01:23 修改

阅读量1.6k

点赞数

分类专栏：大数据实训文章标签： java 开发语言 mapreduce 大数据 centos

于 2022-10-18 20:40:03 首次发布

本文链接：https://blog.csdn.net/qq_53488289/article/details/127252397

版权

大数据实训专栏收录该内容

5 篇文章 1 订阅

订阅专栏

本文详细介绍了如何使用MapReduce进行数据预处理，包括创建数据转换类、实现Map任务的Mapper类、构建MapReduce主类、配置pom.xml文件以及将程序打包提交到Hadoop集群运行。通过实例展示了MapReduce在处理大量数据时的步骤和集群运行情况。

摘要由CSDN通过智能技术生成

1.创建数据转换类

package com.position.clean;

import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;


public class CleanJob {
	public static String deleteString(String str,char delChar) {
		StringBuffer stringBuffer = new StringBuffer("");
		for (int i =0;i<str.length();i++) {
			if(str.charAt(i) != delChar) {
				stringBuffer.append(str.charAt(i));
			}
		}
		return stringBuffer.toString();
	}
	

	public static String mergeString(String position,JSONArray company) throws JSONException {
		String result = "";
		if(company.length() !=0) {
			for(int i = 0;i < company.length(); i++) {
			result = result+company.get(i)+"-" ;
			}
		}
		if(position !="") {
			String[] positionList = position.split("|;|,||,|;|/");
			for(int i = 0;i < positionList.length; i++) {
				result = result + positionList[i].replaceAll("[\\pP\\p{Punct}]", "");
				
			}
		}
		return result.substring(0, result.length()-1);
	}

	public static String killResult(JSONArray killData) throws JSONException {
		String result="";
		if(killData.length()!=0) {
			for (int i = 0;i < killData.length();i++) {
				result = result + killData.get(i)+"-";
			}
			return result.substring(0, result.length()-1);
		}else {
			return "null";
		}
	}
	public static String resultToString(JSONArray jobdata) throws JSONException {
		String jobResultData= "";
		for(int i=0;i<jobdata.length();i++) {
			String everyData =  jobdata.get(i).toString();
			JSONObject everyDataJson = new JSONObject(everyData);
			String city = everyDataJson.getString("city");
			String salary = everyDataJson.getString("salary");
			String positionAdvantage = everyDataJson.getString("positionAdvantage");
			JSONArray companyLabelList = everyDataJson.getJSONArray("companyLabelList");
			JSONArray skillLables = everyDataJson.getJSONArray("skillLables");
			String salaryNew = deleteString(salary,'k');
			String welfare = mergeString(positionAdvantage, companyLabelList);
			String kill = killResult(skillLables);
			if(i == jobdata.length()-1) {
				jobResultData = jobResultData+city + ","+salaryNew + "," + welfare + "," + kill;
			}else {
				jobResultData = jobResultData+city + ","+salaryNew + "," + welfare + "," + kill + "\n";
			}
		}
		return jobResultData;
		
	}
}

2.创建实现Map任务的Mapper类

package com.position.clean;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;

public class CleanMapper extends Mapper<LongWritable,Text,Text,NullWritable>{

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String  jobResultData = "";
		String  reptileData = value.toString();
		String  jobData = reptileData.substring(reptileData.indexOf("=",reptileData.indexOf("=")+1)+1,reptileData.length());
		try {
	
			JSONObject contentJson = new JSONObject(jobData);
			String contentData = contentJson.getString("content");
			JSONObject positionResultJson = new JSONObject(contentData);
			String positionResultData = positionResultJson.getString("positionResult");
			JSONObject resultJson = new JSONObject(positionResultData);
			JSONArray resultData = resultJson.getJSONArray("result");
			CleanJob.resultToString(resultData);
			jobResultData = CleanJob.resultToString(resultData);
			context.write(new Text(jobResultData), NullWritable.get());
			
			
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
		
		
		
		
		
	}
}

3.创建MapReduce程序执行主类

package com.position.clean;



import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.BasicConfigurator;



public class CleanMain {
	public static void main(String[] args) throws Exception{
		BasicConfigurator.configure();
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if(otherArgs.length !=2) {
			System.err.print("Usage : wordcout<in> <out>");
			System.exit(2);
		}

		Job job = new Job(conf,"job");
	
		job.setJarByClass(CleanMain.class);

		job.setMapperClass(CleanMapper.class);

		job.setInputFormatClass(CombineTextInputFormat.class);

		CombineTextInputFormat.setMinInputSplitSize(job, 2097152);

		CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);

		job.setOutputKeyClass(Text.class);
	
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true)?0:1);
	}
}

4.编写pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.itcast.jobcase</groupId>
  <artifactId>jobcase-clean</artifactId>
  <version>0.0.1-SNAPSHOT</version>

  <dependencies>
  	<dependency>
  		<groupId>org.apache.hadoop</groupId>
  		<artifactId>hadoop-common</artifactId>
  		<version>2.7.1</version>
  	</dependency>
  	<dependency>
  		<groupId>org.apache.hadoop</groupId>
  		<artifactId>hadoop-client</artifactId>
  		<version>2.7.1</version>
  	</dependency>
  </dependencies>
</project>

5.将程序打包提交到集群运行

打包方式一、进入项目路径下打开cmd输入
mvn package
打包方式二、使用idea自带的打包

//上传jobcase-clean-0.0.1-SNAPSHOT.jar到集群并运行

//使用winscp上传

//改名

[root@hadoop1 ~]# mv jobcase-clean-0.0.1-SNAPSHOT.jar clean.jar

//启动集群

[root@hadoop1 ~]# start-all.sh
[root@hadoop1 ~]# jps
2112 Jps
1745 ResourceManager
1365 NameNode
1494 DataNode
1894 NodeManager

//运行

[root@hadoop1 ~]# hadoop jar clean.jar com.position.clean.CleanMain /JobData/20221016/ /JobData/output

22/10/18 20:34:14 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /JobData/output
[root@hadoop1 ~]# hadoop jar clean.jar com.position.clean.CleanMain /JobData/20221016/ /JobData/output
22/10/18 20:34:20 INFO client.RMProxy: Connecting to ResourceManager at hadoop1/192.168.100.160:8032
0 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at hadoop1/192.168.100.160:8032
22/10/18 20:34:21 INFO input.FileInputFormat: Total input paths to process : 30
1074 [main] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 30
22/10/18 20:34:21 INFO input.CombineFileInputFormat: DEBUG: Terminated node allocation with : CompletedNodes: 3, size left: 1669481
1097 [main] INFO org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat - DEBUG: Terminated node allocation with : CompletedNodes: 3, size left: 1669481
22/10/18 20:34:21 INFO mapreduce.JobSubmitter: number of splits:1
1192 [main] INFO org.apache.hadoop.mapreduce.JobSubmitter - number of splits:1
22/10/18 20:34:21 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1666096184094_0001
1374 [main] INFO org.apache.hadoop.mapreduce.JobSubmitter - Submitting tokens for job: job_1666096184094_0001
22/10/18 20:34:22 INFO impl.YarnClientImpl: Submitted application application_1666096184094_0001
1874 [main] INFO org.apache.hadoop.yarn.client.api.impl.YarnClientImpl - Submitted application application_1666096184094_0001
22/10/18 20:34:22 INFO mapreduce.Job: The url to track the job: http://hadoop1:8088/proxy/application_1666096184094_0001/
1895 [main] INFO org.apache.hadoop.mapreduce.Job - The url to track the job: http://hadoop1:8088/proxy/application_1666096184094_0001/
22/10/18 20:34:22 INFO mapreduce.Job: Running job: job_1666096184094_0001
1896 [main] INFO org.apache.hadoop.mapreduce.Job - Running job: job_1666096184094_0001
22/10/18 20:34:28 INFO mapreduce.Job: Job job_1666096184094_0001 running in uber mode : false
8041 [main] INFO org.apache.hadoop.mapreduce.Job - Job job_1666096184094_0001 running in uber mode : false
22/10/18 20:34:28 INFO mapreduce.Job: map 0% reduce 0%
8042 [main] INFO org.apache.hadoop.mapreduce.Job - map 0% reduce 0%
22/10/18 20:34:35 INFO mapreduce.Job: map 100% reduce 0%
15112 [main] INFO org.apache.hadoop.mapreduce.Job - map 100% reduce 0%
22/10/18 20:34:41 INFO mapreduce.Job: map 100% reduce 100%
21241 [main] INFO org.apache.hadoop.mapreduce.Job - map 100% reduce 100%
22/10/18 20:34:41 INFO mapreduce.Job: Job job_1666096184094_0001 completed successfully
21257 [main] INFO org.apache.hadoop.mapreduce.Job - Job job_1666096184094_0001 completed successfully
................

//查看数据

[root@hadoop1 ~]# hadoop dfs -cat /JobData/output/part-r-00000
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

上海,15-25,带薪年假-五险一金-节日礼物-定期体检-全面福利体系work life blanc,null
上海,20-35,高薪诚,null
深圳,20-40,福,null
广州,12-24,扁平化管理-现象级产品-互联网百,null
南京,15-30,年底双薪-技能培训-免费班车-带薪年假-六险一金通讯补助大平台核心部,Hadoop-Spark-MySQL-Hive
杭州,15-25,绩效奖金-专项奖金-扁平管理-扁平化管,null
上海,22-32,年底双薪-专项奖金-股票期权-年终分红-不内卷加班,null
北京,25-45,扁平管理-弹性工作-就近租房补贴-六险一金-六险一金弹性工作免费三餐租房补,null
北京,30-50,节日礼物-年度旅游-扁平管理-定期体检-不加班团队氛围

.......