目录
1.创建数据转换类
package com.position.clean;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
public class CleanJob {
public static String deleteString(String str,char delChar) {
StringBuffer stringBuffer = new StringBuffer("");
for (int i =0;i<str.length();i++) {
if(str.charAt(i) != delChar) {
stringBuffer.append(str.charAt(i));
}
}
return stringBuffer.toString();
}
public static String mergeString(String position,JSONArray company) throws JSONException {
String result = "";
if(company.length() !=0) {
for(int i = 0;i < company.length(); i++) {
result = result+company.get(i)+"-" ;
}
}
if(position !="") {
String[] positionList = position.split("|;|,||,|;|/");
for(int i = 0;i < positionList.length; i++) {
result = result + positionList[i].replaceAll("[\\pP\\p{Punct}]", "");
}
}
return result.substring(0, result.length()-1);
}
public static String killResult(JSONArray killData) throws JSONException {
String result="";
if(killData.length()!=0) {
for (int i = 0;i < killData.length();i++) {
result = result + killData.get(i)+"-";
}
return result.substring(0, result.length()-1);
}else {
return "null";
}
}
public static String resultToString(JSONArray jobdata) throws JSONException {
String jobResultData= "";
for(int i=0;i<jobdata.length();i++) {
String everyData = jobdata.get(i).toString();
JSONObject everyDataJson = new JSONObject(everyData);
String city = everyDataJson.getString("city");
String salary = everyDataJson.getString("salary");
String positionAdvantage = everyDataJson.getString("positionAdvantage");
JSONArray companyLabelList = everyDataJson.getJSONArray("companyLabelList");
JSONArray skillLables = everyDataJson.getJSONArray("skillLables");
String salaryNew = deleteString(salary,'k');
String welfare = mergeString(positionAdvantage, companyLabelList);
String kill = killResult(skillLables);
if(i == jobdata.length()-1) {
jobResultData = jobResultData+city + ","+salaryNew + "," + welfare + "," + kill;
}else {
jobResultData = jobResultData+city + ","+salaryNew + "," + welfare + "," + kill + "\n";
}
}
return jobResultData;
}
}
2.创建实现Map任务的Mapper类
package com.position.clean;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
public class CleanMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String jobResultData = "";
String reptileData = value.toString();
String jobData = reptileData.substring(reptileData.indexOf("=",reptileData.indexOf("=")+1)+1,reptileData.length());
try {
JSONObject contentJson = new JSONObject(jobData);
String contentData = contentJson.getString("content");
JSONObject positionResultJson = new JSONObject(contentData);
String positionResultData = positionResultJson.getString("positionResult");
JSONObject resultJson = new JSONObject(positionResultData);
JSONArray resultData = resultJson.getJSONArray("result");
CleanJob.resultToString(resultData);
jobResultData = CleanJob.resultToString(resultData);
context.write(new Text(jobResultData), NullWritable.get());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
3.创建MapReduce程序执行主类
package com.position.clean;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.BasicConfigurator;
public class CleanMain {
public static void main(String[] args) throws Exception{
BasicConfigurator.configure();
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length !=2) {
System.err.print("Usage : wordcout<in> <out>");
System.exit(2);
}
Job job = new Job(conf,"job");
job.setJarByClass(CleanMain.class);
job.setMapperClass(CleanMapper.class);
job.setInputFormatClass(CombineTextInputFormat.class);
CombineTextInputFormat.setMinInputSplitSize(job, 2097152);
CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}
4.编写pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.itcast.jobcase</groupId>
<artifactId>jobcase-clean</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.1</version>
</dependency>
</dependencies>
</project>
5.将程序打包提交到集群运行
打包方式一、进入项目路径下打开cmd输入
mvn package
打包方式二、使用idea自带的打包
//上传jobcase-clean-0.0.1-SNAPSHOT.jar到集群并运行
//使用winscp上传
//改名
[root@hadoop1 ~]# mv jobcase-clean-0.0.1-SNAPSHOT.jar clean.jar
//启动集群
[root@hadoop1 ~]# start-all.sh
[root@hadoop1 ~]# jps
2112 Jps
1745 ResourceManager
1365 NameNode
1494 DataNode
1894 NodeManager//运行
[root@hadoop1 ~]# hadoop jar clean.jar com.position.clean.CleanMain /JobData/20221016/ /JobData/output
22/10/18 20:34:14 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /JobData/output
[root@hadoop1 ~]# hadoop jar clean.jar com.position.clean.CleanMain /JobData/20221016/ /JobData/output
22/10/18 20:34:20 INFO client.RMProxy: Connecting to ResourceManager at hadoop1/192.168.100.160:8032
0 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at hadoop1/192.168.100.160:8032
22/10/18 20:34:21 INFO input.FileInputFormat: Total input paths to process : 30
1074 [main] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 30
22/10/18 20:34:21 INFO input.CombineFileInputFormat: DEBUG: Terminated node allocation with : CompletedNodes: 3, size left: 1669481
1097 [main] INFO org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat - DEBUG: Terminated node allocation with : CompletedNodes: 3, size left: 1669481
22/10/18 20:34:21 INFO mapreduce.JobSubmitter: number of splits:1
1192 [main] INFO org.apache.hadoop.mapreduce.JobSubmitter - number of splits:1
22/10/18 20:34:21 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1666096184094_0001
1374 [main] INFO org.apache.hadoop.mapreduce.JobSubmitter - Submitting tokens for job: job_1666096184094_0001
22/10/18 20:34:22 INFO impl.YarnClientImpl: Submitted application application_1666096184094_0001
1874 [main] INFO org.apache.hadoop.yarn.client.api.impl.YarnClientImpl - Submitted application application_1666096184094_0001
22/10/18 20:34:22 INFO mapreduce.Job: The url to track the job: http://hadoop1:8088/proxy/application_1666096184094_0001/
1895 [main] INFO org.apache.hadoop.mapreduce.Job - The url to track the job: http://hadoop1:8088/proxy/application_1666096184094_0001/
22/10/18 20:34:22 INFO mapreduce.Job: Running job: job_1666096184094_0001
1896 [main] INFO org.apache.hadoop.mapreduce.Job - Running job: job_1666096184094_0001
22/10/18 20:34:28 INFO mapreduce.Job: Job job_1666096184094_0001 running in uber mode : false
8041 [main] INFO org.apache.hadoop.mapreduce.Job - Job job_1666096184094_0001 running in uber mode : false
22/10/18 20:34:28 INFO mapreduce.Job: map 0% reduce 0%
8042 [main] INFO org.apache.hadoop.mapreduce.Job - map 0% reduce 0%
22/10/18 20:34:35 INFO mapreduce.Job: map 100% reduce 0%
15112 [main] INFO org.apache.hadoop.mapreduce.Job - map 100% reduce 0%
22/10/18 20:34:41 INFO mapreduce.Job: map 100% reduce 100%
21241 [main] INFO org.apache.hadoop.mapreduce.Job - map 100% reduce 100%
22/10/18 20:34:41 INFO mapreduce.Job: Job job_1666096184094_0001 completed successfully
21257 [main] INFO org.apache.hadoop.mapreduce.Job - Job job_1666096184094_0001 completed successfully
................
//查看数据[root@hadoop1 ~]# hadoop dfs -cat /JobData/output/part-r-00000
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.上海,15-25,带薪年假-五险一金-节日礼物-定期体检-全面福利体系work life blanc,null
上海,20-35,高薪诚,null
深圳,20-40,福,null
广州,12-24,扁平化管理-现象级产品-互联网百,null
南京,15-30,年底双薪-技能培训-免费班车-带薪年假-六险一金 通讯补助 大平台 核心部,Hadoop-Spark-MySQL-Hive
杭州,15-25,绩效奖金-专项奖金-扁平管理-扁平化管,null
上海,22-32,年底双薪-专项奖金-股票期权-年终分红-不内卷加班,null
北京,25-45,扁平管理-弹性工作-就近租房补贴-六险一金-六险一金弹性工作免费三餐租房补,null
北京,30-50,节日礼物-年度旅游-扁平管理-定期体检-不加班 团队氛围.......