数据及需求分析
数据样式
- json类型数据
字段分析:从左到右分别是
id编号 公司名称 学历要求 工作类型 工作名称 薪资 发布时间 截止时间 城市编码 公司规模 福利 岗位职责 地区 工作经验
- 城市数据
城市id和城市名
需求及分析
- 处理工资,让其变成(最大-最小)/2
- 使用城市名替换城市id
- 每一个值都不能为空,只要有一个为空就删除整条数据
- 分析:使用Fastjson将json转换成对象,使用MapJoin可以实现需求二
代码实现阶段
自定义的对象
- 因为要实现json格式的转换,属性名要和json的k一样,实现WritableComparable因为需要序列化
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Data implements WritableComparable<Data> {
private int id;
private String company_name;
private String eduLevel_name;
private String emplType;
private String jobName;
private String salary;
private String createDate;
private String endDate;
private int city_code;
private String companySize;
private String welfare;
private String responsibility;
private String place;
private String workingExp;
private String city_name;
@Override
public int compareTo(Data o) {
return 0;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(id);
dataOutput.writeUTF(company_name);
dataOutput.writeUTF(eduLevel_name);
dataOutput.writeUTF(emplType);
dataOutput.writeUTF(jobName);
dataOutput.writeUTF(salary);
dataOutput.writeUTF(createDate);
dataOutput.writeUTF(endDate);
dataOutput.writeInt(city_code);
dataOutput.writeUTF(companySize);
dataOutput.writeUTF(welfare);
dataOutput.writeUTF(responsibility);
dataOutput.writeUTF(place);
dataOutput.writeUTF(workingExp);
dataOutput.writeUTF(city_name);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readInt();
company_name = dataInput.readUTF();
eduLevel_name = dataInput.readUTF();
emplType = dataInput.readUTF();
jobName = dataInput.readUTF();
salary = dataInput.readUTF();
createDate = dataInput.readUTF();
endDate = dataInput.readUTF();
city_code = dataInput.readInt();
companySize = dataInput.readUTF();
welfare = dataInput.readUTF();
responsibility = dataInput.readUTF();
place = dataInput.readUTF();
workingExp = dataInput.readUTF();
city_name = dataInput.readUTF();
}
@Override
public String toString() {
return "Data{" +
"id=" + id +
", company_name='" + company_name + '\'' +
", eduLevel_name='" + eduLevel_name + '\'' +
", emplType='" + emplType + '\'' +
", jobName='" + jobName + '\'' +
", salary='" + salary + '\'' +
", createDate='" + createDate + '\'' +
", endDate='" + endDate + '\'' +
", city_name='" + city_name + '\'' +
", companySize='" + companySize + '\'' +
", welfare='" + welfare + '\'' +
", responsibility='" + responsibility + '\'' +
", place='" + place + '\'' +
", workingExp='" + workingExp + '\'' +
'}';
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getCompany_name() {
return company_name;
}
public void setCompany_name(String company_name) {
this.company_name = company_name;
}
public String getEduLevel_name() {
return eduLevel_name;
}
public void setEduLevel_name(String eduLevel_name) {
this.eduLevel_name = eduLevel_name;
}
public String getEmplType() {
return emplType;
}
public void setEmplType(String emplType) {
this.emplType = emplType;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public String getSalary() {
return salary;
}
public void setSalary(String salary) {
this.salary = salary;
}
public String getCreateDate() {
return createDate;
}
public void setCreateDate(String createDate) {
this.createDate = createDate;
}
public String getEndDate() {
return endDate;
}
public void setEndDate(String endDate) {
this.endDate = endDate;
}
public int getCity_code() {
return city_code;
}
public void setCity_code(int city_code) {
this.city_code = city_code;
}
public String getCompanySize() {
return companySize;
}
public void setCompanySize(String companySize) {
this.companySize = companySize;
}
public String getWelfare() {
return welfare;
}
public void setWelfare(String welfare) {
this.welfare = welfare;
}
public String getResponsibility() {
return responsibility;
}
public void setResponsibility(String responsibility) {
this.responsibility = responsibility;
}
public String getPlace() {
return place;
}
public void setPlace(String place) {
this.place = place;
}
public String getWorkingExp() {
return workingExp;
}
public void setWorkingExp(String workingExp) {
this.workingExp = workingExp;
}
public String getCity_name() {
return city_name;
}
public void setCity_name(String city_name) {
this.city_name = city_name;
}
}
Map阶段
- Map阶段中的setup方法将城市表缓存到内存里,然后实现替换的操作,先使用JSONObject来存储,判断是否全部为空,如果是的话,在进行下一步的处理。将salary分割,然后按要求进行处理
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class MapTest extends Mapper<LongWritable, Text, Data, NullWritable> {
private Data k = new Data();
private Map<Integer, String> city = new HashMap();
private int city_code;
private int salary;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] uris = context.getCacheFiles();
File file = new File(uris[0]);
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
while ((line = br.readLine()) != null) {
city.put(Integer.parseInt(line.split(",")[0]), line.split(",")[1]);
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
JSONObject jsonObject = JSONObject.parseObject(value.toString());
String [] datas = new String[14];
datas[0] = jsonObject.getString("id");
datas[1] = jsonObject.getString("company_name");
datas[2] = jsonObject.getString("eduLevel_name");
datas[3] = jsonObject.getString("emplType");
datas[4] = jsonObject.getString("jobName");
datas[5] = jsonObject.getString("salary");
datas[6] = jsonObject.getString("createDate");
datas[7] = jsonObject.getString("endDate");
datas[8] = jsonObject.getString("city_code");
datas[9] = jsonObject.getString("companySize");
datas[10] = jsonObject.getString("welfare");
datas[11] = jsonObject.getString("responsibility");
datas[12] = jsonObject.getString("place");
datas[13] = jsonObject.getString("workingExp");
for (String s:datas){
if (s.equals("")||s==null){
return;
}
}
k = JSON.parseObject(value.toString(),Data.class);
city_code = k.getCity_code();
k.setCity_name(city.get(city_code));
String s [] = k.getSalary().split("-");
salary = Integer.parseInt(s[1].substring(s[1].length()-2,s[1].length()-1))-Integer.parseInt(s[0].substring(s[0].length()-2,s[0].length()-1));
k.setSalary(String.valueOf(salary));
context.write(k,NullWritable.get());
}
}
Reduce阶段
- 只要循环输出所有的结果即可
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class RedTest extends Reducer<Data, NullWritable, Data, NullWritable> {
@Override
protected void reduce(Data key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable v : values) {
context.write(key, NullWritable.get());
}
}
}
Driver阶段
- 注意输入要缓存的文件的路径,即城市表的路径位置
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.File;
import java.net.URI;
public class DriTest {
public static void main(String[] args) throws Exception {
File file = new File("D:\\MP\\招聘数据\\output");
if (file.exists()) {
delFile(file);
driver();
} else {
driver();
}
}
public static void delFile(File file) {
File[] files = file.listFiles();
if (files != null && files.length != 0) {
for (int i = 0; i < files.length; i++) {
delFile(files[i]);
}
}
file.delete();
}
public static void driver() throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setMapperClass(MapTest.class);
job.setJarByClass(DriTest.class);
job.setReducerClass(RedTest.class);
job.setMapOutputKeyClass(Data.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Data.class);
job.setOutputValueClass(NullWritable.class);
job.addCacheFile(new URI("file:///D:/MP/招聘数据/input/com.txt"));
FileInputFormat.setInputPaths(job, "D:\\MP\\招聘数据\\input\\data.json");
FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\招聘数据\\output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}