hadoop数据清洗
每月最火的搜索词代码
FirstMapper.java
package com.hniu.bigdata.hadoop.First;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.StringTokenizer;
public class FirstMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
// 日期数据处理
String[] values = line.split("\\s");
String time = values[values.length - 1];
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
String dateTime = format.format(new Date(Long.parseLong(time)));
// 域名切分
String url = values[4];
values[values.length - 1] = dateTime;
String domain = url.split("/")[0];
values[4] = domain;
line = StringUtils.join(" ",values);
context.write(new Text(line),NullWritable.get());
}
}
FirstMapReduce
package com.hniu.bigdata.hadoop.First;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FirstMapReduce {
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS","hdfs://192.168.179.46:8020");
Job job = Job.getInstance(configuration, "word count");
job.setJarByClass(FirstMapReduce.class);
job.setMapperClass(FirstMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path("/syz"));
FileOutputFormat.setOutputPath(job, new Path("/Clean_Data"));
job.waitForCompletion(true);
}
}