代码为完整的Mapreduce代码实现,含有:
1. 分区字段的获取;
2. 多输入路径的获取;
3. 方法:根据指定的日期,和天数,向前推,返回相应的日期集合。
4. 执行jar时,传入参数的获取;
5. 相关执行哪个参数的设置;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Trash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties;
/**
* User:leen
* Date:2017/4/5 0005
* Time:19:19
*/
public class test_dwd_ec_prod_info_cm {
public static class MyMapper extends Mapper<Object,Text,Text,Text>{
/**
* 通过路径获得分区时间
* @param filePath
* @return
*/
public String getDateFromPath(String filePath){
if(filePath == null){
return null;
}
int index = filePath.indexOf("pt_date=");
if(index <=0 || index+16 >filePath.length()){
return null;
}
return filePath.substring(index+8,index+16);
}
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
//获得输入的file路径。
InputSplit inputSplit = context.getInputSplit();
String filePath = ((FileSplit)inputSplit).getPath().toString();
String[] arr = value.toString().split("\t");
if(filePath.contains("dwd_ec_prod_info") && arr.length>=20){
String ec_prd_cd =arr[0] ; //电商商品编码
String ec_shop_cd =arr[1] ; //电商店铺编码
String ec_seller_name =arr[2] ; //电商卖家名称
String ec_cat_cd =arr[3] ; //商品类目ID
String ec_brand_name =arr[4] ; //品牌名称
String prd_price =arr[5] ; //商品价格
String title =arr[6] ; //标题
String indu_cd =arr[7] ; //行业编号
String prd_type_cd =arr[8] ; //行业类型编号
String brand_cd =arr[9] ; //品牌编号
String prd_cd =arr[10] ; //产品编号
String prd_comment_count =arr[11] ; //商品总评论数
String url =arr[12] ; //商品的url地址
String imgurl =arr[13] ; //图片url地址
String market_time =arr[14] ; //上市时间
String prd_sale_cnt =arr[15] ; //30天累计销量
String today_prd_sale_cnt=arr[16] ; //当天销量
String prd_sale_income =arr[17] ; //当天销售额
String domain =arr[18] ; //网站域名
String para_config =arr[19] ; //配置参数
String pt_date = getDateFromPath(filePath); //时间分区
if((!indu_cd.equalsIgnoreCase("null") || !prd_type_cd.equalsIgnoreCase("null") ||
!brand_cd.equalsIgnoreCase("null") || !prd_cd.equalsIgnoreCase("null") ) && !ec_prd_cd.contains(".")){
context.write(new Text(domain + "\t" + ec_prd_cd),new Text(ec_shop_cd+"\t"+ec_seller_name+"\t"+ec_cat_cd+"\t"+
ec_brand_name+"\t"+prd_price+"\t"+title+"\t"+indu_cd+"\t"+prd_type_cd+"\t"+brand_cd+"\t"+prd_cd+"\t"+
prd_comment_count+"\t"+url+"\t"+imgurl+"\t"+market_time+"\t"+prd_sale_cnt+"\t"+today_prd_sale_cnt+"\t"+
prd_sale_income+"\t"+domain+"\t"+para_config+"\t"+pt_date));
}
}
}
}
public static class MyReducer extends Reducer<Text,Text,Text,Text>{
@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String[] arrkeys = key.toString().split("\t");
if(arrkeys.length==2){
String ec_prd_cd = arrkeys[0],latest_content = null, lastest_date=null;
for(Text val : values){
String[] arrvalues = val.toString().split("\t");
String pt_date = arrvalues[arrvalues.length-1];
if(lastest_date == null || (lastest_date != null && lastest_date.compareTo(pt_date) < 0)){
lastest_date=pt_date;
latest_content=arrvalues[0];
for (int i = 1 ;i < arrvalues.length;i++){
latest_content+= "\t" + arrvalues[i];
}
}
}
if(lastest_date != null && latest_content != null){
context.write(new Text(ec_prd_cd),new Text(latest_content));
}
}
}
}
/**
* 获得输入的路径的集合
* @param selected_dates
* @param fs
* @return
* @throws IOException
*/
public static ArrayList<Path> GenerateInputPaths(ArrayList<String> selected_dates ,FileSystem fs) throws IOException {
ArrayList<Path> inputPaths = new ArrayList<Path>();
for(String str : selected_dates){
String pathStr = "/group/user/tools/meta/hive-temp-table/tools.db/dwd_ec_prod_info/pt_date=" + str;
Path path = new Path(pathStr);
if(fs.exists(path)){
inputPaths.add(path);
FileStatus[] fileStatuses = fs.listStatus(path);
System.out.println(fileStatuses.length+"\t"+path);
}
}
return inputPaths;
}
/**
* 根据制定的日期,和天数,向前推,返回相应的日期集合。
* @param cur_date
* @param K
* @return
* @throws ParseException
*/
public static ArrayList<String> getPeriodDates(String cur_date,Integer K) throws ParseException {
ArrayList<String> dates = new ArrayList<String>();
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
Date cur = sdf.parse(cur_date);
for(int i=0;i< K;i++){
Calendar cal = Calendar.getInstance();
cal.setTime(cur);
cal.add(Calendar.DATE,-i);
dates.add(sdf.format(cal.getTime()));
}
return dates;
}
/**
*
* @param args
* @throws Exception
*/
public static void Process(String[] args) throws Exception{
System.out.println("---------------Process-->");
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length != 2){
System.out.println("请输入两个参数:时间+配置文件");
return;
}
String cur_date = otherArgs[0],prop_filePath = otherArgs[1];
ArrayList<String> selected_dates = getPeriodDates(cur_date,30);
Properties prop = new Properties();
prop.load(new FileInputStream(prop_filePath));//加载配置文件的路径
//map完成了100%之后在开始为reduce任务申请资源,默认是0.05
conf.setInt("mapreduce.job.reduce.slowstart.completedmaps", 1);
System.out.println("第一个参数:" + conf.getStrings("mapreduce.job.reduce.slowstart.completedmaps", "1")[0]);
Job job = Job.getInstance(conf,"test_dwd_ec_prod_info_cm - chl");
job.setJarByClass(test_dwd_ec_prod_info_cm.class);
job.setNumReduceTasks(100);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileSystem fs = FileSystem.get(conf);
ArrayList<Path> inputPaths = GenerateInputPaths(selected_dates,fs);
for (Path path : inputPaths){
FileInputFormat.addInputPath(job,path);
}
Path outputPath = new Path("/group/user/tools/meta/hive-temp-table/chenhaolin.db/dwd_ec_prod_info_for_cm/pt_date="+ cur_date);
Trash trash = new Trash(conf);
if(fs.exists(outputPath)){
trash.moveToTrash(outputPath);
System.out.println("---------------setOutputPath-->");
FileOutputFormat.setOutputPath(job, outputPath);
job.waitForCompletion(true);
fs.close();;
}
}
}