数据简单处理_genericoptionsparser(conf, args).getremainingargs(-CSDN博客

本文链接：https://blog.csdn.net/math_coder/article/details/20029433

DEVICE_ID	sum(BUSINESS_ID)	CLIENT_ID+BUSINESS_ID
101	            46	                        16
101	            46	                        18
101	            46	                        28
102	            31	                        20
102	            31	                        22
103	            55	                        24
103	            55	                        26
103	            55	                        30

ÊýŸÝ¿âÖÐµÄÊýŸÝ£º
timestamp	row	column	value
2014-2-14 15:12	2014-02-01 13:2000000031013101	CLIENT_ID	3
2014-2-14 15:12	2014-02-01 13:2000000031013101	BUSINESS_ID	13
2014-2-14 15:12	2014-02-01 13:2000000031013101	DEVICE_ID	101
2014-2-14 15:15	2014-02-01 15:0000000041014101	CLIENT_ID	4
2014-2-14 15:15	2014-02-01 15:0000000041014101	BUSINESS_ID	14
2014-2-14 15:15	2014-02-01 15:0000000041014101	DEVICE_ID	101
2014-2-14 15:20	2014-02-01 16:0000000051015102	CLIENT_ID	5
2014-2-14 15:20	2014-02-01 16:0000000051015102	BUSINESS_ID	15
2014-2-14 15:20	2014-02-01 16:0000000051015102	DEVICE_ID	102
2014-2-14 15:27	2014-02-01 20:0000000061016102	CLIENT_ID	6
2014-2-14 15:27	2014-02-01 20:0000000061016102	BUSINESS_ID	16
2014-2-14 15:27	2014-02-01 20:0000000061016102	DEVICE_ID	102
2014-2-14 15:35	2014-02-02 10:0000000071017103	CLIENT_ID	7
2014-2-14 15:35	2014-02-02 10:0000000071017103	BUSINESS_ID	17
2014-2-14 15:35	2014-02-02 10:0000000071017103	DEVICE_ID	103
2014-2-14 15:39	2014-02-02 13:0000000081018103	CLIENT_ID	8
2014-2-14 15:39	2014-02-02 13:0000000081018103	BUSINESS_ID	18
2014-2-14 15:39	2014-02-02 13:0000000081018103	DEVICE_ID	103
2014-2-14 15:50	2014-02-02 19:0000000091019101	CLIENT_ID	9
2014-2-14 15:50	2014-02-02 19:0000000091019101	BUSINESS_ID	19
2014-2-14 15:50	2014-02-02 19:0000000091019101	DEVICE_ID	101
2014-2-14 16:10	2014-02-03 14:0000000101020103	CLIENT_ID	10
2014-2-14 16:10	2014-02-03 14:0000000101020103	BUSINESS_ID	20
2014-2-14 16:10	2014-02-03 14:0000000101020103	DEVICE_ID	103


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, Text>{
    private static int count = 1;  
    private String c_id ;
    private String b_id ;
    private String d_id ;
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
    	String line = value.toString();
    	String ans="";
    	int len = line.length();
      if(count%3==1){ // 计算 c_id
    	  for(int i=53;i<len;i++){
    		  if(line.charAt(i)>('0'-1) && line.charAt(i)<('9'+1)){
    			  ans += line.charAt(i);
    		  }
    	  }
    	  c_id = ans;
      }
      if(count%3==2){ // 计算 b_id
    	  for(int i=53;i<len;i++){
    		  if(line.charAt(i)>('0'-1) && line.charAt(i)<('9'+1))
    			  ans += line.charAt(i);
    	  }
    	  b_id = ans;
      }
      if(count%3==0){ // 计算 d_id
    	  for(int i=53;i<len;i++){
    		  if(line.charAt(i)>('0'-1) && line.charAt(i)<('9'+1)){
    			  ans += line.charAt(i);
    		  }
    	  }
    	  d_id = ans;
    	  context.write(new Text(d_id),new Text(b_id+" "+c_id));
      }
      count++;
    }
  }
  
  public static class IntSumReducer 
       extends Reducer<Text,Text,Text,Text> {

    public void reduce(Text key, Iterable<Text> values, 
                       Context context
                       ) throws IOException, InterruptedException {
    	String line="";
    	int all=0;
    	for(Text val:values){
    		line=val.toString();
    		int ans = 0;
    		int tall=0;
    		for(int i=0;i<line.length();i++){
    			if(line.charAt(i)==' ') {
    				tall += ans;
    				all += ans;
    				ans = 0;
    				continue;
    			}
    			ans *= 10; ans+=(line.charAt(i)-'0');
    		}
    		tall += ans;
    		String s = key.toString();
    		s += "  ";
    		s += val;
    		context.write(new Text(key), new Text(String.valueOf(tall)));
    	}
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: wordcount <in> <out>");
      System.exit(2);
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
 }


/*

获得数据
table1
101	16
101	18
101	28
102	20
102	22
103	24
103	26
103	30

*/


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, Text>{
    private static int count = 1;  
    private String c_id ;
    private String b_id ;
    private String d_id ;
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
    	String line = value.toString();
    	String ans="";
    	int len = line.length();
      if(count%3==1){ // 计算 c_id
    	  for(int i=53;i<len;i++){
    		  if(line.charAt(i)>('0'-1) && line.charAt(i)<('9'+1)){
    			  ans += line.charAt(i);
    		  }
    	  }
    	  c_id = ans;
      }
      if(count%3==2){ // 计算 b_id
    	  for(int i=53;i<len;i++){
    		  if(line.charAt(i)>('0'-1) && line.charAt(i)<('9'+1))
    			  ans += line.charAt(i);
    	  }
    	  b_id = ans;
      }
      if(count%3==0){ // 计算 d_id
    	  for(int i=53;i<len;i++){
    		  if(line.charAt(i)>('0'-1) && line.charAt(i)<('9'+1)){
    			  ans += line.charAt(i);
    		  }
    	  }
    	  d_id = ans;
    	  context.write(new Text(d_id),new Text(b_id));
      }
      count++;
    }
  }
  
  public static class IntSumReducer 
       extends Reducer<Text,Text,Text,Text> {

    public void reduce(Text key, Iterable<Text> values, 
                       Context context
                       ) throws IOException, InterruptedException {
    	int sum = 0;
    	for(Text val:values){
    		String line = val.toString();
    		sum += Integer.parseInt(line);
    	}
    	context.write(key,new Text(String.valueOf(sum)));
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: wordcount <in> <out>");
      System.exit(2);
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
 }
/*

获得数据
table2
101	46
102	31
103	55

*/