hadoop0.20.2分布式缓存程序示例

代码1:adlogetl.java

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class AdlogETL extends Configured implements Tool {

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private IpCity ipcity;
		//hashmap treemap 用于查找某ip在ip表中的区段
		private TreeMap<Long, String> tm;
		private HashMap<Long, Long> hm;
		//df用于将yyyy-mm-dd HH:mm:ss格式的字符串转换为毫秒
		SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		//setup在map启动时执行一次(0.20之前的api中对应的函数为configure)
		public void setup(Context context) {
			try {
				ipcity = new IpCity();
				ipcity.initialize(new File("ip"));//ip为分布式文件的文件名,用-files 指定
				tm = ipcity.getTypeName();
				hm = ipcity.getIpregion();
			} catch (IOException e) {
				throw new RuntimeException(e);
			}
		}
		

		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			StringBuffer line = new StringBuffer(value.toString());
			Pattern p = Pattern
					.compile("([^,]*),([^;]*);ad=([^;]*);ap=([^;]*);mac=([^;]*);([^\"]*)\",([^,]*),(\"[^\"]*)\",([^,]*),([^,]*),\"([^\"]*)\",\"([^\"]*)\",\"([^\"]*)\"");//此处用于模式匹配,获得所需字段
			Matcher m = p.matcher(line);
			boolean isvalid = m.matches();
			// m.group():1-->time,3-->ad,4-->ap,5-->mac,9->ip,13-->cookie
			if(isvalid)
			{
					String city = "999999";
					long long_ip = ip2long(m.group(9));
					if (tm.floorEntry(long_ip) != null) {
						long tmp = tm.floorKey(long_ip);
						if (hm.containsKey(tmp) && long_ip <= hm.get(tmp))
							city = tm.get(tmp);
					}
					String mac = m.group(5);
					String time = m.group(1);
					Date date = new Date();
					try {
						date = df.parse(time);
					} catch (ParseException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
					long sec = date.getTime()/1000;
					
					if(mac.equals(""))
					{
						mac = "NULL";
					}
					//mac, time,ap, ad, area_id, cookie_id
				context.write(new Text(mac), new Text(String.valueOf(sec) + "\t"
						+ m.group(4) + "\t" + m.group(3) + "\t" + city + "\t"
						+ m.group(13)));
			}
		}
	}


	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		public void reduce(Text keys, Iterable<Text> values, Context context)
				throws IOException {
			try {
				for (Text val : values) {
					context.write(keys, new Text(val));
				}
			} catch (Exception e) {

			}
		}
	}

	public int run(String[] args) throws Exception {
		// TODO Auto-generated method stub

		Configuration conf = getConf();
		Job job = new Job(conf, "AdlogETL");
		job.setJarByClass(AdlogETL.class);
		Path in = new Path(args[0]);
		Path out = new Path(args[1]);
		FileInputFormat.setInputPaths(job, in);
		FileOutputFormat.setOutputPath(job, out);
		
		job.setMapperClass(MyMap.class);
		job.setReducerClass(MyReduce.class);
//		job.setCombinerClass(MyReduce.class);

		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		job.setNumReduceTasks(0);
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		return 0;
	}

	public static long ip2long(String str) {
		String[] strFields = str.split("\\.");
		if (!str.matches("^\\d+\\.\\d+\\.\\d+\\.\\d+$"))
			return 0;
		long[] nFields = new long[4];
		for (int i = 0; i < 4; i++) {
			nFields[i] = Integer.parseInt(strFields[i]);
		}
		return (nFields[0] << 24) + (nFields[1] << 16) + (nFields[2] << 8)
				+ nFields[3];
	}

	
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new AdlogETL(), args);
		System.out.println(res);
	}
}

代码2:ipcity.java

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.TreeMap;

import org.apache.hadoop.io.IOUtils;

public class IpCity {
	TreeMap<Long, String> tm = new TreeMap<Long, String>();
	HashMap<Long, Long> hm = new HashMap<Long, Long>();
	public void initialize(File file) throws IOException {
	    BufferedReader in = null;
	    try {
	      in = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
	      String line;
	      while ((line = in.readLine()) != null) {
	    	  String fields[] = line.trim().split("\t");
	    	  long s_ip = Long.parseLong(fields[0]);
	    	  long e_ip = Long.parseLong(fields[1]);
	    	  String city_id = fields[3];
	    	  tm.put(s_ip,city_id );
	    	  hm.put(s_ip,e_ip);
	      }
	    } finally {
	      IOUtils.closeStream(in);
	    }
	  }
	 public TreeMap<Long, String> getTypeName(){
		 return tm;
	 }
	 public HashMap<Long, Long> getIpregion(){
		 return hm;
	 }
	 
}




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值