利用Hadoop 根据IP地址进行地域统计

最新推荐文章于 2022-04-15 15:49:52 发布

weixin_34295316

最新推荐文章于 2022-04-15 15:49:52 发布

阅读量250

点赞数

文章标签：大数据

原文链接：https://my.oschina.net/wangjiankui/blog/50638

版权

2019独角兽企业重金招聘Python工程师标准>>>

最近一直在做地域统计的功能，用户下载app的日志中记录了IP，老大要根据这个IP做个地域统计，看看哪些地方的用户喜欢下载哪些应用。最初是用Java跑纯真IP地址数据库，然后对日志进行统计，不过效率不高，20+M的数据要跑几个小时，后来就把任务给我了，让我用Hadoop跑跑试试。

1. ip的解析，把ip地址解析成实际地址：使用了纯真ip数据库QQwry.dat

参考代码：http://blog.csdn.net/swazn_yj/article/details/1611020

2.新建一个hadoop项目，将ip解析的3个类放到一个包下面
3.QQwry的测试使用：项目中需要拿到ip的国家和省，然后追加到原始日志作为2个新字段，但是测试发现返回的数据不能直接拿来处理，所有就需要进一步的格式化处理。

@1.对于：内蒙，广西，新疆，宁夏，西藏这5个自治区，直接返回中国+自治区名

@2，包含省字的，直接返回中国+省份，

@3，包含市的，主要是上海，北京，重庆，天津4个直辖市，返回中国+城市

@5，包含中国的，直接返回中国，省份字段留空；

@6.其他就直接返回解析的数据，省份留空 (基本就是国外的)

@4，测试中发现这样处理不干净，还有很多脏数据，例如包含大学和学院的，还有欧洲中部，XX网吧等。所有需要对这些脏数据进行处理。采用的方法是将脏数据按一定格式保存到dirtydata.txt中，然后初始化到一个map中，利用map进行格式化。

4.根据3中的格式化需求，编写格式化函数

private String formatCity(String country) {
	// 特殊地区处理，
	for (String spe : spelist) {
		if (country.indexOf(spe) != -1)
			return "中国," + spe;
	}
	if (country.indexOf("省") != -1) {
		String contrysplit[] = country.split("省");
		return "中国," + contrysplit[0] + "省";
	else if (country.indexOf("市") != -1) {
		String citysplist[] = country.split("市");
		return "中国," + citysplist[0] + "市";
	} else if (umap.containsKey(country)) {
		eturn "中国," + umap.get(country);
	} else if (country.indexOf("中国") != -1) {
		return "中国," + "";
	} else {
		return country + "," + "";
	}
}

5.对于脏数据的解析，读取txt文件，然后编写解析函数
public Map<String, String> getUniversMap(String filepath)
				throws FileNotFoundException {
	Map<String, String> universMap = new HashMap<String, String>();
	FileReader fr = new FileReader(filepath);
	BufferedReader br = new BufferedReader(fr);
	String readoneline;
	String tmp[];
	try {
		while ((readoneline = br.readLine()) != null) {
		    tmp = readoneline.split(",");
		    if(tmp.length == 3){
			universMap.put(tmp[0], tmp[2]);
		    }
		}
	} catch (IOException e) {
		e.printStackTrace();
	}
	return universMap;
}

6.编写map/reduce程序，调用1中的函数，然后对结果进行格式化

public class ConvertIp {

	public static class ItemMapper extends
			Mapper<Object, Text, Text, NullWritable> {
		private Text outkey = new Text("");
		private IPSeeker ipSeeker;
		private String filepath;
		Map<String, String> umap;
		final String spelist[] = { "内蒙古", "广西", "新疆", "宁夏", "西藏" };
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			String line = value.toString();
			String details[] = line.split(",");
			if(details.length != 15){
				return;
			}
			String ip = details[3];
			String reg = "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})";
			// data clien 1.length=15 2.the university
			if (ip.matches(reg)) {
				outkey.set(new StringBuffer().append(line).append(
						",").append(formatCity(ipSeeker.getCountry(ip)))
						.toString());

				context.write(outkey, NullWritable.get());
			}
		}

		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
			ipSeeker = new IPSeeker("qqwry.dat");  //初始化，这种写法需要在执行hadoop命令时
使用-files +qqwry.dat的路径,+dirtydata.txt的路径（上传到job的临时目录下）
                        filepath = "dirtydata.txt";         //初始化
			try {
				umap = getUniversMap(filepath);
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			}
			super.setup(context);
		}

		private String formatCity(String country) {
			for (String spe : spelist) {
				if (country.indexOf(spe) != -1)
					return "中国," + spe;
			}
			if (country.indexOf("省") != -1) {
				String contrysplit[] = country.split("省");
				return "中国," + contrysplit[0] + "省";
			} else if (country.indexOf("市") != -1) {
				String citysplist[] = country.split("市");
				return "中国," + citysplist[0] + "市";
			} else if (umap.containsKey(country)) {
				return "中国," + umap.get(country);
			} else if (country.indexOf("中国") != -1) {
				return "中国," + "";
			} else {
				return country + "," + "";
			}
		}

		public Map<String, String> getUniversMap(String filepath)
				throws FileNotFoundException {
			Map<String, String> universMap = new HashMap<String, String>();
			FileReader fr = new FileReader(filepath);
			BufferedReader br = new BufferedReader(fr);
			String readoneline;
			String tmp[];
			try {
				while ((readoneline = br.readLine()) != null) {
					tmp = readoneline.split(",");
					if(tmp.length == 3){
						universMap.put(tmp[0], tmp[2]);
					}
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			return universMap;
		}
	}

	public static class ItemReducer extends
			Reducer<Text, NullWritable, Text, NullWritable> {

		protected void reduce(Text key, Iterable<NullWritable> values,
				Context context) throws IOException, InterruptedException {
			context.write(key, NullWritable.get());
		}

	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: -file <file> <in> <out>");
			System.exit(2);
		}

		Job job = new Job(conf, "ipcount");
		job.setJarByClass(ConvertIp.class);
		job.setMapperClass(ItemMapper.class);
		job.setReducerClass(ItemReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

7.打包成jar，上传到服务器，然后进行测试

hadoop jar /home/wjk/ipcount.jar com.wjk.datastat.ip.ConvertIp -files /home/wjk/qqwry.dat,/home/wjk/dirtydata.txt  /user/hive/warehouse/active_log/dt=20120301  /user/wjk/output/datastat

转载于:https://my.oschina.net/wangjiankui/blog/50638