电商实战
使用mapreduce编程
运行环境搭建
1.maven配置
2.hadoop下载,并配置环境变量
3.导入依赖,pom.xml
<!-- 添加hadoop的依赖 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.1</version>
</dependency>
4.idea创建maven项目
编写代码
ProvinceStatApp.java
省流量统计
package com.task.ds.pro;
import com.task.ds.utils.IPParser;
import com.task.ds.utils.LogParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Map;
/**
* 省份浏览量统计
*/
public class ProvinceStatApp {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
Path outputPath = new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out");
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
}
Job job = Job.getInstance(configuration);
job.setJarByClass(ProvinceStatApp.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackinfo_20130721.txt"));
FileOutputFormat.setOutputPath(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out"));
job.waitForCompletion(true);
}
static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private LongWritable ONE = new LongWritable(1);
private LogParser logParser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
logParser = new LogParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String log = value.toString();
Map<String, String> info = logParser.parse(log);
String ip = info.get("ip");
if (StringUtils.isNotBlank(ip)) {
IPParser.RegionInfo regionInfo = IPParser.getInstance().analyseIp(ip);
if (regionInfo != null) {
String provine = regionInfo.getProvince();
if (StringUtils.isNotBlank(provine)) {
context.write(new Text(provine), ONE);
} else {
context.write(new Text("-"), ONE);
}
} else {
context.write(new Text("-"), ONE);
}
} else {
context.write(new Text("-"), ONE);
}
}
}
static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
System.out.println(context);
for (LongWritable value : values) {
count++;
}
context.write(key, new LongWritable(count));
}
}
}
结果展示
- 923
上海市 72898
云南省 1480
内蒙古自治区 1298
北京市 42501
台湾省 254
吉林省 1435
四川省 4442
天津市 11042
宁夏 352
安徽省 5429
山东省 10145
山西省 2301
广东省 51508
广西 1681
新疆 840
江苏省 25042
江西省 2238
河北省 7294
河南省 5279
浙江省 20627
海南省 814
湖北省 7187
湖南省 2858
澳门特别行政区 6
甘肃省 1039
福建省 8918
西藏 110
贵州省 1084
辽宁省 2341
重庆市 1798
陕西省 2487
青海省 336
香港特别行政区 45
黑龙江省 1968
etl.java
提取关键信息:ip、url、pageId(topicId对应的页面Id)、country、province、city
package com.task.ds.pro;
import com.task.ds.utils.IPParser;
import com.task.ds.utils.LogParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Map;
/**
* etl提取关键信息
*/
public class etl {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
Path outputPath = new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out");
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
}
Job job = Job.getInstance(configuration);
job.setJarByClass(ProvinceStatApp.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackinfo_20130721.txt"));
FileOutputFormat.setOutputPath(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\outInfo_out"));
job.waitForCompletion(true);
}
static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
private LongWritable ONE = new LongWritable(1);
private LogParser logParser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
logParser = new LogParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String log = value.toString();
ContentUtils contentUtils = new ContentUtils();
Map<String, String> info = logParser.parse(log);
String ip = info.get("ip");
String url = info.get("url");
String pageId = contentUtils.getPageId(url);
String country = info.get("country");
String province = info.get("province");
String city = info.get("city");
String time = info.get("time");
String out = "," + pageId + "," + ip + "," + country + "," + province + "," + city + "," + time;
context.write(new Text(url) , new Text(out));
}
}
static class MyReduce extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(key,value);
}
}
}
}
结果展示
http://b2b.yihaodian.com/cms/view.do?topicId=21407 ,21407,115.85.250.26,中国,甘肃省,兰州市,2013-07-21 19:25:27
http://b2b.yihaodian.com/cms/view.do?topicId=23542&tracker_u=1010825072 ,23542,101.227.253.115,中国,上海市,null,2013-07-21 05:30:27
http://b2b.yihaodian.com/cms/view.do?topicId=23704&ref=ad.12084_2108676_1 ,23704,180.155.238.184,中国,上海市,null,2013-07-21 09:12:56
http://b2b.yihaodian.com/cms/view.do?topicId=23705&ref=ad.7901_2115482_1 ,23705,119.85.21.23,中国,重庆市,null,2013-07-21 17:25:49
http://b2b.yihaodian.com/customerMessage.do ,,222.69.22.126,中国,上海市,南汇区,2013-07-21 18:59:50
http://b2b.yihaodian.com/index.do ,,222.177.67.13,中国,重庆市,null,2013-07-21 17:50:08