MapReduce电商日志文件分析(一)
MapReduce电商日志文件分析(二)
问题三
在这里我们需要进行ETL操作。
Mapper
在 map 阶段,我们对每一条数据进行读取,调用编写号的 LogParser 类中的 parse 方法,对数据进行解析(即生成对应的 map 键值对,如 “ip” 对应的 ip 地址)然后调用编写好的GetPageId 类的 getPageId 方法,获取数据中的 PageId,接着使用 StringBuilder 对各个值进行拼接,拼接完成的字符串作为键,对其值设置为1 。
LogParser
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class LogParser {
private Logger logger = LoggerFactory.getLogger(LogParser.class);
public Map<String, String> parse(String log) {
Map<String, String> logInfo = new HashMap<String,String>();
IPParser ipParse = IPParser.getInstance();
if(StringUtils.isNotBlank(log)) {
String[] splits = log.split("\001");
String ip = splits[13];
String url = splits[1];
String sessionId = splits[10];
String time = splits[17];
logInfo.put("ip",ip);
logInfo.put("url",url);
logInfo.put("sessionId",sessionId);
logInfo.put("time",time);
IPParser.RegionInfo regionInfo = ipParse.analyseIp(ip);
logInfo.put("country",regionInfo.getCountry());
logInfo.put("province",regionInfo.getProvince());
logInfo.put("city",regionInfo.getCity());
} else{
logger.error("日志记录的格式不正确:" + log);
}
return logInfo;
}
}
GetPageId
import org.apache.commons.lang.StringUtils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetPageId {
public static String getPageId(String url) {
String pageId = "";
if (StringUtils.isBlank(url)) {
return pageId;
}
Pattern pat = Pattern.compile("topicId=[0-9]+");
Matcher matcher = pat.matcher(url);
if (matcher.find()) {
pageId = matcher.group().split("topicId=")[1];
}
return pageId;
}
}
mapper
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Map;
public class LogETLMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final IntWritable one = new IntWritable(1);
private Text outputKey = new Text();
private LogParser logParser = new LogParser();
private Logger logger = LoggerFactory.getLogger(LogETLMapper.class);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 解析日志记录
Map<String, String> logInfo = logParser.parse(value.toString());
if (logInfo == null) {
logger.error("日志记录的格式不正确或解析失败:" + value.toString());
return;
}
// 获取需要的字段
String ip = logInfo.get("ip");
String url = logInfo.get("url");
String country = logInfo.get("country");
String province = logInfo.get("province");
String city = logInfo.get("city");
// 调用 GetPageId 获取 topicId
String topicId = GetPageId.getPageId(url);
logInfo.put("pageId", topicId);
// 检查所有字段是否全部为空
if (ip != null || url != null || topicId != null || country != null || province != null || city != null) {
StringBuilder sb = new StringBuilder();
if (ip != null && !ip.isEmpty()) sb.append("IP: ").append(ip).append(", ");
if (url != null && !url.isEmpty()) sb.append("URL: ").append(url).append(", ");
if (topicId != null && !topicId.isEmpty()) sb.append("PageId: ").append(topicId).append(", ");
if (country != null && !country.isEmpty()) sb.append("Country: ").append(country).append(", ");
if (province != null && !province.isEmpty()) sb.append("Province: ").append(province).append(", ");
if (city != null && !city.isEmpty()) sb.append("City: ").append(city);
// 移除末尾的逗号和空格
String outputString = sb.toString().replaceAll(", $", "");
outputKey.set(outputString);
context.write(outputKey, one);
} else {
logger.error("所有字段为空,日志记录:" + value.toString());
}
}
}
Reducer
reduce 阶段无需多言,与前面的 reduce 阶段的处理逻辑一样。
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class LogETLReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LogETLDriver {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: LogETLDriver <input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Log ETL");
job.setJarByClass(LogETLDriver.class);
job.setMapperClass(LogETLMapper.class);
job.setCombinerClass(LogETLReducer.class);
job.setReducerClass(LogETLReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}