在Excel 中解析通话记录统计
现在假设数据存在于Excel 文件中。具体的实现方案为:
1、添加POI 解析Excel文件。
2、通过实现FileInputFormat 自定义文件输入流。
3、可以通过文件分片将不同的输出输出到不同的目录下。
4、最后将需要的包一并打包到!/some.jar/lib 目录下。最后的项目结构如图所示:
Excel 的文件格式如图5.2.2 所示:
上图的通讯记录(可以自行去移动或是联通的网上营业厅下载通讯记录的Excel 文件),为联通电话的通讯记录。可以自己通过联通或是移动的官方网站上下载自己手机的通话记录详单进行分析即可。
步1:添加依赖:作用是解析excel表格
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.16</version>
</dependency>
步2、开发JavaBean
通过实现接口WritableComparable 可以实现序列化和排序的功能。
package cn.hadoop.excel;
java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class PhoneBean implements WritableComparable<PhoneBean> {
private String tel;// 电话号码
private String type;// 类型,主叫或是被叫
private Long seconds;
private String fileName;//根据文件名进行分组
//其他方法略去....
}
步3、开发文件解析对象
这个解析类,只是一个独立的类,主要用于读取excel 文件,并从中读取数据。
package cn.hadoop.excel;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
public class ExcelParser {
/** 接收InputStream 返回解析的结果*/
public static List<PhoneBean> parse(InputStream in,String fileName) {
List<PhoneBean> list = new ArrayList<PhoneBean>();
HSSFWorkbook book = null;
try {
book = new HSSFWorkbook(in);
HSSFSheet sheet = book.getSheetAt(0);
Iterator<Row> it = sheet.iterator();
while (it.hasNext()) {
// 如果还有下一个
Row row = it.next();
String tel = row.getCell(5).getStringCellValue();
if (!tel.matches("\\d+")) {
continue;// 如果不是数字则下一个
}
String times = row.getCell(3).getStringCellValue();
Long time = parse(times);// 转成秒
String type = row.getCell(4).getStringCellValue();
PhoneBean bean = new PhoneBean(tel, type, time);
bean.setFileName(fileName);
list.add(bean);
}
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
book.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return list;
}
/** 将:2 分40 秒转成160 秒<br>*/
public static Long parse(String time) {
Long times = 0L;
if (time.contains("分")) {
String min = time.substring(0, time.indexOf("分"));
times += Long.parseLong(min) * 60;
time = time.substring(time.lastIndexOf("分") + 1);
}
String ss = time.substring(0, time.length() - 1);
times += Long.parseLong(ss);
return times;
}
/**将秒转成分种*/
public static String parse(Long times){
String str = "";
if(times>(60*60)){
Long _times = times/(60*60);
str=_times+"时";
times=times%(60*60);
}
if(times>60){
Long _times=times/60;
str=str+_times+"分";
times=times%60;
}
str=str+times+"秒";
return str;
}
}
步4、开发ExcelFileInputFormat
这个类输出的数据, 就是Mapper 接收的数据类型。注意观察
FileInputFormat<PhoneBean,NullWritable>
/**用于从Excel 中解析数据<br>*/
public class ExcelFileInputFormat extends FileInputFormat<PhoneBean,
NullWritable> {
/** 读取文件*/
public RecordReader<PhoneBean, NullWritable>
createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new ExcelRecordReader();
}
class ExcelRecordReader extends RecordReader<PhoneBean, NullWritable> {
private List<PhoneBean> list;
private int size;
private int index = -1;
private PhoneBean bean = new PhoneBean();
private InputStream in;
/** 分片读取数据*/
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit fs = (FileSplit) split;
FileSystem fs2 =
FileSystem.get(context.getConfiguration());
in = fs2.open(fs.getPath());
String fileName =
fs.getPath().toString();// hdfs://server:port/a.txt
fileName =
fileName.substring(fileName.lastIndexOf("/") + 1);
list = ExcelParser.parse(in,fileName);// 解析xml 文件
size = list.size();
}
public boolean nextKeyValue() throws IOException,
InterruptedException {
index++;
if (index < list.size()) {
return true;
}
return false;
}
public PhoneBean getCurrentKey() throws IOException,
InterruptedException {
return list.get(index);
}
public NullWritable getCurrentValue() throws IOException,
}
public PhoneBean getCurrentKey() throws IOException,
InterruptedException {
return list.get(index);
}
public NullWritable getCurrentValue() throws IOException,
步5、开发主类及Mapper 和Reducer
这儿开发的Mapper 和Reducer 与之前的一样,只是Mapper 中接收到的Key 和value
为InputFormart 输出的Key 和value,请稍加注意。
public class ExcelParserMain {
public static void main(String[] args) throws Exception {
Configuration config = new Configuration();
Job job = Job.getInstance(config);
job.setJarByClass(ExcelParserMain.class);
job.setMapperClass(ExcelMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(PhoneBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setReducerClass(ExcelReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setInputFormatClass(ExcelFileInputFormat.class);
job.setPartitionerClass(ExcelFileNamePartitioner.class);
job.setNumReduceTasks(3);
job.waitForCompletion(true);
}
public static class ExcelMapper extends Mapper<PhoneBean,
NullWritable, Text, PhoneBean> {
private Text key2 = new Text();
private LongWritable times = new LongWritable();
public void map(PhoneBean key, NullWritable value,
Mapper<PhoneBean, NullWritable, Text, PhoneBean>.Context context)
throws IOException, InterruptedException {
String key2 = key.getTel() + ":" + key.getType();
this.key2.set(key2);
times.set(key.getSeconds());
context.write(this.key2, key);// 组成:xxxxxx 主叫
}
}
public static class ExcelReducer extends Reducer<Text,PhoneBean,Text,Text> {
private Text value4 = new Text();
protected void reduce(Text text, Iterable<PhoneBean> it,
Reducer<Text, PhoneBean, Text, Text>.Context context)
throws IOException, InterruptedException {
Long sum = new Long(0);
for (PhoneBean bean : it) {
sum+=bean.getSeconds();
}
//将秒转换
String str = ExcelParser.parse(sum);
value4.set(str);
context.write(text, value4);
}}}
步6、打包运行
由于poi.jar 并不是hadoop 的包,所以,可以先将项目打成jar 包,然后在包里
面创建一个lib 目录,并将poi.jar 放到这个目录里面即可。
可以将some.xls 即获取到的excel 文件上传到hdfs 然后执行以下命令,执
行mapreduce 任务:
命令:
$ hadoop jar phonedata.jar /a.xls /out