最近实验有一个需求,分析50G左右的大文本数据,提取出指定字段,序列化对象。我想用MapReduce来进行读取,遂用MapReduce进行统计计算。首先建立bean类主要是记录Item的内部属性,如下所示:
package tju.hadoop.mapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
* 招投标类;
* */
public class ItemBean implements WritableComparable<ItemBean>{
//定义项目名字;
public String name;
//招标单位:
public String unit;
//投标单位
public String agency;
//发布时间
public String time;
//预算
public String budget;
//实际花费
public String amount;
//事件识别码
public String identifier;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public ItemBean(String identifier,String name,String unit,String agency,String time,String budget,String amount) {
this.name = name;
this.unit = unit;
this.agency = agency;
this.time = time;
this.budget = budget;
this.amount = amount;
this.identifier = identifier;
}
public ItemBean() {}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getUnit() {
return unit;
}
public void setUnit(String unit) {
this.unit = unit;
}
public String get