思路:基于每个时段内的时长排序,可以理解为,基于key的二次排序,只不过这个key是一个对象,这个对象有两个私有属性,一个是imsi,一个是duration;二次排序就是指,第一次排序imsi,第二次排序duration。 闲话少说,上代码 key类:ImsiAndDuration
- public class ImsiAndDuration implements WritableComparable<ImsiAndDuration> {
-
- private String imsi;
- private long duration;
-
- public void setImsi(String imsi) {
- this.imsi = imsi;
- }
- public void setDuration(long duration) {
- this.duration = duration;
- }
-
- public String getImsi() {
- return imsi;
- }
- public long getDuration() {
- return duration;
- }
-
-
- public ImsiAndDuration() {
- super();
- this.imsi = "";
- this.duration = 0;
- }
-
- public void setImsiAndDuration(String imsi, long duration) {
- this.imsi = imsi;
- this.duration = duration;
- }
-
-
- public int compareTo(ImsiAndDuration o) {
- // if(this.imsi != o.imsi)
- return this.imsi.compareTo(o.imsi);
- // else if(this.duration != o.duration)
- // return this.duration >= o.duration ? 1: -1;
- // else return 0;
- }
-
- public void readFields(DataInput in) throws IOException {
- this.imsi = in.readUTF();
- this.duration = in.readLong();
- }
- public void write(DataOutput out) throws IOException {
-
- out.writeUTF(imsi);
- out.writeLong(duration);
- }
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = 1;
- result = prime * result + Float.floatToIntBits(duration);
- result = prime * result + ((imsi == null) ? 0 : imsi.hashCode());
- return result;
- }
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- ImsiAndDuration other = (ImsiAndDuration) obj;
- if (Float.floatToIntBits(duration) != Float
- .floatToIntBits(other.duration))
- return false;
- if (imsi == null) {
- if (other.imsi != null)
- return false;
- } else if (!imsi.equals(other.imsi))
- return false;
- return true;
- }
- @Override
- public String toString() {
- return "ImsiAndDuration [imsi=" + imsi + ", duration=" + duration + "]";
- }
-
复制代码
LineException :
- public class LineException extends Exception{
- private static final long serialVersionUID = 1L;
- int flag;
- public LineException(String msg, int flag) {
- super(msg);
- this.flag = flag;
- }
-
- public int getFlag(){
- return flag;
- }
- }
复制代码
TableLineWithImsiAndDuration:
- public class TableLineWithImsiAndDuration{
- /**
- * 位置数据格式:
- * IMSI IMEI UPDATETYPE LOC TIME
- * UPDATETYPE为 该用户的状态,开机关机或者上传
- *
- * 上网数据格式:
- * IMSI IMEI LOC TIME URL
- *
- * 公共字段为IMSI IMEI LOC TIME
- * LOC 即为position
- * 因为time为unix时间,timeflag为time转换为24小时中的时间段
- *
- */
- private String imsi,position,time;
- protected String timeFlag;
- private Date day;
- private SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-
-
- /**
- * 初始化并检查该行的合法性
- *
- * @param line
- * @param source
- * @param date
- * @param timepoint
- * @throws LineException
- */
-
- public void set(String line ,boolean source ,String date, String [] timepoint) throws LineException{
- String [] lineSplit = line.split("\t");
-
- if(source){
- //POS
- this.imsi = lineSplit[0];
- this.position = lineSplit[3];
- this.time = lineSplit[4];
- }else{
- //NET
- this.imsi = lineSplit[0];
- this.position = lineSplit[2];
- this.time = lineSplit[3];
- }
-
- //检查日期的合法性
- if(!this.time.startsWith(date)){
- //年月日必须与date一直
- throw new LineException("", -1);
- }
-
- try {
- this.day = this.formatter.parse(this.time);
- } catch (ParseException e) {
- throw new LineException("", 0);
- }
-
-
- //计算所属的时间段
- int i = 0, n = timepoint.length;
- //yyyy-MM-dd HH:mm:ss截取HH
- int hour = Integer.valueOf(this.time.split(" ")[1].split(":")[0]);
- //用户的几个时间段(不一定是3个)小于当前hour的个数
- while(i < n && Integer.valueOf(timepoint[i]) <= hour)
- i++;
-
- if(i < n){
- //00-时间段
- if(i == 0){
- this.timeFlag = ("00-"+timepoint[i]);
- }else{
- //timepoint[i-1]时间段
- this.timeFlag = (timepoint[i-1] + "-" + timepoint[i]);
- }
-
- }else
- throw new LineException("", -1);
- }
-
-
-
- @Override
- public String toString() {
- return "TableLineWithComparable [imsi=" + imsi + ", position="
- + position + ", time=" + time + ", timeFlag=" + timeFlag
- + ", day=" + day + ", formatter=" + formatter.toString() + "]";
- }
-
- public ImsiAndDuration outKey(){
- long t = (day.getTime() / 1000L );
- ImsiAndDuration imsiAndDuration = new ImsiAndDuration();
- imsiAndDuration.setImsiAndDuration(imsi, t);
- return imsiAndDuration;
- }
-
- public Text outValue(){
- return new Text(this.position + "\t" + this.timeFlag);
-
- }
复制代码
BaseStationDataPreprocessFirstThree:
- public class BaseStationDataPreprocessFirstThree {
-
-
- enum Counter{
-
- TIMESKPI, //时间个数有误
- OUTOFTIMESKIP, //时间不在参数指定的时间段内
- LINESKIP, //源文件行有误
- USERSKIP //某个用户有个时间段被整个放弃
- }
-
-
- private static int time = 0;
-
-
- public static class MyMapper extends Mapper<LongWritable, Text, ImsiAndDuration, Text>{
-
- String date;
- String [] timepoint;
- boolean dataSource;
-
-
- @Override
- protected void map(LongWritable key, Text value, Context context)
- throws IOException, InterruptedException {
-
- String line = value.toString();
- TableLineWithImsiAndDuration tableLine = new TableLineWithImsiAndDuration();
-
- try {
- tableLine.set(line, this.dataSource, this.date, timepoint);
- } catch (LineException e) {
- if(e.getFlag() == 1){
- context.getCounter(Counter.OUTOFTIMESKIP).increment(1);
- }else{
- context.getCounter(Counter.TIMESKPI).increment(1);
- }
- return;
- }catch (Exception e) {
- context.getCounter(Counter.LINESKIP).increment(1);
- return;
- }
-
- context.write(tableLine.outKey(), tableLine.outValue());
- // System.out.println("outKey : " + tableLine.outKey());
- // System.out.println("outValue : " + tableLine.outValue());
- }
-
-
- @Override
- protected void setup(Context context) throws IOException,
- InterruptedException {
- this.date = context.getConfiguration().get("date");
- this.timepoint = context.getConfiguration().get("timepoint").split("-");
-
- //提取文件名
- FileSplit fileSplit = (FileSplit) context.getInputSplit();
- String fileName = fileSplit.getPath().getName();
- if (fileName.startsWith("POS")) {
- dataSource = true;
- }else if (fileName.startsWith("NET")){
- dataSource = false;
- }else {
- throw new IOException("File Name should starts with POS or NET");
- }
- }
- }
-
-
- public static class MyReducer extends Reducer<ImsiAndDuration, Text, Text, Text>{
-
- private String date;
- private SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-
- @Override
- protected void reduce(ImsiAndDuration key, Iterable<Text> values,Context context)
- throws IOException, InterruptedException {
-
- if( 0 == time ){
- context.write(new Text("IMSI\t"), new Text("POSITION|TIMEFLAG|DURATION(min)"));
- time++;
- }
-
- // System.out.println("ImsiAndDuration:" + key.toString());
-
- String imsi = key.getImsi();
- long duration = key.getDuration();
-
- String valueString = null,position = null,timeFlag = null,reduce_position = null,reduce_timeFlag=null;
- TreeMap<Long, String> uploads = new TreeMap<Long, String>();
-
- Iterator<Text> it = values.iterator();
- while(it.hasNext()){
- valueString = it.next().toString();
- position = valueString.split("\t")[0];
- timeFlag = valueString.split("\t")[1];
-
- try {
- uploads.put(duration,position);
- } catch (NumberFormatException e) {
- context.getCounter(Counter.TIMESKPI).increment(1);
- continue;
- }
- }
-
- //add the off position
- try {
-
- Date tmp = this.formatter.parse(this.date + " " +timeFlag.split("-")[1] + ":00:00" );
- uploads.put(tmp.getTime() / 1000L, "OFF");
-
- HashMap<String, Float> locs = getStayTime(uploads);
-
-
- for (Entry<String, Float> entrySet : locs.entrySet()) {
-
- reduce_position = entrySet.getKey();
- reduce_timeFlag = String.valueOf(entrySet.getValue());
- context.write(new Text(imsi), new Text(reduce_position + "|" + timeFlag +"|"+ reduce_timeFlag));
-
- }
-
- } catch (Exception e) {
- context.getCounter(Counter.USERSKIP).increment(1);
- return;
- }
- }
-
- //location time
- private HashMap<String, Float> getStayTime(TreeMap<Long, String> uploads) {
- Entry<Long,String> upload ,nextUpload;
- HashMap<String, Float> locs = new HashMap<String, Float>();
-
-
-
- Iterator<Entry<Long, String>> it = uploads.entrySet().iterator();
- upload = it.next();
- while(it.hasNext()){
- nextUpload = it.next();
- float diff = (float)(nextUpload.getKey() - upload.getKey()) / 60.0f;
-
- if(diff <= 60.0){
- if(locs.containsKey(upload.getValue())){
- locs.put(upload.getValue(), locs.get(upload.getValue()) + diff);
- }else{
- locs.put(upload.getValue(), diff);
- }
- }
- upload = nextUpload;
- }
-
- return locs;
- }
-
- @Override
- protected void setup(Context context)
- throws IOException, InterruptedException {
-
- this.date = context.getConfiguration().get("date");
-
- }
- }
-
-
-
- public static class firstThreePartitioner extends Partitioner<ImsiAndDuration, Text>{
-
- @Override
- public int getPartition(ImsiAndDuration key, Text value, int numReducers) {
- //imsi
- return (key.getImsi().hashCode() & Integer.MAX_VALUE) % numReducers;
- }
- }
-
- //每个分区内又调用job.setSortComparatorClass或者key的比较函数进行排序,以duration进行排序
- public static class firstThreeKeySortComparator extends WritableComparator{
-
- protected firstThreeKeySortComparator() {
- super(ImsiAndDuration.class,true);
- }
-
- @SuppressWarnings("rawtypes")
- @Override
- public int compare(WritableComparable a, WritableComparable b) {
-
- ImsiAndDuration imsi1 = (ImsiAndDuration)a;
- ImsiAndDuration imsi2 = (ImsiAndDuration)b;
-
- int cmp = imsi1.compareTo(imsi2);
- if(cmp != 0)
- return cmp;
- return imsi1.getDuration() != imsi2.getDuration() ? imsi1.getDuration() > imsi2.getDuration() ? 1: -1 : 0;
-
- }
-
- @Override
- public int compare(byte[] b1, int s1, int l1, byte[] b2,
- int s2, int l2) {
- return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
- }
- }
-
- @SuppressWarnings("rawtypes")
- //相同key的value在一起,以imsi 分组
- public static class firstThreeGroupingComparator extends WritableComparator{
-
- public firstThreeGroupingComparator() {
- super(ImsiAndDuration.class,true);
- }
-
-
- @Override
- public int compare(WritableComparable a, WritableComparable b) {
- ImsiAndDuration imsi1 = (ImsiAndDuration)a;
- ImsiAndDuration imsi2 = (ImsiAndDuration)b;
- return imsi1.compareTo(imsi2);
- }
-
- @Override
- public int compare(byte[] b1, int s1, int l1, byte[] b2,
- int s2, int l2) {
- return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
- }
- }
-
-
- public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
-
- Configuration conf = new Configuration();
-
- conf.set("date", "2013-09-12");
- conf.set("timepoint", "09-17-24");
-
- conf.set("mapred.job.tracker", "192.168.126.3:9001");
- conf.addResource("classpath:/hadoop/core-site.xml");
- conf.addResource("classpath:/hadoop/hdfs-site.xml");
- conf.addResource("classpath:/hadoop/mapred-site.xml");
- conf.addResource("classpath:/hadoop/master");
- conf.addResource("classpath:/hadoop/slave");
-
- Path inPath = new Path("/input/1217");
- Path outPath = new Path("/output/1217/BaseStation3");
-
- Job job = new Job(conf, "BaseStation3");
-
- FileInputFormat.addInputPath(job, inPath);
- FileOutputFormat.setOutputPath(job, outPath);
-
- job.setJarByClass(BaseStationDataPreprocess.class);
- job.setMapperClass(MyMapper.class);
- job.setReducerClass(MyReducer.class);
- job.setPartitionerClass(firstThreePartitioner.class);
- job.setGroupingComparatorClass(firstThreeGroupingComparator.class);
- job.setSortComparatorClass(firstThreeKeySortComparator.class);
-
-
- job.setMapOutputKeyClass(ImsiAndDuration.class);
- job.setMapOutputValueClass(Text.class);
-
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
-
- System.exit(job.waitForCompletion(true)? 0 : 1);
-
-
- }
-
-
-
- }
-
复制代码
因为之前,跑了100000个imsi,vm跑挂了,所以这次数据量小一点。 运行数据量100个imsi,两个数据文件加起来2M左右 生成结果: 因为这个程序是验证二次排序的效果,没有取前3. |