Hadoop/MapReduce 使用马尔可夫模型的智能邮件营销

目的:用户的购买行为看起来是没有规律可循的,但其实从时间有序的角度看,也许是有规律可循的,例如,用户可能每一个月发工资时购买得多,每年某个时间(双十一、生日)等购买得比较多
马尔科夫模型能够挖掘出时间上的规律,假设我们能够根据用户上一次购买记录推测其下一次购买时间,就可以在推测时间向其发送邮件进行营销
至于营销的商品内容,可以根据其他推荐算法的结果。



输入:<customerID>,<transactionID>,<purchaseDate>,<amount>
...
ZSY40NYPS6,1381872876,2013-01-01,110
...
ZSY40NYPS6,1381872920,2013-01-11,32
...
ZSY40NYPS6,1381873821,2013-03-04,111
...
ZSY40NYPS6,1381874034,2013-04-09,65
...

第一步:生成<customerID>,<purchaseDate1>,<amount1>,<purchaseDate2>,<amount2>,<purchaseDate3>,<amount3>...
其中,purchaseDate1<=purchaseDate2<=purchaseDate3....
需要对每个用户的交易日期排序,关键技术:二次排序、组合键
...
ZSY40NYPS6,2013-01-01,110,2013-01-11,32,2013-03-04,111,2013-04-09,65
...


package markov.step1;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 
 * CompositeKey: represents a pair of 
 * (String customerID, long timestamp).
 * 
 * 
 * We do a primary grouping pass on the customerID field to get all  
 * of the data of one type together, and then our "secondary sort"  
 * during the shuffle phase uses the timestamp long member (representing  
 * the purchase-date) to sort the pairs of PairOfLongInt so that they
 * arrive at the reducer partitioned and in sorted order.
 *  
 * @author Mahmoud Parsian
 *
 */
public class CompositeKey implements WritableComparable<CompositeKey> {
    // natural key is (customerID)
    // composite key is a pair (customerID, timestamp)
	private String customerID;
	private long timestamp;

	public CompositeKey(String customerID, long timestamp) {
		set(customerID, timestamp);
	}
	
	public CompositeKey() {
	}

	public void set(String customerID, long timestamp) {
		this.customerID = customerID;
		this.timestamp = timestamp;
	}

	public String getCustomerID() {
		return this.customerID;
	}

	public long getTimestamp() {
		return this.timestamp;
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.customerID = in.readUTF();
		this.timestamp = in.readLong();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.customerID);
		out.writeLong(this.timestamp);
	}

	@Override
	public int compareTo(CompositeKey other) {
		if (this.customerID.compareTo(other.customerID) != 0) {
			return this.customerID.compareTo(other.customerID);
		} 
		else if (this.timestamp != other.timestamp) {
			return timestamp < other.timestamp ? -1 : 1;
		} 
		else {
			return 0;
		}

	}

	public static class CompositeKeyComparator extends WritableComparator {
		public CompositeKeyComparator() {
			super(CompositeKey.class);
		}

		public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
			return compareBytes(b1, s1, l1, b2, s2, l2);
		}
	}

	static { // register this comparator
		WritableComparator.define(CompositeKey.class,
				new CompositeKeyComparator());
	}

    @Override
    public String toString() {
        return customerID;
    }

	
	
}


package markov.step1;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * CompositeKeyComparator
 * 
 * The purpose of this class is to enable comparison of two CompositeKey(s).
 * 
 *  
 * @author Mahmoud Parsian
 *
 */
public class CompositeKeyComparator extends WritableComparator {

	protected CompositeKeyComparator() {
		super(CompositeKey.class, true);
	}

	@Override
	public int compare(WritableComparable w1, WritableComparable w2) {
		CompositeKey key1 = (CompositeKey) w1;
		CompositeKey key2 = (CompositeKey) w2;

		int comparison = key1.getCustomerID().compareTo(key2.getCustomerID());
		if (comparison == 0) {
			 // customerID's are equal here
		     if (key1.getTimestamp() == key2.getTimestamp()) {
		     	return 0;
		     }
		     else if (key1.getTimestamp() < key2.getTimestamp()) {
		     	return -1;
		     }
		     else {
		     	return 1;
		     }
		}
		else {
			return comparison;
		}
	}
}


package markov.step1;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 
 * NaturalKeyGroupingComparator
 * 
 * This class is used during Hadoop's shuffle phase to group 
 * composite key's by the first part (natural) of their key.
 * The natural key is the "customerID".
 *  
 * @author Mahmoud Parsian
 *
 */
public class NaturalKeyGroupingComparator extends WritableComparator {

	protected NaturalKeyGroupingComparator() {
		super(CompositeKey.class, true);
	}

	@Override
	public int compare(WritableComparable w1, WritableComparable w2) {
		CompositeKey key1 = (CompositeKey) w1;
		CompositeKey key2 = (CompositeKey) w2;
		return key1.getCustomerID().compareTo(key2.getCustomerID());
	}

}

package markov.step1;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import edu.umd.cloud9.io.pair.PairOfLongInt;

/**
 * NaturalKeyPartitioner
 * 
 * This custom partitioner allow us to distribute how outputs from the 
 * map stage are sent to the reducers.  NaturalKeyPartitioner partitions 
 * the data output from the map phase (SecondarySortProjectionMapper)
 * before it is sent through the shuffle phase. Since we want a single
 * reducer to recieve all projected data for a single "customerID", we 
 * partition data output of the map phase by only the natural key component 
 * ("customerID"). Note that (CompositeKey, PairOfLongInt) is the (key, value)
 * generated by mappers.
 * 
 * 
 * @author Mahmoud Parsian
 *
 */
public class NaturalKeyPartitioner implements
   Partitioner<CompositeKey, PairOfLongInt> {

	@Override
	public int getPartition(CompositeKey key, 
	                        PairOfLongInt value,
			                int numberOfPartitions) {
		return (int) (hash(key.getCustomerID()) % numberOfPartitions);
	}

	@Override
	public void configure(JobConf jobconf) {
	}
	
    /**
     *  adapted from String.hashCode()
     */
    static long hash(String str) {
		return Math.abs(str.hashCode());
    }	

    /**
     *  adapted from String.hashCode()
     */
    static long hash2(String str) {
       long h = 1125899906842597L; // prime
       int length = str.length();
       for (int i = 0; i < length; i++) {
          h = 31*h + str.charAt(i);
       }
       return h;
    }	
}


package markov.step1;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.commons.lang.StringUtils;

import yidongpingjun.DateUtil;
import edu.umd.cloud9.io.pair.PairOfLongInt;



/**
  * MapReduce job for projecting customer transaction data
  * by using MapReduce's "secondary sort" (sort by shuffle function).
  * Note that reducer values arrive sorted by implementing the "secondary sort"
  * design pattern (no data is sorted in memory).
  *
  * This class implements the map() function for "secondary sort" design pattern.
  * 
  * @author Mahmoud Parsian
  *
  */
public class SecondarySortProjectionMapper extends MapReduceBase 
   implements Mapper<LongWritable, Text, CompositeKey, PairOfLongInt> {
 
   // reuse Hadoop's Writable objects
   private final CompositeKey reducerKey = new CompositeKey();
   private final PairOfLongInt reducerValue = new PairOfLongInt();
 
	@Override
	public void map(LongWritable inkey, Text value,
			OutputCollector<CompositeKey, PairOfLongInt> output,
			Reporter reporter) throws IOException {
			   
       String[] tokens = StringUtils.split(value.toString(), ",");
       if (tokens.length != 4) {
          // not a proper format
          return;
       }
       // tokens[0] = customer-id
       // tokens[1] = transaction-id
       // tokens[2] = purchase-date
       // tokens[3] = amount
       long date;
       try {
       		date = DateUtil.getDateAsMilliSeconds(tokens[2]);
       }
       catch(Exception e) {
       		// date is in error, ignore the record
       		return;
       }
       int amount = Integer.parseInt(tokens[3]);
       reducerValue.set(date, amount);
       reducerKey.set(tokens[0], date); 
       // emit key-value pair
       output.collect(reducerKey, reducerValue);
   }
}

package markov.step1;

import java.util.Iterator;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.JobConf;

import yidongpingjun.DateUtil;
import edu.umd.cloud9.io.pair.PairOfLongInt;


/**
 * 
 * SecondarySortProjectionReducer 
 * 
 * Data arrive sorted to reducer.
 * 
 * MapReduce job for projecting customer transaction data
 * by using MapReduce's "secondary sort" (sort by shuffle 
 * function).
 * Note that reducer values arrive sorted by implementing 
 * the "secondary sort" design pattern (no data is sorted 
 * in memory).
 *
 * This class implements the reduce() function for "secondary sort" 
 * design pattern.
 * 
 * @author Mahmoud Parsian
 *
 */
public class SecondarySortProjectionReducer extends MapReduceBase 
   implements Reducer<CompositeKey, PairOfLongInt, Text, Text> {
	 
	public void reduce(CompositeKey key, 
	                   Iterator<PairOfLongInt> values,
			           OutputCollector<Text, Text> output, 
			           Reporter reporter)
		throws IOException {

		// note that values are sorted (by using MR's secondary sort)
        // below, builder will generate: 
        //    CustoerID,Date1,Amount1,Date2,Amount2,...,DateN,AmountN
        // where Date1 <= Date2 <= ... <= DateN
        StringBuilder builder = new StringBuilder();
        builder.append(key.toString());			
        while (values.hasNext()) {
			 builder.append(",");
             PairOfLongInt pair = values.next();
			 long timestamp = pair.getLeftElement(); // date as milliseconds
			 String date = DateUtil.getDateAsString(timestamp);
			 builder.append(date); // date as String			 
			 builder.append(",");
			 builder.append(pair.getRightElement()); // amount
		} 
		
        output.collect(null, new Text(builder.toString()));        
	} // reduce

}

package markov.step1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobClient;
import edu.umd.cloud9.io.pair.PairOfLongInt;
import org.apache.log4j.Logger;


/**
  * MapReduce job for projecting customer transaction data
  * by using MapReduce's "secondary sort" (sort by shuffle function).
  * Note that reducer values arrive sorted by implementing the "secondary sort"
  * design pattern (no data is sorted in memory).
  * 
  * @author Mahmoud Parsian
  *
  */
public class SecondarySortProjectionDriver {

	private static final Logger theLogger = 
	   Logger.getLogger(SecondarySortProjectionDriver.class); 
 
    public static void main(String[] args) throws Exception {
        args = new String[2];
        args[0] = "input/smart_email_training.txt";
        args[1] = "output/smart_email_training";
        
		long startTime = System.currentTimeMillis();	
        Configuration conf = new Configuration();
		JobConf jobconf = new JobConf(conf, SecondarySortProjectionDriver.class);
		jobconf.setJobName("SecondarySortProjectionDriver");
    
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if (otherArgs.length != 2) {
          System.err.println("Usage: SecondarySortProjectionDriver  <input> <output>");
          System.exit(1);
       }

       // add jars to distributed cache
       // set mapper/reducer
       jobconf.setMapperClass(SecondarySortProjectionMapper.class);
       jobconf.setReducerClass(SecondarySortProjectionReducer.class);
       
       // define mapper's output key-value
       jobconf.setMapOutputKeyClass(CompositeKey.class);
       jobconf.setMapOutputValueClass(PairOfLongInt.class);
              
       // define reducer's output key-value
       jobconf.setOutputKeyClass(Text.class);
       jobconf.setOutputValueClass(Text.class);
       
       // define I/O
	   FileInputFormat.setInputPaths(jobconf, new Path(otherArgs[0]));
	   FileOutputFormat.setOutputPath(jobconf, new Path(otherArgs[1]));
       
       jobconf.setInputFormat(TextInputFormat.class); 
       jobconf.setOutputFormat(TextOutputFormat.class);
	   jobconf.setCompressMapOutput(true);       
       
       // the following 3 setting are needed for "secondary sorting"
       // Partitioner decides which mapper output goes to which reducer 
       // based on mapper output key. In general, different key is in 
       // different group (Iterator at the reducer side). But sometimes, 
       // we want different key in the same group. This is the time for 
       // Output Value Grouping Comparator, which is used to group mapper 
       // output (similar to group by condition in SQL).  The Output Key 
       // Comparator is used during sort stage for the mapper output key.
       jobconf.setPartitionerClass(NaturalKeyPartitioner.class);
       jobconf.setOutputKeyComparatorClass(CompositeKeyComparator.class);
       jobconf.setOutputValueGroupingComparator(NaturalKeyGroupingComparator.class);
       
       JobClient.runJob(jobconf).waitForCompletion();
       
	   long elapsedTime = System.currentTimeMillis() - startTime;
       theLogger.info("elapsedTime (in milliseconds): "+ elapsedTime);      
       System.exit(0);
       
    }

}









第二步:将交易序列转换为状态序列
对于每个用户的交易序列,每次取两个交易:2013-01-01,110,2013-01-11,32   2013-03-04,111,2013-04-09,65
根据两个交易的时间差和金额差来标定不同的状态

ZSY40NYPS6,ME,SL


package markov.step2;

import java.io.IOException;
import java.util.Date;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

import yidongpingjun.DateUtil;

public class StateSequenceMapper extends MapReduceBase 
implements Mapper<LongWritable, Text, Text, Text>{

    @Override
    public void map(LongWritable arg0, Text value,
            OutputCollector<Text, Text> output, Reporter arg3) throws IOException {
        // TODO Auto-generated method stub
      //000UDM50M4,2013-04-27,183,2013-06-19,62,2013-06-20,29
        String[] tokens = value.toString().split(",");
        if(tokens.length < 5)
        {
            return;
        }
        String customerID = tokens[0];
        int i = 4;
        while(i < tokens.length)
        {
            String sequence = "";
            int amount = Integer.valueOf(tokens[i]);
            int priorAmount = Integer.valueOf(tokens[i-2]);
            Date date = DateUtil.getDate(tokens[i-1]);
            Date priorDate =DateUtil.getDate( tokens[i-3]);
            long daysDiff = (date.getTime() - priorDate.getTime())/1000/60/60/24;
            int amountDif = amount - priorAmount;
            char dd;
            char ad;;
            if(daysDiff < 30)
            {
                dd = 'S';
            }
            else if(daysDiff < 60)
            {
                dd = 'M';
            }
            else
            {
                dd = 'L';
            }
            if(priorAmount < 0.9 * amount)
            {
                ad = 'L';
            }
            else if(priorAmount < 1.1 * amount)
            {
                ad = 'E';
            }
            else
            {
                ad = 'G';
            }
            sequence = "" + dd + ad;
            Text outputKey = new Text();
            outputKey.set(customerID);
            Text outputValue = new Text();
            outputValue.set(sequence);
            output.collect(outputKey,outputValue);
            i+=2;
        }
    }
}


package markov.step2;

import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class StateSequenceReducer extends MapReduceBase 
   implements Reducer<Text, Text, Text, Text> {
    public void reduce(Text key, 
                       Iterator<Text> values,
                       OutputCollector<Text, Text> output, 
                       Reporter reporter)
        throws IOException {
        StringBuilder builder = new StringBuilder();
        builder.append(key.toString());         
        while (values.hasNext()) {
             builder.append(",");
             builder.append(values.next().toString());
        } 
        output.collect(null, new Text(builder.toString()));        
    }

}

package markov.step2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobClient;
import org.apache.log4j.Logger;


public class StateSequenceDiver {

	private static final Logger theLogger = 
	   Logger.getLogger(StateSequenceDiver.class); 
 
    public static void main(String[] args) throws Exception {
        args = new String[2];
        args[0] = "output/smart_email_training";
        args[1] = "output/smart_email_training2";
        
		long startTime = System.currentTimeMillis();	
        Configuration conf = new Configuration();
		JobConf jobconf = new JobConf(conf, StateSequenceDiver.class);
		jobconf.setJobName("StateSequenceDiver");
    
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if (otherArgs.length != 2) {
          System.err.println("Usage: SecondarySortProjectionDriver  <input> <output>");
          System.exit(1);
       }

       // add jars to distributed cache
       // set mapper/reducer
       jobconf.setMapperClass(StateSequenceMapper.class);
       jobconf.setReducerClass(StateSequenceReducer.class);
       
       // define mapper's output key-value
       jobconf.setMapOutputKeyClass(Text.class);
       jobconf.setMapOutputValueClass(Text.class);
              
       // define reducer's output key-value
       jobconf.setOutputKeyClass(Text.class);
       jobconf.setOutputValueClass(Text.class);
       
       // define I/O
	   FileInputFormat.setInputPaths(jobconf, new Path(otherArgs[0]));
	   FileOutputFormat.setOutputPath(jobconf, new Path(otherArgs[1]));
       
       jobconf.setInputFormat(TextInputFormat.class); 
       jobconf.setOutputFormat(TextOutputFormat.class);
	   jobconf.setCompressMapOutput(true);       
       JobClient.runJob(jobconf).waitForCompletion();
	   long elapsedTime = System.currentTimeMillis() - startTime;
       theLogger.info("elapsedTime (in milliseconds): "+ elapsedTime);      
       System.exit(0);
    }
}








第三步:生成马尔科夫状态转移矩阵。根据上步统计状态转移
([ME,SL],1)累加

package markov.step3;

import java.io.InputStream;
import java.io.OutputStream;
import java.io.BufferedReader;
//
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.util.LineReader;

/**
 * This class provides convenient methods for accessing 
 * some Input/Output methods.
 *
 * @author Mahmoud Parsian (mahmoud.parsian@yahoo.com)
 *
 */
public class InputOutputUtil {

    public static void close(LineReader reader) {
        if (reader == null) {
            return;
        }
        //
        try {
            reader.close();
        } 
        catch (Exception ignore) {
        }
    }

    public static void close(OutputStream stream) {
        if (stream == null) {
            return;
        }
        //
        try {
            stream.close();
        } 
        catch (Exception ignore) {
        }
    }

    public static void close(InputStream stream) {
        if (stream == null) {
            return;
        }
        //
        try {
            stream.close();
        } 
        catch (Exception ignore) {
        }
    }

    public static void close(FSDataInputStream stream) {
        if (stream == null) {
            return;
        }
        //
        try {
            stream.close();
        } 
        catch (Exception ignore) {
        }
    }

    public static void close(BufferedReader reader) {
        if (reader == null) {
            return;
        }
        //
        try {
            reader.close();
        } 
        catch (Exception ignore) {
        }
    }

}

package markov.step3;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import edu.umd.cloud9.io.pair.PairOfStrings;
import org.apache.commons.lang.StringUtils;

/**
 * The MarkovStateTransitionModelMapper class implements MapReduce's
 * map() method.
 *
 * 
 * @author Mahmoud Parsian
 *
 */
public class MarkovStateTransitionModelMapper 
	extends Mapper<LongWritable, Text, PairOfStrings, IntWritable> {

	private PairOfStrings reducerKey = new PairOfStrings();
	private static final IntWritable ONE  = new IntWritable(1);

	protected void map(LongWritable key, Text value, Context context)
		throws IOException, InterruptedException {
		// value = <customerID><,><State1><,><State2><,>...<,><StateN>
		String[] items = StringUtils.split(value.toString(), ",");	
		if (items.length > 2) {
			for (int i = 1; i < (items.length -1); i++) {
				reducerKey.set(items[i], items[i+1]);
				context.write(reducerKey, ONE);
			}
		}
	}              
}	
	

package markov.step3;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import edu.umd.cloud9.io.pair.PairOfStrings;

/**
 * The MarkovStateTransitionModelCombiner class implements MapReduce's
 * combine() method (in Hadoop, we call it reduce() method).
 *
 * This class implements the combine() function for Markov's 
 * state transition model.
 * 
 * @author Mahmoud Parsian
 *
 */
public class MarkovStateTransitionModelCombiner 
	extends Reducer<PairOfStrings, IntWritable, PairOfStrings, IntWritable> {
		
	protected void reduce(PairOfStrings  key, Iterable<IntWritable> values, Context context)
		throws IOException, InterruptedException {
		int partialSum = 0;
		for (IntWritable value : values) {
			partialSum += value.get();
		}
		context.write(key, new IntWritable(partialSum));       	
	}		
}	
	

package markov.step3;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import edu.umd.cloud9.io.pair.PairOfStrings;

/**
 * The MarkovStateTransitionModelReducer class implements MapReduce's
 * reduce() method.
 *
 * 
 * @author Mahmoud Parsian
 *
 */
public class MarkovStateTransitionModelReducer 
	extends Reducer<PairOfStrings, IntWritable, Text, IntWritable> {
	   	
	protected void reduce(PairOfStrings key, Iterable<IntWritable> values, Context context)
	throws IOException, InterruptedException {
		int finalCount = 0;
		for (IntWritable value : values) {
			finalCount += value.get();
		}
		
		String fromState = key.getLeftElement();
		String toState = key.getRightElement();
		String outputkey = fromState + "," + toState;
		context.write(new Text(outputkey), new IntWritable(finalCount));
	}	   	
}
	

package markov.step3;


import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import edu.umd.cloud9.io.pair.PairOfStrings;


/**
 * Markov state transition probability matrix Driver
 *
 * 
 * @author Mahmoud Parsian
 *
 */
public class MarkovStateTransitionModelDriver extends Configured implements Tool {
	@Override
	public int run(String[] args) throws Exception {
	    args = new String[2];
	    args[0] = "output/smart_email_training2";
	    args[1] = "output/smart_email_training3";
	    
        @SuppressWarnings("deprecation")
        Job job = new Job(getConf());
        job.setJobName("MarkovStateTransitionModelDriver");
        
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(MarkovStateTransitionModelMapper.class);
        job.setReducerClass(MarkovStateTransitionModelReducer.class);
        job.setCombinerClass(MarkovStateTransitionModelCombiner.class);
        
        // PairOfStrings = (fromState, toState)
        job.setMapOutputKeyClass(PairOfStrings.class); 
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        int status =  job.waitForCompletion(true) ? 0 : 1;
        return status;
	}
	
	public static void main(String[] args) throws Exception {
        int statusCode = ToolRunner.run(new MarkovStateTransitionModelDriver(), args);
        System.exit(statusCode);
	}
}


 
第四步:根据马尔科夫状态转移矩阵得到转移概率表
package markov.step3;

/**
 * TableItem represents an item of a Markov State Transition Model 
 * as a Tuple3<fromSate, toState, count>
 *
 */
public class TableItem  {
	String fromState;
	String toState;
	int count;
	
	public TableItem(String fromState, String toState, int count) {
		this.fromState = fromState;
		this.toState = toState;
		this.count = count;
	}
	
	/**
	 * for debugging ONLY
	 */
	public String toString() {
		return "{"+fromState+"," +toState+","+count+"}";
	}
}


package markov.step3;

import java.util.List;
import java.util.ArrayList;

import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;


/**
 * Class containing a number of utility methods for manipulating 
 * Hadoop's SequenceFiles.
 *
 *
 * @author Mahmoud Parsian
 *
 */
public class ReadDataFromHDFS {

	private static final Logger THE_LOGGER = 
		Logger.getLogger(ReadDataFromHDFS.class);

	private ReadDataFromHDFS() {
	}
	
	public static List<TableItem> readDirectory(String path) {
		return ReadDataFromHDFS.readDirectory(new Path(path));
	}
	
	public static List<TableItem> readDirectory(Path path) {
		FileSystem fs;
		try {
			fs = FileSystem.get(new Configuration());
		} 
		catch (IOException e) {
			THE_LOGGER.error("Unable to access the hadoop file system!", e);
			throw new RuntimeException("Unable to access the hadoop file system!");
		}
		
		List<TableItem> list = new ArrayList<TableItem>();
		try {
			FileStatus[] stat = fs.listStatus(path);
			for (int i = 0; i < stat.length; ++i) {
				if (stat[i].getPath().getName().startsWith("part")) {
					List<TableItem> pairs = readFile(stat[i].getPath(), fs);
					list.addAll(pairs);
				}
			}
		} 
		catch (IOException e) {
			THE_LOGGER.error("Unable to access the hadoop file system!", e);
			throw new RuntimeException("Error reading the hadoop file system!");
		}

		return list;		
	}	

	@SuppressWarnings("unchecked")
	public static List<TableItem> readFile(Path path, FileSystem fs) {
		THE_LOGGER.info("path="+path);
		List<TableItem> list = new ArrayList<TableItem>();
		FSDataInputStream stream = null;
		BufferedReader reader = null;
		try {
			stream = fs.open(path);
			reader = new BufferedReader(new InputStreamReader(stream));
			String line;
			while ((line = reader.readLine()) != null) {
				// line = <fromState><,><toState><TAB><count>
				THE_LOGGER.info("line="+line);
				String[] tokens = line.split("\t"); // TAB separator
				if (tokens.length == 2) {
					String states = tokens[0];
					int count = Integer.parseInt(tokens[1]);
					String[] twoStates =  states.split(",");
					TableItem item = new TableItem(twoStates[0], twoStates[1], count);
					list.add(item);
				}
			}		
		}
		catch (IOException e) {
			THE_LOGGER.error("readFileIntoCoxRegressionItem() failed!", e);
			throw new RuntimeException("readFileIntoCoxRegressionItem() failed!");
		}
		finally {
			InputOutputUtil.close(reader);
			InputOutputUtil.close(stream);
		}
			
		return list;
	}
	

	
	
	public static void main(String[] args) throws Exception {
		String path = args[0];
		List<TableItem> list = readDirectory(path);
		THE_LOGGER.info("list="+list.toString());
	}
		
}

package markov.step3;

import java.util.Map;
import java.util.List;
import java.util.HashMap;

/**
 * Markov state transition probability matrix builder
 *
 */
public class StateTransitionTableBuilder {

	//
	// model.states=SL,SE,SG,ML,ME,MG,LL,LE,LG
	//
	// states<key, value>: key is the state and value is row/column in table
	//
	private Map<String, Integer> states = null;
	private double[][] table = null;
    private int numberOfStates;
	private int scale = 100;

	private void initStates(){
	    states = new HashMap<String, Integer>();
		states.put("SL", 0);	
		states.put("SE", 1);	
		states.put("SG", 2);	
		states.put("ML", 3);	
		states.put("ME", 4);	
		states.put("MG", 5);	
		states.put("LL", 6);	
		states.put("LE", 7);	
		states.put("LG", 8);	
	}
			
	public StateTransitionTableBuilder(int numberOfStates) {
		this.numberOfStates = numberOfStates;
		table = new double[numberOfStates][numberOfStates];
		initStates();
	}
	
	public StateTransitionTableBuilder(int numberOfStates, int scale) {
		this(numberOfStates);
		this.scale = scale;
	}

    public void add(String fromState, String toState, int count) {
    	int row = states.get(fromState);
    	int column = states.get(toState);
        table[row][column] = count;
    }
    
	public void normalizeRows() {
		// Laplace correction: the usual solution is to do a 
		// Laplacian correction by upping all the counts by 1
		// see: http://cs.nyu.edu/faculty/davise/ai/bayesText.html		
		for (int r = 0; r < numberOfStates; r++) {
			boolean gotZeroCount = false;
			for (int c = 0; c < numberOfStates; c++) {
				if(table[r][c] == 0) {
					gotZeroCount = true;
					break;
				}
			}
			
			if (gotZeroCount) {
				for (int c = 0; c < numberOfStates; c++) {
					 table[r][c] += 1;
				}			
			}
		}		
		
		//normalize
		for (int r = 0; r < numberOfStates; r++) {
			double rowSum = getRowSum(r);
			for (int c = 0; c < numberOfStates; c++) {
				table[r][c] = table[r][c] / rowSum;
			}
		}
	}
	
    public double getRowSum(int rowNumber) {
        double sum = 0.0;
        for (int column = 0; column < numberOfStates; column++) {
            sum += table[rowNumber][column];
        }
        return sum;
    }

    public String serializeRow(int rowNumber) {
        StringBuilder builder = new StringBuilder();
        for (int column = 0; column < numberOfStates; column++) {
        	double element = table[rowNumber][column];
        	builder.append(String.format("%.4g", element));
            if (column < (numberOfStates-1)) {
            	builder.append(",");
            }
        }
        return builder.toString();
    }

    public void persistTable() {
		for (int row = 0; row < numberOfStates; row++) {
        	String serializedRow = serializeRow(row);
        	System.out.println(serializedRow);
        }
    }
   
	public static void generateStateTransitionTable(String hdfsDirectory) {
		List<TableItem> list = ReadDataFromHDFS.readDirectory(hdfsDirectory);
	    StateTransitionTableBuilder tableBuilder = new StateTransitionTableBuilder(9);
	    for (TableItem item : list) {
	    	tableBuilder.add(item.fromState, item.toState, item.count);
	    }
	    
	    tableBuilder.normalizeRows();
	    tableBuilder.persistTable();
	}
	
	public static void main(String[] args) {
		String hdfsDirectory = "output/smart_email_training3";
		generateStateTransitionTable(hdfsDirectory);
	}	
}

运行结果:
0.05033,0.008262,0.7487,0.1432,0.0003689,0.01423,0.03306,8.889e-05,0.001791
0.4791,0.01265,0.4071,0.07468,0.0002040,0.009386,0.01612,0.0002040,0.0006121
0.6671,0.008839,0.1261,0.1463,0.0009289,0.01387,0.03505,0.0002426,0.001555
0.04773,0.0004718,0.7681,0.01487,0.0001862,0.1385,0.02863,1.242e-05,0.001490
0.6215,0.002151,0.2925,0.01075,0.006452,0.05161,0.008602,0.002151,0.004301
0.1072,0.002772,0.7044,0.1364,0.0003616,0.01374,0.03247,8.036e-05,0.002612
0.06196,0.0004748,0.7678,0.02008,0.0001424,0.1262,0.003988,4.748e-05,0.01937
0.5036,0.007299,0.3431,0.04380,0.007299,0.05839,0.007299,0.007299,0.02190
0.1834,0.001920,0.6313,0.02544,0.0004801,0.1167,0.03889,0.0009602,0.0009602
第五步:根据马尔科夫模型预测下一个智能邮件营销日期
 
 
 
 

                
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值