一阶马尔科夫:系统在t+1时刻的状态仅由t时刻状态决定
时序交易 Mapreduce
输入 customerID, transactionID, pusechasedate amount
输出 customerID, (Date1, amount1)(Date2,amount2)(Date3,amount3)...(DateN,AmountN) 用其表示马尔科夫链,最终求转移矩阵
//key
//value customerid, transactionid,pusechasedate, amount
map(key,value){
pair(date, Integer) pair=(value.purse-date,value.amount)
emit(value.customer-id,pair);
}
//key customer-id
//values List(pair(Date,Integer))
reduce(key,values){
sortedValues=sortbyDateInAscendingOrder(values);
emit(key,sortedValues)
}
以上可以考虑二次排序方法实现序列按date排序,完成生成交易序列后,要将其转换为一个状态序列state sequence
即customerid, state1, state2,state3,...staten
可以通过其他脚本将(date,amount)转化为state,state其实也是由两个分量确定的(s1,s2)s1表示上一次交易后经过的时间,s2表示与上一次相比的交易额
s1:S 短 M中L长
s2:L 显著小,E基本相同G显著大
这个脚本可以由map实现
之后需要生产状态转移矩阵,一共有9个状态,状态转移矩阵是9*9
//key customer-id
//value (s1,s2,...sn)
map(key, value){
for (i=0;i<n-1;i++){
reducekey=pair(value[i],value[i+1]);
emit(reducekey,1)
}
//key pair(state1,state2)
//values list(1,1,...1)
combine(Pair(state1,state2) key, List<Integer> values){
for( int count:values){
partialsum +=count;
}
emit(key,partialsum);
}
//key pair(state1,state2)
// value 整数列表
reduce(Pair(state1,state1) key, List<integer> value){
for (int count :value){
sum +=count;
}
emit(key,sum);}
spark 解决方案暂不支持二次排序
public class SparkMarkov implements Serializable{
public static void main(String[] args) throws Exception{
//处理输入参数
//创建上下文对象
JavaSparkContext ctx =new JavaSparkContext();
//从hdfs创建RDD string
JavaRDD<String> record= ctx.textFile("",1);
//从stringRDD中抽取pairRDD
JavaPairRDD<String,Tuple2<long,Integer>>kv= record.mapToPair(
new PairFunction<String, String,Tuple2<Long,Integer>>(){
public Tuple2<String,Tuple2<Long,Integer>>call(String,s){
String[] tokens=s.split(",");
String customer=tokens[0];
String transaction =tokens[1];
Long date=Long.parsLong(tokens[2]);
Int amount= Integer.parseInt(tokens[3])
Tuple2<Long,Integer> pair= new Tuple2<Long,Integer>(date, amount);
return(new Tuple2<String,Tuple2<Long,Integer>>(customer, pair));
}
}
)
//按照customerid进行分组
JavaPairRDD<String,Iterable<Tuple2<Long,Integer>>> customerRDD=kv.groupByKey();
//创建状态序列
JavaPairRDD<String,Iterable<Tuple2<Long,Integer>>> statesequence= customerRDD.mapValues(
new Function<Iterable<Tuple2<Long,Integer>>, List<String>>(){
public List<String> call(Iterable<Tuple2<Long,Integer>>dateandamount){
List<Tuple2<Long,Integer>> list = toList(dateandamount);
collections.sort(list,TupleComparatorAscending.INSTANCE);
List<String> statesequence = toStateSequence(list);
return statesequence;
}
})
//生成状态转移矩阵
JavaPairRDD<Tuple2<String,String>, Integer> model =statesequence.flatMapToPair(
new PairFlatMapFunction<Tuple2<String,List<String>>,Tuple2<String,String>, Integer> (){
public Iterable<Tuple2<Tuple2<String,String>,Integer>>call(Tuple2<String,List<String>> s){
List<String> states =s._2;
List<Tuple2<Tuple2<String,String>,Integer>> mapout=
new ArrayList<Tuple2<Tuple2<String,String>,Integer>>();
for(int i=0; i<stats.size()-1;i++){
String fromstate=states.get(i);
String tostate=states.get(i+1);
Tuple2<String,String> pair= new Tuple2<<String,String>>(fromstate,tostate);
mapout.add(new Tuple2<Tuple2<String,String>,Integer>(pair, 1))
}
return mapout;
}
}))
JavaPairRDD<Tuple2<String,String>, Integer> markovmodel=model.reduceByKey(
new Function2<Integer,Integer,Integer(){
public Integer call(Integer i1,Integer i2){
return i1+i2;}}>)
//发出最终输出
JavaPairRDD<String> markovmodelformatted=markovmodel.map(
new Function<Tuple2<Tuple2<String,String>,Integer>,String>(){
public String call(Tuple2<Tuple2<String,String,Integer>> t){
return t._1._1+","+t._1._2+"/t"+t._2;
}})
//
}
static List<Tuple2<long, integer>> to List<Iterable<Tuple2<long,integer>>>{...};
static List<String> tOsatesquence(List<Tuple2<Long,Integer>> List){...};
}