用java实现了PrefixSpan,基本是Spark PrefixSpan的逻辑。把List换成RDD基本就是Spark的实现。
代码
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import scala.Tuple2;
/**
*
*/
public class JavaPrefixSpan<T>
{
public static void main( String[] args )
{
List<List<List<Integer>>> data = Arrays.asList( Arrays.asList(Arrays.asList( 1,2 ), Arrays.asList( 2,3 )), Arrays.asList(Arrays.asList( 1,2,5 )) );
List<FreqSequence<Integer>> result = run( data );
for (FreqSequence<Integer> seq:result)
{
System.out.println( seq.sequence( ) );
System.out.println( seq.freq( ) );
System.out.println( "/" );
}
System.out.println( result.size( ) );
System.out.println( "Done" );
}
public static <T> List<FreqSequence<T>> run(List<List<List<T>>> data)
{
List<Tuple2<T, Long>> freqItemAndCount = findAllItem( data );
List<T> freqItems = freqItemAndCount.stream().sorted( (s1, s2) -> s2._2 - s1._2 >=0? 1:-1 ).map( s->s._1 ).collect( Collectors.toList( ) );
Map<T, Integer> itemToInt = zipWithIndex( freqItems );
List<List<Integer>> transData = transToSeqItem( data, itemToInt );
List<Tuple2<List<Integer>, Long>> result = genFreqPatterns( transData );
List<FreqSequence<T>> freqSequences = result.stream( ).map( s->new FreqSequence<T>(toPublicRepr( s._1, freqItems ), s._2)).collect( Collectors.toList( ) );
return freqSequences;
}
private static List<Tuple2<List<Integer>, Long>> genFreqPatterns(List<List<Integer>> data)
{
List<Postfix<Integer>> postfixes = data.stream( ).map( s->new Postfix<Integer>( s ) ).collect( Collectors.toList( ) );
List<Tuple2<List<Integer>, Long>> retValue = new ArrayList<Tuple2<List<Integer>, Long>>();
Map<Integer, Prefix> prefixMap = new HashMap<Integer, Prefix>();
Prefix empty = Prefix.empty( );
prefixMap.put( empty.id, empty );
while (!prefixMap.isEmpty( ))
{
final Map<Integer, Prefix> scPprefixMap = prefixMap;
List<Tuple2<Tuple2<Integer, Integer>, Tuple2<Long, Long>>> rdd = postfixes.stream( ).flatMap( s->{
return scPprefixMap.entrySet( ).stream( ).flatMap( t->{
Prefix prefix = t.getValue( );
List<Tuple2<Integer, Long>> list = s.project( prefix ).genPrefixItems( );
return list.stream( ).map( k->{
int item = k._1;
long postfixSize = k._2;
return new Tuple2<Tuple2<Integer, Integer>, Tuple2<Long, Long>>
(new Tuple2<Integer, Integer>(prefix.id, item), new Tuple2<Long, Long>(1L, postfixSize));
});
});
}).collect( Collectors.toList( ) );
Map<Tuple2<Integer, Integer>, Tuple2<Long, Long>> emptyMap = new HashMap<Tuple2<Integer, Integer>, Tuple2<Long, Long>>();
rdd.stream( ).reduce( emptyMap, (t, u)->{
t.merge( u._1, u._2, (k1, k2)-> new Tuple2<Long, Long>(k1._1 + k2._1, k1._2 + k2._2) );
return t;
}, (s1,s2)->
{
s2.entrySet( ).stream( ).forEach( s->{
s1.merge( s.getKey( ), s.getValue( ), (k1, k2)-> new Tuple2<Long, Long>(k1._1 + k2._1, k1._2 + k2._2) );
});
return s1;
});
Map<Integer, Prefix> newPrefixMap = new HashMap<Integer, Prefix>();
emptyMap.entrySet( ).stream( ).forEach( s->{
int id = s.getKey( )._1;
int item = s.getKey( )._2;
long count = s.getValue( )._1;
long proJDBSize = s.getValue( )._2;
Prefix newPrefix = scPprefixMap.get( id ).add( item );
List<Integer> items = new ArrayList<Integer>(newPrefix.items);
items.add( 0 );
retValue.add( new Tuple2<List<Integer>, Long>(items, count ));
newPrefixMap.put( newPrefix.id, newPrefix );
});
prefixMap = newPrefixMap;
}
return retValue;
}
private static <T> List<List<T>> toPublicRepr(List<Integer> pattern, List<T> freqItems)
{
List<List<T>> sequenceBuilder = new ArrayList<List<T>>();
List<T> itemsetBuilder = new ArrayList<T>();
long n = pattern.size( );
int i = 1;
while ( i < n)
{
int x = pattern.get( i );
if ( x == 0)
{
sequenceBuilder.add( itemsetBuilder );
itemsetBuilder = new ArrayList<T>();
}
else
{
itemsetBuilder.add( freqItems.get( x - 1 ) );
}
i++;
}
return sequenceBuilder;
}
private static <T> List<List<Integer>> transToSeqItem(List<List<List<T>>> data, Map<T, Integer> itemToInt)
{
List<List<Integer>> retValue = new ArrayList<List<Integer>>();
data.stream( ).forEach( s->
{
List<Integer> allItems = new ArrayList<Integer>();
allItems.add( 0 );
boolean[] contain = new boolean[]{false};
s.stream( ).forEach( t->{
List<Integer> result = new ArrayList<Integer>();
t.stream( ).forEach( k->{
result.add( itemToInt.get( k ) + 1 );
});
if (result.size( ) > 0)
{
contain[0] = true;
Collections.sort( result );
allItems.addAll( result );
allItems.add( 0 );
}
});
if (contain[0])
{
retValue.add( allItems );
}
});
return retValue;
}
private static <T> List<Tuple2<T, Long>> findAllItem(List<List<List<T>>> data)
{
Map<T, Long> empty = new HashMap<T, Long>();
List<Tuple2<T, Long>> retValue = data.stream( ).flatMap( s->
{
Set<T> uniqItem = new HashSet<T>( );
s.stream( ).forEach( t-> {
t.stream( ).forEach( k->
{
uniqItem.add( k );
});
});
return uniqItem.stream( ).map( k->new Tuple2<T, Long> (k, 1L));
}).reduce(empty,(t, u)->{
t.merge( u._1, u._2, (s1,s2) -> s1 + s2);
return t;
},(s1,s2)->
{
s2.entrySet( ).stream( ).forEach( s->{
s1.merge( s.getKey( ), s.getValue( ), (k1, k2)->k1 + k2 );
});
return s1;
} ).entrySet( ).stream( ).map( s->new Tuple2<T, Long>(s.getKey( ), s.getValue( )) ).collect( Collectors.toList( ) );
return retValue;
}
private static <T> Map<T, Integer> zipWithIndex( List<T> freqItems )
{
Map<T, Integer> retValue = new HashMap<T, Integer>( );
for ( int i = 0; i < freqItems.size( ); i++ )
{
retValue.put( freqItems.get( i ), i );
}
return retValue;
}
public static class FreqSequence<T> implements Serializable
{
List<List<T>> sequence;
long freq;
public FreqSequence( List<List<T>> sequence, long freq )
{
super( );
this.sequence = sequence;
this.freq = freq;
}
public List<List<T>> sequence( )
{
return sequence;
}
public long freq( )
{
return freq;
}
}
public static class Postfix<T> implements Serializable
{
List<Integer> items;
int start;
List<Integer> partialStarts;
public Postfix( List<Integer> items)
{
this( items, 0, new ArrayList<Integer>() );
}
public Postfix( List<Integer> items, int start)
{
this( items, start, new ArrayList<Integer>() );
}
public Postfix( List<Integer> items, int start, List<Integer> partialStarts )
{
super( );
this.items = items;
this.start = start;
this.partialStarts = partialStarts;
}
int fullStart()
{
int i = start;
while(items.get( i ) != 0)
{
i = i + 1;
}
return i;
}
public List<Tuple2<Integer, Long>> genPrefixItems()
{
int n1 = items.size( ) - 1;
Map<Integer, Long> prefixed = new HashMap<Integer, Long>();
for (int start:partialStarts)
{
int i = start;
int x = -items.get( i );
while (x != 0)
{
if (!prefixed.containsKey( x ))
{
prefixed.put( x, ((Integer)(n1 - i )).longValue( ) );
}
i = i + 1;
x = -items.get( i );
}
}
int i = fullStart( );
while (i < n1)
{
int x = items.get( i );
if (x != 0 && !prefixed.containsKey( x ))
{
prefixed.put( x, ((Integer)(n1 - i )).longValue( ));
}
i = i + 1;
}
List<Tuple2<Integer, Long>> retValue = prefixed.entrySet( ).stream( ).map( s->new Tuple2<Integer, Long>(s.getKey( ),
s.getValue( )) ).collect( Collectors.toList( ) );
return retValue;
}
public boolean nonEmpty()
{
return items.size( ) > start + 1;
}
Postfix project(int prefix)
{
assert(prefix != 0);
int n1 = items.size( ) - 1;
boolean matched = false;
int newStart = n1;
List<Integer> newPartialStarts = new ArrayList<Integer>();
if (prefix < 0)
{
int target = -prefix;
for (int start:partialStarts)
{
int i = start;
int x = items.get( i );
while (x != target && x != 0)
{
i = i + 1;
x = items.get( i );
}
if (x == target)
{
i = i + 1;
if (!matched)
{
newStart = i;
matched = true;
}
if (items.get( i ) != 0)
{
newPartialStarts.add( i );
}
}
}
}
else
{
int target = prefix;
int i = fullStart( );
while ( i < n1)
{
int x = items.get( i );
if (x == target)
{
if (!matched)
{
newStart = i;
matched = true;
}
if (items.get( i + 1 ) != 0)
{
newPartialStarts.add( i + 1 );
}
}
i = i + 1;
}
}
return new Postfix<>( items, newStart, newPartialStarts );
}
Postfix project(List<Integer> prefix)
{
boolean partial = true;
Postfix cur = this;
int i = 0;
int np = prefix.size( );
while( i < np && cur.nonEmpty( ))
{
int x = prefix.get( i );
if ( x == 0)
{
partial = false;
}
else
{
if(partial)
{
cur = cur.project( -x );
}
else
{
cur = cur.project( x );
partial = true;
}
}
i = i + 1;
}
return cur;
}
public Postfix project(Prefix prefix)
{
return project( prefix.items );
}
public Postfix compressed()
{
if (start > 0)
{
List<Integer> list = new ArrayList<Integer>();
list.addAll( items.subList( start, items.size( ) ) );
return new Postfix(list , 0,
partialStarts.stream( ).map( s->s - start ).collect( Collectors.toList( ) ));
}
else
{
return this;
}
}
}
public static class Prefix implements Serializable
{
private static AtomicInteger counter = new AtomicInteger(-1);
public List<Integer> items;
public int length;
public int id;
public Prefix( List<Integer> items, int length )
{
super( );
this.items = items;
this.length = length;
id = nextId( );
}
public Prefix add(int item)
{
assert item != 0;
List<Integer> newItems = new ArrayList<Integer>(items);
if (item < 0)
{
newItems.add( -item );
return new Prefix( newItems, length + 1 );
}
else
{
newItems.add( 0 );
newItems.add( item );
return new Prefix( newItems, length + 1 );
}
}
public static int nextId()
{
return counter.incrementAndGet( );
}
public static Prefix empty()
{
return new Prefix(new ArrayList<Integer>(), 0);
}
}
}