// k 期望的簇数
//delta 可接受的收敛误差
//data 输入数据
kmeans(k,delta,data){
//初始化簇质心
initial_centroids=pick(k,data);
//利用这个方法向映射器广播中心
writeToHDFS(initial_centroids);
//必要时迭代
current_centroids=initial_centroids;
while(true){
//theMapReduceJob完成2个任务
//map中使用current_centroids
//reduce创建new_centroids 并写到hdfs
theMapReduceJob();
new_centriods=readFromHDFS();
if change(new_centriods,current_centriods)<=delta{break;}
else{current_centroids=new_centroids}
}
result=readFromHDFS();
return result;
}
map()使用质心把各个点分配到最近的中心,输出(clusterID,vector)键值对
public class KmeansMapper...{
private List<Vector> centers=null;
private List<Vector> readCentersFromSequenceFile(){
//从sequenceFile中读取簇质心,是一个键值对组合
}
public void setup(Context context){
this.centers=readCentersfRromSequenceFile();
}
//key由MapReduce生成可忽略,value为d唯向量
map(Object key,Vector value){
Vector nearest=null;
double nearestDistance=Double.MAX_VALUE;
for(Vector center: centers){
double distance =EuclideanDistance.calculateDistance(center,value);
if (nearst==null){
nearest=center;
nearestDistance=distance;}
else{
if (nearestDistance >distance){
nearset=center;
nearestDistance=distance;}
}
}
}
emit(nearest,value);
}
combine() 累加向量对象各个维的值
combine(Vector key, Iterable<Vector> values){
Vector sum=new Vector();
for(Vector value:values){
for (int i; i<value.length; i++){sum[i] +=value[i]}}
}
emit(key, sum);
}
reduce()生成新的簇平均值输出(clusterid, centroid)
reduce(Vector key, Iterable<Vector> values){
Vector newCenter = new Vector();
int count=0;
for(Vector value:values){
count++;
for( int i =0; i<value.length;i++){
newCenter[i]+=value[i];}
}
for ( int i =0; i<key.length; i++){
newCenter[i]=newCenter[i]/count;
}
emit(key.ID,newCenter);
}
spark MLlib类提供了常用的机器学习方法
public final class JavaKMeans{
public static void main(String[] args){
SparkConf conf=new SparkConf().setAppName("javakmeans");
JavaSparkContext sc=new JavaSparkContext(conf);
JavaRDD<String> lines=sc.textFile("",1);
JavaRDD<Vector> points=lines.map(new ParsePoint());
KMeansModel model=KMeans.train(points.rdd(),k,iterations,runs,KMeans.K_MEANS_PARALLEL());
System.out.println("cluster centers");
for (Vector center:model.clusterCenters()){
System.out.println(""+center);
}
}
}