Kmeans方法Giraph实现V1.1.0

package org.apache.giraph.benchmark.kmeans;

import java.io.IOException;
import java.util.regex.Pattern;
import org.apache.giraph.graph.BasicComputation;
import org.apache.giraph.graph.Vertex;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;


/*
 * Kmeans算法代码解释:
 * centerPoints[numberOfClusters][numberOfDimensions] ,前面是分群的数量,后面是一个顶点共有几维属性
 * 每读取一个顶点vertex后,将其各维的属性值分别与centerPoints每一个群内的各维属性值相比,求使其各维属性的欧拉距离最小的群,然后更新该群的中心值,agg给master.
 * 每个任务各自遍历自己的所有vertex,等一次大同步后,centerPoints每个群的各维属性值最终agg一个值作为本轮的centerPoints,然后开始下一轮,该算法需要等最后一个任务完成以获取最终
 * agg值。
 * 
 */
public class KMeansComputation  extends  BasicComputation<Text, Text, Text, Text> {

	Pattern commonSpliter = Pattern.compile(",");

	@Override
	public void compute(Vertex<Text, Text, Text> vertex, Iterable<Text> messages)
			throws IOException {
	KMeansNodeWorkerContext workerContext = (KMeansNodeWorkerContext) getWorkerContext();
		
		long superstep = getSuperstep();
		int numberOfDim = workerContext.getNumberOfDimensions();
		
		//This is bad I should store the values more native
		String origValue = vertex.getValue().toString();
		
		String[] pointsStrings = commonSpliter.split(origValue);
		
		if (superstep >= workerContext.getMaxIterations()) {
			vertex.voteToHalt();
		} else if (superstep == 0) {
			//Here we will determine the globe maxs and mins
			double[] points = new double[pointsStrings.length];
			for (int i = 0; i < pointsStrings.length; i++) {
				points[i] = Double.parseDouble(pointsStrings[i]);
			}
			
			for (int i = 0; i < numberOfDim; i++) {
				aggregate(Const.MAX_DIMENSION_PREFIX + "." + i, new DoubleWritable(points[i]));
				aggregate(Const.MIN_DIMENSION_PREFIX + "." + i, new DoubleWritable(points[i]));
			}
		} else {
			double[] points = parsePointsFromValue(superstep, pointsStrings);
			int clusterCenter = selectClusterCenter(workerContext, points); //为points选择合适的群
			applyClusterCenterAggregates(clusterCenter, points);
			updateValue(vertex, superstep, origValue, clusterCenter);
		}
	}

	private double[] parsePointsFromValue(long superstep, String[] pointsStrings) {
		double[] points = new double[pointsStrings.length - (superstep == 1?0:1)];
		for (int i = 0; i < points.length; i++) {
			points[i] = Double.parseDouble(pointsStrings[i]);
		}
		return points;
	}

	private void updateValue( Vertex<Text, Text, Text> vertex,long superstep, String origValue, int clusterCenter) {
		//This can be made faster, string are very slow
		if (superstep == 1) {
			vertex.setValue(new Text(origValue + "," + clusterCenter));
		} else {
			vertex.setValue(new Text(origValue.substring(0, origValue.lastIndexOf(',')) + "," + clusterCenter));
		}
	}
	
  /**
   * centerPoints二维数据,分别代表群数量和维数量, 该函数依次遍历centerPoints的每一行, 求出points针对这一个群(行)的distance累加和,最后选择累加和最小的群.
   * @param workerContext
   * @param points
   * @return
   */
	private int selectClusterCenter(KMeansNodeWorkerContext workerContext, double points[]) {
		double[][] centerPoints = workerContext.getCenters();
		
		int selectedCluster = -1;
		double shortestDistance = Double.MAX_VALUE;
		
		for (int c = 0; c < centerPoints.length; c++) {
			double distance = 0;
			
			for (int d = 0; d < centerPoints[c].length; d++) { //求出points针对这一个群的distance累加和
				//This can be made faster
				double dimDistance = Math.abs(centerPoints[c][d] - points[d]);
				distance = Math.sqrt(Math.pow(distance, 2) + Math.pow(dimDistance, 2));//根号下(distance平方 + dimDistance平方)
			}
			
			if (distance < shortestDistance) { //找出累加距离最小的群
				selectedCluster = c;
				shortestDistance = distance;
			}
		}
		return selectedCluster;
	}
	
	private void applyClusterCenterAggregates(int clusterCenter, double[] points) {
		for (int d = 0; d < points.length; d++) {
			aggregate(Const.MAX_DIMENSION_PREFIX + "." + clusterCenter + "." + d, new DoubleWritable(points[d]));
			aggregate(Const.MIN_DIMENSION_PREFIX + "." + clusterCenter + "." + d, new DoubleWritable(points[d]));
		}
		aggregate(Const.CLUSTER_NODE_COUNT_PREFIX + "." + clusterCenter, new LongWritable(1));
	}

}

package org.apache.giraph.benchmark.kmeans;


import java.util.regex.Pattern;

import org.apache.giraph.worker.DefaultWorkerContext;
import org.apache.hadoop.io.Text;

public class KMeansNodeWorkerContext extends DefaultWorkerContext {

	private double[][] centers;
    
	private int numberOfClusters;
	private int numberOfDimensions;
	private int maxIterations;
	
	
	private final static Pattern commaPattern = Pattern.compile(",");
	
	
	@Override
	public void preApplication() throws InstantiationException,
			IllegalAccessException {
		numberOfClusters = Integer.parseInt(getContext().getConfiguration().get(Const.NUMBER_OF_CLUSTERS));
		numberOfDimensions = Integer.parseInt(getContext().getConfiguration().get(Const.NUMBER_OF_DIMENSIONS));
		
		//System.out.println("numberOfClusters:" + numberOfClusters);
		//System.out.println("numberOfDimensions:" + numberOfDimensions);
		centers = new double[numberOfClusters][numberOfDimensions];
		
		maxIterations = Integer.parseInt(getContext().getConfiguration().get(Const.MAX_ITERATIONS));
	}

	@Override
	public void postApplication() {
		
	}

  /**
   * 获取新的中心点集合二维数组, pointIndex = numberOfClusters * numberOfDimensions 
   */
	@Override
	public void preSuperstep() {
		Text pointsText = ((Text)getAggregatedValue(Const.CENTER_POINTS));
		if (pointsText != null) {
			String pointsString = pointsText.toString();
			if (!pointsString.isEmpty()) {
				String[] points = commaPattern.split(pointsString);
				int pointIndex = 0;
				for (int c = 0; c < numberOfClusters; c++) {
					for (int d = 0; d < numberOfDimensions; d++) {
						//System.out.println("centers[" + c + "][" + d + "]=" + points[pointIndex]);
						centers[c][d] = Double.parseDouble(points[pointIndex]);
						pointIndex++; 
					}
				}
			}
		}
		
	}

	@Override
	public void postSuperstep() {
	}

	public double[][] getCenters() {
		return centers;
	}

	public int getNumberOfClusters() {
		return numberOfClusters;
	}

	public int getNumberOfDimensions() {
		return numberOfDimensions;
	}
	
	public int getMaxIterations() {
		return maxIterations;
	}
	
}




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip 【备注】 1、该资源内项目代码都经过测试运行成功,功能ok的情况下才上传的,请放心下载使用!有问题请及时沟通交流。 2、适用人群:计算机相关专业(如计科、信息安全、数据科学与大数据技术、人工智能、通信、物联网、自动化、电子信息等)在校学生、专业老师或者企业员工下载使用。 3、用途:项目具有较高的学习借鉴价值,不仅适用于小白学习入门进阶。也可作为毕设项目、课程设计、大作业、初期项目立项演示等。 4、如果基础还行,或热爱钻研,亦可在此项目代码基础上进行修改添加,实现其他不同功能。 欢迎下载!欢迎交流学习!不清楚的可以私信问我! 毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip毕设新项目-基于Java开发的智慧养老院信息管理系统源码+数据库(含vue前端源码).zip
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值