KMeans++算法和Hadoop关键代码实现
算法关键描述
kmeans算法比较简单,比较难的问题在于初始簇的选择。因为如果初始簇选的不好,会出现空簇,聚类不收敛等问题。因此kmeans++就是增加了初始簇的选择方法。
轮盘法
轮盘法是一种非均匀分布的均匀分布实现方法。意思就是给定一个概率分布,可以在0,1均匀分布下选择一个随机数,然后从起始概率分布遍历,直到概率和到达随机数。在kmeans++中,目的是让初始点的分布尽可能的远,因此新的簇点,应该更大概率的选择离已有簇点更远的点。这就是轮盘法的结果。在代码编写中,不妨初始点设置为0向量,然后总的簇的数目为k+1,最后去掉0向量就可以了。然后由于实际中数据量很大,因此必须要采用读写文件的方式。
Hadoop关键代码实现
Feature.java
package edu.bupt.kmeans;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
public class Feature implements Writable {
private ArrayList<Double> features;
public Feature() {
this.features = new ArrayList<Double>();
}
public Feature(ArrayList<Double> features) {
this.features = features;
}
public Feature(int size){
features = new ArrayList<Double>();
for (int i=0;i<size;i++){
features.add(0.0);
}
}
public Feature(String line) {
String[] valueString = line.split(" ");
features = new ArrayList<Double>();
for (String value : valueString) {
features.add(Double.parseDouble(value));
}
}
public double get(int i){
if (i<getFeatureSize()){
return this.features.get(i);
}else{
return 0;
}
}
public void setFeatures(ArrayList<Double> features) {
this.features = features;
}
public int getFeatureSize() {
return features.size();
}
public ArrayList<Double> getFeatures() {
return features;
}
@Override
public String toString() {
String s = new String();
for (Double feature : features) {
s += feature + " ";
}
return s.substring(0,s.length()-1);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(features.size());
for (Double feature : features) {
out.writeDouble(feature);
}
}
@Override
public void readFields(DataInput in) throws IOException {
int size = in.readInt();
features = new ArrayList<Double>();
for (int i=0;i<size;i++){
features.add(in.readDouble());
}
}
public static void main(String[] args) {
ArrayList<Double> arr = new ArrayList<Double>();
arr.add(1.0);
arr.add(2.0);
Feature feature = new Feature("-1.246179058446342136e+00 2.288084176933262270e+00");
System.out.println(feature);
int x;
System.out.println(3.1/3);
}
}
KMeans.java
package edu.bupt.kmeans;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
public class KMeans {
public static class KmeansMapper extends Mapper<LongWritable, Text, IntWritable,Feature>{
private ArrayList<Feature> kClusters = new ArrayList<Feature>();//存放上一次的分簇结果
private Integer k;
private Integer feature_size;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
k = new Integer(context.getConfiguration().get("cluster_k"));
feature_size = new Integer(context.getConfiguration().get("feature_size"));
FileSystem fileSystem = FileSystem.get(context.getConfiguration());
FileStatus[] fileList = fileSystem.listStatus(new Path(context.getConfiguration().get("clusterPath")));
BufferedReader in = null;
FSDataInputStream fsi = null;
String line = null;
for (int i=0;i<fileList.length;i++){
if (!fileList[i].isDirectory()){
fsi = fileSystem.open(fileList[i].getPath());
in = new BufferedReader(new InputStreamReader(fsi,"UTF-8"));
while((line=in.readLine())!=null){
Feature feature = new Feature(line);
kClusters.add(feature);
}
}
}
in.close();
fsi.close();
while (kClusters.size()<k){
kClusters.add(new Feature(feature_size));
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Feature feature = new Feature(value.toString());
int temp_k = 0;
double minDis = Double.MAX_VALUE;
double newDis = 0.0;
for (int i=0;i<kClusters.size();i++){
Feature point = kClusters.get(i);
double temp = 0;
for (int j=0;j<feature_size;j++){
temp+=Math.pow((point.get(j)-feature.get(j)),2);
}
newDis = Math.sqrt(temp);
if (newDis<minDis){
minDis = newDis;
temp_k = i;
}
}
context.write(new IntWritable(temp_k),feature);
}
}
public static class KmeansReducer extends Reducer<IntWritable,Feature, NullWritable,Feature>{
private Integer k;
private Integer feature_size;
private ArrayList<Feature> kClusters = new ArrayList<Feature>();//存放上一次的分簇结果
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
k = new Integer(context.getConfiguration().get("cluster_k"));
feature_size = new Integer(context.getConfiguration().get("feature_size"));
FileSystem fileSystem = FileSystem.get(context.getConfiguration());
FileStatus[] fileList = fileSystem.listStatus(new Path(context.getConfiguration().get("clusterPath")));
BufferedReader in = null;
FSDataInputStream fsi = null;
String line = null;
for (int i=0;i<fileList.length;i++){
if (!fileList[i].isDirectory()){
fsi = fileSystem.open(fileList[i].getPath());
in = new BufferedReader(new InputStreamReader(fsi,"UTF-8"));
while((line=in.readLine())!=null){
Feature feature = new Feature(line);
kClusters.add(feature);
}
}
}
in.close();
fsi.close();
while (kClusters.size()<k){
kClusters.add(new Feature(feature_size));
}
}
@Override
protected void reduce(IntWritable key, Iterable<Feature> values, Context context) throws IOException, InterruptedException {
ArrayList<Double> mean_feature= new ArrayList<Double>();
for (int i=0;i<feature_size;i++){
mean_feature.add(0.0);
}
int count = 0;
for (Feature value:values){
count+=1;
for (int j=0;j<feature_size;j++){
mean_feature.set(j,mean_feature.get(j)+value.get(j));
}
}
if (count>0){
for (int i=0;i<feature_size;i++){
mean_feature.set(i,mean_feature.get(i)/count);
}
context.write(NullWritable.get(),new Feature(mean_feature));
}else{
context.write(NullWritable.get(),kClusters.get(key.get()));
}
}
}
}
KMeanspp.java
package edu.bupt.kmeans;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;
public final class KMeanspp {
private int k; //分簇的个数
private int feature_size; //特征的维度
private FileStatus[] fileList; //文件列表
private FileSystem fs; //源数据目录操作对象
private ArrayList<Feature> kClusters;//初始簇的集合
private Configuration conf; //配置文件
private String filepath;
public KMeanspp(Configuration conf,String filePath,int k,int feature_size){
this.k = k;
this.filepath = filePath;
this.feature_size = feature_size;
try {
fs = FileSystem.get(conf);
fileList = fs.listStatus((new Path(filePath)));
//构造初始容量为K的空arraylist,kcluster.size =0
kClusters = new ArrayList<Feature>();
kClusters.add(new Feature(feature_size));
this.conf = conf;
} catch (IOException e) {
e.printStackTrace();
}
}
public double getDis(Feature feature1,Feature feature2){
double newDis = 0.0;
for (int i=0;i<feature_size;i++){
newDis+=Math.pow((feature1.get(i)-feature2.get(i)),2);
}
newDis = Math.sqrt(newDis);
return newDis;
}
public void writeLineToTemp(String s, String temp){
Path hdfsPath = new Path(temp);
FSDataOutputStream fileOutputStream = null;
try {
if (!fs.exists(hdfsPath)) {
fileOutputStream = fs.create(hdfsPath,false);
}else{
fileOutputStream = fs.append(hdfsPath);
}
fileOutputStream.writeBytes(s+"\n");
} catch (IOException e) {
e.printStackTrace();
}finally {
if(fileOutputStream!=null){
try {
fileOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public Feature getNextCluster() throws IOException {
Text line = new Text();
FSDataInputStream fsi = null;
FSDataOutputStream fso = null;
double count = 0;
try {
if (fs.exists(new Path(filepath+"temp.txt"))){
fs.delete(new Path(filepath+"temp.txt"),false);}
for(int i = 0;i < fileList.length;i++){
if (!(fileList[i].isDirectory() | fileList[i].getPath().equals(new Path(filepath+"temp.txt")))){
fsi = fs.open(fileList[i].getPath());
LineReader lineReader = new LineReader(fsi,conf);
while(lineReader.readLine(line)>0){
if(line.toString().length()==0){
continue;
}
Feature point = new Feature(line.toString());
double disMin = Double.MAX_VALUE;
for (Feature point2:kClusters){
double dis = getDis(point,point2);
if (dis<disMin){disMin = dis;}
}
count+=disMin;
writeLineToTemp((point.toString()+" "+Double.toString(disMin)),filepath+"temp.txt");
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
fsi.close();
} catch (IOException e) {
e.printStackTrace();
}
}
int result = new Random(0).nextInt((int)count);
Feature newcluster = new Feature();
try {
Path hdfsPath = new Path(filepath+"temp.txt");
fsi = fs.open(hdfsPath);
LineReader lineReader = new LineReader(fsi,conf);
while((lineReader.readLine(line)>0)&(result>0)) {
if (line.toString().length() == 0) {
continue;
}
String[] l1 = line.toString().split(" ");
Double dis = Double.parseDouble(l1[l1.length-1]);
String fea = "";
for (int i=0;i<l1.length-1;i++){
fea=fea+l1[i]+" ";
}
fea = fea.substring(0,fea.length()-1);
result-=Double.parseDouble(dis.toString());
newcluster = new Feature(fea);
}
return newcluster;
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
fsi.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public void writeNextCluster(String firstClusterPath) throws IOException {
while(kClusters.size()<k+1){
Feature newCluster = getNextCluster();
kClusters.add(newCluster);
writeLineToTemp(newCluster.toString(),firstClusterPath);
}
if (fs.exists(new Path(filepath+"temp.txt"))){
fs.delete(new Path(filepath+"temp.txt"),false);}
}
public static void main(String[] args) throws IOException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://hadoop1:9000/");
KMeanspp test = new KMeanspp(conf,"KMeans_in/",6,2);
test.writeNextCluster("KMeans_out/123.txt");
}
}