// k-means.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <fstream>
#include <math.h>
#include <vector>
#include <iostream>
#define k 3
using namespace std;
//存放元组的属性信息
struct Tuple
{
float attr1;
float attr2;
};
//计算两个元组的欧几里得距离
float getDistXY(Tuple t1, Tuple t2)
{
return sqrt((t1.attr1 - t2.attr1)*(t1.attr1 - t2.attr1) + (t1.attr2 - t2.attr2)*(t1.attr2 - t2.attr2));
}
//根据质心,决定当前元组属于哪个簇
int clusterOfTuple(Tuple means[], Tuple tuple)
{
float dist = getDistXY(means[0], tuple);
float tmp;
int label = 0;//标记属于哪一个簇
for (int i = 1; i < k; i++)
{
tmp = getDistXY(means[i], tuple);
if (tmp < dist)
{
dist = tmp;
label = i;
}
}
return label;
}
//获得给定簇集的平方误差
float getVar( vector<Tuple> clusters[], Tuple means[] )
{
float var = 0;
for (int i = 0; i < k; i++)
{
vector<Tuple> t = clusters[i];
for (int j = 0; j < t.size(); j++)
{
var += getDistXY(t[j], means[i]);
}
}
return var;
}
//获得当前簇的均值(质心)
Tuple getMeans(vector<Tuple> cluster)
{
int num = cluster.size();
double meansX = 0;
double meansY = 0;
Tuple t;
for (int i = 0; i < num; i++)
{
meansX += cluster[i].attr1;
meansY += cluster[i].attr2;
}
t.attr1 = meansX / num;
t.attr2 = meansY / num;
return t;
}
void KMeans(vector<Tuple> tuples)
{
vector<Tuple> clusters[k]; //设置有几簇
Tuple means[k]; //设置每簇的质心
int i = 0;
//默认一开始将前k个元组的值作为k个簇的质心
for (; i < k; i++)
{
means[i].attr1 = tuples[i].attr1;
means[i].attr2 = tuples[i].attr2;
}
//看当前元组属于哪个簇,并将其导入
int label = 0;
for (i = 0; i != tuples.size(); i++)
{
label = clusterOfTuple(means, tuples[i]);
clusters[label].push_back(tuples[i]);
}
//输出刚开始的簇
for (label = 0; label < k; label++)
{
cout << "第" <<label + 1 << "个簇:"<<endl;
vector<Tuple> t = clusters[label];
for ( i = 0; i < t.size(); i++)
{
cout <<"(" << t[i].attr1 <<", "<< t[i].attr2<<")"<<"\t";
}
cout<<endl;
}
float oldVar = -1;
float newVar = getVar(clusters, means);
while (abs(newVar - oldVar) >= 1) //当新旧函数值即平方差不到1即则准函数值不发生明显变化时,算法终止
{
for (i = 0; i < k; i++) //更新每个簇的中心点
{
means[i] = getMeans(clusters[i]);
}
oldVar = newVar;
newVar = getVar(clusters, means);
//清空每个簇
for (i = 0; i < k; i++)
{
clusters[i].clear();
}
//根据新的质心获得新的簇
for (i = 0; i != tuples.size(); i++)
{
label = clusterOfTuple(means, tuples[i]);
clusters[label].push_back(tuples[i]);
}
//输出当前簇
for (label = 0; label < k; label++)
{
cout << "第" <<label + 1 << "个簇:"<<endl;
vector<Tuple> t = clusters[label];
for ( i = 0; i < t.size(); i++)
{
cout <<"(" << t[i].attr1 <<", "<< t[i].attr2<<")"<<"\t";
}
cout<<endl;
}
}
}
int _tmain(int argc, _TCHAR* argv[])
{
const char* fileName = "C:\\Users\\sony\\Desktop\\data.txt";
ifstream infile(fileName);
if (!infile.is_open())
{
cout<<"不能打开输入文件"<<fileName<<endl;
}
int count = 0;
vector<Tuple> tuples;
Tuple tuple;
//从文件流中读入数据
while (!infile.eof())
{
count++;
if (count%2 == 1)
{
infile>>tuple.attr1;
}else
{
infile>>tuple.attr2;
tuples.push_back(tuple);
}
}
//输出文件中的元组信息
for (auto it = tuples.begin(); it != tuples.end(); it++)
{
cout << "(" << (*it).attr1 <<", "<<(*it).attr2<<")"<<"\t";
}
cout << endl;
KMeans(tuples);
system("pause");
return 0;
}
原博文链接:点击打开链接
以上为二维,一维则为以下情况,这里注意在用ifstream读取数据时,如果以infile.eof()来判断结尾,则最后会出现读取2次最后数据的现象,stream 中的 eofbit 标记是在尝试读取文件结尾时才设立的读取完最后一个数据后,fstream 仍处于正常状态,所以下一次 while 判断不会跳出,再次 fin>>x 时,fstream 发现没有数据可读,此时才会设立 failbit。但由于已进入循环,虽然未读数据,x 仍保留上次的值,所以就又一次 pushback 了。
代码如下:
// text.cpp : 定义控制台应用程序的入口点。
#include "stdafx.h"
#include <fstream>
#include <math.h>
#include <vector>
#include <iostream>
#define k 3
using namespace std;
//存放元组的属性信息
struct Tuple
{
float attr1;
/*float attr2;*/
};
//计算两个元组的欧几里得距离
float getDistXY(Tuple t1, Tuple t2)
{
return sqrt((t1.attr1 - t2.attr1)*(t1.attr1 - t2.attr1) );
}
//根据质心,决定当前元组属于哪个簇
int clusterOfTuple(Tuple means[], Tuple tuple)
{
float dist = getDistXY(means[0], tuple);
float tmp;
int label = 0;//标记属于哪一个簇
for (int i = 1; i < k; i++)
{
tmp = getDistXY(means[i], tuple);
if (tmp < dist)
{
dist = tmp;
label = i;
}
}
return label;
}
//获得给定簇集的平方误差
float getVar( vector<Tuple> clusters[], Tuple means[] )
{
float var = 0;
for (int i = 0; i < k; i++)
{
vector<Tuple> t = clusters[i];
for (int j = 0; j < t.size(); j++)
{
var += getDistXY(t[j], means[i]);
}
}
return var;
}
//获得当前簇的均值(质心)
Tuple getMeans(vector<Tuple> cluster)
{
int num = cluster.size();
double meansX = 0;
Tuple t;
for (int i = 0; i < num; i++)
{
meansX += cluster[i].attr1;
}
t.attr1 = meansX / num;
return t;
}
void KMeans(vector<Tuple> tuples)
{
vector<Tuple> clusters[k]; //设置有几簇
Tuple means[k]; //设置每簇的质心
int i = 0;
//默认一开始将前k个元组的值作为k个簇的质心
for (i = 0; i < k; i++)
{
means[i].attr1 = tuples[i].attr1;
}
//看当前元组属于哪个簇,并将其导入
int label = 0;
for (i = 0; i != tuples.size(); i++)
{
label = clusterOfTuple(means, tuples[i]);
clusters[label].push_back(tuples[i]);
}
输出刚开始的簇
//for (label = 0; label < k; label++)
//{
// cout << "第" <<label + 1 << "个簇:"<<endl;
// vector<Tuple> t = clusters[label];
// for ( i = 0; i < t.size(); i++)
// {
// cout <<"(" << t[i].attr1 <<", "<< t[i].attr2<<")"<<"\t";
// }
// cout<<endl;
//}
float oldVar = -1;
float newVar = getVar(clusters, means);
while (abs(newVar - oldVar) >= 1) //当新旧函数值即平方差不到1即则准函数值不发生明显变化时,算法终止
{
for (i = 0; i < k; i++) //更新每个簇的中心点
{
means[i] = getMeans(clusters[i]);
}
oldVar = newVar;
newVar = getVar(clusters, means);
//清空每个簇
for (i = 0; i < k; i++)
{
clusters[i].clear();
}
//根据新的质心获得新的簇
for (i = 0; i != tuples.size(); i++)
{
label = clusterOfTuple(means, tuples[i]);
clusters[label].push_back(tuples[i]);
}
输出当前簇
//for (label = 0; label < k; label++)
//{
// /*cout << "第" <<label + 1 << "个簇:"<<endl;
// vector<Tuple> t = clusters[label];
// for ( i = 0; i < t.size(); i++)
// {
// cout <<"(" << t[i].attr1 <<", "<< t[i].attr2<<")"<<"\t";
// }
// cout<<endl;*/
// cout << "第" <<label + 1 << "个簇:"<<endl;
// vector<Tuple> t = clusters[label];
// for ( i = 0; i < t.size(); i++)
// {
// cout << t[i].attr1 <<"\t";
// }
// cout<<endl;
}
//输出当前簇
for (label = 0; label < k; label++)
{
/*cout << "第" <<label + 1 << "个簇:"<<endl;
vector<Tuple> t = clusters[label];
for ( i = 0; i < t.size(); i++)
{
cout <<"(" << t[i].attr1 <<", "<< t[i].attr2<<")"<<"\t";
}
cout<<endl;*/
cout << "第" <<label + 1 << "个簇:"<<endl;
vector<Tuple> t = clusters[label];
for ( i = 0; i < t.size(); i++)
{
cout << t[i].attr1 <<"\t";
}
cout<<endl;
}
}
int _tmain(int argc, _TCHAR* argv[])
{
const char* fileName = "C:\\Users\\sony\\Desktop\\data.txt";
ifstream infile(fileName, ios::in);
if (!infile.is_open())
{
cout<<"不能打开输入文件"<<fileName<<endl;
}
int count = 0;
vector<Tuple> tuples;
Tuple tuple;
//从文件流中读入数据
while (infile>>tuple.attr1)
{
/*count++;
if (count%2 == 1)
{
infile>>tuple.attr1;
}else
{
infile>>tuple.attr1;
tuples.push_back(tuple);
}*/
tuples.push_back(tuple);
}
输出文件中的元组信息
//for (auto it = tuples.begin(); it != tuples.end(); it++)
//{
// cout << "(" << (*it).attr1 <<", "<<(*it).attr2<<")"<<"\t";
//}
cout << endl;
KMeans(tuples);
system("pause");
return 0;
}