基本Kmeans算法實現 C++代碼
#include?<iostream>#include?<sstream>
#include?<fstream>
#include?<vector>
#include?<math.h>
#include?<stdlib.h>
#define?k?3//簇的數目
using?namespace?std;
//存放元組的屬性信息
typedef?vector<double>?Tuple;//存儲每條數據記錄
int?dataNum;//數據集中數據記錄數目
int?dimNum;//每條記錄的維數
//計算兩個元組間的歐幾裏距離
double?getDistXY(const?Tuple&?t1,?const?Tuple&?t2)?
{
double?sum?=?0;
for(int?i=1;?i<=dimNum;?++i)
{
sum?+=?(t1[i]-t2[i])?*?(t1[i]-t2[i]);
}
return?sqrt(sum);
}
//根據質心,決定當前元組屬於哪個簇
int?clusterOfTuple(Tuple?means[],const?Tuple&?tuple){
double?dist=getDistXY(means[0],tuple);
double?tmp;
int?label=0;//標示屬於哪壹個簇
for(int?i=1;i<k;i++){
tmp=getDistXY(means[i],tuple);
if(tmp<dist)?{dist=tmp;label=i;}
}
return?label;
}
//獲得給定簇集的平方誤差
double?getVar(vector<Tuple>?clusters[],Tuple?means[]){
double?var?=?0;
for?(int?i?=?0;?i?<?k;?i++)
{
vector<Tuple>?t?=?clusters[i];
for?(int?j?=?0;?j<?t.size();?j++)
{
var?+=?getDistXY(t[j],means[i]);
}
}
//cout<<"sum:"<<sum<<endl;
return?var;
}
//獲得當前簇的均值(質心)
Tuple?getMeans(const?vector<Tuple>&?cluster){
int?num?=?cluster.size();
Tuple?t(dimNum+1,?0);
for?(int?i?=?0;?i?<?num;?i++)
{
for(int?j=1;?j<=dimNum;?++j)
{
t[j]?+=?cluster[i][j];
}
}
for(int?j=1;?j<=dimNum;?++j)
t[j]?/=?num;
return?t;
//cout<<"sum:"<<sum<<endl;
}
void?print(const?vector<Tuple>?clusters[])
{
for(int?lable=0;?lable<k;?lable++)
{
cout<<"第"<<lable+1<<"個簇:"<<endl;
vector<Tuple>?t?=?clusters[lable];
for(int?i=0;?i<t.size();?i++)
{
cout<<i+1<<".(";
for(int?j=0;?j<=dimNum;?++j)
{
cout<<t[i][j]<<",?";
}
cout<<")\n";
}
}
}
void?KMeans(vector<Tuple>&?tuples){
vector<Tuple>?clusters[k];//k個簇
Tuple?means[k];//k個中心點
int?i=0;
//壹開始隨機選取k條記錄的值作為k個簇的質心(均值)
srand((unsigned?int)time(NULL));
for(i=0;i<k;){
int?iToSelect?=?rand()%tuples.size();
if(means[iToSelect].size()?==?0)
{
for(int?j=0;?j<=dimNum;?++j)
{
means[i].push_back(tuples[iToSelect][j]);
}
++i;
}
}
int?lable=0;
//根據默認的質心給簇賦值
for(i=0;i!=tuples.size();++i){
lable=clusterOfTuple(means,tuples[i]);
clusters[lable].push_back(tuples[i]);
}
double?oldVar=-1;
double?newVar=getVar(clusters,means);
cout<<"初始的的整體誤差平方和為:"<<newVar<<endl;?
int?t?=?0;
while(abs(newVar?-?oldVar)?>=?1)?//當新舊函數值相差不到1即準則函數值不發生明顯變化時,算法終止
{
cout<<"第?"<<++t<<"?次叠代開始:"<<endl;
for?(i?=?0;?i?<?k;?i++)?//更新每個簇的中心點
{
means[i]?=?getMeans(clusters[i]);
}
oldVar?=?newVar;
newVar?=?getVar(clusters,means);?//計算新的準則函數值
for?(i?=?0;?i?<?k;?i++)?//清空每個簇
{
clusters[i].clear();
}
//根據新的質心獲得新的簇
for(i=0;?i!=tuples.size();?++i){
lable=clusterOfTuple(means,tuples[i]);
clusters[lable].push_back(tuples[i]);
}
cout<<"此次叠代之後的整體誤差平方和為:"<<newVar<<endl;?
}
cout<<"The?result?is:\n";
print(clusters);
}
int?main(){
char?fname[256];
cout<<"請輸入存放數據的文件名:?";
cin>>fname;
cout<<endl<<"?請依次輸入:?維數?樣本數目"<<endl;
cout<<endl<<"?維數dimNum:?";
cin>>dimNum;
cout<<endl<<"?樣本數目dataNum:?";
cin>>dataNum;
ifstream?infile(fname);
if(!infile){
cout<<"不能打開輸入的文件"<<fname<<endl;
return?0;
}
vector<Tuple>?tuples;
//從文件流中讀入數據
for(int?i=0;?i<dataNum?&&?!infile.eof();?++i)
{
string?str;
getline(infile,?str);
istringstream?istr(str);
Tuple?tuple(dimNum+1,?0);//第壹個位置存放記錄編號,第2到dimNum+1個位置存放實際元素
tuple[0]?=?i+1;
for(int?j=1;?j<=dimNum;?++j)
{
istr>>tuple[j];
}
tuples.push_back(tuple);
}
cout<<endl<<"開始聚類"<<endl;
KMeans(tuples);
return?0;
}