Max space clustering (Hamming)

21 篇文章 0 订阅
In this question your task is again to run the clustering algorithm from lecture, but on a MUCH bigger graph. So big, in fact, that the distances (i.e., edge costs) are only defined implicitly, rather than being provided as an explicit list.
The data set is here. The format is:
[# of nodes] [# of bits for each node's label]
[first bit of node 1] ... [last bit of node 1]
[first bit of node 2] ... [last bit of node 2]
...
For example, the third line of the file "0 1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1" denotes the 24 bits associated with node #2.


The distance between two nodes u and v in this problem is defined as the Hamming distance--- the number of differing bits --- between the two nodes' labels. For example, the Hamming distance between the 24-bit label of node #2 above and the label "0 1 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 1" is 3 (since they differ in the 3rd, 7th, and 21st bits).


The question is: what is the largest value of k such that there is a k-clustering with spacing at least 3? That is, how many clusters are needed to ensure that no pair of nodes with all but 2 bits in common get split into different clusters?


NOTE: The graph implicitly defined by the data file is so big that you probably can't write it out explicitly, let alone sort the edges by cost. So you will have to be a little creative to complete this part of the question. For example, is there some way you can identify the smallest distances without explicitly looking at every pair of nodes?



#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <bitset>
#include <map>

using namespace std;

// ------------ Union-find ------------
typedef struct
{
	int parent;
	int rank; 
} UnionNode;

void IniUnionFind(UnionNode *nodes, int n)
{
	for(int i=0; i<n; i++)
	{
		nodes[i].parent = i;
		nodes[i].rank = 0; 
	}
}

int Find(UnionNode *nodes, int n, int th)
{
	//if(nodes[th].parent == th)
	//       return th; 
	//else
	//    return Find(nodes, n, nodes[th].parent);
	if(nodes[th].parent != th)
		nodes[th].parent = Find(nodes, n, nodes[th].parent);

	return nodes[th].parent; 
}

void Union(UnionNode *nodes, int n, int th1, int th2)
{
	int root1 = Find(nodes, n, th1);
	int root2 = Find(nodes, n, th2);
	if(root1 == root2)
		return;

	if (nodes[root1].rank < nodes[root2].rank)
		nodes[root1].parent = root2;
	else if (nodes[root1].rank > nodes[root2].rank)
		nodes[root2].parent = root1;
	else
	{
		nodes[root2].parent = root1;
		nodes[root1].rank += 1; 
	}

}

// ----------------------------------
int ClusterNumber(bitset<24> *nodes, int nV, bitset<16777216> &tab, int nB)
{
    int nClu = nV;
    // ------------ Initial -----------------
	UnionNode *unodes = new UnionNode[nV];
	IniUnionFind(unodes, nV);
	map<int, int> posmap;
	for(int i=0; i<nV; i++)
	{
		posmap[nodes[i].to_ulong()] = i;
	}

    // ------------ The max space length =1 -------
    bitset<24> tempB;
    tempB.reset();
    bitset<24> v1;
    bitset<24> v2;
    int root1;
    int root2;
	for(int i=0; i<nB; i++)
    {
        tempB[i] = 1;
        for(int k=0; k<nV; k++)
        {
            v1 = nodes[k];
            v2 = v1^tempB;
            if(tab[v2.to_ulong()] == 0)
                continue;
            
			root1 = Find(unodes, nV, posmap[v1.to_ulong()]);
			root2 = Find(unodes, nV, posmap[v2.to_ulong()]);
			if(root1 != root2)
			{
				nClu--;         
				Union(unodes, nV, posmap[v1.to_ulong()], posmap[v2.to_ulong()]); 
			} 
        } 

        tempB.reset();
    }
    // --------- The max space length = 2 -----------
    for(int i=0; i<nB; i++)
    {
        for(int j=i+1; j<nB; j++)
        {
			tempB[i] = 1;
			tempB[j] = 1;
			for(int k=0; k<nV; k++)
			{
				v1 = nodes[k];
				v2 = v1^tempB;
				if(tab[v2.to_ulong()] == 0)
					continue;


            
				root1 = Find(unodes, nV, posmap[v1.to_ulong()]);
				root2 = Find(unodes, nV, posmap[v2.to_ulong()]);
				if(root1 != root2)
				{
					nClu--;         
					Union(unodes, nV, posmap[v1.to_ulong()], posmap[v2.to_ulong()]); 
				} 
			} 

			tempB.reset();
		}
    }
    
    // ---------------------------------------------- 
	delete unodes;
    return nClu;
}

int main()
{
	ifstream infile;
	infile.open("clustering_big.txt");

	// ------------------------
	string line;
	stringstream ss;
	getline(infile, line);
	ss << line;

	int nV, nB;
	ss >> nV;
	ss >> nB;
	// --------- Initialize ----------
    bitset<24> *nodes = new bitset<24>[nV];

	ss.clear();
	line.clear();
	int n = 0;
    int b = 0;
    bitset<24> node;
    bitset<16777216> tab;
    node.reset();
    tab.reset();
	int dul = 0;
	while(getline(infile, line) && n < nV)
	{
		ss << line;

        for (int i=0; i<nB; i++)
        {
			ss >> b;
            if (b == 1)
				node[i] = 1;
        }

        nodes[n] = node;
		if (tab[node.to_ulong()] == 1)
		{
			dul++;
		}
        tab[node.to_ulong()] = 1;
		

		n++;
		ss.clear();
		line.clear();
        node.reset();
	}
	infile.close();
	cout << "Duplication: "<<dul << endl;
	// --------------------------
	int numCluster = ClusterNumber(nodes, nV, tab, nB); 

	cout << numCluster-dul << endl;



    delete nodes;
	return 0;
}

需要注意的是数据中有重复,所以计算的结果需要把它减去。本来设置tab变量就是为了查表迅速,用bit来标记相应的数。但是为了查找各个数在Union-find的位置,还是建立了一个map结构,这个不知道怎么省去它。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值