This is a C++ demo for pan-genome analysis, by bbsunchen:
/*
start:2012/06/11 by sunchen
amend:
1.2012/06/12 by sunchen
construct a array of 2^n
conculate
2.2112/06/12 by sunchen
introduce multithread model
3.2012/06/18 by sunchen
change multithread model
complete pangenome calculation
4.2012/06/19 by sunchen
complete newgene calculation
mission completed
*/
#include <iostream>
#include <fstream>
#include <cstring>
#include <cstdlib>
#include <vector>
#include <time.h>
#include <pthread.h>
#include <sstream>
using namespace std;
struct NumRange
{
int index;
long long startNum;
long long endNum;
};
struct panGenomeNum
{
int SampleNum[101];//the index is genomeNumber, start from 1
long long panNum[101];//the index is genomeNumber, start from 1
};
//##################public data###################################
long long refdig[]=
{
//n=0
1,2,4,8,
//n=4
16,32,64,128,
//n=8
256,512,1024,2048,
//n=12
4096,8192,16384,32768,
//n=16
65536,131072,262144,524288,
//n=20
1048576,2097154,4194304,8388608,
//n=24
16777216,33554432,67108864,134217728,
//n=28
268435456,536870912,1073741824,2147483648,
//n=32
4294967296,8589934592,17179869184,34359738368,
//n=36
68719476736,137438953472,274877906944,549755813888,
//n=40
1099511627776,2199023255552,4398046511104,8796093022208,
//n=44
17592186044416,35184372088832,70368744177664,140737488355328,
//n=48
281474976710656,562949953421312,1125899906842624,2251799813685248,
//n=52
4503599627370496,9007199254740992,18014398509481984,36028797018363968,
//n=56
72057594036727936,144115188073455872,228230376146911744,576460752293823488,
//n=60
1152921504587646976,2305843009175293952,4611686018350587904
};
int genome_genesize[101] = {0};//record gene num of specific genome, start from 0
vector< vector<bool> > m;//matrix
int m_line_num = 0;
int m_column_num = 0;
//char clusterPath[] = "/home/sun/zhao/1.Orthologs_Cluster.txt";
char* clusterPath;
char tempPath[1001] = "";
panGenomeNum pN[101];//the index is threadId, start from 0
ofstream TEMP[101];
//#################################public data end###################################
//convert long_long number to "01"string to find out which is 1
vector<int> whichGenome(long long genomeCombination)
{
vector<int> genomeIdVector;
for(int k = 62; k >= 0; k--)
{
if(genomeCombination >= refdig[k])
{
genomeCombination -= refdig[k];
genomeIdVector.push_back(k);
}
if(genomeCombination == 0)
{
break;
}
}
return genomeIdVector;
}
long long genomeNum2LongLong(int genomeSize)
{
long long genomeIndicator = 0;
for(int i = 0; i < genomeSize;i++)
{
genomeIndicator += refdig[i];
}
return genomeIndicator;
}
char* getTempFilePath(int index)
{
char* pathSegment[50];
char* temp_num = strtok(clusterPath,"/");//split string
int e_num = 0;
while(temp_num != NULL)
{
pathSegment[e_num] = temp_num;
temp_num = strtok(NULL,"/");
e_num++;
}
char tempPath[1001] = "";
for(int i = 0; i < e_num -1 ; i++)
{
strcat(tempPath, "/");
strcat(tempPath, pathSegment[i]);
}
stringstream stream;
string s;
stream << tempPath << "/" << index << "_temp.dat";
stream >> s;
cout << tempPath << endl;
//char *path =const_cast<char*>(s.c_str()); //get the path
char* path=const_cast<char*>(s.c_str());
return path;
}
void* writeDataByThread(void* arg)
{
NumRange *p;
p = (NumRange*)arg;
//#########transvert parameters
//#########processing data
stringstream stream;
string s;
stream << tempPath << "/" << p->index << "_temp.dat";
stream >> s;
char* filepath=const_cast<char*>(s.c_str());
//cout << filepath << endl;
//######################getpath
TEMP[p->index].open(filepath);
if(!TEMP[p->index].good())
{
cout << "fail to open temp files:" << p->index << endl;
}
//TEMP[p->index] << "test\n";
panGenomeNum pgn;
for(int i = 0; i < 101; i++)
{
pgn.SampleNum[i] = 0;
pgn.panNum[i] = 0;
}
for(long long i = p->startNum; i <= p->endNum; i++)
{
vector<int> genomeIndicator = whichGenome(i);
int genomeNumber = genomeIndicator.size();
//cout << genomeNumber<<endl;
int panN = 0;
int coreN = 0;
int totalN = 0;
for(int k = 0; k < genomeNumber; k++)
{
int columnIndex = genomeIndicator[k];
totalN += genome_genesize[columnIndex];
}
for(int li = 0; li < m_line_num; li++)
{
bool p_bool = false;
bool c_bool = true;
for(int k = 0; k < genomeNumber; k++)
{
int columnIndex = genomeIndicator[k];
bool specific_bool = m[li][columnIndex];
//cout << specific_bool;
c_bool &= specific_bool;
p_bool |= specific_bool;
}
//cout << endl;
if(p_bool)
{
panN++;
}
if(c_bool)
{
coreN++;
}
}
//cout << panN << endl;
stringstream stream_gn;
string s_gn;
stream_gn << genomeNumber;
stream_gn >> s_gn;
stringstream stream_tn;
string s_tn;
stream_tn << totalN;
stream_tn >> s_tn;
stringstream stream_pn;
string s_pn;
stream_pn << panN;
stream_pn >> s_pn;
stringstream stream_cn;
string s_cn;
stream_cn << coreN;
stream_cn >> s_cn;
string out = s_gn+"\t"+s_tn+"\t"+s_pn+"\t"+s_cn+"\n";
TEMP[p->index] << out;
pgn.SampleNum[genomeNumber] ++;//the index is genomeNumber, start from 1
pgn.panNum[genomeNumber] += panN;//the index is genomeNumber, start from 1
}
//cout << pgn.panNum[1] << endl;
pN[p->index] = pgn;
//cout << p->index << endl;
//cout << pN[p->index].panNum[2] << endl;
TEMP[p->index].close();
//#########exit
pthread_exit((void*)0);
}
int readData()
{
ifstream CLUSTER;
CLUSTER.open(clusterPath);
if(!CLUSTER.good())
{
cout << "ERROR: illegal input file path: " << clusterPath <<endl;
cout <<
"Input format:\n" <<
"program_name \n";
exit(0);
}
char* genome_name[101];//id start from 0
int e_num = 0;//e_num equals to the num of char*
//which means that e_num-1 equals to the number of genome
//i.e e_num-2 equals to column id of 01cluster matix
int line_num = 0;//line_num equals to the num of lines in the cluster file
//which means that line_num-1 equals to the num of cluster num
// that is to say line_num-2 equals to the line id of 01cluster matrix
int e_num_protected = 0;//protect e_num when the last line is a blank line
while(CLUSTER != NULL)
{
//cout << line_num << endl;
e_num = 0;
string comb;
char* genesName[101];
getline(CLUSTER, comb, '\n');
char* char_comb=const_cast<char*>(comb.c_str());//const char* to char*
char* temp_num = strtok(char_comb,"\t");//split string
while(temp_num != NULL)
{
genesName[e_num] = temp_num;
temp_num = strtok(NULL,"\t");
e_num++;
}
if(e_num == 0)
{
break;
}
vector<bool> m_line;
if(line_num == 0)
{
for(int i = 1; i <=e_num; i++)
{
genome_name[i-1] = genesName[e_num];
// so the size of genome_num = e_num;
}
e_num_protected = e_num;
}else
{
for(int i = 1;i < e_num; i++)
{
if(strcmp(genesName[i], "-") == 0)//if equal, return 0
{
//cout << num[i] << endl;
m_line.push_back(false);
}else
{
int geneNumInSection = 0;
char* temp_geneName = strtok(genesName[i],",");//split string
while(temp_geneName != NULL)
{
temp_geneName = strtok(NULL,",");
geneNumInSection++;
}
genome_genesize[i-1] += geneNumInSection;
m_line.push_back(true);
}
}
}
if(line_num == 0)
{
line_num++;
}else
{
m.push_back(m_line);
line_num++;
}
}
CLUSTER.close();
//true&false matrix, (line_num-1)*(e_num-1)
m_line_num = line_num - 1;
m_column_num = e_num_protected - 1;
char* pathSegment[50];
char* temp_num_2 = strtok(clusterPath,"/");//split string
int e_num_2 = 0;
while(temp_num_2 != NULL)
{
pathSegment[e_num_2] = temp_num_2;
temp_num_2 = strtok(NULL,"/");
e_num_2++;
}
for(int i = 0; i < e_num_2 -1 ; i++)
{
strcat(tempPath, "/");
strcat(tempPath, pathSegment[i]);
}
return m_column_num;
}
int main(int argc,char *argv[])
{
if(argc != 3)
{
cout << "ERROR: illegal argument number: " << argc << endl;
cout <<
"Input format:\n" <<
"program_name inputfile threadNum\n" <<
"i.e.\n" <<
"./main sun.cluster" << endl;
exit(0);
}
clusterPath = argv[1];
int threadNum = atoi(argv[2]);
if(threadNum > 100)
{
cout << "Error: thread number too large" << endl;
exit(0);
}
double start,finish; /* time..*/
start=(double)clock(); /* 我的time.h内没有CLOCKS_PER_SEC */
int genomeSize = readData();
//cout << genomeSize << endl;
long long totalLong = genomeNum2LongLong(genomeSize);
long long span = totalLong / threadNum;
long long last_end = 0;
pthread_t threadId[101];//thread id strat from 0
NumRange p[101];
for(int i = 0; i < threadNum; i++)
{
long long start = last_end + 1;
long long end = start + span;
if(end > totalLong)
{
end = totalLong;
}
last_end = end;
p[i].index = i;
p[i].startNum = start;
p[i].endNum = end;
}
for(int i = 0; i < threadNum; i++)
{
//cout << "strat:" << p[i].startNum << "~~" << p[i].endNum << endl;
//pthread_t id;
//threadId.push_back(id);
int ret = pthread_create(&threadId[i],NULL,writeDataByThread, (void*)&p[i]);
//pthread_join(threadId[i], NULL);
sleep(0.01);//important so that the CPU will have time to run pthread_create completely
//in the insprion1420, it costs at least 0.001 second to transport parameters
//pthread_join(id, NULL);
}
for(int i = 0; i < threadNum; i++)
{
pthread_join(threadId[i], NULL);
}
stringstream stream_path;
string s_path;
stream_path << tempPath << "/panGenome.txt";
stream_path >> s_path;
char* panPath=const_cast<char*>(s_path.c_str());
//cout << filepath << endl;
//######################getpath
ofstream PANGENOME;
PANGENOME.open(panPath);
if(!PANGENOME.good())
{
cout << "open panGenome file failed!" << endl;
exit (0);
}
PANGENOME << "ClusterConservation" << "\t" << "TotalGeneNumber" << "\t" << "PanGenome" << "\t" << "CoreGenome" << endl;
for(int i = 0; i < threadNum; i++)
{
stringstream stream;
string s;
stream << tempPath << "/" << i << "_temp.dat";
stream >> s;
char* filepath=const_cast<char*>(s.c_str());
//cout << filepath << endl;
//######################getpath
ifstream TEMPFILE;
TEMPFILE.open(filepath);
if(!TEMPFILE.good())
{
cout << "fail to open temp files:" << p->index << endl;
}
while(TEMPFILE != NULL)
{
string matr;
getline(TEMPFILE,matr);
//cout << matr.length() << endl;
if(matr != "\n" && matr != "")
{
PANGENOME << matr << endl;
}
}
}
PANGENOME.close();
long long panTotal[101] = {0};//index stands for genome number, start from 1;
int SampleTotal[101] = {0};//index stands for genome number, start from 1;
for(int i = 0; i < threadNum; i++)
{
for(int k = 1; k <= genomeSize; k++)
{
//panGenomeNum temp_pgn = pN[threadNum];
panTotal[k] += pN[i].panNum[k];
//cout <<"thread:" << i<< " genome:"<< k << " "<<pN[i].panNum[k] << endl;
SampleTotal[k] += pN[i].SampleNum[k];
}
}
for(int k = 1; k <= genomeSize; k++)
{
//cout << SampleTotal[k] << endl;
panTotal[k] /= SampleTotal[k];
}
//cout << "right" << endl;
stringstream stream_new;
string s_new;
stream_new << tempPath << "/newGene.txt";
stream_new >> s_new;
char* newPath=const_cast<char*>(s_new.c_str());
//cout << filepath << endl;
//######################getpath
ofstream NEWGENE;
NEWGENE.open(newPath);
if(!NEWGENE.good())
{
cout << "open panGenome file failed!" << endl;
exit (0);
}
NEWGENE << "GenomeNumber" << "\t" << "NewGene" << endl;
for(int k = 2; k <=genomeSize; k++)
{
NEWGENE << k << "\t" << panTotal[k] - panTotal[k-1] << endl;
}
NEWGENE.close();
for(int i = 0; i < threadNum; i++)
{
stringstream stream_delete;
string s_delete;
stream_delete << tempPath << "/" << i << "_temp.dat";
stream_delete >> s_delete;
char commend[1001] = "rm ";
char* deletepath=const_cast<char*>(s_delete.c_str());
strcat(commend, deletepath);
system(commend);
}
finish=(double)clock();
//cout << (finish - start)/CLOCKS_PER_SEC << endl;
return 0;
}