目录
实验目的:
建立算法和系统,可以为基因组序列建立自索引表示。
复原Hon et al., 2007的算法
实验原理
测试结果及分析
代码:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <math.h>
#include <time.h>
typedef struct
{
char* data;
int SA;
int SB;
int order;
} A,*B;
typedef struct
{
B S;
int lc;
int rc;
int la;
int ra;
int lg;
int rg;
int lt;
int rt;
} C,*D;
void calculatel(D Data,int length)
{
Data->la=0;
Data->ra=0;
Data->lc=0;
Data->rc=0;
Data->lg=0;
Data->rg=0;
Data->lt=0;
Data->rt=0;
for(int i=0; i<length-1; i++)
{
if(Data->S[i].data[0]!='A'&&Data->S[i+1].data[0]=='A')
{
Data->la=i+1;
Data->ra=Data->la;
Data->lc=Data->la;
Data->rc=Data->la;
Data->lg=Data->la;
Data->rg=Data->la;
Data->lt=Data->la;
Data->rt=Data->la;
}
if(Data->S[i].data[0]=='A'&&Data->S[i+1].data[0]!='A')
{
Data->ra=i;
Data->lc=Data->ra;
Data->rc=Data->ra;
Data->lg=Data->ra;
Data->rg=Data->ra;
Data->lt=Data->ra;
Data->rt=Data->ra;
}
if(Data->S[i].data[0]!='C'&&Data->S[i+1].data[0]=='C')
{
Data->lc=i+1;
Data->rc=Data->lc;
Data->lg=Data->lc;
Data->rg=Data->lc;
Data->lt=Data->lc;
Data->rt=Data->lc;
}
if(Data->S[i].data[0]=='C'&&Data->S[i+1].data[0]!='C')
{
Data->rc=i;
Data->lg=Data->rc;
Data->rg=Data->rc;
Data->lt=Data->rc;
Data->rt=Data->rc;
}
if(Data->S[i].data[0]!='G'&&Data->S[i+1].data[0]=='G')
{
Data->lg=i+1;
Data->rg=Data->lg;
Data->lt=Data->lg;
Data->rt=Data->lg;
}
if(Data->S[i].data[0]=='G'&&Data->S[i+1].data[0]!='G')
{
Data->rg=i;
Data->lt=Data->rg;
Data->rt=Data->rg;
}
if(Data->S[i].data[0]!='T'&&Data->S[i+1].data[0]=='T')
{
Data->lt=i+1;
Data->rt=Data->lt;
}
if(Data->S[i].data[0]=='T'&&Data->S[i+1].data[0]!='T')
{
Data->rt=i;
}
if(Data->S[length-1].data[0]=='T')
{
Data->rt=length-1;
}
if(Data->S[length-1].data[0]=='G')
{
Data->rg=length-1;
Data->lt=Data->rg;
Data->rt=Data->rg;
}
if(Data->S[length-1].data[0]=='C')
{
Data->rc=length-1;
Data->lg=Data->rc;
Data->rg=Data->rc;
Data->lt=Data->rc;
Data->rt=Data->rc;
}
if(Data->S[length-1].data[0]=='A')
{
Data->ra=length-1;
Data->lc=Data->ra;
Data->rc=Data->ra;
Data->lg=Data->ra;
Data->rg=Data->ra;
Data->lt=Data->ra;
Data->rt=Data->ra;
}
}
}
int location(int start,int end,D Data,int key)
{
if(start==end-1||start==end)
{
if(Data->S[start].SB>=key)
return start;
else if(Data->S[end].SB<key)
return end+1;
else
return end;
}
while(start<end)
{
if(start==end-1||start==end)
{
if(Data->S[start].SB>=key)
return start;
else if(Data->S[end].SB<key)
return end+1;
else
return end;
}
int mid=(start+end)/2;
if(Data->S[mid].SB<key)
{
start=mid;
}
else if(Data->S[mid].SB>key)
{
end=mid;
}
else if(Data->S[mid].SB==key)
{
return mid;
}
}
}
int suborders(D Data,char *str,int order)
{
int or=0;
if(str[0]=='A')
{
if(Data->la==0)
{
or=Data->la+1;
}
else
or=location(Data->la,Data->ra,Data,order);
}
else if(str[0]=='C')
{
if(Data->lc==Data->rc&&Data->ra==Data->lc)
{
or=Data->lc+1;
}
else
or=location(Data->lc,Data->rc,Data,order);
}
else if(str[0]=='G')
{
if(Data->lg==Data->rg&&Data->rc==Data->lg)
{
or=Data->lg+1;
}
else
or=location(Data->lg,Data->rg,Data,order);
}
else if(str[0]=='T')
{
if(Data->lt==Data->rt&&Data->rg==Data->lt)
{
or=Data->lt+1;
}
else
or=location(Data->lt,Data->rt,Data,order);
}
return or;
}
int order(A arr[],D Data,int L,int before)
{
arr[L-1].order=suborders(Data,arr[L-1].data,before);
for(int i=L-2; i>=0; i--)
{
arr[i].order=suborders(Data,arr[i].data,arr[i+1].order);
}
int k=0;
for(int i=0;i<L;i++){
if(arr[i].order<arr[0].order)
k++;
else if(arr[i].order==arr[0].order&&strcmp(arr[0].data,arr[i].data)>0)
k++;
}
return arr[0].order+k;
}
int partition(A arr[], int low, int high)
{
A key;
key=arr[low];
while(low<high)
{
while(low <high && strcmp(arr[high].data, key.data)>=0 )
high--;
if(low<high)
arr[low++]= arr[high];
while( low<high && strcmp(arr[low].data,key.data)<=0 )
low++;
if(low<high)
arr[high--]= arr[low];
}
arr[low] = key;
return low;
}
void quick_sort(A arr[], int start, int end)
{
int pos;
if (start<end)
{
pos = partition(arr, start, end);
quick_sort(arr,start,pos-1);
quick_sort(arr,pos+1,end);
}
return;
}
void calculateSB(A arr[],int l)
{
int *sa=(int *)malloc(sizeof(int)*(l+1));
for(int i=0; i<l; i++)
{
sa[arr[i].SA]=i;
if(arr[i].SA==0)
{
sa[l]=i;
}
}
for(int i=0; i<l; i++)
{
arr[i].SB=sa[arr[i].SA+1];
}
free(sa);
}
void getdata(A arr[],char *fname,char *str)
{
printf("开始读入数据\n");
char s[80];
FILE * r=fopen(fname,"r");
assert(r!=NULL);
while(fgets(s,100,r)!=NULL)
{
if(s[0]!='>')
{
s[strlen(s)-1]='\0';
strcat(str,s);
}
}
strcat(str,"$");
fclose(r);
printf("读入数据完毕\n");
}
void P(A arr[],int l)
{
for(int i=0; i<l; i++)
printf("%d\t%d\t%d\t%s\t%d\n",i,arr[i].SA,arr[i].SB,arr[i].data,arr[i].order);
printf("\n");
}
int locations(int start,int end,A arr[],int key)
{
while(start<end)
{
if(start==end-1)
{
if(arr[end].order<=key)
return end+1;
else if(arr[start].order>key)
return start;
else if(arr[start].order<=key&&arr[end].order>key)
return start+1;
else
return end+1;
}
int mid=(start+end)/2;
if(arr[mid].order<=key)
{
start=mid;
}
else if(arr[mid].order>key)
{
end=mid;
}
}
}
int F(int j,A arr[],int l){
int k=j;
k+=locations(0,l-1,arr,j);
return k;
}
B com(A arr1[],A arr2[],int l1,int l2,int l){
B S=(B)malloc(sizeof(A)*(l1+l2+1));
int f1=0;
int f2=1;
quick_sort(arr2,0,l2-1);
int *gg=(int *)malloc(sizeof(int)*l2);
printf("gg\n");
for(int i=0;i<l2;i++){
gg[arr2[i].SA]=i;
}
int *ff=(int *)malloc(sizeof(int)*l1);
printf("ff\n");
for(int i=0;i<l1;i++){
ff[i]=F(i,arr2,l2);
}
printf("合并\n");
S[0].SB=arr2[gg[0]].order+gg[0];
S[0].SA=arr1[0].SA+l2;
S[0].data=arr1[0].data;
for(int i=1;i<l1+l2;i++){
if(i==ff[f2]){
S[i].SA=arr1[f2].SA+l2;
S[i].SB=ff[arr1[f2].SB];
S[i].data=arr1[f2].data;
f2++;
}
else{
S[i].SA=arr2[f1].SA;
if(arr2[f1].SA==l2-1)
S[i].SB=F(arr1[0].SB,arr2,l2);
else
S[i].SB=arr2[gg[arr2[f1].SA+1]].order+gg[arr2[f1].SA+1];
S[i].data=arr2[f1].data;
f1++;
}
}
printf("\n");
free(gg);
free(ff);
free(arr1);
free(arr2);
return S;
}
void print(char * fname,A arr[],int l)
{
printf("开始写入数据\n");
FILE * R=fopen(fname,"w");
int i=0;
for(i=0; i<l; i++)
{
char ch[100];
itoa(i,ch,10);
fputs(ch,R);
fputc('\t',R);
itoa(arr[i].SA,ch,10);
fputs(ch,R);
fputc('\t',R);
itoa(arr[i].SB,ch,10);
fputs(ch,R);
fputc('\t',R);
ch[0]=arr[i].data[0];
ch[1]='\0';
fputs(ch,R);
fputc('\n',R);
}
fclose(R);
printf("写入数据完毕\n");
}
int main(int argc, char *argv[])
{
char *str=(char *)malloc(sizeof(char)*4938921);
B S=(B)malloc(sizeof(A)*4938921);
C Data;
Data.S=S;
getdata(S,"NC_008253.fna",str);
printf("开始初始化\n");
int l=strlen(str);
float k=1;
int lnum=k*log(l)+1;
printf("一共有%d块数据\n",lnum);
int a[lnum];
a[0]=0;
for(int i=0; i<lnum-1; i++)
{
a[i+1]=a[i]+l/(log(l)*k);
}
for(int i=0; i<l-a[lnum-1]; i++)
{
S[i].data=str+a[lnum-1]+i;
S[i].SA=i;
}
quick_sort(S,0,l-a[lnum-1]-1);
calculateSB(S,l-a[lnum-1]);
calculatel(&Data,l-a[lnum-1]);
int before=0;
for(int i=0;i<l-a[lnum-1];i++){
if(S[i].SA==0){
before=i;
}
}
for(int hy=0; hy<lnum-1; hy++)
{
B S1=(B)malloc(sizeof(A)*l/log(l));
printf("开始循环第%d块数据\n",hy+2);
printf("开始读入第%d块数据\n",hy+2);
for(int i=0; i<a[lnum-1-hy]-a[lnum-1-1-hy]; i++)
{
S1[i].data=str+a[lnum-1-1-hy]+i;
S1[i].SA=i;
S1[i].SB=0;
}
printf("开始第%d块数据orderl\n",hy+2);
before=order(S1,&Data,a[lnum-1-hy]-a[lnum-1-1-hy],before);
printf("开始第%d块数据合并\n",hy+2);
S=com(S,S1,l-a[lnum-1-hy],a[lnum-1-hy]-a[lnum-1-1-hy],l);
Data.S=S;
for(int i=0;i<l-a[lnum-1-1-hy]-1;i++){
if(strcmp(S[i].data,S[i+1].data)>0){
printf("error at %d\n",i);
}
}
printf("开始第%d块数据计算l\n",hy+2);
calculatel(&Data,l-a[lnum-1-1-hy]);
printf("%d::%d %d::%d %d::%d %d::%d\n",Data.la,Data.ra,Data.lg,Data.rg,Data.lc,Data.rc,Data.lt,Data.rt);
}
print("hy.csa",S,l);
printf("OK\n");
for(int i=0;i<l-1;i++){
if(strcmp(S[i].data,S[i+1].data)>0){
printf("error at %d\n",i);
}
}
char *Str=(char *)malloc(sizeof(char)*4938921);
Str[0]=S[S[0].SB].data[0];
int kkk=S[0].SB;
for(int i=0;i<l-1;i++){
Str[i+1]=S[S[kkk].SB].data[0];
kkk=S[kkk].SB;
}
FILE * RR=fopen("yanzheng.txt","w");
fputs(Str,RR);
fclose(RR);
if(strcmp(Str,str)==0){
printf("验证正确\n");
}
else{
printf("验证不正确\n");
}
free(S);
free(str);
free(Str);
return 0;
}