c语言程序朴素贝叶斯分类器,基于朴素贝叶斯分类器的文本分类算法(C语言)

#include

#include

#include //_getcwd(),

_chdir()

#include //_MAX_PATH,

system()

#include //_finddata_t,

_findfirst(), _findnext(), _findclose()

char vocabulary[1000][20];

//@输入参数:要分类的文本

//@输出参数:该文本中总单词数

int SplitToWord(char text[])

{

int i=0;

char seps[]=", .\n";

char *substring;

substring=strtok(text,seps);

while(substring!=NULL)

{

strcpy(vocabulary[i],substring);//将单词存储到vocabulary数组中

substring=strtok(NULL,seps);

i++;

}

return i; //返回一共多少个单词

}

//@输入参数:无

//@输出参数:该目录下.txt文件数

int CountDirectory()

{

int count=0; //txt文件计数器

long hFile;

_finddata_t

fileinfo;

if

((hFile=_findfirst("*.txt",&fileinfo))!=-1L)

{

do

{

count++;

} while

(_findnext(hFile,&fileinfo) == 0);

}

return count;

}

//@输入参数:分类文本中单词数

//@输出参数:该类别下∏P(ai|vj)

float CalculateWordProbability(int wordCount)

{

int countSame; //分类文本中的某单词在所有训练样本中出现次数

int countAll=0; //训练样本中总单词数

char token;

FILE *fp;

float wordProbability=1; //为后面联乘做准备

int i,j;

long hFile;

_finddata_t

fileinfo;

for(j=0;j

{

countSame=0;

countAll=0;

if((hFile=_findfirst("*.txt",&fileinfo))!=-1L)

//对于该类别下每一个.txt文本

{

do

{

if((fp=fopen(fileinfo.name,"r"))==NULL)

//是否能打开该文本

{

printf("Sorry!Cannot open the file!\n");

exit(0);

}

while((token = fgetc(fp)) !=

EOF)

{

char

keyword[1024];

i =

0;

keyword[0] = token; // 将每个词第一个字符赋给数组第一个元素

while

((keyword[++i] = fgetc(fp)) != ' '

&& keyword[i] != '\t'

&& keyword[i] != EOF

&& keyword[i] != '\n'); //

开始读字符,直到遇到空白符,说明找到一个词

keyword[i] = '\0';// 加结束符

countAll++;

if

(strcmp(keyword,vocabulary[j]) == 0) //比较两个单词是否相同

countSame++;

}

fclose(fp);

}while

(_findnext(hFile,&fileinfo) ==

0);

}

wordProbability*=(float)(countSame+1)/(float)(wordCount+countAll)*300;

//计算∏P(wj|vi),为了扩大效果而*380

}

return wordProbability;

}

//@输入参数:分类文本中单词数

void CalculateProbability(int wordCount)

{

FILE *fp;

char classList[10][20]; //类别列表

char ch;

//临时读取字符使用

int index=0;

//classList的行标

int className_c=0;

//classList的列标

if((fp=fopen("ClassList.txt","r"))==NULL)

{

printf("Failed to open the file:

ClassList.txt.\n");

}

ch = fgetc(fp);

while(ch!=EOF)

{

if(ch!='\n')

{

classList[index][className_c]=ch;

className_c++;

}

else

{

classList[index][className_c]='\0';

index++;

className_c=0;

}

ch = fgetc(fp);

}

int txtCount[10]; //每个类别下的训练文本数

int countAll=0; //训练集中总文本数

float wordProbability[10]; //每个类别的单词概率,即∏P(ai|vj)

if(_chdir("F:\\SogouC\\Sample\\1")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[0]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[0];

wordProbability[0]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\2")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[1]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[1];

wordProbability[1]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\3")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[2]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[2];

wordProbability[2]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\4")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[3]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[3];

wordProbability[3]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\5")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[4]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[4];

wordProbability[4]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\6")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[5]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[5];

wordProbability[5]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\7")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[6]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[6];

wordProbability[6]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\8")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[7]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[7];

wordProbability[7]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\9")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[8]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[8];

wordProbability[8]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

if(_chdir("F:\\SogouC\\Sample\\10")) //更改当前绝对路径

printf("系统找不到指定路径!\n");

else

{

txtCount[9]=CountDirectory();

//获取该类别下.txt文件数

countAll+=txtCount[9];

wordProbability[9]=CalculateWordProbability(wordCount);

//获取该类别下∏P(wj|vi)

}

float max=0;

int classNo=0;

float priorProbability[10];

float finalProbability[10];

for(int i=0;i<10;i++)

{

priorProbability[i]=(float)txtCount[i]/(float)countAll;

//先验概率

finalProbability[i]=priorProbability[i]*wordProbability[i];

//最终概率

if(finalProbability[i]>max)

//找到最大概率并记录

{

max=finalProbability[i];

classNo=i;

}

printf("该文本为类别%s的概率为:%.5e\n",classList[i],finalProbability[i]);

//输出每个类别的最终概率

}

printf("\n经分析,该文本最有可能为%s类文本!\n",classList[classNo]);

//输出最后分类结果

}

//@输入参数:分类文本

void NaiveBayesClassifier(char text[])

{

int vocabularyCount;//分类样本中单词数

vocabularyCount=SplitToWord(text);

//对要分类的文本进行单词分割,结果存储在vocabulary数组中,返回分类样本中单词数

CalculateProbability(vocabularyCount); //计算最终概率

}

int main()

{

char text[]="Microsoft offered 44.6 billion dollars to buy

Yahoo.February 1st network reported the Associated Press news,

Microsoft offered 44.6 billion dollars in cash and stock to buy

Yahoo search site.Microsoft offered to pay 31 dollars per share for

Yahoo.Microsoft's acquisition offer on Jan. 31 premium of 62% than

Yahoo's closing price of 19.18 dollars.Microsoft said that Yahoo

shareholders can choose cash or stock transactions. Microsoft and

Yahoo have sought cooperation in late 2006 and early 2007.The last

two years, Yahoo has been in a dilemma: the market share

decline,poor operating performance,stock prices tumbled

sharply.Trying to make a difference for Microsoft in the Internet

market, the acquisition of Yahoo is a shortcut, because the two

sides have very strong complementarity.";

NaiveBayesClassifier(text);

return 1;

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值