这是以前做过的topcoder题目,继续贴上来,请朋友们指正。
============================================================================
Problem Statement
For computers it can be hard to determine in which language a given text is written. A simple way to try to determine the language is the following: for the given text and for some sample texts, for which we know the languages, we determine the letter frequencies and compare these.
The frequency of a letter is the total number of occurrences of that letter divided by the total number of letters in the text. To determine this, we ignore case and non-letter characters. Once the letter frequencies of the text and of a language are known, we can calculate the difference between the two. This difference we define by the sum of the squared differences of the frequencies:
The lesser this value, the closer text resembles that language. Compare text with each element of languages and return the (0-based) index of the language that has the smallest difference with text. In case of a tie, return the smallest index.
Definition
Class:
LanguageRecognition
Method:
whichLanguage
Parameters:
vector <string>, string
Returns:
int
Method signature:
int whichLanguage(vector <string> languages, string text)
(be sure your method is public)
Constraints
-languages contains between 1 and 50 elements, inclusive.
-Each element of languages has length between 1 and 50, inclusive.
-text has length between 1 and 50, inclusive.
-Each element of languages and text consists only of characters with ASCII value between 32 and 127, inclusive.
-Each element of languages and text contains at least one letter ('A'-'Z' and 'a'-'z').
Examples
0)
{"This is an English sentence.",
"Dieser ist ein Deutscher Satz.",
"C'est une phrase Francaise.",
"Dit is een Nederlandse zin."
}
"In welke taal is deze zin geschreven?"
Returns: 3
The differences are 0.0385, 0.0377, 0.0430 and 0.0276, so the sentence is written in language 3, Dutch. Note that Dutch is somewhat similar to German, somewhat less similar to English and not similar to French.
1)
{"aaaaa","bbbb","ccc","dd","e"}
"xxx"
Returns: 0
In case of a tie, return the language with the smallest index.
2)
{"AABB","AaBb","A? B!","ab!@#$%"}
"ab"
Returns: 0
Ignore case and the non-letter characters.
This problem statement is the exclusive and proprietary property of TopCoder, Inc. Any unauthorized use or reproduction of this information without the prior written consent of TopCoder, Inc. is strictly prohibited. (c)2003, TopCoder, Inc. All rights reserved.
============================================================================
以下是我的解题程序:
#include < stdio.h >
#include < vector >
#include < STRING >
using namespace std;
class LanguageRecognition
... {
public:
vector <string> m_vecLanguage; //some languages
string m_strText; //the given text
/**///
//this array is to test the difference calculation
vector <double> m_dblVecDifference;
/**///
int m_arrayFreqLanguage[26]; //frequency array of a language
int m_arrayFreqText[26]; //frequency array of the given text
int m_nTotalNumberLanguage; //total number of a language
int m_nTotalNumberText; //total number of the given text
public:
LanguageRecognition();
~LanguageRecognition();
void InitializeFreqLanguage();
void InitializeFreqText();
void GetFreqLanguage(string language);
void GetFreqText(string text);
double GetDifference();
/**///
//this function is to test the difference calculation
void DisplayDifference();
/**///
int whichLanguage(vector <string> languages, string text);
} ;
LanguageRecognition::LanguageRecognition()
... {
m_vecLanguage.clear();
InitializeFreqLanguage();
InitializeFreqText();
}
LanguageRecognition:: ~ LanguageRecognition()
... {
}
// initialize the frequency array of the language
void LanguageRecognition::InitializeFreqLanguage()
... {
m_nTotalNumberLanguage=0;
for(int i=0;i<26;i++)
m_arrayFreqLanguage[i]=0;
}
// initialize the frequency array of the given text
void LanguageRecognition::InitializeFreqText()
... {
m_nTotalNumberText=0;
for(int i=0;i<26;i++)
m_arrayFreqText[i]=0;
}
// get frequencies of all symbols in a language
void LanguageRecognition::GetFreqLanguage( string language)
... {
unsigned char ch;
int len=language.size();
for(int i=0;i<len;i++)
...{
ch=language.at(i);
if(ch>='A' && ch<='Z')
...{
m_nTotalNumberLanguage++;
m_arrayFreqLanguage[ch-'A']++;
}
else if(ch>='a' && ch<='z')
...{
m_nTotalNumberLanguage++;
m_arrayFreqLanguage[ch-'a']++;
}
}
}
// get frequencies of all symbols in the given text
void LanguageRecognition::GetFreqText( string text)
... {
unsigned char ch;
int len=text.size();
for(int i=0;i<len;i++)
...{
ch=text.at(i);
if(ch>='A' && ch<='Z')
...{
m_nTotalNumberText++;
m_arrayFreqText[ch-'A']++;
}
else if(ch>='a' && ch<='z')
...{
m_nTotalNumberText++;
m_arrayFreqText[ch-'a']++;
}
}
}
// get the difference of a language and the given text
double LanguageRecognition::GetDifference()
... {
double dblFreq1,dblFreq2,dblDifference=0,dblSum=0;
for(int i=0;i<26;i++)
...{
dblFreq1=1.0*m_arrayFreqLanguage[i]/m_nTotalNumberLanguage;
dblFreq2=1.0*m_arrayFreqText[i]/m_nTotalNumberText;
dblDifference=dblFreq1-dblFreq2;
dblSum+=dblDifference*dblDifference;
}
return dblSum;
}
// determine which language, ignore case and non-letter characters
int LanguageRecognition::whichLanguage(vector < string > languages, string text)
... {
double dblDifference=0,dblMin=0;
int nWhichLanguage;
//get frequencies of all letters in the given text
InitializeFreqText();
GetFreqText(text);
//initialize the minimum difference
InitializeFreqLanguage();
GetFreqLanguage(languages.at(0));
dblMin=GetDifference();
nWhichLanguage=0;
/**///
// to test the differency calculation
m_dblVecDifference.clear();
m_dblVecDifference.push_back(dblMin);
/**///
//get the minimum difference and its index
for(int i=1;i<languages.size();i++)
...{
InitializeFreqLanguage();
GetFreqLanguage(languages.at(i));
dblDifference=GetDifference();
/**///
// to test the differency calculation
m_dblVecDifference.push_back(dblDifference);
/**///
if(dblDifference<dblMin)
...{
dblMin=dblDifference;
nWhichLanguage=i;
}
}
return nWhichLanguage;
}
// display the result of difference calculation
void LanguageRecognition::DisplayDifference()
... {
int nCount=m_dblVecDifference.size();
for(int i=0;i<nCount;i++)
printf(" difference - language %d: %.4f ",i,m_dblVecDifference.at(i));
printf(" ");
}
void main()
... {
LanguageRecognition case0,case1,case2;
int nWhichLanguage0,nWhichLanguage1,nWhichLanguage2;
//case 0
case0.m_vecLanguage.push_back("This is an English sentence.");
case0.m_vecLanguage.push_back("Dieser ist ein Deutscher Satz.");
case0.m_vecLanguage.push_back("C'est une phrase Francaise.");
case0.m_vecLanguage.push_back("Dit is een Nederlandse zin.");
case0.m_strText="In welke taal is deze zin geschreven?";
//case 1
case1.m_vecLanguage.push_back("aaaaa");
case1.m_vecLanguage.push_back("bbbb");
case1.m_vecLanguage.push_back("ccc");
case1.m_vecLanguage.push_back("dd");
case1.m_vecLanguage.push_back("e");
case1.m_strText="xxx";
//case 2
case2.m_vecLanguage.push_back("AABB");
case2.m_vecLanguage.push_back("AaBb");
case2.m_vecLanguage.push_back("A? B!");
case2.m_vecLanguage.push_back("ab!@#$%");
case2.m_strText="ab";
//determine which language the text is likely to be
nWhichLanguage0=case0.whichLanguage(case0.m_vecLanguage,case0.m_strText);
printf("the language index of case 0 is: %d ",nWhichLanguage0);
case0.DisplayDifference();
nWhichLanguage1=case1.whichLanguage(case1.m_vecLanguage,case1.m_strText);
printf("the language index of case 1 is: %d ",nWhichLanguage1);
case1.DisplayDifference();
nWhichLanguage2=case2.whichLanguage(case2.m_vecLanguage,case2.m_strText);
printf("the language index of case 2 is: %d ",nWhichLanguage2);
case2.DisplayDifference();
}