C++课设--简单字符串编码与余弦相似度计算系统

前言:
生成特征空间的核心:采用“字母—下标—下标”的方法生成编码。按照一般方法来讲,在处理操作中,对于每个字符串都需要遍历并对遇到的每个字符进行操作,由于需要检索aphb[]数组所带来的复杂度是O(N);这里采用的方法是先使用map建立字符与位置间的映射,之后加入中间数组vectorTemp用来存储相应位置元素的频数,那么生成特征空间的操作就可以变为一句话:

 for (int i = 0;i < len;i++) Temp[FeatureSpace[str[i]]]++;	

T=FeatureSpace[str[i]] 是当前字符到位置的映射,那么Temp[T]就是当前元素所在位置的频数了。这种通过下标引用的方法理论复杂度为O(1),但是实际中因为使用了map,而map内部也进行了复杂度为O(lgN)的查找操作,因此实际的复杂度应该为O(1*lgN),即O(lgN)

代码:

#include<iostream>
#include<string>
#include<set>
#include<map>
#include<vector>
#include<fstream>
#include<cstring>
#include<sstream>
#include<algorithm>
using namespace std;
class Alphabet {	//存储字典序
private:
	char ch;
	int value;
public:
	Alphabet() { ch = 'a';value = 0; }
	Alphabet(char ch, int val);
	char GetCharact() const;
	int GetValue() const;
	void AddValue();
	bool operator >(Alphabet &T);
};
class StringEncoding {
private:
	vector<Alphabet>aphb;
	vector<string> String;
	map<char, int> FeatureSpace;
public:
	StringEncoding();
	void ReadFromFile(const char *fileName);
	void WriteToFile(const char *fileName);
	void SearchFromFile(const string str,int key);
	double CountCosine(vector<int> Temp, vector<int> tt);
	void AlphaSort();
	void RebuildCoding();	//按权重新生成26英文字母编码
};
class Manager {	//管理类
private:
	StringEncoding ss;
	string SearchString;
public:
	Manager(string SearchString = "none");
	void Call();	//调用
};
int main()
{
	Manager mm("none");
	mm.Call();
	system("pause");
	return 0;
}
bool Alphabet::operator>(Alphabet &T)
{
	return value > T.GetValue() ? true : false;
}
char Alphabet::GetCharact()const
{
	return ch;
}
int Alphabet::GetValue() const
{
	return value;
}
Alphabet::Alphabet(char ch, int val) :value(val)
{
	this->ch = ch;
}
void Alphabet::AddValue()
{
	value++;
}
StringEncoding::StringEncoding() {}
void StringEncoding::AlphaSort()
{
	int Size = aphb.size();
	for (int i = 0;i < Size;i++) {
		for (int j = 0;j < Size - i - 1;j++)
			if (aphb[j + 1] > aphb[j]) {
				Alphabet T;
				T = aphb[j];aphb[j] = aphb[j + 1];aphb[j + 1] = T;
			}
	}
}
void StringEncoding::ReadFromFile(const char *fileName)
{
	fstream file_in;
	try {
		file_in.open(fileName, ios::in | ios::out);
		if (!file_in)
			throw "File Open Error!";
	}
	catch (const char *Warning)
	{
		cout << Warning << endl;
		system("pause");
		exit(0);
	}
	//成功打开文件
	string line;
	int flag = 0;
	while (getline(file_in, line))
	{
		string buf;
		stringstream ss(line);
		while (ss >> buf)	//逐个字符串读取
		{
			int len = buf.length();
			for (int i = 0;i < len;i++) {
				if (!flag){
					aphb.push_back(Alphabet(buf[i], 1));
					flag = 1;
				}
				else {
					int mark = 0;
					for (vector<Alphabet>::iterator it = aphb.begin();it != aphb.end();it++)
						if (it->GetCharact() == buf[i]) {
							mark = 1;
							it->AddValue();
						}
					if (!mark) aphb.push_back(Alphabet(buf[i], 1));
				}
				flag++;
			}
			String.push_back(buf);
		}
	}
	file_in.close();
}
void StringEncoding::WriteToFile(const char *fileName)	//将String中的字符串按照特征空间的字典写入到另一个文件
{
	fstream file_out1, file_out2;
	try {
		file_out2.open("dictionary.txt", ios::app);
		file_out1.open(fileName, ios::app);
		if (!file_out1||!file_out2)
			throw "Write To File Error!";
	}
	catch (const char *Warning)
	{
		cout << Warning << endl;
		system("pause");
		exit(0);
	}
	//解析字符串
	for (vector<string>::iterator it = String.begin();it != String.end();it++)
	{
		const int Size = aphb.size();
		vector<int>Temp;
		Temp.resize(Size);
		string str = *it;
		int len = str.length();
		for (int i = 0;i < len;i++) Temp[FeatureSpace[str[i]]]++;	//核心:字母--下标--下标

		file_out1 << str + ": ";
		for (int i = 0;i < Size;i++)
			file_out1<< Temp[i] << ' ';
		file_out1 << endl;
	}
	for (vector<Alphabet>::iterator it = aphb.begin();it != aphb.end();it++) {
		file_out2 << it->GetCharact() << (char*)": " << it->GetValue() << endl;
	}
	file_out1.close();
	file_out2.close();
}
void StringEncoding::SearchFromFile(const string str,int key)
{
	//先把str按字典序转化为对应编码
	if (key == 2) {
		fstream file_in;
		string fileName = "file_out.txt";
		try {
			file_in.open(fileName, ios::in | ios::out);
			if (!file_in)
				throw "File Open Error!";
		}
		catch (const char *Warning)
		{
			cout << Warning << endl;
			system("pause");
			exit(0);
		}
		//成功打开文件
		string line;
		int flag = 0;	//控制开关
		while (getline(file_in, line))
		{
			string buf;
			stringstream ss(line);
			while (ss >> buf)	//逐个字符串读取
			{
				if ((str + ":") == buf) {
					flag = 1;
					break;
				}
			}
			if (flag) {
				for (int i = buf.length();i < line.length();i++) cout << line[i];
				cout << endl;
				break;
			}
		}
		if (!flag) {
			cout << "查无此字符串或其编码!" << endl;
		}
		file_in.close();
	}
	else {
		double max = -1;
		string maxString = "null";
		int flag = 0;	//控制开关
		vector<int>vec;
		//下面先计算指定字符串的编码
		vector<int>tt;
		int len = str.length();
		const int ss = aphb.size();
		tt.resize(ss);
		for (int i = 0;i < len;i++) {	//try
			tt[FeatureSpace[str[i]]]++;
		}
		cout << "输入的字符串编码为:";
		for (int i = 0;i < ss;i++) cout << tt[i] << ' ';
		cout << endl;
		for (vector<string>::iterator it = String.begin();it != String.end();it++)
		{
			const int Size = aphb.size();
			vector<int>Temp;
			Temp.resize(Size);
			string str = *it;
			int len = str.length();
			for (int i = 0;i < len;i++) Temp[FeatureSpace[str[i]]]++;	//Temp为生成的字符串编码
																		//计算余弦相似度
			double ans = CountCosine(Temp, tt);
			if (ans > max) {
				vec.clear();
				max = ans;
				maxString = *it;
				vec = Temp;
			}
		}
		cout << "最近字符串的余弦相似度为:" << (double)max << endl;;
		cout << "最相似的字符串为:" << maxString << endl;
		cout << "编码为:";
		for (int i = 0;i < vec.size();i++) cout << vec[i] << ' ';
	}
}
void StringEncoding::RebuildCoding()
{
	AlphaSort();
	int Size = aphb.size();
	for (int i = 0;i < Size;i++)	//重新生成新的字典序并存放于特征空间中
		FeatureSpace[aphb[i].GetCharact()] = i;
}
double StringEncoding::CountCosine(vector<int> Temp, vector<int>tt)
{
	int ss = aphb.size();
	double cos = 0;
	for (int i = 0;i < ss;i++) cos += Temp[i] * tt[i];
	double deno1 = 0, deno2 = 0;
	for (int i = 0;i < ss;i++) {
		deno1 += tt[i] * tt[i];
		deno2 += Temp[i] * Temp[i];
	}
	deno1 = sqrt(deno1)*sqrt(deno2);
	cos = 1.0*cos / deno1;
	return cos;
}
Manager::Manager(string SearchString)
{
	this->SearchString = SearchString;
}
void Manager::Call()
{
	int key;
	ss.ReadFromFile("file_in.txt");
	ss.RebuildCoding();
	ss.WriteToFile("file_out.txt");
	cout << "请输入选项:\n";
	cout << "1.查找余弦相似度 2.查找字符串编码\n";
	cin >> key;
	cout << "请输入您要查询编码的字符串:";
	string str;
	cin >> str;
	ss.SearchFromFile(str,key);
}
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值