文本文件检索

最新推荐文章于 2024-01-08 01:26:49 发布

吃了三只小熊

最新推荐文章于 2024-01-08 01:26:49 发布

阅读量441

点赞数 1

文章标签： c++

本文链接：https://blog.csdn.net/qq_46142764/article/details/115048373

版权

2.1 项目总体框架
本题主要是针对字符串检索，常见的字符串检索算法有：朴素模式匹配（暴力匹配），KMP模式匹配。
2.2 关键算法分析
算法1：朴素模式
【5】算法功能
指定字符串检索
【6】算法基本思想
从主串s 和子串t 的第一个字符开始，将两字符串的字符一一比对，如果出现某个字符不匹配，主串回溯到第二个字符，子串回溯到第一个字符再进行一一比对。如果出现某个字符不匹配，
主串回溯到第三个字符，子串回溯到第一个字符再进行一一比对…一直到子串字符全部匹配成功。
【7】算法空间、时间复杂度分析
O（m*n）
【8】代码逻辑（可用伪代码描述）
int simple(str s, str goal, int begin)
{
int i = begin; //指定位置开始查找
int j, k, m, n;
m = goal.length; //模式串长度赋m
n = s.length; //目标串长度赋n
for (;i < n;i++)
{
j = 0;
k = i; //目标串起始位置i送入k
while (j <= m && s.data[k] == goal.data[j])
{
k++;
j++; //继续下一个子符的比较
}
if (j == m) //若相等，则说明找到匹配的子串，返回匹配位置i
return i + 1; //否则从下一个位置重新开始比较
}
return -1;

算法2：函数名
【9】算法功能
指定字符串检索
【9】算法基本思想
如果字符匹配，则主串和子串字符同时右移。至于子串回溯到哪个字符，这个问题我们先放一放。
现在发现了不匹配的地方，根据KMP的思想我们要将子串向后移动，现在解决要移动多少的问题。
a.说明一下：一个字符串最长相等前缀和后缀。
在“aba”中，前缀集就是除掉最后一个字符’a’后的子串集合{a,ab}，同理后缀集为除掉最前一个字符a后的子串集合{a,ba}，那么两者最长的重复子串就是a，k=1；
在“ababa”中，前缀集是{a,ab,aba,abab}，后缀集是{a,ba,aba,baba}，二者最长重复子串是aba，k=3；
事实上，每一个字符前的字符串都有最长相等前后缀，而且最长相等前后缀的长度是我们移位的关键，因为在P的每一个位置都可能发生不匹配，也就是说我们要计算每一个位置j对应的k，所以用一个数组next来保存，next[j] = k，表示当T[i] != P[j]时，j指针的下一个位置。另一个非常有用且恒等的定义，因为下标从0开始的，k值实际是j位前的子串的最大重复子串的长度。所以next[i]=j,含义是：下标为i 的字符前的字符串最长相等前后缀的长度为j。
【10】代码逻辑（可用伪代码描述）
/*

查找的单词的最长相等前后缀
next[i]=j,含义是：下标为i 的字符前的字符串最长相等前后缀的长度为j。
/
int GetNext(str goal_s)
{
//由模式串t求出next值

int gls = goal_s.length;//单词长度
int next[MAXSIZE];//目标单词的next数组
int j = 0, k = -1;
next[0] = -1;//第一个字符前无字符串，给值-1
while (j < gls)
{
if (k == -1 || goal_s.data[j] == goal_s.data[k])
{
j++;k++;
if (goal_s.data[j] != goal_s.data[k])
//这里的t.data[k]是t.data[j]处字符不匹配而会回溯到的字符
//为什么？因为没有这处if判断的话，此处代码是next[j]=k;
//next[j]不就是t.data[j]不匹配时应该回溯到的字符位置嘛
next[j] = k;
else
next[j] = next[k];
//
//此时nextval[j]的值就是就是t.data[j]不匹配时应该回溯到的字符的nextval值
//即字符不匹配时回溯两层后对应的字符下标
}
else k = next[k];
//next[k]的值代表的是下标为k的字符前面的字符串最长相等前后缀的长度
//也表示该处字符不匹配时应该回溯到的字符的下标
//这个值给k后又进行while循环判断，此时t.data[k]即指最长相等前缀后一个字符
}
return next;
}

/*KMP算法
*begin为开始搜索的位置
/
int KMP(str s, str goal_s, int next, int begin)

{
int sl = s.length;
int gsl = goal_s.length;//要查找的字符长度
int i = begin;
int j = 0;
while (i < sl && j < gsl)
{
if (s.data[i] == goal_s.data[j] || j == -1)
{
i++;j++;
}
else j = next[j];
}
if (j >= gsl)
return(i - gsl + 1);
else
return(-1);//未找到
}

#include
#include
#include
#include
#include
#include
#include
using namespace std;
class Dataset {
private:
int _command; //命令
int _currentLine; //目前所在行
int _wordCount; //单词计数
int _nonWordCount; //非单词计数
string _currentWords; //目前的单词串
map<string, int>_wordMap; //单词地图（在头文件添加#include）

public:
Dataset() :_command(0) {} //构造函数
void createFile(); //创建文本文件
void printMenu(); //打印菜单
void textCount(); //统计单词与非单词个数
void wordLocate(); //统计单词出现的次数与位置
void executeCommand(); //执行各项命令
void wordPosition(string fileName); //返回单词的位置
void wordCount(string fileName); //单词数量统计
};

//打印菜单函数
void Dataset::printMenu() {
cout << “******************************************************” << endl;
cout << “----------------文本文件单词的检索与计数---------------” << endl;
cout << " * 【1】建立文本文档 * " << endl;
cout << " * 【2】文本单词汇总 * " << endl;
cout << " * 【3】待查单词定位 * " << endl;
cout << " * 【4】退出检索系统 * " << endl;
cout << “******************************************************” << endl;

}
//文本创建函数
void Dataset::createFile() {
string fileName, line;
cout << “输入要建立的文件名” << endl;
cin >> fileName;
ofstream file; //在头文件添加#include
file.open(fileName);

char command = 'n';
while (command == 'n') {
	cout << "请输入一行文本：";
	getline(cin, line);						
	getline(cin, line);
	file << line << endl;
	cout << "输入结束了吗 y or n :";
	cin >> command;
}
cout << "建立文件结束！" << endl;
file.close();

}
//单词计数函数
void Dataset::wordCount(string fileName) {
ifstream file;
file.open(fileName);
while (getline(file, _currentWords)) {
_currentLine++;
//建立一个stringstream的对象，并将刚才读取的line(字符串)的内容放在ss里
stringstream ss(_currentWords);
string word;
while (ss >> word) {
if (word[0] >= ‘A’ && word[0] <= ‘Z’ || word[0] >= ‘a’ && word[0] <= ‘z’)
_wordCount++;
else
_nonWordCount++;
if (_wordMap[word] == 0)
_wordMap[word] = 1;
else
_wordMap[word]++;
}
}
file.close();
}
//统计单词与非单词个数
void Dataset::textCount() {
_currentLine = 0;
_wordCount = 0;
_nonWordCount = 0;
cout << “请输入文件名:”;
string fileName;
cin >> fileName;

_wordMap.clear();
wordCount(fileName);

cout << ">>>>>>>>>>>>单词<<<>>>>个数<<<<<<<" << endl;
for (auto iter = _wordMap.begin(); iter != _wordMap.end(); iter++)
{
	cout.fill(' ');
	cout << setw(15) << iter->first << setw(10) << iter->second << endl;
}
cout << endl << ">>>>>>>>>>>>>>>>" << fileName << "的单词总数为" << _wordCount << "个" << endl << endl;
cout << ">>>>>>>>>>>>>>>>" << fileName << "的非单词总数为" << _nonWordCount << "个" << endl << endl;

}
//单词位置统计函数
void Dataset::wordPosition(string fileName) {
string word;
cout << “要检索的单词:”;
cin >> word;
_currentLine = 0;
int position = 0;
ifstream file;
file.open(fileName);

vector<int> _wordPosition;					//在头文件添加#include<vector>
while (getline(file, _currentWords)) {
	position = 1;
	_wordPosition.clear();
	_currentLine++;
	_wordCount = 0;
	//建立一个stringstream的对象，并将刚才读取的line(字符串)的内容放在ss里
	stringstream ss(_currentWords);
	string tempWord;
	while (ss >> tempWord) {
		if (word == tempWord) {
			_wordCount++;
			_wordPosition.push_back(position);
		}
		position += tempWord.size() + 1;
	}
	if (_wordCount != 0) {
		cout << "行号：" << _currentLine << ",出现次数为：" << _wordCount << ",起始位置分别为：第";
		for (auto iter = _wordPosition.begin(); iter != _wordPosition.end(); iter++)
			cout << "   " << *iter;
		cout << "个字符" << endl;
	}
}
file.close();

}
//文本文件字串的定位统计及定位
void Dataset::wordLocate() {
cout << “=" << endl;
cout << “|| 文本文件字串的定位统计及定位 ||” << endl;
cout << "||=||" << endl;
cout << “|| a.单词出现的次数 ||” << endl;
cout << “|| ||” << endl;
cout << “|| ||” << endl;
cout << “|| b.单词出现的位置 ||” << endl;
cout << “|| ||” << endl;
cout << "||=========================================||” << endl;

string command;
cout << "请输入a或b：";
cin >> command;
string word;
if (command == "a") {
	string fileName;
	cout << "请输入文本文件名：";
	cin >> fileName;

	cout << "请输入要统计计数的单词：";
	cin >> word;
	_wordMap.clear();
	wordCount(fileName);
	cout << "单词" << word << "在文本文件" << fileName << "中共出现" << _wordMap[word] << "次" << endl;
}
else if (command == "b") {
	string fileName;
	cout << "请输入文本文件名：";
	cin >> fileName;
	wordPosition(fileName);
}
else
	cout << "输入错误，已退出！" << endl;

}
//执行指令函数
void Dataset::executeCommand() {
while (_command != 4) {
printMenu();
cout << “请选择<1-4>:”;
cin >> _command;
switch (_command) {
case 1:
createFile();
break;
case 2:
textCount();
break;
case 3:
wordLocate();
break;
case 4:
break;
default:
cout << “输入错误，请重新输入” << endl;
break;
}
}
}