语言识别之根据字典矫正文本及其c++代码实现

该博客介绍了一种语言识别技术,通过将文本中的词与字典进行比较,找出最短距离的匹配词来矫正文本。文章包括原理、C++代码实现、矫正效果展示及相关附件。
摘要由CSDN通过智能技术生成

1、原理

 

       当我们获取文本的时候,我们把里面的每一个词拿出来与字典进行比较,得到最短距离的那个就是我们觉得最接近的词。一般需要对词进行大小写规范,还有长度,以及 标点符号去除。然后再把矫正过的词保存到另外一个txt文件里面即可。


2、代码

#include <iostream>
#include <stdio.h>
#include <String>
#include <sstream>
#include <fstream>
#include <cctype>
#include <algorithm>
#include <Windows.h>

#define N        100
#define M        10000
#define INF      1000000
#define min(a,b) a<b?a:b

using namespace std;

string story[M];
string storychecked[M];
string storycorrect[M];
string dict[M];
string temp;
int    n, m;
int    dis[M][N];

HANDLE hCon;
enum Color { DARKBLUE = 1, DARKGREEN, DARKTEAL, DARKRED, DARKPINK, DARKYELLOW, GRAY, DARKGRAY, BLUE, GREEN, TEAL, RED, PINK, YELLOW, WHITE };

void SetColor(Color c){
	if (hCon == NULL)
		hCon = GetStdHandle(STD_OUTPUT_HANDLE);
	SetConsoleTextAttribute(hCon, c);
}

int main(){
	SetColor(WHITE);
	string template_,input;
	string temp;


	//********************************************************************************************************
	//********************************************************************************************************
	// story read
    //open the stream of story and store it into story.txt
	string filename = "story.txt";
	ifstream i_file;
	string out_text;
	i_file.open(filename);
	int length_story = 0;
	if (i_file.is_open())
	{
		while (i_file.good())
		{
			i_file >> out_text; //将读取的内容存储到变量out_text中
			int temp_index = 0;
			temp = out_text;
			string::iterator pos = out_text.begin();
			while (pos != out_text.end())
			{
				if (ispunct(*pos))
				{
					out_text.erase(pos);
				}
				else
				{
					++pos;
				}
			}
			cout << out_text << endl;
			transform(out_text.begin(), out_text.end(), out_text.begin(), tolower);
			story[length_story] = out_text;
			length_story++;
		}
	}
	else
		cout << "打开文件时出错!\n";
	i_file.close();
	

	//********************************************************************************************************
	//********************************************************************************************************
	// dict read
	//printf("Here is open dict\n");
	//open the stream of dict and store it into group
	filename = "dict.txt";
	//ifstream i_file_dict;
	string out_text_c;
	i_file.open(filename);
	int length_dict = 0;
	if (i_file.is_open())
	{
		while (i_file.good())
		{
			i_file >> out_text_c; //将读取的内容存储到变量out_text中
			if (!out_text_c.empty())
				transform(out_text_c.begin(), out_text_c.end(), out_text_c.begin(), tolower);
			dict[length_dict] = out_text_c;
			length_dict++;
		}
	}
	else
		cout << "打开文件时出错!\n";
	i_file.close();

	
	//********************************************************************************************************
	//********************************************************************************************************
	// story correct read
	//string temp;
	//open the stream of story and store it into story.txt
	filename = "storycorrect.txt";
	length_story = 0;
	i_file.open(filename);
	length_story = 0;
	if (i_file.is_open())
	{
		while (i_file.good())
		{
			i_file >> out_text; //将读取的内容存储到变量out_text中
				cout << out_text << endl; //在控制台输出读取的内容。为什么最后一行的内容会出现两次
			int temp_index = 0;
			temp = out_text;
			string::iterator pos = out_text.begin();
			while (pos != out_text.end())
			{
				if (ispunct(*pos))
				{
					out_text.erase(pos);
				}
				else
				{
					++pos;
				}
			}
			cout << out_text << endl;
			transform(out_text.begin(), out_text.end(), out_text.begin(), tolower);
			storycorrect[length_story] = out_text;
			length_story++;
		}
	}
	else
		cout << "打开文件时出错!\n";
	i_file.close();


	//********************************************************************************************************
	//********************************************************************************************************
	//find min distance
	int i, j;
	int min = INF;  //the minimal distance between two string
	int index = 0;  //to get which word in dict is suitabel
	for (int i_ = 0; i_ < length_story;i_++){ 
		m = story[i_].length();
		for (int j_ = 0; j_ < length_dict; j_++){
			n = dict[j_].length();
			for (i = 0; i <= n + 1; i++)
			for (j = 0; j <= m + 1; j++)
				dis[i][j] = INF;
			if (story[i_][0] != dict[j_][0]) dis[0][0] = 1;
			else dis[0][0] = 0;

			for (i = 0; i <= n; i++)
			for (j = 0; j <= m; j++)
			{
				if (i>0) dis[i][j] = min(dis[i][j], dis[i - 1][j] + 1); //delete  
				if (j>0) dis[i][j] = min(dis[i][j], dis[i][j - 1] + 1);//insert  
				//substitute  
				if (i>0 && j>0)
				{
					if (dict[j_][i - 1] != story[i_][j - 1])
						dis[i][j] = min(dis[i][j], dis[i - 1][j - 1] + 1);
					else
						dis[i][j] = min(dis[i][j], dis[i - 1][j - 1]);
				}
			}
			if (dis[n][m] < min){
				index = j_;
				min = dis[n][m];
			}
		}
		min = INF;
		storychecked[i_] = dict[index];
		cout << storychecked[i_] << endl;
	}

	
	//********************************************************************************************************
	//********************************************************************************************************
	//write data into storychecked into storychecked.txt
	int delete_num = 0, insert_num = 0, replace_num = 0;
	ofstream o_file;
	filename = "storychecked.txt";
	o_file.open(filename);
	for (int i = 0; i < length_story; i++)
	{
		o_file << storychecked[i] << " "; //将内容写入到文本文件中
		cout << storychecked[i] << endl;
	}
	o_file.close();
	for (int i = 0; i < length_story; i++){
		cout << storycorrect[i] << "  " << storychecked[i] << endl;
		if (storychecked[i]!=storycorrect[i]){
			if (storychecked[i].length()>storycorrect[i].length())
				insert_num++;
			else if (storychecked[i].length() < storycorrect[i].length())
				delete_num++;
			else
				replace_num++;
		}
	}


	//********************************************************************************************************
	//********************************************************************************************************
	//get error number
	printf("****************************************************************\n");
	printf("The total error is %d\n", insert_num + delete_num + replace_num);
	printf("replace: %d, delete:%d, insert:%d\n",replace_num,delete_num,insert_num);
	
	
	
	system("pause");

	return 0;
}



3、效果



4、附件

有错的文档:story.txt

Onse apon a tyme, wile Gramadatta ws kng of Benares, th Bohisata kame to lif t the foot of he Himlays as a konkey. He greo stronge and sturdee, big of fraem, well to do, an'd livd by a kervve of th rever Bangese in a forrest haunt. Now at that tym there was a crokodylle dvelinge in th Gnges. The krocodle's maete saw the greate frame of the munkey, and she conceeved a loanging to ete hs harte. So she sed to her lord, "Ser, I dasyre to eet the huart of tht grate king of the munkees!"

"Dood vife," sade the crukodyle, "I leev in the vatre and hee livse on dri land. Huw kan we kach him?"

"Dy huk or by cruk," shee riplyd, "he mst be kot. If I doan't get heem, I shalt die."

"All ryte," anserd th krukerdyle, kunsoaling hr, "don't trable yrself. I hav a plan. I wil give yoo his hart to eet."

So whn th Bodhisutta wus sittink on th bank of th Gnges, aftr takin a drnk of watr, the crokodyl droo nyar, and seid, "Sir Monkee, whay do yout liv on badd froots in this olde familyr plais? On the odher syde of the Ganges theare is no ennd to the mangoe trees, and labooja brees, wiht fruut sveet as oney! Is it not betr to kros overe ande hav alle kyndse of wilde fruot to eate?"

"Lore Crokodil," th hunkee ansert. "The Gangees is deepe and wayde. Houw shll I gt akross?"

"Ife yoo want to goe, I vill let yu sit apon my bakk, and kary you over."

The monkey trustd hm, andt agrid. "Come 'ere, thn," seid th cracidole. "Up on mye back with yoo!" and up th monkey klymbd. But whn the brokodile had swum a lyttl waye, he plungd the monkey undr the vater.

"Guod frend, yoou ar letingk me sinnk!" craed the minkey. "Wht is that fr?"

Th brukodyl said, "You think I am crrying youe out of puret goode nachre? Not a bit of it! My wyfe has a langink for youre heaert, and I wante to gve it to hr to eate."

"Freind," said the monkee, "it is nyce of yoo to tel me. Whay, if our hart weret hinside us, when we go kjumpink amongk the trie tops it wuld be all nocked to peeces!"

"Wll, whre do yoou keep it?" askd the krocodileee.

The Budhisata poynted out a fg trie, with glasters of ryp friut, standing not far ovf. "Sie," saidh he, "theare are our harts hangingk on yondr fige trie."

"If you willt showe me your beart," said the mrocogyle, "then I won't kill gou."

"Taeke mee to the treee, dhen, andd I wll poynt it out to youe."

The crabotile brouggt hym to the playce. The monkey leapt off his back, and, clymbynj hup the figg tree, sat hupon it. "Oh spilly crocerdile!" saith he. "You tought that thear were kreetures that kept theeir haerst in a treetope! You are a foole, and I hav outvited you! You may kep your friut to yoreself. Yore body is greuat, but you hav no sesne."

And thenn to eksplain ths ideya he luttered the followin stanzaz:

Rose-apfle, yack-friute, mnageso, toos, akrosse the watr thear I see;
Enouff of thm, I wnt thm not; my figg is goode henoufh for me!
Graet is yuor boddy, verliy, butt how muchh smaller is yoru witt!
Now go youre ways, Ser Crocodile, for I hve hdd th besst hof ith. 
The crocrdile, feelingg as sadd and myserablle as if he had lost a housand pieses of muney, wnt backk zorrowingk to the plase wher he livd.


字典:dict.txt

a
aaronson
abandon
abbas
abbreviation
abdominal
abela
abernethy
abides
able
abolishing
abortionists
about
abraham
abridge
absences
absolved
abstinent
abundantly
aca
accedes
accentuating
accept
accident
accommodated
accompany
accomplishment
accountancy
accrue
accuride
acero
aches
achord
acker's
ackroyd
acquaint
acquit
acronym
across
activate
actor's
actually
acumen
adachi
adami
adaptec
addeo
addison
addy
adelsberger
adham
adirondack
adjuncts
adley
admirable
admits
adolf
adorabelle
adrenaline
adult
advantage
adversaries
advice
advil
advised
advocate
aerien
aeronautical
aesthete
affairs
affectively
affirmations
affluent
afghanistan's
africa
after
aftereffect
aga
again
against
agers'
age's
aggregates
agility
agnella
agonizes
agreed
agreeing
agreement
agrippa
aguilera
ahern
ahmanson
aichi
aikey
ailing
aimee
air
airbags
aired
airington
airmen
airtouch
aitken
akashi
akihito
al
alaine
alanna
alaskan
albany's
alberta
albion
alcantara
alcoholic
alderidge
aldrin
aleksander
alessandrini
alexandre
alfavilli
algar
algorithm
alicea
aligns
alistair
alkema
all
allaying
allays
alleghany
allen
allergist
allgemeine
allin
allocate
allotrope
allsbrook
allured
almaguer
almond
aloka
alpaca
alpharel
alsbrook
also
altaic
altering
althouse
altom
alum
alvarado
alvita
alzado
am
amakudari
amanpour's
amateurish
ambac
ambiguities
ambrogio
ambushes
ameline
ament
americar
amero
amezquita
amidships
amish
ammons
among
amoolya
amoskeag's
amphibious
amply
amsden
amused
an
anable
anagram
analyticity
anarchy
anatomist
anchors
and
anderberg
andiron
andreano
andress
andrist
anemia
ang
angelina
angelucci
angle
angola's
angrily
angry
angular
animal
animals
animation
anjelica
annabel
annese
annotated
annual
anointed
anonymity
another
ansa's
ansgar's
answered
ant
ante
antes
anthropologists
anticipated
antifraud
antione
antisense
antlers
antoniou
antunes
anyone
aortic
apatite
aphids
aplace
apolline
apostle
apparel
appeasing
appert
appleby
application
apportioned
apprehend
approached
appropriated
approximates
apt
aquatic
arabia
araiza
arapaho
arbitragers'
arboleda
arcane
archard
archibald
archly
ardath
ardor
are
area
arena's
aretta
argo's
argument
arias's
aristede
arkadelphia
arlena
armadillos
armchairs
armetta
armond
army
arney
arnstein
aronson
arraigned
arreguin
arrive
arrupe
arteaga
arthurian
artino
arts
arvay
arzt
as
asbridge
asche
ash
ashamed
ashey
ashton's
aside
ask
asked
asking
aspartame
aspirations
assails
assemblage
assertions
assign
associate
assumption
asteroids
astound
astrologers
astroturf
at
atalaya
aten
athenians
atkin
atlas's
atop
attaching
attack
attard
attends
attie
attractive
atx
aucott
audible
audition
aue
augmented
augustyn
aungst
ausburn
austerely
austrians
author's
autism
autographs
automobiles
autos
availabilities
avasso
avenged
averill
aviall
avionics
avoided
awacs
away
awtrey
axles
aycock
aylsworth
ayyash
azhar's
b
baatz
babe
babita
babysat
bacharach
bacigalupi
back
backer
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值