文章分词
1.问题描述
中文自然语言处理中,分词是对文本进行分析的最基础工作。现有一段文章,要求对文章进行分词并且对词按词频进行排序。
基本要求:
查找并下载中文词典,根据该词典,对给出的3年内政府工作报告等文章,统计该文本的词频,分析近3年政府工作报告中最高词频的变化。 用快速排序,冒泡排序两种方法对词频进行排序,并保存在不同的文件中。
提高要求:
在基本要求完成的基础上,用直接插入排序、用归并排序、堆排序对词频进行排序,给出效率对比结果,包括时间效率和空间效率,并保存在不同文件中。 分析不同词频情况下(例如正序、逆序、随机)上述几种排序算法的效率。
2.需求分析
软件的基本功能:
根据本地中文词典对给定的文章进行分词,并统计词频。用快速排序,冒泡排序对词频进行排序,并将排序结果保存在不同文件中。再用直接插入排序、归并排序、堆排序对词频进行排序,并对这五种排序算法的时间效率和空间效率进行比较,分析不同词频情况下排序算法的效率,将对比结果保存在不同文件中。
输入/输出形式:
用户可以通过文件进行操作。程序将输出分词结果、词频统计结果、排序结果以及排序算法的效率对比结果到不同文件中。
输入形式:
在“文章输入”文件中输入文章内容。
输出形式:
将分词结果、词频统计结果、排序结果以及排序算法的效率对比结果输出到不同文件中。
测试数据要求:用户可以输入包含大量文本的文章,以及具有不同词频情况的测试数据,用于测试排序算法的效率和准确性。
3.概要设计
3.1 主程序流程
3.2 函数调用关系
4.主函数实现
4.1 main.h
#include <iostream>
#include <string>
#include <unordered_map>
using namespace std;
#define ERROR cerr << "Open error !" << endl; exit(0);
//词语结构
struct Word
{
string word; //词语
int sum; //频率
};
extern int maxLen; //单个词语最大长度
extern int totalWord; //存储词语总个数
extern unordered_map<string, int> hashMap; //存储词语和频率
void processDic(string _inFile);
void forwardMax(string _inFile, string _outFile);
void countWordFreq(Word* arr, string _outFile);
void reversequickSort(Word* r, int low, int high);
void quickResult(Word* arr, string _outFile1, string _outFile2, string _outFile3);
void bubbleResult(Word* arr, string _outFile1, string _outFile2, string _outFile3);
void insertResult(Word* arr, string _outFile1, string _outFile2);
void mergeResult(Word* arr, string _outFile1, string _outFile2);
void heapResult(Word* arr, string _outFile1, string _outFile2);
4.2 main.cpp
#include "main.h"
int maxLen = 0;
int totalWord = 0;
unordered_map<string, int> hashMap;
int main()
{
cout << "正在运行中..." << endl;
processDic("中文词典.txt");
forwardMax("文章输入.txt", "分词结果.txt");
Word* arr = new Word[totalWord];
countWordFreq(arr, "词频统计.txt");
quickResult(arr, "快速排序结果.txt", "时间效率对比结果.txt", "空间效率对比结果.txt");
bubbleResult(arr, "冒泡排序结果.txt", "时间效率对比结果.txt", "空间效率对比结果.txt");
insertResult(arr, "时间效率对比结果.txt", "空间效率对比结果.txt");
mergeResult(arr, "时间效率对比结果.txt", "空间效率对比结果.txt");
heapResult(arr, "时间效率对比结果.txt", "空间效率对比结果.txt");
system("cls");
cout << "已完成!" << endl;
delete[] arr;
return 0;
}
5. 函数实现
5.1 processDic函数
用于处理词典,将词典词语存储在哈希表中,并统计词语的最大长度。
//过滤非中文字符
bool Check(string s)
{
if (s[0] >= 0 && s[0] < 256)
{
return false;
}
else
{
return true;
}
}
//处理词典
void processDic(string _inFile)
{
ifstream inFile(_inFile);
if (!inFile)
{
ERROR;
}
string str;
while (inFile >> str)
{
if (Check(str))
{
if (str.size() > maxLen)
{
maxLen = str.size();
}
hashMap[str] = 0;
}
}
inFile.close();
inFile.clear();
}
5.2 forwardMax函数
用正向最大匹配算法对输入的文章进行分词,在查找过程中统计文章词语数量,便于动态申请数组空间,并统计词频,最后将分词结果保存在文件中。
//文章分词
//正向最大匹配算法
void forwardMax(string _inFile, string _outFile)
{
ifstream inFile(_inFile);
ofstream outFile(_outFile);
if (!inFile || !outFile)
{
ERROR;
}
ostringstream temp;
temp << inFile.rdbuf();
string textTmp = temp.str();
int Begin = 0, End = textTmp.size();
while (Begin < End)
{
string str;
int num;
for (num = min(maxLen, (End - Begin)); num > 0; num--)
{
str = textTmp.substr(Begin, num);
if (hashMap.find(str) != hashMap.end())
{
if (hashMap[str] == 0)
{
totalWord++;
}
outFile << str;
Begin += num;
hashMap[str]++;
break;
}
}
if (num == 0)
{
outFile << textTmp.substr(Begin, 1);
Begin += 1;
}
outFile << "/";
}
inFile.close();
inFile.clear();
outFile.close();
outFile.clear();
}
5.3 countWordFreq函数
用于统计文章词频,并将统计结果保存在文件中。
//统计词频
void countWordFreq(Word* arr, string _outFile)
{
ofstream outFile(_outFile);
if (!outFile)
{
ERROR;
}
int i = 0;
for (unordered_map<string, int>::iterator it = hashMap.begin(); it != hashMap.end(); it++)
{
if (it->second > 0)
{
arr[i].word = it->first;
arr[i].sum = it->second;
i++;
outFile << it->first << "\t\t出现次数:\t" << it->second << endl;
}
}
outFile.close();
outFile.clear();
}
5.4 quickResult函数
实现对随机词频、正序词频、逆序词频的快速排序,并将排序结果,排序时间、空间效率保存在不同文件中。
double quickMemory = 0;
//快速排序
int Part(Word* r, int low, int high)
{
int i = low, j = high;
while (i < j)
{
while (i < j && r[i].sum >= r[j].sum)
{
j--;
}
if (i < j)
{
swap(r[i], r[j]);
i++;
}
while (i < j && r[i].sum >= r[j].sum)
{
i++;
}
if (i < j)
{
swap(r[i], r[j]);
j--;
}
}
quickMemory += sizeof(int) * 2;
return i;
}
void quickSort(Word* r, int low, int high)
{
if (low < high)
{
int pivot = Part(r, low, high);
quickSort(r, low, pivot - 1);
quickSort(r, pivot + 1, high);
}
quickMemory += sizeof(int);
}
//快速排序结果
void quickResult(Word* arr, string _outFile1, string _outFile2, string _outFile3)
{
Word* r = new Word[totalWord];
clock_t start1, end1, start2, end2, start3, end3;
copy(arr, arr + totalWord, r);
ofstream outFile1(_outFile1);
ofstream outFile2(_outFile2);
ofstream outFile3(_outFile3);
if (!outFile1 || !outFile2 || !outFile3)
{
ERROR;
}
start1 = clock();
quickSort(r, 0, totalWord - 1);
end1 = clock();
outFile3 << "排序方法\t\t所占内存大小\n\n";
outFile3 << "快速排序\t\t" << quickMemory / 1024 << "KB" << endl << endl;
start2 = clock();
quickSort(r, 0, totalWord - 1);
end2 = clock();
reversequickSort(r, 0, totalWord - 1);
start3 = clock();
quickSort(r, 0, totalWord - 1);
end3 = clock();
for (int i = 0; i < totalWord; i++)
{
outFile1 << r[i].word << "\t\t出现次数:\t" << r[i].sum << endl;
}
outFile2 << "排序方法\t\t随机词频用时\t\t正序词频用时\t\t逆序词频用时\n\n";
outFile2 << "快速排序\t\t" << double(end1 - start1) / CLOCKS_PER_SEC << "s\t\t\t"
<< double(end2 - start2) / CLOCKS_PER_SEC << "s\t\t\t" << double(end3 - start3) / CLOCKS_PER_SEC << "s\n\n";
delete[] r;
outFile1.close();
outFile1.clear();
outFile2.close();
outFile2.clear();
outFile3.close();
outFile3.clear();
}
5.5 bubbleResult函数
double bubbleMemory = 0;
//冒泡排序
void bubbleSort(Word* r, int n)
{
int exchange = n;
while (exchange != 0)
{
int bound = exchange;
exchange = 0;
for (int i = 1; i < bound; i++)
{
if (r[i - 1].sum < r[i].sum)
{
swap(r[i - 1], r[i]);
exchange = i;
}
}
}
bubbleMemory += sizeof(int) * 3;
}
//冒泡排序结果
void bubbleResult(Word* arr, string _outFile1, string _outFile2, string _outFile3)
{
Word* r = new Word[totalWord];
clock_t start1, end1, start2, end2, start3, end3;
copy(arr, arr + totalWord, r);
ofstream outFile1(_outFile1);
ofstream outFile2(_outFile2, ofstream::app);
ofstream outFile3(_outFile3, ofstream::app);
if (!outFile1 || !outFile2 || !outFile3)
{
ERROR;
}
start1 = clock();
bubbleSort(r, totalWord);
end1 = clock();
outFile3 << "冒泡排序\t\t" << bubbleMemory / 1024 << "KB" << endl << endl;
start2 = clock();
bubbleSort(r, totalWord);
end2 = clock();
reversequickSort(r, 0, totalWord - 1);
start3 = clock();
bubbleSort(r, totalWord);
end3 = clock();
for (int i = 0; i < totalWord; i++)
{
outFile1 << r[i].word << "\t\t出现次数:\t" << r[i].sum << endl;
}
outFile2 << "冒泡排序\t\t" << double(end1 - start1) / CLOCKS_PER_SEC << "s\t\t\t"
<< double(end2 - start2) / CLOCKS_PER_SEC << "s\t\t\t" << double(end3 - start3) / CLOCKS_PER_SEC << "s\n\n";
delete[] r;
outFile1.close();
outFile1.clear();
outFile2.close();
outFile2.clear();
outFile3.close();
outFile3.clear();
}
5.7 insertResult函数
double insertMemory = 0;
//直接插入排序
void insertSort(Word* r, int n)
{
for (int i = 1; i < n; i++)
{
Word temp = r[i];
int j;
for (j = i; j > 0 && r[j - 1].sum < temp.sum; j--)
{
r[j] = r[j - 1];
}
r[j] = temp;
}
insertMemory += sizeof(int) * 2 + sizeof(Word);
}
//直接插入排序结果
void insertResult(Word* arr, string _outFile1, string _outFile2)
{
Word* r = new Word[totalWord];
clock_t start1, end1, start2, end2, start3, end3;
copy(arr, arr + totalWord, r);
ofstream outFile1(_outFile1, ofstream::app);
ofstream outFile2(_outFile2, ofstream::app);
if (!outFile1 || !outFile2)
{
ERROR;
}
start1 = clock();
insertSort(r, totalWord);
end1 = clock();
outFile2 << "直接插入排序\t" << insertMemory / 1024 << "KB" << endl << endl;
start2 = clock();
insertSort(r, totalWord);
end2 = clock();
reversequickSort(r, 0, totalWord - 1);
start3 = clock();
insertSort(r, totalWord);
end3 = clock();
outFile1 << "直接插入排序\t" << double(end1 - start1) / CLOCKS_PER_SEC << "s\t\t\t"
<< double(end2 - start2) / CLOCKS_PER_SEC << "s\t\t\t" << double(end3 - start3) / CLOCKS_PER_SEC << "s\n\n";
delete[] r;
outFile1.close();
outFile1.clear();
outFile2.close();
outFile2.clear();
}
5.8 mergeResult函数
double mergeMemory = sizeof(Word) * totalWord;
//归并排序
void Merge(Word* a, Word* b, int low, int mid, int high)
{
int i = low, j = mid + 1, k = 0;
while (i <= mid && j <= high)
{
if (a[i].sum >= a[j].sum)
{
b[k++] = a[i++];
}
else
{
b[k++] = a[j++];
}
}
while (i <= mid)
{
b[k++] = a[i++];
}
while (j <= high)
{
b[k++] = a[j++];
}
k = 0;
for (int i = low; i <= high; i++)
{
a[i] = b[k++];
}
mergeMemory += sizeof(int) * 3;
}
void mergeSort(Word* r1, Word* r2, int low, int high)
{
if (low < high)
{
int mid = (low + high) / 2;
mergeSort(r1, r2, low, mid);
mergeSort(r1, r2, mid + 1, high);
Merge(r1, r2, low, mid, high);
}
mergeMemory += sizeof(int);
}
//归并排序结果
void mergeResult(Word* arr, string _outFile1, string _outFile2)
{
Word* r1 = new Word[totalWord];
Word* r2 = new Word[totalWord];
clock_t start1, end1, start2, end2, start3, end3;
copy(arr, arr + totalWord, r1);
ofstream outFile1(_outFile1, ofstream::app);
ofstream outFile2(_outFile2, ofstream::app);
if (!outFile1 || !outFile2)
{
ERROR;
}
start1 = clock();
mergeSort(r1, r2, 0, totalWord - 1);
end1 = clock();
outFile2 << "归并排序\t\t" << mergeMemory / 1024 << "KB" << endl << endl;
start2 = clock();
mergeSort(r1, r2, 0, totalWord - 1);
end2 = clock();
reversequickSort(r1, 0, totalWord - 1);
start3 = clock();
mergeSort(r1, r2, 0, totalWord - 1);
end3 = clock();
outFile1 << "归并排序\t\t" << double(end1 - start1) / CLOCKS_PER_SEC << "s\t\t\t"
<< double(end2 - start2) / CLOCKS_PER_SEC << "s\t\t\t" << double(end3 - start3) / CLOCKS_PER_SEC << "s\n\n";
delete[] r1;
delete[] r2;
outFile1.close();
outFile1.clear();
outFile2.close();
outFile2.clear();
}
5.9 heapResult函数
double heapMemory = 0;
//堆排序
void Sift(Word* r, int start, int end)
{
int i = start, j = 2 * start + 1;
while (j < end)
{
if (j + 1 < end && r[j].sum > r[j + 1].sum)
{
j++;
}
if (r[i].sum < r[j].sum)
{
break;
}
else
{
swap(r[i], r[j]);
i = j;
j = i * 2 + 1;
}
}
heapMemory += sizeof(int) * 2;
}
void heapSort(Word* r, int n)
{
for (int i = n / 2 - 1; i >= 0; i--)
{
Sift(r, i, n);
}
for (int i = n - 1; i > 0; i--)
{
swap(r[0], r[i]);
Sift(r, 0, i);
}
heapMemory += sizeof(int);
}
//堆排序结果
void heapResult(Word* arr, string _outFile1, string _outFile2)
{
Word* r = new Word[totalWord];
clock_t start1, end1, start2, end2, start3, end3;
copy(arr, arr + totalWord, r);
ofstream outFile1(_outFile1, ofstream::app);
ofstream outFile2(_outFile2, ofstream::app);
if (!outFile1 || !outFile2)
{
ERROR;
}
start1 = clock();
heapSort(r, totalWord);
end1 = clock();
outFile2 << "堆排序\t\t" << heapMemory / 1024 << "KB" << endl << endl;
start2 = clock();
heapSort(r, totalWord);
end2 = clock();
reversequickSort(r, 0, totalWord - 1);
start3 = clock();
heapSort(r, totalWord);
end3 = clock();
outFile1 << "堆排序\t\t" << double(end1 - start1) / CLOCKS_PER_SEC << "s\t\t\t"
<< double(end2 - start2) / CLOCKS_PER_SEC << "s\t\t\t" << double(end3 - start3) / CLOCKS_PER_SEC << "s\n\n";
delete[] r;
outFile1.close();
outFile1.clear();
outFile2.close();
outFile2.clear();
}
6. 使用说明
在程序文件同一目录下的“文章输入.txt”中输入所要进行操作的文章。
运行程序,待控制台显示由“正在运行中…”转变成“已完成!”时,分词、统计、排序等操作便一并完成,并以文件形式保存在同一目录下,通过文件名称即可查看相应操作结果。
7. 程序具体实现
点击下方链接即可 ^ ^
此为链接,可查看/下载