使用二叉搜索树（BST）和顺序搜索树（SST）实现文本词频统计（c++）

最新推荐文章于 2022-10-24 10:06:39 发布

Elylicery

最新推荐文章于 2022-10-24 10:06:39 发布

阅读量917

点赞数 1

分类专栏：一些小实战demo（c/c++/python）文章标签： C++ 二叉搜索树

本文链接：https://blog.csdn.net/sinat_38368658/article/details/87910058

版权

一些小实战demo（c/c++/python）专栏收录该内容

7 篇文章 1 订阅

订阅专栏

1. 定义数据结构：

定义二叉搜索树（Binary Search Tree）:

之前已经讲过：https://blog.csdn.net/sinat_38368658/article/details/87652790

BST.h

#ifndef BST_H_INCLUDED
#define BST_H_INCLUDED

#include<iostream>
#include<vector>
#include<string>

using namespace std;

template<typename Key,typename Value>
class BST{

private:
    struct Node {
        Key key;
        Value value;
        Node *left;
        Node *right;

        Node(Key key,Value value){
            this->key=key;
            this->value=value;
            this->left=this->right=NULL;
        }

    };

    Node *root;
    int count;

public:
    BST(){
        root = NULL;
        count =0;
    }
    ~BST(){
        //
    }
    int size(){
        return count;
    }

    bool isEmpty(){
        return count==0;
    }

    void insert(Key key,Value value){
        root = insert(root,key,value);
    }

    bool contain(Key key){

        return contain(root,key);
    }

    //使用Value* 使得当查找不存在的时候，可以返回空
    Value* search(Key key){
        return search(root,key);
    }


private:
    //递归定义的insert方法
    //向以node为根的二叉搜索树中，插入结点（key，value）
    //返回插入新节点后的二叉搜索树的根
    Node* insert(Node* node,Key key,Value value ){

        //递归到底的的情况
        if(node == NULL)
        {
            count++;
            return new Node(key,value);
        }

        if(key == node->key)
            node->value = value;
        else if(key<node->key)
            node->left = insert(node->left,key,value);
        else //key > node->key
            node->right = insert(node->right,key,value);

        return node;
    }

    //查看以node为根的二叉搜索树中是否包含键值为key的节点
    bool contain(Node* node, Key key){

        //递归到底的最基本的情况
        if(node==NULL)
        {
            return false;
        }

        if(key==node->key)
            return true;
        else if(key<node->key)
            return contain(node->left,key);
        else
            return contain(node->right,key);
    }

    //在以node为根的二叉搜索树中查找key所对应的value
    Value* search(Node* node,Key key){

        if(node==NULL)
            return NULL;

        if(key==node->key)
            return &(node->value);
        else if(key<node->key)
            return search(node->left,key);
        else
            return search(node->right,key);
    }
};



#endif // BST_H_INCLUDED

定义顺序搜索树（Sequence Search Tree）:

我们的顺序查找表，内部本质是一个链表

SST.h

#ifndef SST_H_INCLUDED
#define SST_H_INCLUDED

//Sequence Search Tree

#include<iostream>
#include<cassert>

using namespace std;

//顺序查找表
//BINARY_SEARCH_TREE_SEARCH_SEQUENCEST
template<typename Key,typename Value>
class SequenceSt{

private:
    //顺序查找表中的节点为私有的结构体，外界不需要了解顺序查找表中节点的具体实现
    //我们的顺序查找表，内部本质是一个链表
    struct Node{
        Key key;
        Value value;
        Node *next;

        Node(Key key,Value value)
        {
            this->key=key;
            this->value=value;
            this->next=NULL;
        }
    };

    Node* head;//表头
    int count;//顺序查找表中的结点个数

public:
    SequenceSt()
    {
        head=NULL;
        count=0;
    }

    ~SequenceSt()
    {
        while(head!=NULL)
        {
            Node *node = head;
            head=head->next;
            delete node;
            count--;
        }
    }

    int size(){
        return count;
    }

    bool isEmpty(){
        return count==0;
    }

    //向顺序表中插入一个新的(key,value)数组对
    void insert(Key key,Value value){

        //查找一下整个顺序表，看是否存下同样大小的key
        Node *node = head;
        while(node!=NULL){
            if(key==node->key){
                node->value=value;
                return;
            }
            node=node->next;
        }

        //若顺序表中没有同样大小的key，则创建新节点，将新节点直接插在表头
        //头插法
        Node *newNode = new Node(key,value);
        newNode->next=head;
        head=newNode;
        count++;
    }

    //查看顺序表中是否包含键值为key的结点
    bool contain(Key key){

        Node *node= head;
        while(node!=NULL){
            if(key==node->key)
                return true;
            node=node->next;
        }
        return false;
    }

    //在顺序查找表中查找key所对应的value，若value不存在，则返回NULL
    Value* search(Key key){

        Node *node = head;
        while(node!=NULL){
            if(key==node->key)
                return &(node->value);
            node=node->next;
        }
        return NULL;
    }

    //在顺序查找表中删除（key，value）所对应的结点
    void remove(Key key){

        if(head=NULL)
            return;
       // 如果待删除的节点就是头结点, 则需要特殊处理
        if(key==head->key){
            Node *delNode=head;
            head=head->next;
            delete delNode;
            count--;
            return;
        }

        Node *node =head;
        while(node->next!=NULL&&node->next->key!=key)
            node=node->next;

        if(node->next!=NULL){
            Node *delNode=node->next;
            node->next=delNode->next;
            delete delNode;
            count--;
            return;
        }
    }

};
#endif // SST_H_INCLUDED

2. 编写文件（统计文本）相关操作

简单分词规则：
找到文本中第一个是字母的位置，从该位置开始逐个遍历文本，将该字母位置到下一个不是字母的位置这一段的字符串转为小写后，作为一个string类型的变量，放入words（vector &words）中

FileOps.h

#ifndef INC_04_BINARY_SEARCH_TREE_SEARCH_FILEOPS_H
#define INC_04_BINARY_SEARCH_TREE_SEARCH_FILEOPS_H

#include <string>
#include <iostream>
#include <fstream>
#include <vector>

using namespace std;

// 文件相关操作
namespace FileOps{

    // 读取文件名称为filename中的内容，并将其中包含的所有词语放进words中
    int firstCharacterIndex(const string& s, int start){
        for( int i = start ; i < s.length() ; i ++ )
            if( isalpha(s[i]) )
                return i;
        return s.length();
    }

    // 将字符串s中的所有字母转换成小写之后返回
    string lowerS( const string& s){

        string ret = "";
        for( int i = 0 ; i < s.length() ; i ++ )
            ret += tolower(s[i]);
        return ret;
    }

    // 读取文件名称为filename中的内容，并将其中包含的所有词语放进words中
    bool readFile( const string &filename, vector<string> &words){

        // 文件读取
        string line;
        string contents = "";
        //ifstream file("bible.txt");
        ifstream file(filename.c_str());
        if( file.is_open() ){
            while( getline(file, line))
                contents += ( line + "\n" );
            file.close();
        }
        else{
            cout<<"Can not open "<<filename<<" !!!"<<endl;
            return false;
        }

        // 简单分词
        // 这个分词方式相对简陋, 没有考虑很多文本处理中的特殊问题
        // 在这里只做demo展示用
        int start = firstCharacterIndex(contents, 0);
        for( int i = start + 1 ; i <= contents.length() ; )
            if( i == contents.length() || !isalpha(contents[i]) ){
                words.push_back( lowerS( contents.substr(start,i-start) ) );
                start = firstCharacterIndex(contents, i);
                i = start + 1;
            }
            else
                i ++;

        return true;
    }

}

#endif //INC_04_BINARY_SEARCH_TREE_SEARCH_FILEOPS_H

3.编写主程序

main.c

#include<ctime>
#include<cassert>
#include<iostream>

#include"SST.h"
#include"BST.h"
#include"FileOps.h"

//测试二分搜索树和顺序查找表之间的性能差异
//二分搜索树额性能远远优于顺序查找表
int main()
{
    string filename="bible.txt";
    vector<string> words;
    if(FileOps::readFile(filename,words)){

        cout<<"There are totally"<<words.size()<<"words in"<<filename<<endl;
        cout<<endl;

        //测试BST
        time_t startTime=clock();

        //统计文章中所有词的词频
        BST<string,int>bst = BST<string,int>();
        for(vector<string>::iterator iter = words.begin();iter!=words.end();iter++){
            int *res=bst.search(*iter);
            if(res==NULL)
                bst.insert(*iter,1);
            else
                (*res)++;
        }

        //输出god一词出现的频率
        if(bst.contain("god"))
            cout<<"god:"<<*bst.search("god")<<endl;
        else
            cout<<"No word god in"<<filename<<endl;

        time_t endTime = clock();

        cout<<"BST,time:"<<double(endTime-startTime)/CLOCKS_PER_SEC<<"s."<<endl;


        //测试顺序查找表SST
        startTime=clock();

        SequenceSt<string,int> sst = SequenceSt<string,int>();
        for (vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
            int *res = sst.search(*iter);
            if (res == NULL)
                sst.insert(*iter, 1);
            else
                (*res)++;
       }

        // 输出圣经中god一词出现的频率
        if(sst.contain("god"))
            cout << "'god' : " << *sst.search("god") << endl;
        else
            cout << "No word 'god' in " << filename << endl;

        endTime = clock();

        cout << "SST , time: " << double(endTime - startTime) / CLOCKS_PER_SEC << " s." << endl;

     }

    return 0;
}

测试结果
在这里插入图片描述

Elylicery

关注

1
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
使用二叉搜索树（BST）和顺序搜索树（SST）实现文本词频统计（c++）

首先定义数据结构：定义二叉搜索树（Binary Search Tree）:之前已经讲过：https://blog.csdn.net/sinat_38368658/article/details/87652790BST.h#ifndef BST_H_INCLUDED#define BST_H_INCLUDED#include&amp;lt;iostream&amp;gt;#include&amp;lt;v...
复制链接

扫一扫