用C++实现对txt文件中的单词词频的统计

最新推荐文章于 2024-04-29 19:13:51 发布

DChuancey

最新推荐文章于 2024-04-29 19:13:51 发布

阅读量847

点赞数 1

分类专栏：小工具文章标签： c++ 开发语言 visual studio

本文链接：https://blog.csdn.net/NGU_Jq/article/details/121287018

版权

小工具专栏收录该内容

1 篇文章 0 订阅

订阅专栏

代码在CentOS 7上运行通过，Windows下无法运行，统计规则如下：

- 形如to-night用一个连字符连起来的单词记为一个单词

- 在行尾用连字符连起来的单词算一个单词，例如：

xxxxxxxxxxxxxxxxxxx beauti-

ful

beautiful算一个单词

- 多个连字符相连的视为多个单词，例如it---is---a---dog，视为it is a dog共计为4个单词

- 文件路径名不可超过1024个字符

- 使用-h参数查看使用帮助

下面是代码:

#include <ctype.h>
#include <unistd.h>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <map>
#include <string>
#include <vector>

std::map<const std::string, int> data;
std::vector<std::pair<std::string, int> > outputlist;
#define PATH_LEN 1024
#define BUFF_SIZE 1024

char path[PATH_LEN];
int output_mode = 0;

static void show_help_message() {
    printf(
        "Usage:\n-s 文件名，\n-m 输出格式,0表示文件 1表示命令行\n"
        "-h 显示帮助文档\n\n文件名不能超过1024个字符\n");
}

static int init_arguments(int* argc, char*** argv) {
    const char* regex = "s:m:h";
    int opt;
    while ((opt = getopt(*argc, *argv, regex)) != -1) {
        switch (opt) {
            case 's':
                strcpy(path, optarg);
                break;
            case 'm':
                output_mode = atoi(optarg);
                break;
            case 'h':
                show_help_message();
                break;
            default:
                fprintf(stderr, "Please use -h to get help infomation\n");
                return -1;
        }
    }
    if (path == NULL || strlen(path) == 0) {
        fprintf(stderr, "file name is not given!\n");
        return -1;
    }
    return 0;
}

char segment[BUFF_SIZE * 2];
int pre_segment_len = 0;
static void process_data(const char* words, const int len) {
    memcpy(segment + pre_segment_len, words, len);
    //求后缀长度
    int p = len + pre_segment_len - 1;
    int next_pre_segment_len = 0;
    while (isalpha(segment[p]) ||
           (segment[p] == '-' && isalpha(segment[p - 1]))) {
        p--;
        next_pre_segment_len++;
    }
    int head = 0, end = 0;
    while (end < p) {
        while (!isalpha(segment[head]) && head < p) {
            head++;
        }
        if (head >= p)
            break;
        end = head;
        char word[BUFF_SIZE];
        int word_len = 0;
        memset(word, 0, sizeof(word));
        while (1) {
            if (isalpha(segment[end])) {
                word[word_len] = tolower(segment[end]);  //忽略大小写
                word_len++;
            } else if (segment[end] == '-') {  //考虑连字符
                                               // 换行连字符
                if (segment[end + 1] != '\r' && segment[end + 1] != '\n' &&
                    !isalpha(segment[end + 1])) {
                    break;
                } else {
                    if (segment[end + 1] == '\r' || segment[end + 1] == '\n') {
                        while (segment[end + 1] == '\r' ||
                               segment[end + 1] == '\n')
                            end++;
                        end--;
                    }
                    if (isalpha(segment[end + 1])) {
                        word[word_len] = '-';
                        word_len++;
                    }
                }
            } else {  //不是字母/连字符
                break;
            }
            end++;
        }
        word[word_len] = '\0';
        data[word]++;

        head = end;
    }
    memmove(segment, segment + p + 1, next_pre_segment_len);
    pre_segment_len = next_pre_segment_len;
}

static int calculate() {
    // freopen(path, "r", stdin);
    FILE* fp = fopen(path, "r");
    if (fp == NULL) {
        fprintf(stderr, "文件名无效\n");
        return -1;
    }
    char buff[BUFF_SIZE];
    bool exit = false;
    while (!exit) {
        int ret = fread(buff, 1, BUFF_SIZE, fp);
        if (ret < BUFF_SIZE && feof(fp)) {
            buff[ret] = '\n';
            exit = true;
        }
        process_data(buff, ret);
    }
}

bool cmp(const std::pair<const std::string, const int>& a,
         const std::pair<const std::string, const int>& b) {
    return a.second > b.second;
}

int main(int argc, char** argv) {
//debug调试用
#ifndef _DEBUG
    if (init_arguments(&argc, &argv) == -1)
        return -1;

    printf("path:%s\nmode:%d\n", path, output_mode);
#else
    strcpy(path, "/home/chuancey/homework/doc/100M.txt");
    output_mode = 1;
    puts("DEBUG!\n");
#endif
    if (calculate() == -1)
        return -1;
    if (pre_segment_len > 0) {  //剩余的字符
        segment[pre_segment_len] = '\0';
        //忽略大小写
        for (int i = 0; i < pre_segment_len; i++) {
            segment[i] = tolower(segment[i]);
        }

        data[segment]++;
    }
    std::map<const std::string, int>::iterator it = data.begin();
    if (output_mode) {
        freopen("output.txt", "w", stdout);
    }
    printf("wods number: %d\n", data.size());
    for (; it != data.end(); it++) {
        // printf("%s:%d\n", it->first.c_str(), it->second);
        const std::string key = it->first;
        int value = it->second;
        outputlist.push_back(std::make_pair(key, value));
    }
    std::sort(outputlist.begin(), outputlist.end(), cmp);
    for (int i = 0; i < outputlist.size(); i++) {
        printf("%s:%d\n", outputlist[i].first.c_str(), outputlist[i].second);
    }
    return 0;
}