用c语言和python进行较大容量文件的中文文本词频统计

饕餮之徒和頑丫头

已于 2024-02-02 02:27:51 修改

阅读量638

点赞数 5

文章标签： c语言开发语言程序人生 python

于 2024-02-02 02:26:34 首次发布

本文链接：https://blog.csdn.net/m0_74981381/article/details/135597401

版权

这篇博文是根据笔者在学术探讨时候根据现实情况写的，可作为文本分析的参考，如果读者觉得还能以更加高效率、算法更优良方式进行，非常欢迎与笔者沟通。

一、词频统计

操作对象：

一个文件A存储关键词，一个目标检测文件B

思路：

1.将A文件手动整理成程序可识别的txt文件后存储起来

代码：

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define Maxkeywords 3000
#define Maxnum2 500
#define KeyWN 550
#define SN 1000
#define PN 3000
typedef struct
{
    char word[Maxnum2];
}Words;
typedef struct
{
    int num;
}Num;
int main()
{
    Words keywords[KeyWN];
    int numkey=0,i=0;
    int nkey;
    char filename1[]="keywords.csv";
    FILE* keywords_file=fopen(filename1,"r");
    if( keywords_file==NULL)
	{
		perror("Error opening file keywords");
		exit(1);
	}
    fseek( keywords_file,0,SEEK_END);
    long file_size=ftell( keywords_file);
    rewind( keywords_file); 
    char *file_contents=(char *)malloc(file_size+1);
    fread(file_contents,1,file_size, keywords_file);
    file_contents[file_size]='\0';
    fclose( keywords_file);
    char *line=strtok(file_contents,"\n");
	while(line!=NULL)
    {
        strcpy(keywords[i].word,line);
        line=strtok(NULL,"\n");
        i++;
    }
    nkey=i;
    free(file_contents);
    return 0;
}

2.在B文件中的统计区间统计词频

B文件内容结构：

层次一（以“第”开头）
    文字描述
    层次二（以“1(...),2(...),...”开头）
        文字描述
        层次三（以“1-01(...),1-02(...),2-01(...),...”开头）
            文字描述
            层次四（以“1-01-01(...),2-03-02(...),3-02-01(...)”开头）
                文字描述
                层次五（以“1-01-01-01(...),2-01-02-03(...)”开头） //（最底层）
                    文字描述    //（统计）
                ...
            ...
        ...
    ...
    层次二
        文字描述
        层次三
            文字描述
            层次四
                文字描述
                层次五
                    文字描述（统计）
                ...
            ...
        ...
    ...
    层次二
        文字描述
        层次三
            文字描述
            层次四
                文字描述
                层次五
                    文字描述（统计）
                ...
            ...
        ...
    ...
层次一
...

统计条件：只统计第五个层次对应的文字描述的词频

代码：

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define Maxkeywords 3000
#define Maxnum2 500
#define KeyWN 550
#define SN 1000
#define PN 3000
typedef struct
{
    char word[500];
}Words;
typedef struct
{
    int num;
}Num;

    
int judge(char *sent,int num)
{
    int flag=1;
    char sentence[SN]={'0'};
    strcpy(sentence,sent);
    char ch[4]={'0'};
    ch[0]=sentence[0],ch[1]=sentence[1],ch[2]=sentence[2],ch[3]='\0';
    if(!num)
    {
     if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=1;
     else
     {
     if(sentence[1]=='-'&&sentence[4]=='-') flag=0;
     else
            {
                if(sentence[1]=='-') flag=0;
                else
                {
                    if(sentence[1]=='(') flag=0;
                    else
                    {
                        if(strcmp(ch,"第")==0) flag=0;
                    }
                }

            }    
     }
    }
    else  if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=0;
    
    return !flag;
}

int main()
{
    FILE *write=fopen("outcome.csv","w");//打开最终写入的excel文件
    int numkey=0,i=0;
    int nkey;
    char filename2[]="b_file.txt";//B文件
    int numca[Maxkeywords][KeyWN];//记录词频数量的数组
    Words ca[Maxkeywords];
    for(int m=0;m<Maxkeywords;m++)
    {
        for(int mn=0;mn<i;mn++)
            {
            numca[m][mn]=0;
            }
    }

    FILE *fp=fopen(filenam2,"r");//打开B文件
    if(fp==NULL||write==NULL)
    {
        perror("Error opening file target");
        exit(1);
    }
    fseek( fp,0,SEEK_END);
    long file_sizeof=ftell(fp);
    rewind( fp);
    char *target_content=(char *)malloc(file_sizeof+1);
    fread(target_content,1,file_sizeof, fp);
    target_content[file_sizeof]='\0';//target_content录入以“####”结尾的B文件文本
    char *sentence=(char*)malloc(sizeof(char)*SN);//sentence存储B文件的每一行
    char *paragraph=(char*)malloc(sizeof(char)*3000);//paragraph存储分割出来的满足统计条件的内容
    int flag[3]={0},k=0,j1,j0;//用三个点j0,j1,flag去进行“生成paragraph”步骤
    int num=-1;
    int size,size1,s1=0,s2=0;
    sentence=strtok(target_content,"\n");
    paragraph[0]='#';
    j0=judge(sentence,0),j1=judge(sentence,1);
    //非最后一层title j0=0, j1=1,flag=0
    //最后一层title j0=1, j1=0, flag=1
    //非title j0=0, j1=0
    while(strcmp(sentence,"####"))
    {   
        if( j0 ) 
        {
            flag[0]=1,flag[1]=0,flag[2]=0;
        }
        else 
        {    
        if( !j0 && j1 )        
        {
            flag[0]=0,flag[1]=1,flag[2]=1,door=1;
        }
        else flag[0]=flag[1]=0;
        }
        if( !j0 &&!j1 &&flag[2] )
        {
            if(sentence!=NULL&&paragraph!=NULL)   strcat(paragraph,sentence);
        }
        if(flag[1]&&sentence!=NULL) //开辟新的paragraph分割记录，存储新的职业名称
        {   
            num++;
            strcpy(ca[num].word ,sentence);
        }
        
        sentence=strtok(NULL,"\n");//获取下一行内容
        if(sentence==NULL) sentence=strtok(NULL,"\n");
        j0=judge(sentence,0),j1=judge(sentence,1);
        if( (j0&&flag[2]) ||(!j0&&j1&&flag[2]))//结束paragraph分割记录，并重置为0
        {
            fprintf(write,"\"%s\",",ca[num].word);
            for( k=0;k<i;k++)//统计该区间关键词频数
            {
            char *ptr=(char*)malloc(sizeof(char)*PN);
            ptr=paragraph;
            while((ptr=strstr(ptr,keywords[k].word))!=NULL)
            {
                numca[num][k]++;
                ptr++;
            }
            fprintf(write,"%d,",numca[num][k]);
            }
            fprintf(write,"\n");
            memset(paragraph,'\0',sizeof(*paragraph));
        }
    }
    free(paragraph);
    free(sentence);
    fclose(fp);
    fclose(write);
    return 0;
}

全部代码：

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define Maxkeywords 3000
#define Maxnum2 500
#define KeyWN 550
#define SN 1000
#define PN 3000
typedef struct
{
    char word[500];
}Words;
typedef struct
{
    int num;
}Num;

    
int judge(char *sent,int num)
{
    int flag=1;
    char sentence[SN]={'0'};
    strcpy(sentence,sent);
    char ch[4]={'0'};
    ch[0]=sentence[0],ch[1]=sentence[1],ch[2]=sentence[2],ch[3]='\0';
    if(!num)
    {
     if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=1;
     else
     {
     if(sentence[1]=='-'&&sentence[4]=='-') flag=0;
     else
            {
                if(sentence[1]=='-') flag=0;
                else
                {
                    if(sentence[1]=='(') flag=0;
                    else
                    {
                        if(strcmp(ch,"第")==0) flag=0;
                    }
                }

            }    
     }
    }
    else  if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=0;
    
    return !flag;
}

int main()
{
    FILE *write=fopen("outcome.csv","w");
    int line_num=0;
    Words keywords[KeyWN];
    int numkey=0,i=0;
    int nkey;
    char filename1[]="keywords.csv";
    FILE* keywords_file=fopen(filename1,"r");
    if( keywords_file==NULL)
	{
		perror("Error opening file keywords");
		exit(1);
	}
    fseek( keywords_file,0,SEEK_END);
    long file_size=ftell( keywords_file);
    rewind( keywords_file); 
    char *file_contents=(char *)malloc(file_size+1);
    fread(file_contents,1,file_size, keywords_file);
    file_contents[file_size]='\0';
    fclose( keywords_file);
    fprintf(write,",");
    char *line=strtok(file_contents,"\n");
	while(line!=NULL)
    {
        fprintf(write,"\"%s\"",line);//使得较长的字符串输出进excel文件的一个单元格当中
        fprintf(write,",");//表示换格
        strcpy(keywords[i].word,line);
        line=strtok(NULL,"\n");//获取下一行
        i++;
    }
    fprintf(write,"\n");
    nkey=i;
    free(file_contents);

    char filename2[]="b_file.txt";
    int numca[Maxkeywords][KeyWN];
    Words ca[Maxkeywords];
    for(int m=0;m<Maxkeywords;m++)
    {
        for(int mn=0;mn<i;mn++)
            {
            numca[m][mn]=0;
            }
    }

    FILE *fp=fopen(filenam2,"r");
    if(fp==NULL||write==NULL)
    {
        perror("Error opening file target");
        exit(1);
    }
    fseek( fp,0,SEEK_END);
    long file_sizeof=ftell(fp);
    rewind( fp);
    char *target_content=(char *)malloc(file_sizeof+1);
    fread(target_content,1,file_sizeof, fp);
    target_content[file_sizeof]='\0';
    char *sentence=(char*)malloc(sizeof(char)*SN);
    char *paragraph=(char*)malloc(sizeof(char)*3000);
    int flag[3]={0},k=0,j1,j0;
    int num=-1,start=0,door=0;
    int size,size1,s1=0,s2=0;
    sentence=strtok(target_content,"\n");
    paragraph[0]=' ';
    j0=judge(sentence,0),j1=judge(sentence,1);
    while(strcmp(sentence,"####"))
    {   
        if( j0 ) 
        {
            flag[2]=0;
        }
        else 
        {    
        if( !j0 && j1 )        
        {
            flag[2]=1,door=1;
        }
        else flag[0]=flag[1]=0;
        }
        if( !j0 &&!j1 &&flag[2] )
        {
            if(sentence!=NULL&&paragraph!=NULL)   strcat(paragraph,sentence);
        }
        if(flag[1]&&sentence!=NULL) 
        {   
            num++;
            strcpy(ca[num].word ,sentence);
        }
        
        sentence=strtok(NULL,"\n");//获取下一行内容
        if(sentence==NULL) sentence=strtok(NULL,"\n");
        if(!strcmp(sentence,"####") ) break;
        j0=judge(sentence,0),j1=judge(sentence,1);
        if( (j0&&flag[2]) ||(!j0&&j1&&flag[2]))
        {
            fprintf(write,"\"%s\",",ca[num].word);
            for( k=0;k<i;k++)//统计该区间关键词频数
            {
            char *ptr=(char*)malloc(sizeof(char)*3000);
            ptr=paragraph;
            while((ptr=strstr(ptr,keywords[k].word))!=NULL)
            {
                numca[num][k]++;
                ptr++;
            }
            fprintf(write,"%d,",numca[num][k]);
            }
            fprintf(write,"\n");
            memset(paragraph,'\0',sizeof(*paragraph));
        }
    }
    free(paragraph);
    free(sentence);
    fclose(fp);
    return 0;
}

二、使用jieba中文分词器

实现步骤：

1)途径一：使用lightly在线编译器，用python进行jieba的安装、调用和相关代码编写，使用lightly可以跳过个人亲自下载jieba文件，因为lightly提供可以在该平台上方便安装和调用的jieba库

2)途径二：对python不熟悉可以使用C语言，此种途径需要自行下载jieba-c，具体步骤如下

1. 首先，需要下载jieba-c的源代码。先下载python，若是已经安装python中没有setuptools需要自行下载；后下载jieba-c （https://github.com/yanyiwu/cppjieba），或者本篇文章“资源绑定”上下载已经附载的jiebazip（我是从github上下载来的）

2. 下载源代码后，解压缩并进入源代码目录。

3. 在python终端中，使用以下命令安装jieba：

pip install jieba

4. 安装完成后，在python代码中使用jieba：

import jieba

6. 可以创建一个Jieba分词的对象，并使用它进行分词：

对目标文本每一行进行分词代码：

import jieba
#只保留汉字
def remove_special_chars(lst):
    pattern = '[^\u4e00-\u9fa5]' # 定义只保留汉字的正则表达式模式
    result = []
    for item in lst:
        if isinstance(item, str) and not re.search(pattern, item):
            new_lst.append(re.sub(pattern, ' ', item))
    return new_lst
 #获取要分解的内容
 with open("segmentation_tar.csv","r") as f:
    lines=f.readlines()
    #对每一行进行分词并将每一个词输入到excel的一个格子里面
    seg_list=jieba.cut(line)
    seg_list=remove_speical_chars(seg_list)
    for i,item in enumerate(seg_list) :
        df.loc[len(df) - 1, i] = item
df.to_excel('segmentation.xlsx', index=False, engine='xlsxwriter')

对B文件当中最后一层title对应的内容进行分词（B文件内容结构参考“一、词频统计”）：

import jieba
import pandas as pd
import re
def convert_to_string(textline):
    if isinstance(textline,str):
        return textline
    else:
        return ' '.join(textline)
def remove_special_chars(lst):
    pattern = r'[^\w\s]|[\d]+|\n' # 定义要删除的特殊字符模式
    new_lst = []
    for item in lst:
        if isinstance(item, str) and not re.search(pattern, item):
            new_lst.append(re.sub(pattern,' ', item))
    return new_lst

#文件对应内容存储
j0=0
j1=0
flag=0
j00=0
j11=0
#j0, j1, j00, j11, flag的作用是用来标记什么时候将分割出来的一段文件进行分词，j0、j1标记目前这一行，j00、j11标记下一行，flag标记是否可以将这一行添加到textline当中以便遇新title时进行这一段textline分词
with open("b-file","r") as f:
    lines=f.readlines()

filtered_lines=[]
textline=[]
title_line=[]
next_line=[]
#line=[""] *300
df = pd.DataFrame(columns=['title'])

for line in lines:
    if line == "####":
        break
    if line == "\n":
        line= lines[lines.index(line) + 1]
    if line =="\n":
        line= lines[lines.index(line)+2]
    if line == "####":
        break
    if len(line)>8 and line[1]=='-' and line[4]=='-' and line[7]=='-':#lastest title
        j0=1
        j1=0
        flag=1
        title_line=line
    elif ((len(line) > 5) and line[1] == '-' and line[4] == '-') or (line[1] == '-') or (len(line)>3 and line[0] >= '0' and line[0] <= '9' and line[1] == '(') or line[0] == '第':#other title
        j0=0
        j1=1
        flag=0
    else:#content
        j0=0
        j1=0

    if not j0 and not j1 and flag : #the content under the lastest title
        textline.append(line)
    
    next_line = lines[lines.index(line) + 1]
    if next_line == "\n":#B文件空行懒得删可以这样写
        next_line = lines[lines.index(line) + 2]
    if next_line == "\n":
        next_line = lines[lines.index(line) + 3]
    if len(next_line)>8 and next_line[1]=='-' and next_line[4]=='-' and next_line[7]=='-': #lastest title
        j00 = 1
        j11 = 0
    elif ( len(next_line)>5 and next_line[1]=='-' and next_line[4]=='-' ) or ( len(next_line)>2 and next_line[1]=='-') or ( len(next_line)>3 and next_line[0]>='0' and next_line[0]<='9' and next_line[1]=='(' ) or ( len(next_line)>2 and next_line[0]=='第') : #other title
        j00 = 0
        j11 = 1
    else:
        j00 = 0
        j11 = 0

    if (j00 or j11) and flag:
        # 将title_line输入在对应'title'这一行的第一个格子
        df.loc[len(df),'title'] = title_line
        # 输入jieba的分词结果
        if textline == []:
            seg_list = []
        else:
            textline=convert_to_string(textline)
            ori_list = [word for word in jieba.cut(textline)]#将分号的词变成list赋值给ori_list
            seg_list=remove_special_chars(ori_list)#除掉非汉字字符
        for i,item in enumerate(seg_list) : #seg_list中每一个词语输入到对应的这一行的先后不同列的格子当中
            df.loc[len(df) - 1, i] = item
        # 重置title_line和textline
        title_line = ""
        seg_list= []
        textline = []
df.to_excel('segmentation.xlsx', index=False, engine='xlsxwriter')

最后可得到存储分割后词语的excel表segmentation.xlsx