这篇博文是根据笔者在学术探讨时候根据现实情况写的,可作为文本分析的参考,如果读者觉得还能以更加高效率、算法更优良方式进行,非常欢迎与笔者沟通。
一、词频统计
操作对象:
一个文件A存储关键词,一个目标检测文件B
思路:
1.将A文件手动整理成程序可识别的txt文件后存储起来
代码:
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define Maxkeywords 3000
#define Maxnum2 500
#define KeyWN 550
#define SN 1000
#define PN 3000
typedef struct
{
char word[Maxnum2];
}Words;
typedef struct
{
int num;
}Num;
int main()
{
Words keywords[KeyWN];
int numkey=0,i=0;
int nkey;
char filename1[]="keywords.csv";
FILE* keywords_file=fopen(filename1,"r");
if( keywords_file==NULL)
{
perror("Error opening file keywords");
exit(1);
}
fseek( keywords_file,0,SEEK_END);
long file_size=ftell( keywords_file);
rewind( keywords_file);
char *file_contents=(char *)malloc(file_size+1);
fread(file_contents,1,file_size, keywords_file);
file_contents[file_size]='\0';
fclose( keywords_file);
char *line=strtok(file_contents,"\n");
while(line!=NULL)
{
strcpy(keywords[i].word,line);
line=strtok(NULL,"\n");
i++;
}
nkey=i;
free(file_contents);
return 0;
}
2.在B文件中的统计区间统计词频
B文件内容结构:
层次一(以“第”开头)
文字描述
层次二(以“1(...),2(...),...”开头)
文字描述
层次三(以“1-01(...),1-02(...),2-01(...),...”开头)
文字描述
层次四(以“1-01-01(...),2-03-02(...),3-02-01(...)”开头)
文字描述
层次五(以“1-01-01-01(...),2-01-02-03(...)”开头) //(最底层)
文字描述 //(统计)
...
...
...
...
层次二
文字描述
层次三
文字描述
层次四
文字描述
层次五
文字描述(统计)
...
...
...
...
层次二
文字描述
层次三
文字描述
层次四
文字描述
层次五
文字描述(统计)
...
...
...
...
层次一
...
统计条件:只统计第五个层次对应的文字描述的词频
代码:
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define Maxkeywords 3000
#define Maxnum2 500
#define KeyWN 550
#define SN 1000
#define PN 3000
typedef struct
{
char word[500];
}Words;
typedef struct
{
int num;
}Num;
int judge(char *sent,int num)
{
int flag=1;
char sentence[SN]={'0'};
strcpy(sentence,sent);
char ch[4]={'0'};
ch[0]=sentence[0],ch[1]=sentence[1],ch[2]=sentence[2],ch[3]='\0';
if(!num)
{
if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=1;
else
{
if(sentence[1]=='-'&&sentence[4]=='-') flag=0;
else
{
if(sentence[1]=='-') flag=0;
else
{
if(sentence[1]=='(') flag=0;
else
{
if(strcmp(ch,"第")==0) flag=0;
}
}
}
}
}
else if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=0;
return !flag;
}
int main()
{
FILE *write=fopen("outcome.csv","w");//打开最终写入的excel文件
int numkey=0,i=0;
int nkey;
char filename2[]="b_file.txt";//B文件
int numca[Maxkeywords][KeyWN];//记录词频数量的数组
Words ca[Maxkeywords];
for(int m=0;m<Maxkeywords;m++)
{
for(int mn=0;mn<i;mn++)
{
numca[m][mn]=0;
}
}
FILE *fp=fopen(filenam2,"r");//打开B文件
if(fp==NULL||write==NULL)
{
perror("Error opening file target");
exit(1);
}
fseek( fp,0,SEEK_END);
long file_sizeof=ftell(fp);
rewind( fp);
char *target_content=(char *)malloc(file_sizeof+1);
fread(target_content,1,file_sizeof, fp);
target_content[file_sizeof]='\0';//target_content录入以“####”结尾的B文件文本
char *sentence=(char*)malloc(sizeof(char)*SN);//sentence存储B文件的每一行
char *paragraph=(char*)malloc(sizeof(char)*3000);//paragraph存储分割出来的满足统计条件的内容
int flag[3]={0},k=0,j1,j0;//用三个点j0,j1,flag去进行“生成paragraph”步骤
int num=-1;
int size,size1,s1=0,s2=0;
sentence=strtok(target_content,"\n");
paragraph[0]='#';
j0=judge(sentence,0),j1=judge(sentence,1);
//非最后一层title j0=0, j1=1,flag=0
//最后一层title j0=1, j1=0, flag=1
//非title j0=0, j1=0
while(strcmp(sentence,"####"))
{
if( j0 )
{
flag[0]=1,flag[1]=0,flag[2]=0;
}
else
{
if( !j0 && j1 )
{
flag[0]=0,flag[1]=1,flag[2]=1,door=1;
}
else flag[0]=flag[1]=0;
}
if( !j0 &&!j1 &&flag[2] )
{
if(sentence!=NULL&¶graph!=NULL) strcat(paragraph,sentence);
}
if(flag[1]&&sentence!=NULL) //开辟新的paragraph分割记录,存储新的职业名称
{
num++;
strcpy(ca[num].word ,sentence);
}
sentence=strtok(NULL,"\n");//获取下一行内容
if(sentence==NULL) sentence=strtok(NULL,"\n");
j0=judge(sentence,0),j1=judge(sentence,1);
if( (j0&&flag[2]) ||(!j0&&j1&&flag[2]))//结束paragraph分割记录,并重置为0
{
fprintf(write,"\"%s\",",ca[num].word);
for( k=0;k<i;k++)//统计该区间关键词频数
{
char *ptr=(char*)malloc(sizeof(char)*PN);
ptr=paragraph;
while((ptr=strstr(ptr,keywords[k].word))!=NULL)
{
numca[num][k]++;
ptr++;
}
fprintf(write,"%d,",numca[num][k]);
}
fprintf(write,"\n");
memset(paragraph,'\0',sizeof(*paragraph));
}
}
free(paragraph);
free(sentence);
fclose(fp);
fclose(write);
return 0;
}
全部代码:
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define Maxkeywords 3000
#define Maxnum2 500
#define KeyWN 550
#define SN 1000
#define PN 3000
typedef struct
{
char word[500];
}Words;
typedef struct
{
int num;
}Num;
int judge(char *sent,int num)
{
int flag=1;
char sentence[SN]={'0'};
strcpy(sentence,sent);
char ch[4]={'0'};
ch[0]=sentence[0],ch[1]=sentence[1],ch[2]=sentence[2],ch[3]='\0';
if(!num)
{
if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=1;
else
{
if(sentence[1]=='-'&&sentence[4]=='-') flag=0;
else
{
if(sentence[1]=='-') flag=0;
else
{
if(sentence[1]=='(') flag=0;
else
{
if(strcmp(ch,"第")==0) flag=0;
}
}
}
}
}
else if(sentence[1]=='-'&&sentence[4]=='-'&&sentence[7]=='-') flag=0;
return !flag;
}
int main()
{
FILE *write=fopen("outcome.csv","w");
int line_num=0;
Words keywords[KeyWN];
int numkey=0,i=0;
int nkey;
char filename1[]="keywords.csv";
FILE* keywords_file=fopen(filename1,"r");
if( keywords_file==NULL)
{
perror("Error opening file keywords");
exit(1);
}
fseek( keywords_file,0,SEEK_END);
long file_size=ftell( keywords_file);
rewind( keywords_file);
char *file_contents=(char *)malloc(file_size+1);
fread(file_contents,1,file_size, keywords_file);
file_contents[file_size]='\0';
fclose( keywords_file);
fprintf(write,",");
char *line=strtok(file_contents,"\n");
while(line!=NULL)
{
fprintf(write,"\"%s\"",line);//使得较长的字符串输出进excel文件的一个单元格当中
fprintf(write,",");//表示换格
strcpy(keywords[i].word,line);
line=strtok(NULL,"\n");//获取下一行
i++;
}
fprintf(write,"\n");
nkey=i;
free(file_contents);
char filename2[]="b_file.txt";
int numca[Maxkeywords][KeyWN];
Words ca[Maxkeywords];
for(int m=0;m<Maxkeywords;m++)
{
for(int mn=0;mn<i;mn++)
{
numca[m][mn]=0;
}
}
FILE *fp=fopen(filenam2,"r");
if(fp==NULL||write==NULL)
{
perror("Error opening file target");
exit(1);
}
fseek( fp,0,SEEK_END);
long file_sizeof=ftell(fp);
rewind( fp);
char *target_content=(char *)malloc(file_sizeof+1);
fread(target_content,1,file_sizeof, fp);
target_content[file_sizeof]='\0';
char *sentence=(char*)malloc(sizeof(char)*SN);
char *paragraph=(char*)malloc(sizeof(char)*3000);
int flag[3]={0},k=0,j1,j0;
int num=-1,start=0,door=0;
int size,size1,s1=0,s2=0;
sentence=strtok(target_content,"\n");
paragraph[0]=' ';
j0=judge(sentence,0),j1=judge(sentence,1);
while(strcmp(sentence,"####"))
{
if( j0 )
{
flag[2]=0;
}
else
{
if( !j0 && j1 )
{
flag[2]=1,door=1;
}
else flag[0]=flag[1]=0;
}
if( !j0 &&!j1 &&flag[2] )
{
if(sentence!=NULL&¶graph!=NULL) strcat(paragraph,sentence);
}
if(flag[1]&&sentence!=NULL)
{
num++;
strcpy(ca[num].word ,sentence);
}
sentence=strtok(NULL,"\n");//获取下一行内容
if(sentence==NULL) sentence=strtok(NULL,"\n");
if(!strcmp(sentence,"####") ) break;
j0=judge(sentence,0),j1=judge(sentence,1);
if( (j0&&flag[2]) ||(!j0&&j1&&flag[2]))
{
fprintf(write,"\"%s\",",ca[num].word);
for( k=0;k<i;k++)//统计该区间关键词频数
{
char *ptr=(char*)malloc(sizeof(char)*3000);
ptr=paragraph;
while((ptr=strstr(ptr,keywords[k].word))!=NULL)
{
numca[num][k]++;
ptr++;
}
fprintf(write,"%d,",numca[num][k]);
}
fprintf(write,"\n");
memset(paragraph,'\0',sizeof(*paragraph));
}
}
free(paragraph);
free(sentence);
fclose(fp);
return 0;
}
二、使用jieba中文分词器
实现步骤:
1)途径一:使用lightly在线编译器,用python进行jieba的安装、调用和相关代码编写,使用lightly可以跳过个人亲自下载jieba文件,因为lightly提供可以在该平台上方便安装和调用的jieba库
2)途径二:对python不熟悉可以使用C语言,此种途径需要自行下载jieba-c,具体步骤如下
1. 首先,需要下载jieba-c的源代码。先下载python,若是已经安装python中没有setuptools需要自行下载;后下载jieba-c (https://github.com/yanyiwu/cppjieba),或者本篇文章“资源绑定”上下载已经附载的jiebazip(我是从github上下载来的)
2. 下载源代码后,解压缩并进入源代码目录。
3. 在python终端中,使用以下命令安装jieba:
pip install jieba
4. 安装完成后,在python代码中使用jieba:
import jieba
6. 可以创建一个Jieba分词的对象,并使用它进行分词:
对目标文本每一行进行分词代码:
import jieba
#只保留汉字
def remove_special_chars(lst):
pattern = '[^\u4e00-\u9fa5]' # 定义只保留汉字的正则表达式模式
result = []
for item in lst:
if isinstance(item, str) and not re.search(pattern, item):
new_lst.append(re.sub(pattern, ' ', item))
return new_lst
#获取要分解的内容
with open("segmentation_tar.csv","r") as f:
lines=f.readlines()
#对每一行进行分词并将每一个词输入到excel的一个格子里面
seg_list=jieba.cut(line)
seg_list=remove_speical_chars(seg_list)
for i,item in enumerate(seg_list) :
df.loc[len(df) - 1, i] = item
df.to_excel('segmentation.xlsx', index=False, engine='xlsxwriter')
对B文件当中最后一层title对应的内容进行分词(B文件内容结构参考“一、词频统计”):
import jieba
import pandas as pd
import re
def convert_to_string(textline):
if isinstance(textline,str):
return textline
else:
return ' '.join(textline)
def remove_special_chars(lst):
pattern = r'[^\w\s]|[\d]+|\n' # 定义要删除的特殊字符模式
new_lst = []
for item in lst:
if isinstance(item, str) and not re.search(pattern, item):
new_lst.append(re.sub(pattern,' ', item))
return new_lst
#文件对应内容存储
j0=0
j1=0
flag=0
j00=0
j11=0
#j0, j1, j00, j11, flag的作用是用来标记什么时候将分割出来的一段文件进行分词,j0、j1标记目前这一行,j00、j11标记下一行,flag标记是否可以将这一行添加到textline当中以便遇新title时进行这一段textline分词
with open("b-file","r") as f:
lines=f.readlines()
filtered_lines=[]
textline=[]
title_line=[]
next_line=[]
#line=[""] *300
df = pd.DataFrame(columns=['title'])
for line in lines:
if line == "####":
break
if line == "\n":
line= lines[lines.index(line) + 1]
if line =="\n":
line= lines[lines.index(line)+2]
if line == "####":
break
if len(line)>8 and line[1]=='-' and line[4]=='-' and line[7]=='-':#lastest title
j0=1
j1=0
flag=1
title_line=line
elif ((len(line) > 5) and line[1] == '-' and line[4] == '-') or (line[1] == '-') or (len(line)>3 and line[0] >= '0' and line[0] <= '9' and line[1] == '(') or line[0] == '第':#other title
j0=0
j1=1
flag=0
else:#content
j0=0
j1=0
if not j0 and not j1 and flag : #the content under the lastest title
textline.append(line)
next_line = lines[lines.index(line) + 1]
if next_line == "\n":#B文件空行懒得删可以这样写
next_line = lines[lines.index(line) + 2]
if next_line == "\n":
next_line = lines[lines.index(line) + 3]
if len(next_line)>8 and next_line[1]=='-' and next_line[4]=='-' and next_line[7]=='-': #lastest title
j00 = 1
j11 = 0
elif ( len(next_line)>5 and next_line[1]=='-' and next_line[4]=='-' ) or ( len(next_line)>2 and next_line[1]=='-') or ( len(next_line)>3 and next_line[0]>='0' and next_line[0]<='9' and next_line[1]=='(' ) or ( len(next_line)>2 and next_line[0]=='第') : #other title
j00 = 0
j11 = 1
else:
j00 = 0
j11 = 0
if (j00 or j11) and flag:
# 将title_line输入在对应'title'这一行的第一个格子
df.loc[len(df),'title'] = title_line
# 输入jieba的分词结果
if textline == []:
seg_list = []
else:
textline=convert_to_string(textline)
ori_list = [word for word in jieba.cut(textline)]#将分号的词变成list赋值给ori_list
seg_list=remove_special_chars(ori_list)#除掉非汉字字符
for i,item in enumerate(seg_list) : #seg_list中每一个词语输入到对应的这一行的先后不同列的格子当中
df.loc[len(df) - 1, i] = item
# 重置title_line和textline
title_line = ""
seg_list= []
textline = []
df.to_excel('segmentation.xlsx', index=False, engine='xlsxwriter')
最后可得到存储分割后词语的excel表segmentation.xlsx