Python中文语料批量预处理手记jieba

最新推荐文章于 2024-07-27 12:20:46 发布

weixin_33872660

最新推荐文章于 2024-07-27 12:20:46 发布

阅读量663

点赞数

文章标签： python 人工智能数据库

原文链接：https://yq.aliyun.com/articles/495523

版权

阅读目录

手记实用系列文章：

1 结巴分词和自然语言处理HanLP处理手记

2 Python中文语料批量预处理手记

3 自然语言处理手记

4 Python中调用自然语言处理工具HanLP手记

5 Python中结巴分词使用手记

语料预处理封装类：

 
         #coding=utf-8 
        
         import os 
        
         import jieba 
        
         import sys 
        
         import re 
        
         import time 
        
         import jieba.posseg  
         as  
         pseg 
        
         sys.path.append( 
         "../" 
         ) 
        
         jieba.load_userdict( 
         "../Database/userdict.txt" 
         ) # 加载自定义分词词典 
        
         '' 
         ' 
        
         title：利用结巴分词进行文本语料处理：单文本处理器、批量文件处理器 
        
         1 首先对文本进行遍历查找 
        
         2 创建原始文本的保存结构 
        
         3 对原文本进行结巴分词和停用词处理 
        
         4 对预处理结果进行标准化格式，并保存原文件结构路径 
        
         author：白宁超 
        
         myblog：http: 
         //www.cnblogs.com/baiboy/ 
        
         '' 
         ' 
        
         '' 
         ' 
        
         分词.词性标注以及去停用词 
        
         stopwordspath： 停用词路径 
        
         dealpath：中文数据预处理文件的路径 
        
         savepath：中文数据预处理结果的保存路径 
        
         '' 
         ' 
        
         def cutTxtWord(dealpath,savepath,stopwordspath): 
        
         stopwords = {}.fromkeys([ line.rstrip()  
         for  
         line  
         in  
         open(stopwordspath, 
         "r" 
         ,encoding= 
         'utf-8' 
         )]) # 停用词表 
        
         with open(dealpath, 
         "r" 
         ,encoding= 
         'utf-8' 
         )  
         as  
         f: 
        
         txtlist=f.read() # 读取待处理的文本 
        
         words =pseg.cut(txtlist) # 带词性标注的分词结果 
        
         cutresult= 
         "" 
         # 获取去除停用词后的分词结果 
        
         for  
         word, flag  
         in  
         words: 
        
         if  
         word not  
         in  
         stopwords: 
        
         cutresult += word+ 
         "/" 
         +flag+ 
         " "  
         #去停用词 
        
         getFlag(cutresult,savepath) # 
        
         '' 
         ' 
        
         分词.词性标注以及去停用词 
        
         stopwordspath： 停用词路径 
        
         read_folder_path ：中文数据预处理文件的路径 
        
         write_folder_path ：中文数据预处理结果的保存路径 
        
         filescount=300 #设置文件夹下文件最多多少个 
        
         '' 
         ' 
        
         def cutFileWord(read_folder_path,write_folder_path,stopwordspath): 
        
         # 停用词表 
        
         stopwords = {}.fromkeys([ line.rstrip()  
         for  
         line  
         in  
         open(stopwordspath, 
         "r" 
         ,encoding= 
         'utf-8' 
         )]) 
        
         # 获取待处理根目录下的所有类别 
        
         folder_list = os.listdir(read_folder_path) 
        
         # 类间循环 
        
         for  
         folder  
         in  
         folder_list: 
        
         #某类下的路径 
        
         new_folder_path = os.path. 
         join 
         (read_folder_path, folder) 
        
         # 创建保存文件目录 
        
         path=write_folder_path+folder #保存文件的子文件 
        
         isExists=os.path.exists(path) 
        
         if  
         not isExists: 
        
         os.makedirs(path) 
        
         print(path+ 
         ' 创建成功' 
         ) 
        
         else 
         : pass 
        
         save_folder_path = os.path. 
         join 
         (write_folder_path, folder)#某类下的保存路径 
        
         print( 
         '--> 请稍等，正在处理中...' 
         ) 
        
         # 类内循环 
        
         files = os.listdir(new_folder_path) 
        
         j = 1 
        
         for  
         file  
         in  
         files: 
        
         if  
         j > len(files):  
         break 
        
         dealpath = os.path. 
         join 
         (new_folder_path, file) #处理单个文件的路径 
        
         with open(dealpath, 
         "r" 
         ,encoding= 
         'utf-8' 
         )  
         as  
         f: 
        
         txtlist=f.read() 
        
         # python 过滤中文、英文标点特殊符号 
        
         # txtlist1 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",txtlist) 
        
         words =pseg.cut(txtlist) # 带词性标注的分词结果 
        
         cutresult= 
         ""  
         # 单个文本：分词后经停用词处理后的结果 
        
         for  
         word, flag  
         in  
         words: 
        
         if  
         word not  
         in  
         stopwords: 
        
         cutresult += word+ 
         "/" 
         +flag+ 
         " "  
         #去停用词 
        
         savepath = os.path. 
         join 
         (save_folder_path,file) 
        
         getFlag(cutresult,savepath) 
        
         j += 1 
        
         '' 
         ' 
        
         做词性筛选 
        
         cutresult：str类型，初切分的结果 
        
         savepath： 保存文件路径 
        
         '' 
         ' 
        
         def getFlag(cutresult,savepath): 
        
         txtlist=[] #过滤掉的词性后的结果 
        
         #词列表为自己定义要过滤掉的词性 
        
         cixing=[ 
         "/x" 
         , 
         "/zg" 
         , 
         "/uj" 
         , 
         "/ul" 
         , 
         "/e" 
         , 
         "/d" 
         , 
         "/uz" 
         , 
         "/y" 
         ] 
        
         for  
         line  
         in  
         cutresult.split( 
         '\n' 
         ): 
        
         line_list2=re.split( 
         '[ ]' 
         , line) 
        
         line_list2.append( 
         "\n" 
         ) # 保持原段落格式存在 
        
         line_list=line_list2[:] 
        
         for  
         segs  
         in  
         line_list2: 
        
         for  
         K  
         in  
         cixing: 
        
         if  
         K  
         in  
         segs: 
        
         line_list.remove(segs) 
        
         break 
        
         else 
         : 
        
         pass 
        
         txtlist.extend(line_list) 
        
         # 去除词性标签 
        
         resultlist=txtlist[:] 
        
         flagresult= 
         "" 
        
         for  
         v  
         in  
         txtlist: 
        
         if  
         "/"  
         in  
         v: 
        
         slope=v.index( 
         "/" 
         ) 
        
         letter=v[0:slope]+ 
         " " 
        
         flagresult+= letter 
        
         else 
         : 
        
         flagresult+= v 
        
         standdata(flagresult,savepath) 
        
         '' 
         ' 
        
         标准化处理，去除空行，空白字符等。 
        
         flagresult:筛选过的结果 
        
         '' 
         ' 
        
         def standdata(flagresult,savepath): 
        
         f2=open(savepath, 
         "w" 
         ,encoding= 
         'utf-8' 
         ) 
        
         for  
         line  
         in  
         flagresult.split( 
         '\n' 
         ): 
        
         if  
         len(line)>=2: 
        
         line_clean= 
         "/ " 
         . 
         join 
         (line.split()) 
        
         lines=line_clean+ 
         " " 
         + 
         "\n" 
        
         f2.write(lines) 
        
         else 
         : pass 
        
         f2.close() 
        
         if  
         __name__ ==  
         '__main__'  
         : 
        
         t1=time.time() 
        
         # 测试单个文件 
        
         dealpath= 
         "../Database/SogouC/FileTest/1.txt" 
        
         savepath= 
         "../Database/SogouCCut/FileTest/1.txt" 
        
         stopwordspath= 
         '../Database/stopwords/CH_stopWords.txt' 
        
         stopwordspath1= 
         '../Database/stopwords/HG_stopWords.txt'  
         # 哈工大停用词表 
        
         # 批量处理文件夹下的文件 
        
         # rfolder_path = '../Database/SogouC/Sample/' 
        
         rfolder_path =  
         '../Database/SogouC/FileNews/' 
        
         # 分词处理后保存根路径 
        
         wfolder_path =  
         '../Database/SogouCCut/' 
        
         # 中文语料预处理器 
        
         # cutTxtWord(dealpath,savepath,stopwordspath) # 单文本预处理器 
        
         cutFileWord(rfolder_path,wfolder_path,stopwordspath) # 多文本预处理器 
        
         t2=time.time() 
        
         print( 
         "中文语料语处理完成，耗时：" 
         +str(t2-t1)+ 
         "秒。" 
         ) #反馈结果

执行结果：

转自：https://www.cnblogs.com/baiboy/p/7676251.html

weixin_33872660

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
Python中文语料批量预处理手记jieba

阅读目录手记实用系列文章：语料预处理封装类：执行结果：手记实用系列文章：1结巴分词和自然语言处理HanLP处理手记2Python中文语料批量预处理手记3自然语言处理手记4Python中调用自然语言处理工具HanLP手记5Python中结巴分词使用手记语料预处理封装类：123456789...
复制链接

扫一扫