中文信息处理--分句

最新推荐文章于 2022-03-21 20:06:31 发布

beifeng600

最新推荐文章于 2022-03-21 20:06:31 发布

阅读量2.7k

点赞数 3

分类专栏： NLP_语言学 Python 开源项目文章标签： NLP 分句

本文链接：https://blog.csdn.net/beifeng600/article/details/47928567

版权

NLP_语言学同时被 3 个专栏收录

3 篇文章 0 订阅

订阅专栏

Python

3 篇文章 0 订阅

订阅专栏

开源项目

3 篇文章 0 订阅

订阅专栏

中文信息处理--分句

工欲善其事必先利其器。中文信息处理之，中文分句。

按这几个标点“ 。！？…!?”，将中文进行分句，一般会遇到一些问题，比如成对的《》“”‘’{}（）()【】""，如果其中包含句的标点，会将完整的一句话拆分成几个句子。

以下是使用Python实现的中文分句程序，可以处理单个文件或者文件夹，但文件编码需为UTF-8，

# coding=utf-8
# python

import sys;
import os;

#设置分句的标志符号
cutlist="。！？…!?".decode('utf-8')
punct_pair_str = "《》“”‘’{}（）()【】\"\"".decode('utf-8')
punct_pair_hm = {}

sent_count = 0

# 检查某字符是否分句标志符号的函数；如果是，返回True， 否则返回False
def FindTok(char):
    global cutlist
    if char in cutlist:
        return True
    else:
        return False

def CutSent(cut_str):

    sent_list = []
    sent = []

    punct_pair = []

    for ch in cut_str:
        AddPunct(punct_pair, ch)
        if FindTok(ch):
            sent.append(ch)
            if len(punct_pair)==0:
                sent_list.append(''.join(sent))
                sent = []
                punct_pair = []
        else:
            sent.append(ch)
            
    if len(sent)!=0:
        sent_list.append(''.join(sent))

    return sent_list

def ConstPunctPair():
    global punct_pair_str, punct_pair_hm

    for index in range(0, len(punct_pair_str), 2):
        punct_pair_hm[punct_pair_str[index+1]] = punct_pair_str[index]
        #print (punct_pair_str[index+1]+"\t<==>\t"+punct_pair_str[index]).encode('gbk')


def AddPunct(punct_pair, ch):
    global punct_pair_str, punct_pair_hm
    
    if ch not in punct_pair_str:
        return punct_pair

    if len(punct_pair_hm)==0:
        ConstPunctPair()

    if ch not in punct_pair_hm:
        punct_pair.append(ch)
        return punct_pair

    hasMatch = False
    pair_ch = punct_pair_hm[ch]
    for index in range(len(punct_pair)-1, -1, -1):
        if punct_pair[index]==pair_ch:
            del punct_pair[index]
            hasMatch = True
            break
    if not hasMatch:
        punct_pair.append(ch)

    return punct_pair

def handle_file(input_path, output_path, multi_line=False):
    global sent_count
    
    if multi_line:
        fpw = open(output_path, 'w')
        
        total_line = ""
        for line in open(input_path).xreadlines():
            new_line = line[:-1].decode('utf-8')
            total_line += new_line

        sent_list = CutSent(total_line)
        for sent in sent_list:
            sent_count += 1
            #fpw.write(str(sent_count)+"\t"+sent.encode('utf-8')+"\n")
            fpw.write(sent.encode('utf-8')+"\n")
            
        fpw.close()
        return
    
    else:
        fpw = open(output_path, 'w')

        for line in open(input_path).xreadlines():
            new_line = line[:-1].decode('utf-8')

            sent_list = CutSent(new_line)
            for sent in sent_list:
                sent_count += 1
                #fpw.write(str(sent_count)+"\t"+sent.encode('utf-8')+"\n")
                fpw.write(sent.encode('utf-8')+"\n")
        fpw.close()
        return
    
def handle_dir(input_path, output_path, multi_line=False):

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    file_list = os.listdir(input_path)
    for file_name in file_list:
        if os.path.isdir(input_path+"/"+file_name):
            handle_dir(input_path+"/"+file_name, output_path+"/"+file_name, multi_line)
        else:
            handle_file(input_path+"/"+file_name, output_path+"/"+file_name, multi_line)


def handle(input_path, output_path, multi_line=False):

    if os.path.isdir(input_path):
        handle_dir(input_path, output_path, multi_line)
    else:
        handle_file(input_path, output_path, multi_line)

if __name__ == "__main__":
    if len(sys.argv)!=3:
        print "python %s input_path, output_path" % sys.argv[0]
    else:
        handle(sys.argv[1], sys.argv[2], False)


#cutlist="[。，,！!《》<>\"':：？\?、、|“”‘’；]{}(){}【】（）;~-_——+=*&……#@`·\n\r".decode('utf-8')

或源代码见：

https://github.com/beifeng600/nlp_storeroom/tree/master/tools/%E5%88%86%E5%8F%A5

参考：

Python 中文处理问题--分句，

http://m.blog.csdn.net/blog/yhc13429826359/4141471