1、缩写词还原
在预处理文本的时候,我们经常会遇到,we’ll, don’t, i’m, i’ve, he’s之类的缩写词,对这类缩写词的还原目前大多数时候用自定义正则匹配即可。下面给出一些例子:
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
if __name__ == '__main__':
# -------------自定义替换文本-----------
replacer = RegexpReplacer()
test1 = replacer.replace("she must've gone to the market but she didn't go")
print(test1)
2、去重重复字符
很多时候一些文本不规则,比如推特,微博文本等,用户为了表达他们的意愿采取一些重复字符加重语气。比如“goooooood”, "loooove"等,对于这类词我们该怎么办呢?正则匹配重复字符,一般英文单词里面连续相同的字母不会超过三个,我们将其重复字符依次减少为三个、两个、一个,借助于nltk依次判断是否为单词即可(详情点击https://blog.csdn.net/qq_40438165/article/details/84572888)。
class RepeatReplacer():
def __init__(self):
self.repeat_reg = re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self, word):
if wordnet.synsets(word): # 判断当前字符串是否是单词
return word
repl_word = self.repeat_reg.sub(self.repl, word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_word
if __name__ == '__main__':
#-------------处理单词里面的重复字符---------
replacer = RepeatReplacer()
test1 = replacer.replace('looooooove')
test2 = replacer.replace('happy')
test3 = replacer.replace('gooooood')
print(test1, test2, test3)
3、命名实体识别
命名实体识别运用nltk训练好的nltk.ne_chunk()即可。在初次调用时nltk可能会提示你下载一些文件,按照提示下载即可。注意该方法需要利用词性标注的结果,在使用前请将文本分词并进行词性标注。下面是自己实现的一个类,getNERItem()可以以字典形式(单词:所属类别)返回一个文本text里面的地名机构名等等。
class NER():
def __init__(self):
pass
def chunk(self, text):
tokens = nltk.word_tokenize(text) # 分词
self.tags = nltk.pos_tag(tokens) # 词性标注
return nltk.ne_chunk(self.tags) # 需要利用词性标注的结果
# 返回命名实体的字典
def getNERItem(self, text):
tokens = nltk.word_tokenize(text) # 分词
self.tags = nltk.pos_tag(tokens) # 词性标注
temp = nltk.ne_chunk(self.tags) # 需要利用词性标注的结果
res = {}
for item in temp:
if len(item)==1: # item长度为1的是命名实体单词
t = str(item).split(' ')
res[t[1][:-1].split('/')[0]] = t[0][1:]
return res
if __name__ == '__main__':
#--------------命名实体识别----------------
ner = NER()
test1 = ner.getNERItem("UESTC in ChengDu, SiChuan province, founded in 1956")
test2 = ner.getNERItem("Xi is the chairman of China in the year 2013.")
print(test1)
print(test2)
所有程序
import re
from nltk.corpus import wordnet
import nltk
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
class RepeatReplacer():
def __init__(self):
self.repeat_reg = re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self, word):
if wordnet.synsets(word): # 判断当前字符串是否是单词
return word
repl_word = self.repeat_reg.sub(self.repl, word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_word
class NER():
def __init__(self):
pass
def chunk(self, text):
tokens = nltk.word_tokenize(text) # 分词
self.tags = nltk.pos_tag(tokens) # 词性标注
return nltk.ne_chunk(self.tags) # 需要利用词性标注的结果
# 返回命名实体的字典
def getNERItem(self, text):
tokens = nltk.word_tokenize(text) # 分词
self.tags = nltk.pos_tag(tokens) # 词性标注
temp = nltk.ne_chunk(self.tags) # 需要利用词性标注的结果
res = {}
for item in temp:
if len(item)==1:
t = str(item).split(' ')
res[t[1][:-1].split('/')[0]] = t[0][1:]
return res
if __name__ == '__main__':
# -------------自定义替换文本-----------
replacer = RegexpReplacer()
test1 = replacer.replace("she must've gone to the market but she didn't go")
print(test1)
#-------------处理单词里面的重复字符---------
replacer = RepeatReplacer()
test1 = replacer.replace('looooooove')
test2 = replacer.replace('happy')
test3 = replacer.replace('gooooood')
print(test1, test2, test3)
#--------------命名实体识别----------------
ner = NER()
test1 = ner.getNERItem("UESTC in ChengDu, SiChuan province, founded in 1956")
test2 = ner.getNERItem("Xi is the chairman of China in the year 2013.")
print(test1)
print(test2)