一、切分(切分成句子、切分成单词)
- 文本切分成独立的句子
import nltk
text=" Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import sent_tokenize
print(sent_tokenize(text))
- 如果要切分大批量的句子
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text=" Welcome readers. I hope you find it interesting. Please do reply."
print(tokenizer.tokenize(text))
- 其他语言的切分
找到对应的pickle文件,方法和上面一样,比如french就用french.pickle
- 将句子节分成单词
NLTK有三种切分单词的方法,分别是TreebankWordTokenizer,PunktWordTokenizer,RegexpTokenizer,
1、TreebankWordTokenizer
import nltk
text = " Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
print(tokenizer.tokenize(text))2、已弃用
3.1、RegexpTokenizer -- WordPunctTokenizer
import nltk
text = " Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import WordPunctTokenizer
tokenizer=WordPunctTokenizer()
print(tokenizer.tokenize(text))3.2、RegexpTokenizer -- WhitespaceTokenizer
import nltk
text = " Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import WhitespaceTokenizer
tokenizer=WhitespaceTokenizer()
print(tokenizer.tokenize(text))3.3 RegexpTokenizer -- 自定义
import nltk
text = " Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer('\s+',gaps=True) #按照空格切分
print(tokenizer.tokenize(text))import nltk
text = " Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer('[A-Z]\w+') #只找到大写字母开头
print(tokenizer.tokenize(text))
二、标准化(消除标点符号、文本大小写的转换、处理停用词、计算英语中的停用词、替换和校正标识符)
- 消除标点符号
import nltk
import string
import re
text = [" It is a Pleasant evening. ", "Guests, who came from US arrived at the venue","Food was tasty" ]
from nltk.tokenize import word_tokenize
tokenized_docs=[word_tokenize(doc) for doc in text]
x=re.compile( '[%s]' % re.escape(string.punctuation) ) #emm ,这里的string.punctution是个列表所以要放在[]中...被坑了
tokenized_docs_no_punctuation=[]
for review in tokenized_docs:
new_review=[]
for token in review:
new_token=x.sub(u'',token) #re 模块中的 替换函数 x.sub() == re.sub(x,token)
if not new_token == u'':
new_review.append(new_token)
tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)
- 文本大小写的转换
text.lower() < - - - > text.upper()
- 处理停用词
import nltk
from nltk.corpus import stopwords
# print(stopwords.fileids())
stops=set(stopwords.words('english'))
print(stopwords.words('english'))
words=["Don't","to","is","are"]
word=[word for word in words if word not in stops] #打印列表中 去除了停止词的词
print(word)
- 处理重复字符
import re #新建文件名为repeat_clean.py
class RepeatReplacer(object):
def __init__(self):
self.repeat_regexp=re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self,word):
repl_word = self.repeat_regexp.sub(self.repl,word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_wordimport nltk
from repeat_clean import RepeatReplacer
replacer = RepeatReplacer()
print(replacer.replace('lotttt'))
print(replacer.replace('ohhhhh'))
- 替换一些你不想要的字符
import re #自己创建replace_clean文件
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in
patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return simport nltk
from replace_clean import RegexpReplacer
replacer = RegexpReplacer()
print(replacer.replace("Don't hesitate to ask questions"))
- 同义词转换
class WordReplacer(object): #定义一个map_replace_clean文件
def __init__(self, word_map):
self.word_map = word_map
def replace(self, word):
return self.word_map.get(word, word)import nltk
from map_replace_clean import WordReplacer
replacer = WordReplacer({'congrats': 'congratulations'})
print(replacer.replace('congrats'))
print(replacer.replace('maths'))
三、相似度度量
为了测试一个标准器、分块器、可以使用从信息检索到的标准分数
from nltk.metrics import *
training = 'PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing = 'PERSON OTHER OTHER OTHER OTHER OTHER'.split()
print(accuracy(training,testing))trainset=set(training)
testset=set(testing)print(precision(trainset,testset))
print(recall(trainset,testset))
print(f_measure(trainset,testset))#编辑距离
print(edit_distance("",""))
#jaccard距离
print(jaccard_distance(setA,setB))
#二进制距离
def binary_distance(label1,label2):
return 0.0 if label1==label2 else 1.0print(binary_distance(setA,setB))