1.1.1 将文本切分为语句
def main1():
from nltk.tokenize import sent_tokenize
import nltk
text = " Welcome readers from U.S. I hope you find it interesting. Please do reply."
print(sent_tokenize(text))
main1()
# 执行结果
[' Welcome readers from U.S.', 'I hope you find it interesting.', 'Please do reply.']
# 切分大批量的句子,加载tokenize函数
def main2():
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = " Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
print(tokenizer.tokenize(text))
main2()
# 执行结果
[' Hello everyone.', 'Hope all are fine and doing well.', 'Hope you find the book interesting']
1.1.2 其他语言文本的切分
# 加载各自的pickle文件即可
def main3():
import nltk
french_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
print(french_tokenizer.tokenize(
'Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage collège '
'franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, '
'voilà ce qui a motivé hier matin le débrayage Levallois. '
'L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , '
'd’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , '
'd’un professeur d’histoire'))
main3()
# 执行结果
['Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage collège franco-britanniquedeLevallois-Perret.', 'Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage Levallois.', 'L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire.', 'L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire']
1.1.3 将句子切分成单词
# 使用word_tokenize()函数
# word_tokenize()函数使用NLTK包的一个叫做TreebankWordTokizer类的实例用于执行单词的切分
def main4():
import nltk
text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")
print(text)
main4()
# 执行结果
['PierreVinken', ',', '59', 'years', 'old', ',', 'will', 'join', 'as', 'a', 'nonexecutive', 'director', 'on', 'Nov.', '29', '.']
1.1.4 使用TreebankWordTokizer执行单词的切分
def main5():
import nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))
main5()
# 执行结果
['Have', 'a', 'nice', 'day.', 'I', 'hope', 'you', 'find', 'the', 'book', 'interesting']
# TreebankWordTokizer通过分离缩略词来实现切分
def main6():
import nltk
text = nltk.word_tokenize(" Don't hesitate to ask questions")
print(text)
main6()
# 执行结果
['Do', "n't", 'hesitate', 'to', 'ask', 'questions']
# WordPunktTokizer通过将标点转化为一个全新的标识符来实现切分
def main7():
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(" Don't hesitate to ask questions"))
main7()
# 执行结果
['Don', "'", 't', 'hesitate', 'to', 'ask', 'questions']
1.1.5 使用正则表达式实现切分
# 匹配单词
# 匹配空格
# 导入NLTK包的RegexpTokenizer模块
def main8():
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Don't hesitate to ask questions"))
main8()
# 执行结果
["Don't", 'hesitate', 'to', 'ask', 'questions']
# 不用实例化的切分方式
# 单词|.|非空白字符
def main9():
import nltk
from nltk.tokenize import regexp_tokenize
sent = "Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))
main9()
# 执行结果
['Don', "'t", 'hesitate', 'to', 'ask', 'questions']
# 通过空格来执行切分
def main10():
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)
print(tokenizer.tokenize("Don't hesitate to ask questions"))
main10()
# 执行结果
["Don't", 'hesitate', 'to', 'ask', 'questions']
# 筛选以大写字母开头的单词
def main11():
import nltk
from nltk.tokenize import RegexpTokenizer
sent = " She secured 90.56 % in class X . She is a meritorious student"
capt = RegexpTokenizer('[A-Z]\w+')
print(capt.tokenize(sent))
main11()
# 执行结果
['She', 'She']
# RegexpTokenizer的子类使用预定义正则表达式
def main12():
import nltk
sent = " She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import BlanklineTokenizer
print(BlanklineTokenizer().tokenize(sent))
main12()
# 执行结果
[' She secured 90.56 % in class X . She is a meritorious student']
# 字符串的切分可以通过空格、间隔、换行等来完成
def main13():
import nltk
sent=" She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import WhitespaceTokenizer
print(WhitespaceTokenizer().tokenize(sent))
main13()
# 执行结果
['She', 'secured', '90.56', '%', 'in', 'class', 'X', '.', 'She', 'is', 'a', 'meritorious', 'student']
# 使用split()方法进行切分
def main14():
import nltk
sent = "She secured 90.56 % in class X. She is a meritorious student"
# 默认空格
print(sent.split())
print(sent.split(' '))
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(sent.split('\n'))
main14()
# 执行结果
['She', 'secured', '90.56', '%', 'in', 'class', 'X.', 'She', 'is', 'a', 'meritorious', 'student']
['She', 'secured', '90.56', '%', 'in', 'class', 'X.', 'She', 'is', 'a', 'meritorious', 'student']
[' She secured 90.56 % in class X ', '. She is a meritorious student', '']
# LineTokenizer通常将文本切分为行来执行切分
def main15():
import nltk
from nltk.tokenize import BlanklineTokenizer
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(BlanklineTokenizer().tokenize(sent))
from nltk.tokenize import LineTokenizer
print(LineTokenizer(blanklines='keep').tokenize(sent))
print(LineTokenizer(blanklines='discard').tokenize(sent))
main15()
# 执行结果
[' She secured 90.56 % in class X \n. She is a meritorious student\n']
[' She secured 90.56 % in class X ', '. She is a meritorious student']
[' She secured 90.56 % in class X ', '. She is a meritorious student']
# SpaceTokenizer与sent.split(' ')类似
def main16():
import nltk
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(sent.split(' '))
from nltk.tokenize import SpaceTokenizer
print(SpaceTokenizer().tokenize(sent))
main16()
# 执行结果
['', 'She', 'secured', '90.56', '%', 'in', 'class', 'X', '\n.', 'She', 'is', 'a', 'meritorious', 'student\n']
['', 'She', 'secured', '90.56', '%', 'in', 'class', 'X', '\n.', 'She', 'is', 'a', 'meritorious', 'student\n']
# nltk.tokenize.util模块通过返回元组形式的序列来执行切分,(位置,偏移量)
def main17():
import nltk
from nltk.tokenize import WhitespaceTokenizer
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(WhitespaceTokenizer().span_tokenize(sent)))
main17()
# 执行结果
[(1, 4), (5, 12), (13, 18), (19, 20), (21, 23), (24, 29), (30, 31), (33, 34), (35, 38), (39, 41), (42, 43), (44, 55), (56, 63)]
# 返回跨度序列
def main18():
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.util import spans_to_relative
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))))
main18()
# 执行结果
[(1, 3), (1, 7), (1, 5), (1, 1), (1, 2), (1, 5), (1, 1), (2, 1), (1, 3), (1, 2), (1, 1), (1, 11), (1, 7)]
# 在每一个分隔符的连接处进行分割
def main19():
import nltk
from nltk.tokenize.util import string_span_tokenize
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(string_span_tokenize(sent, " ")))
main19()
# 执行结果
[(1, 4), (5, 12), (13, 18), (19, 20), (21, 23), (24, 29), (30, 31), (32, 34), (35, 38), (39, 41), (42, 43), (44, 55), (56, 64)]