01_字符串处理------01_切分

 1.1.1 将文本切分为语句

def main1():
    from nltk.tokenize import sent_tokenize
    import nltk

    text = " Welcome readers from U.S. I hope you find it interesting. Please do reply."
    print(sent_tokenize(text))

main1()
# 执行结果
[' Welcome readers from U.S.', 'I hope you find it interesting.', 'Please do reply.']
# 切分大批量的句子,加载tokenize函数
def main2():
    import nltk
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    text = " Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
    print(tokenizer.tokenize(text))

main2()
# 执行结果
[' Hello everyone.', 'Hope all are fine and doing well.', 'Hope you find the book interesting']

1.1.2 其他语言文本的切分

# 加载各自的pickle文件即可
def main3():
    import nltk
    french_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
    print(french_tokenizer.tokenize(
        'Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  collège '
        'franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, '
        'voilà ce qui a motivé hier matin le débrayage  Levallois. '
        'L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , '
        'd’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , '
        'd’un professeur d’histoire'))

main3()
# 执行结果
['Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  collège franco-britanniquedeLevallois-Perret.', 'Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  Levallois.', 'L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire.', 'L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire']

1.1.3 将句子切分成单词

# 使用word_tokenize()函数
# word_tokenize()函数使用NLTK包的一个叫做TreebankWordTokizer类的实例用于执行单词的切分
def main4():
    import nltk
    text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")
    print(text)

main4()
# 执行结果
['PierreVinken', ',', '59', 'years', 'old', ',', 'will', 'join', 'as', 'a', 'nonexecutive', 'director', 'on', 'Nov.', '29', '.']

1.1.4 使用TreebankWordTokizer执行单词的切分

def main5():
    import nltk
    from nltk.tokenize import TreebankWordTokenizer
    tokenizer = TreebankWordTokenizer()
    print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))

main5()
# 执行结果
['Have', 'a', 'nice', 'day.', 'I', 'hope', 'you', 'find', 'the', 'book', 'interesting']
# TreebankWordTokizer通过分离缩略词来实现切分
def main6():
    import nltk
    text = nltk.word_tokenize(" Don't hesitate to ask questions")
    print(text)

main6()
# 执行结果
['Do', "n't", 'hesitate', 'to', 'ask', 'questions']
# WordPunktTokizer通过将标点转化为一个全新的标识符来实现切分
def main7():
    from nltk.tokenize import WordPunctTokenizer
    tokenizer = WordPunctTokenizer()
    print(tokenizer.tokenize(" Don't hesitate to ask questions"))
    
main7()
# 执行结果
['Don', "'", 't', 'hesitate', 'to', 'ask', 'questions']

1.1.5 使用正则表达式实现切分

# 匹配单词
# 匹配空格
# 导入NLTK包的RegexpTokenizer模块
def main8():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer("[\w']+")
    print(tokenizer.tokenize("Don't hesitate to ask questions"))

main8()
# 执行结果
["Don't", 'hesitate', 'to', 'ask', 'questions']
# 不用实例化的切分方式
# 单词|.|非空白字符
def main9():
    import nltk
    from nltk.tokenize import regexp_tokenize
    sent = "Don't hesitate to ask questions"
    print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))

main9()
# 执行结果
['Don', "'t", 'hesitate', 'to', 'ask', 'questions']
# 通过空格来执行切分
def main10():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    print(tokenizer.tokenize("Don't hesitate to ask questions"))

main10()
# 执行结果
["Don't", 'hesitate', 'to', 'ask', 'questions']
# 筛选以大写字母开头的单词
def main11():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    sent = " She secured 90.56 % in class X . She is a meritorious student"
    capt = RegexpTokenizer('[A-Z]\w+')
    print(capt.tokenize(sent))

main11()
# 执行结果
['She', 'She']
# RegexpTokenizer的子类使用预定义正则表达式
def main12():
    import nltk
    sent = " She secured 90.56 % in class X . She is a meritorious student"
    from nltk.tokenize import BlanklineTokenizer
    print(BlanklineTokenizer().tokenize(sent))

main12()
# 执行结果
[' She secured 90.56 % in class X . She is a meritorious student']
# 字符串的切分可以通过空格、间隔、换行等来完成
def main13():
    import nltk
    sent=" She secured 90.56 % in class X . She is a meritorious student"
    from nltk.tokenize import WhitespaceTokenizer
    print(WhitespaceTokenizer().tokenize(sent))

main13()
# 执行结果
['She', 'secured', '90.56', '%', 'in', 'class', 'X', '.', 'She', 'is', 'a', 'meritorious', 'student']
# 使用split()方法进行切分
def main14():
    import nltk
    sent = "She secured 90.56 % in class X. She is a meritorious student"
    # 默认空格
    print(sent.split())
    print(sent.split(' '))
    sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
    print(sent.split('\n'))

main14()
# 执行结果
['She', 'secured', '90.56', '%', 'in', 'class', 'X.', 'She', 'is', 'a', 'meritorious', 'student']
['She', 'secured', '90.56', '%', 'in', 'class', 'X.', 'She', 'is', 'a', 'meritorious', 'student']
[' She secured 90.56 % in class X ', '. She is a meritorious student', '']
# LineTokenizer通常将文本切分为行来执行切分
def main15():
    import nltk
    from nltk.tokenize import BlanklineTokenizer
    sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
    print(BlanklineTokenizer().tokenize(sent))
    from nltk.tokenize import LineTokenizer
    print(LineTokenizer(blanklines='keep').tokenize(sent))
    print(LineTokenizer(blanklines='discard').tokenize(sent))

main15()
# 执行结果
[' She secured 90.56 % in class X \n. She is a meritorious student\n']
[' She secured 90.56 % in class X ', '. She is a meritorious student']
[' She secured 90.56 % in class X ', '. She is a meritorious student']
# SpaceTokenizer与sent.split(' ')类似
def main16():
    import nltk
    sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
    print(sent.split(' '))
    from nltk.tokenize import SpaceTokenizer
    print(SpaceTokenizer().tokenize(sent))

main16()
# 执行结果
['', 'She', 'secured', '90.56', '%', 'in', 'class', 'X', '\n.', 'She', 'is', 'a', 'meritorious', 'student\n']
['', 'She', 'secured', '90.56', '%', 'in', 'class', 'X', '\n.', 'She', 'is', 'a', 'meritorious', 'student\n']
# nltk.tokenize.util模块通过返回元组形式的序列来执行切分,(位置,偏移量)
def main17():
    import nltk
    from nltk.tokenize import WhitespaceTokenizer
    sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
    print(list(WhitespaceTokenizer().span_tokenize(sent)))

main17()
# 执行结果
[(1, 4), (5, 12), (13, 18), (19, 20), (21, 23), (24, 29), (30, 31), (33, 34), (35, 38), (39, 41), (42, 43), (44, 55), (56, 63)]
# 返回跨度序列
def main18():
    import nltk
    from nltk.tokenize import WhitespaceTokenizer
    from nltk.tokenize.util import spans_to_relative
    sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
    print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))))

main18()
# 执行结果
[(1, 3), (1, 7), (1, 5), (1, 1), (1, 2), (1, 5), (1, 1), (2, 1), (1, 3), (1, 2), (1, 1), (1, 11), (1, 7)]
# 在每一个分隔符的连接处进行分割
def main19():
    import nltk
    from nltk.tokenize.util import string_span_tokenize
    sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
    print(list(string_span_tokenize(sent, " ")))
    
main19()
# 执行结果
[(1, 4), (5, 12), (13, 18), (19, 20), (21, 23), (24, 29), (30, 31), (32, 34), (35, 38), (39, 41), (42, 43), (44, 55), (56, 64)]

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值