01_字符串处理-----02_标准化

1.2.1 消除标点符号

def main1():
    text = [" It is a pleasant evening.", "Guests, who came from US arrived at the venue", "Food was tasty."]
    from nltk.tokenize import word_tokenize
    tokenized_docs = [word_tokenize(doc) for doc in text]
    print(tokenized_docs)

main1()
# 执行结果
[['It', 'is', 'a', 'pleasant', 'evening', '.'], ['Guests', ',', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty', '.']]
# 从切分的文本中删除标点符号
def main2():
    import re
    import string
    text = [" It is a pleasant evening.", "Guests, who came from US arrived at the venue", "Food was tasty."]
    from nltk.tokenize import word_tokenize
    tokenized_docs = [word_tokenize(doc) for doc in text]
    # 未去除标点
    print(tokenized_docs)
    x = re.compile('[%s]' % re.escape(string.punctuation))
    tokenized_docs_no_punctuation = []
    for review in tokenized_docs:
        new_review = []
        for token in review:
            new_token = x.sub(u'', token)
            if not new_token == u'':
                new_review.append(new_token)
        tokenized_docs_no_punctuation.append(new_review)
    print(tokenized_docs_no_punctuation)

main2()
# 执行结果
[['It', 'is', 'a', 'pleasant', 'evening', '.'], ['Guests', ',', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty', '.']]
[['It', 'is', 'a', 'pleasant', 'evening'], ['Guests', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty']]

1.2.2 文本的大小写转换

# lower()小写
# upper()大写
def main3():
    text = 'HARdWork IS KEy to SUCCESS'
    print(text.lower())
    print(text.upper())

main3
# 执行结果
hardwork is key to success
HARDWORK IS KEY TO SUCCESS

1.2.3 处理停止词

# 停止次是指在执行信息检索任务或其他自然语言处理任务时需要被过滤掉的词,
# 因为这些词对理解句子的整体意思没有多大的意义。
def main4():
    import nltk
    from nltk.corpus import stopwords
    stops = set(stopwords.words('english'))
    # to为停止词
    words = ["Don't", 'hesitate', 'to', 'ask', 'questions']
    print([word for word in words if word not in stops])

main4()
# 执行结果
["Don't", 'hesitate', 'ask', 'questions']
def main5():
    from nltk.corpus import stopwords
    # 多种语言
    print(stopwords.fileids())
    stops = list(stopwords.words('arabic'))
    # 查看'arabic'语言的前5个停止词
    print(stops[:5])

main5()
# 执行结果
['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish', 'turkish']
['إذ', 'إذا', 'إذما', 'إذن', 'أف']

1.2.4 计算英语中的停止词

def main6():
    import nltk
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    print(stopwords[:5])

    def para_fraction(text):
        stopwords = nltk.corpus.stopwords.words('english')
        para = [w for w in text if w.lower() not in stopwords]
        return len(para) / len(text)
    # 文章
    print(nltk.corpus.reuters.words())
    # 比例
    print(para_fraction(nltk.corpus.reuters.words()))
    print(para_fraction(nltk.corpus.inaugural.words()))

main6()
# 执行结果
['i', 'me', 'my', 'myself', 'we']
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]
0.735240435097661
0.5228599855902837

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值