1.2.1 消除标点符号
def main1():
text = [" It is a pleasant evening.", "Guests, who came from US arrived at the venue", "Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in text]
print(tokenized_docs)
main1()
# 执行结果
[['It', 'is', 'a', 'pleasant', 'evening', '.'], ['Guests', ',', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty', '.']]
# 从切分的文本中删除标点符号
def main2():
import re
import string
text = [" It is a pleasant evening.", "Guests, who came from US arrived at the venue", "Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in text]
# 未去除标点
print(tokenized_docs)
x = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
new_review = []
for token in review:
new_token = x.sub(u'', token)
if not new_token == u'':
new_review.append(new_token)
tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)
main2()
# 执行结果
[['It', 'is', 'a', 'pleasant', 'evening', '.'], ['Guests', ',', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty', '.']]
[['It', 'is', 'a', 'pleasant', 'evening'], ['Guests', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty']]
1.2.2 文本的大小写转换
# lower()小写
# upper()大写
def main3():
text = 'HARdWork IS KEy to SUCCESS'
print(text.lower())
print(text.upper())
main3
# 执行结果
hardwork is key to success
HARDWORK IS KEY TO SUCCESS
1.2.3 处理停止词
# 停止次是指在执行信息检索任务或其他自然语言处理任务时需要被过滤掉的词,
# 因为这些词对理解句子的整体意思没有多大的意义。
def main4():
import nltk
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
# to为停止词
words = ["Don't", 'hesitate', 'to', 'ask', 'questions']
print([word for word in words if word not in stops])
main4()
# 执行结果
["Don't", 'hesitate', 'ask', 'questions']
def main5():
from nltk.corpus import stopwords
# 多种语言
print(stopwords.fileids())
stops = list(stopwords.words('arabic'))
# 查看'arabic'语言的前5个停止词
print(stops[:5])
main5()
# 执行结果
['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish', 'turkish']
['إذ', 'إذا', 'إذما', 'إذن', 'أف']
1.2.4 计算英语中的停止词
def main6():
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
print(stopwords[:5])
def para_fraction(text):
stopwords = nltk.corpus.stopwords.words('english')
para = [w for w in text if w.lower() not in stopwords]
return len(para) / len(text)
# 文章
print(nltk.corpus.reuters.words())
# 比例
print(para_fraction(nltk.corpus.reuters.words()))
print(para_fraction(nltk.corpus.inaugural.words()))
main6()
# 执行结果
['i', 'me', 'my', 'myself', 'we']
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]
0.735240435097661
0.5228599855902837