第八章8.3自然语言处理-库的基本运用

最新推荐文章于 2023-12-26 11:06:21 发布

晴天下雨下雪下冰雹

最新推荐文章于 2023-12-26 11:06:21 发布

阅读量237

点赞数

分类专栏： python网络数据采集

本文链接：https://blog.csdn.net/qq_24599703/article/details/83684203

版权

python网络数据采集专栏收录该内容

24 篇文章 1 订阅

订阅专栏

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
#下载nltk的data
# import nltk
# nltk.download()

#创建text对象
# from nlkt import word_tokenize
# from nltk import Text
# tokens=word_tokenize("here is some not very interesting text")
# text=Text(tokens)

#统计词频
# from nlkt import FreqDist
# from nltk.book import *
# #统计书籍中不重复的单词与总单词的数据之比
# #len(text6)/len(words)
#
# #统计出现频率最高的前十个单词
# fdist=FreqDist(text6)
# fdist.most_common(10)
# #查看某个单词的频率
# fdist["Grail"]

#创建并搜索2-ngram模型
# from nltk import bigrams
# from nltk.book import *
# bigrams=bigrams(text6,2)
# bigramsDict=FreqDist(bigrams)
# bigramsDict[("Sir","Robin")]

#nltk进行词性分析
# from nlkt import word_tokenize
# from nltk import pos_tag
# text=word_tokenize("the dust was thick so he had to dust")
# pos_tag(text)


#选择采集文字中的动词的google
from nltk import word_tokenize,sent_tokenize,pos_tag
sentences=sent_tokenize("Google is one of the best companies in the world.I constantly google myself to see what i am up to")
nouns=['NN','NNS','NNP','NNPS']
for sentence in sentences:
    if "google" in sentence.lower():
        taggleWords=pos_tag(word_tokenize(sentence))
        for word in taggleWords:
            if word[0].lower()=='google' and word[1] in nouns:
                print(sentence)