Python爬虫商业项目实战，python中Counter用法实例

最新推荐文章于 2024-03-14 22:12:53 发布

林森见鹿

最新推荐文章于 2024-03-14 22:12:53 发布

阅读量360

点赞数

文章标签： python爬虫爬虫项目实战

本文链接：https://blog.csdn.net/qianyuanruqu/article/details/103399036

版权

爬虫案例，对美国总统的一篇演讲稿分析

要点： 1、Counter用法：统计分析，类似于tf-itf词频统计常用的函数有subtract update 2、python中'delimer'.JOIN(sentence)的使用对sentence按照delimer分割 from urllib.request import urlopen from bs4 import BeautifulSoup import re import string from collections import Counter

def cleanSentence(sentence): sentence=sentence.split(' ') #string.puctuaction+string.whitespace获取所有的字符，strip去掉word左右的字符 sentence=[word.strip(string.punctuation+string.whitespace) for word in sentence] #word长度大于1 或者word是'a'或者'i'的收录到word中并返回 sentence=[word for word in sentence if (len(word)>1 or (word.lower()=='a' or word.lower()=='i'))] print('cleanSentence>',sentence) return sentence def cleanInput(content): content=content.upper() #用空格替换\n content=re.sub('\n',' ',content) #按utf-8编码 content=bytes(content,'utf-8') #按照ASCII解码，解码报错直接忽略：排除掉非ASCII编码的字符，只适合在英文环境下分析 content=content.decode('ascii','ignore') #按照". "分割成句子 sentence=content.split('. ') print('cleanInput>',sentence) return [cleanSentence(sentence) for sentence in sentence] def getNgramFromSentence(content,n): output=[] # for i in range(len(content)-n+1): output.append(content[i:i+n]) print('getNgramFromSentence>',output) return output def getNgrams(content,n): content=cleanInput(content) #print('content>',content) ngrams=Counter() ngrams_list=[] for sentence in content: newNgrams=[' '.join(ngram) for ngram in getNgramFromSentence(sentence,2)] #print('newgream>',newNgrams) ngrams_list.extend(newNgrams) ngrams.update(newNgrams) return(ngrams) content=str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),'utf-8') ngrams=getNgrams(content,2) print(ngrams)