#0715-----------------------------
#数据清洗
'''
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
def ngrams(input,n):
#input = re.sub("\n+","",input)
#input = re.sub(" +","",input)
input = input.replace(" ","")
inputs = input.split("[,。、?!:“”]")
output = []
for input0 in inputs:
for i in range(len(input0)-n-1):
output.append(input0[i:i+n])
return output
html = urlopen("https://baike.baidu.com/item/%E5%BC%A0%E4%BA%91%E9%9B%B7/17149")
bs0bj = BeautifulSoup(html,"html.parser")
content = bs0bj.find("div",{"class":"para"}).get_text()
ngrams = ngrams(content,2)
print(ngrams)
print(str(len(ngrams)))
'''
#排序 -------序列频率转换成OrdereDict对象
'''
from collections import OrderedDict
ngrams = ngrams(content,2)
ngrams = OrdereDict(sorted(ngrams.items(),key = lambda t:t[1],reverse = True))
print(ngrams)
'''
0715----------爬虫
最新推荐文章于 2023-05-13 00:28:32 发布