from bs4 import BeautifulSoup
import requests
import jieba
import operator
data =""
url ="https://www.sohu.com/a/667138596_121332532?edtsign=97FD0595D380312998E3F9B2DF745EEDB3FF46D4&edtcode=v1kPcsuVpwONqQTmI9mDPg%3D%3D&scm=1103.plate:280:0.0.1_1.0&spm=smpc.home.top-news2.1.1681652134887q6G7VGh_1467&_f=index_news_0"
html = requests.get(url).text
soup = BeautifulSoup(html,"lxml")
article = soup.find("article",attrs={"class":"article"})for p_tag in article.find_all("p"):
data += p_tag.text
#print(p_tag.text)#print(data,"\n")withopen("中文分字.txt","w", encoding ="utf-8")as f:
f.write(data)withopen("中文分字.txt","r", encoding ="utf-8")as fp:
data = fp.read()
data = data.translate({ord(c):Nonefor c inlist(" ( ) , . ,。“ ” :;!、| \n / 《 》")})
words = jieba.cut(data)#print(words)for word in words:print(word,'/', end="")
word_freq =dict()print(dir(words))for word in words:if word notin word_freq:
word_freq[word]=1print(word_freq[word])else:
word_freq[word]+=1print(word_freq)
ordered_freq =sorted(word_freq.items(), key = operator.itemgetter(1), reverse =True)for w , c in ordered_freq:print(w, c)