'''
Python 新浪实时新闻词云 by 郑瑞国
'''
from wordcloud import WordCloud
import re
import urllib.request
import time
def get_wordcloud(text):
image=WordCloud(font_path="c:/Windows/Fonts/simfang.ttf",width=1200,height=600,max_words=50).generate(text).to_image()
image.show()
def get_wordcloud_fromfile():
newtext=[]
text=[]
#try:
with open(r'd:\test.txt','rt') as f:
text = f.readlines()
#except:
#pass
for i in range(len(text),0,-1):
if len(text)-i<8:
newtext.append(text[i-1])
image=WordCloud(font_path="c:/Windows/Fonts/simfang.ttf",width=1200,height=600,max_words=50).generate(str(newtext)).to_image()
image.show()
def open_url(url):
return urllib.request.urlopen(url).read().decode("utf-8","ignore")
def find_url(url):
return re.findall('href="(http://.*?)"',open_url(url))
def find_text(url):
return re.findall('<a.*>(.*?)</a>',open_url(url))
def save_text(text):
t=[]
try:
with open(r'd:\test.txt','r') as pre_f:
t = pre_f.readlines()
except:
pass
with open(r'd:\test.txt','a') as f:
for i in range(0,len(text)):
if len(text[i])>8:
if text[i]+'\n' not in t:
try:
f.write(text[i]+"\n")
print(text[i])
except:
pass
#print('*',end=' ')
if __name__ == "__main__":
url = 'http://news.sina.com.cn/'
url_list = find_url(url)
newtext=[]
while True:
for c_url in url_list:
#urllib.request.urlretrieve(c_url,filename=r'd:\tmp\abc.html')
text = find_text(c_url)
save_text(text)
newtext.append(text)
#get_wordcloud_fromfile()
try:
#get_wordcloud(str(newtext))
get_wordcloud_fromfile()
time.sleep(20)
except:
pass