[Python] 纯文本查看 复制代码import requests
from lxml import etree
import jieba
import numpy as np
import matplotlib.pyplot as plt
import time
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
def GetData(url):
html = requests.get(url)
ehtml = etree.HTML(html.text)
pagenum = ehtml.xpath('//*[@class="last"]/text()')
if len(pagenum)>0:
txt=''
pagenum = pagenum[1].replace('... ', '')
print('回复共 ' + str(pagenum) + ' 页,请稍候...')
url=url.split('-')
for i in range(1,int(pagenum)+1):
print('第 '+str(i)+' 页')
nurl=url[0]+'-'+url[1]+'-'+str(i)+'-'+url[3]
html = requests.get(nurl)
ehtml = etree.HTML(html.text)
pltxt = ''.join(ehtml.xpath('//*[@class="t_f"]/text()'))
txt = txt + pltxt
time.sleep(1) # 给服务器留些喘气的时间
else:
print('回复只有 1 页,这数据是不是太少了...')
txt = ''.join(ehtml.xpath('//*[@class="t_f"]/text()'))
cut_text =txt
background_image = np.array(Image.open('c:\\bg.jpg'))
wordcloud = WordCloud(
font_path="C:/Windows/Fonts/simfang.ttf",
background_color="white",
mask=background_image).generate(cut_text)
image_colors = ImageColorGenerator(background_image)
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.show()
if __name__ == '__main__':
print('开始获取,请稍候...')
url='https://www.52pojie.cn/thread-924213-1-1.html'
GetData(url)