话不多说,先上效果图
import re
import requests
import jieba
import wordcloud
from lxml import etree
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import threading
import time
#多线程爬取豆瓣热评较多的电影名称和热评词云
#matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei']
matplotlib.rcParams['axes.unicode_minus'] = False
def getHtml(targetUrl):
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Referer': targetUrl
}
try:
r = requests.get(targetUrl, headers = my_headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('抓取错误,返回异常,爬虫结束...')
def getContent(content):
html_xpath = etree.HTML(content)
content_list = html_xpath.xpath(r"//div[@class='short-content']")
for content in content_list:
content_list_full.append(content.xpath("string(.)"))
def getJiebaStr(content_list_full):
content_str = "".join(content_list_full)
content_str = content_str.replace("影评","").replace("可能","").replace("展开","").replace("电影","").replace("没有","")
content_jieba = jieba.cut(content_str, cut_all = True)
content_jieba_str = "|".join(content_jieba)
# print(content_jieba_str)
return content_jieba_str
def getWordcloud(content_jieba_str):
wc = wordcloud.WordCloud(
font_path= "simsun.ttc",
width= 1920,
height= 1080,
background_color="white",
mask=plt.imread("ciyun.jpg")
)
wc.generate(content_jieba_str)
wc.to_file("d://1.jpg")
def getHotList(hotContent):
pattern = r'<img alt="(.*?)" title="(.*?)" src="(.*?)" rel="v:image" />'
hotList = re.findall(pattern, hotContent)
for hot in hotList:
utf_str = hot[0]
if utf_str in hotCounter:
hotCounter[utf_str] = str(int(hotCounter.get(utf_str))+ 1)
else:
hotCounter[utf_str] = '1'
def printPic(moviesName, moviesNum):
myfont = matplotlib.font_manager.FontProperties(fname='simsun.ttc')
# fontproperties = myfont
if len (moviesName) >=25:
showN = 25
else:
showN = len(moviesName)
plt.barh(range(showN), moviesNum[0: showN], height=0.4, color='steelblue', alpha=0.4) # 从下往上画
plt.yticks(range(showN), moviesName[0: showN], fontproperties = myfont)
plt.xlim(0,9)
plt.ylabel("电影名称", fontproperties = myfont)
plt.xlabel("热评数量", fontproperties = myfont)
count = 0
for x, y in enumerate(moviesNum):
count += 1
plt.text(y + 0.2, x - 0.1, '%s' % y)
if count == showN:
break
plt.show()
def getContentThread(page):
htmlContent = getHtml(startUrl+str(page * 20))
getContent(htmlContent)
def getHotListThread(page):
hotContent = getHtml("https://movie.douban.com/review/best/?start="+ str(page * 20))
getHotList(hotContent)
def is_uchar(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
if __name__ == '__main__':
startUrl = 'https://movie.douban.com/review/best/?start='
content_list_full = []
hotCounter = {}
moviesName = []
moviesNum = []
#绘制排行
for page in range(0, 20):
Thd = threading.Thread(target=getHotListThread,args=(page, ))
Thd.start()
time.sleep(6)
sortedHotCounter=sorted(hotCounter.items(),key=lambda x:x[1],reverse=True)
for key, value in sortedHotCounter:
if is_uchar(key) == True:
moviesName.append(key)
moviesNum.append(int(value))
print(moviesName)
printPic(moviesName, moviesNum)
###########
for page in range(0, 9):
Thd = threading.Thread(target=getContentThread,args=(page, ))
Thd.start()
time.sleep(5)
content_jieba_str = getJiebaStr(content_list_full)
getWordcloud(content_jieba_str)