Python爬虫实战项目 ----- 多线程爬取豆瓣热评较多的电影名称和热评词云

最新推荐文章于 2024-03-16 11:40:23 发布

henuyl

最新推荐文章于 2024-03-16 11:40:23 发布

阅读量1k

点赞数

分类专栏：语言类

本文链接：https://blog.csdn.net/henuyl/article/details/107532586

版权

语言类专栏收录该内容

23 篇文章 0 订阅

订阅专栏

话不多说，先上效果图

import re
import requests
import jieba
import wordcloud
from lxml import etree
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import threading
import time
#多线程爬取豆瓣热评较多的电影名称和热评词云

#matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei']
matplotlib.rcParams['axes.unicode_minus'] = False

def getHtml(targetUrl):
    my_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
        'Referer': targetUrl
    }
    try:
        r = requests.get(targetUrl, headers = my_headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('抓取错误,返回异常,爬虫结束...')

def getContent(content):
    html_xpath = etree.HTML(content)
    content_list = html_xpath.xpath(r"//div[@class='short-content']")
    for content in content_list:
        content_list_full.append(content.xpath("string(.)"))

def getJiebaStr(content_list_full):
    content_str = "".join(content_list_full)
    content_str = content_str.replace("影评","").replace("可能","").replace("展开","").replace("电影","").replace("没有","")
    content_jieba = jieba.cut(content_str, cut_all = True)
    content_jieba_str = "|".join(content_jieba)
    # print(content_jieba_str)
    return content_jieba_str

def getWordcloud(content_jieba_str):
    wc = wordcloud.WordCloud(
        font_path= "simsun.ttc",
        width= 1920, 
        height= 1080,
        background_color="white",
        mask=plt.imread("ciyun.jpg")
    )
    wc.generate(content_jieba_str)
    wc.to_file("d://1.jpg")

def getHotList(hotContent):
    pattern = r'<img alt="(.*?)" title="(.*?)" src="(.*?)" rel="v:image" />'
    hotList = re.findall(pattern, hotContent)
    for hot in hotList:
        utf_str = hot[0]
        if utf_str in hotCounter:
            hotCounter[utf_str] = str(int(hotCounter.get(utf_str))+ 1)
        else:
            hotCounter[utf_str] = '1'


def printPic(moviesName, moviesNum):
    myfont = matplotlib.font_manager.FontProperties(fname='simsun.ttc')
    #  fontproperties = myfont
    if len (moviesName) >=25:
        showN = 25
    else:
        showN = len(moviesName)
    plt.barh(range(showN), moviesNum[0: showN], height=0.4, color='steelblue', alpha=0.4)      # 从下往上画
    plt.yticks(range(showN), moviesName[0: showN], fontproperties = myfont)
    plt.xlim(0,9)
    plt.ylabel("电影名称", fontproperties = myfont)
    plt.xlabel("热评数量", fontproperties = myfont)
    count = 0
    for x, y in enumerate(moviesNum):
        count += 1
        plt.text(y + 0.2, x - 0.1, '%s' % y)
        if count == showN:
            break
    plt.show()

def getContentThread(page):
    htmlContent = getHtml(startUrl+str(page * 20))
    getContent(htmlContent)

def getHotListThread(page):
    hotContent = getHtml("https://movie.douban.com/review/best/?start="+ str(page * 20))
    getHotList(hotContent)

def is_uchar(uchar):
    """判断一个unicode是否是汉字"""
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
            return True
    else:
        return False

if __name__ == '__main__':

    startUrl = 'https://movie.douban.com/review/best/?start='
    content_list_full = []
    hotCounter = {}
    moviesName = []
    moviesNum = []
    #绘制排行
    for page in range(0, 20):
        Thd = threading.Thread(target=getHotListThread,args=(page, ))
        Thd.start()
    time.sleep(6)
    sortedHotCounter=sorted(hotCounter.items(),key=lambda x:x[1],reverse=True)
    for key, value in sortedHotCounter:
        if is_uchar(key) == True:
            moviesName.append(key)
            moviesNum.append(int(value))
    print(moviesName)
    printPic(moviesName, moviesNum)
    ###########
    for page in range(0, 9):
        Thd = threading.Thread(target=getContentThread,args=(page, ))
        Thd.start()
    time.sleep(5)
    content_jieba_str = getJiebaStr(content_list_full)
    getWordcloud(content_jieba_str)

henuyl

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
3
评论
Python爬虫实战项目 ----- 多线程爬取豆瓣热评较多的电影名称和热评词云

话不多说，先上效果图import reimport requestsimport jiebaimport wordcloudfrom lxml import etreeimport matplotlibimport numpy as npimport matplotlib.pyplot as pltimport threadingimport time#多线程爬取豆瓣热评较多的电影名称和热评词云#matplotlib.rcParams['font.sans-serif.
复制链接

扫一扫