python 爬取《延禧攻略》所有的演员参演的电视剧

最新推荐文章于 2024-08-29 16:05:27 发布

linxizi0622

最新推荐文章于 2024-08-29 16:05:27 发布

阅读量634

点赞数 1

分类专栏： Python 文章标签： Python 爬虫

本文链接：https://blog.csdn.net/linxizi0622/article/details/82828640

版权

Python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-
#@Time :18-9-23 上午11:22
#@Author : LiMeng
#@Email : 925762221@qq.com
#@File : yanxigonglvu.py
#Software:PyCharm
import  requests
import  ppretty
import collections
from wordcloud import WordCloud
from bs4 import  BeautifulSoup
import matplotlib.pyplot as plt
from scipy.misc import imread
import jieba
def get():
    url='http://www.tvzn.com/14784/yanyuanbiao.html'
    res=requests.get(url=url)
    html=res.content
    dianshiju_list=[]
    nameList=[]
    soup=BeautifulSoup(html,'lxml')
    dianshuju_x=[]

    # 主演
    contents1 = soup.find('ul', attrs={'class':'gclearfix'}).findAll("li")
    for content in contents1:
        #actorNamezhuyan=content.find('p',attrs={'class':'mh-actor'}).find('a',attrs={'class':'mh-actor'})
        actorNamezhuyan=content.find('a',attrs={'class':'mh-actor'})
        # print(actorNamezhuyan)
        href=actorNamezhuyan.attrs['href']
        # 将分析得到的网页地址进行二次爬虫，这里是要寻找某个演员参演的电视剧，需要再次发送请求
        res1=requests.get(('http://www.tvzn.com/'+href))
        rsp=res1.text
        soup1=BeautifulSoup(rsp,"lxml")
        content1 = soup1.find('ul',attrs={'class':'tn-avatar-list tn-helper-reset tn-helper-clearfix'})
        # print(type(dianshiju_list))
        for x in content1.strings:#这里是获取节点下面所有的内容
            if (x):#有的节点下面没有内容，所以需要将其过滤掉
                dianshiju_list.append(x)#将电视剧目表添加到数组中

     # 配角
    contents2=soup.find('div',attrs={'class':'mh-name-list'}).findAll('li')
    for contentx in contents2:
        aclist=contentx.findAll('p')
        for x in aclist:
             nameList.append(x.find('',attrs={'class':'mh-actor'}).getText())


    # 得到包含演员的数组
    surnamelist = []
    givennamelist = []
    surname_dict = {}
    for actorname in nameList:
        surnamelist.append(actorname[0])
        for givenname in actorname[2:]:
            givennamelist.append(givenname)
            if actorname[0] not in surname_dict:
                surname_dict[actorname[0]]=1
            else:
                surname_dict[actorname[0]]+=1


    file=open('./data.txt','w')
    for x in nameList:
        file.write(x)
        file.write(" ")
    file.close()

    word_count = collections.Counter(dianshiju_list);
    bg_pic = imread('mask.jpeg')
    wordcloud = WordCloud(font_path='./simhei.ttf', mask=bg_pic, background_color="white", width=1000, height=860,
                           margin=2).generate_from_frequencies((word_count))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
if __name__ == '__main__':
        get()

linxizi0622

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
3
评论
python 爬取《延禧攻略》所有的演员参演的电视剧

# -*- coding: utf-8 -*-#@Time :18-9-23 上午11:22#@Author : LiMeng#@Email : 925762221@qq.com#@File : yanxigonglvu.py#Software:PyCharmimport requestsimport pprettyimport collectionsfrom wordcl...
复制链接

扫一扫

专栏目录