python 爬取《延禧攻略》所有的演员参演的电视剧

# -*- coding: utf-8 -*-
#@Time :18-9-23 上午11:22
#@Author : LiMeng
#@Email : 925762221@qq.com
#@File : yanxigonglvu.py
#Software:PyCharm
import  requests
import  ppretty
import collections
from wordcloud import WordCloud
from bs4 import  BeautifulSoup
import matplotlib.pyplot as plt
from scipy.misc import imread
import jieba
def get():
    url='http://www.tvzn.com/14784/yanyuanbiao.html'
    res=requests.get(url=url)
    html=res.content
    dianshiju_list=[]
    nameList=[]
    soup=BeautifulSoup(html,'lxml')
    dianshuju_x=[]

    # 主演
    contents1 = soup.find('ul', attrs={'class':'gclearfix'}).findAll("li")
    for content in contents1:
        #actorNamezhuyan=content.find('p',attrs={'class':'mh-actor'}).find('a',attrs={'class':'mh-actor'})
        actorNamezhuyan=content.find('a',attrs={'class':'mh-actor'})
        # print(actorNamezhuyan)
        href=actorNamezhuyan.attrs['href']
        # 将分析得到的网页地址进行二次爬虫,这里是要寻找某个演员参演的电视剧,需要再次发送请求
        res1=requests.get(('http://www.tvzn.com/'+href))
        rsp=res1.text
        soup1=BeautifulSoup(rsp,"lxml")
        content1 = soup1.find('ul',attrs={'class':'tn-avatar-list tn-helper-reset tn-helper-clearfix'})
        # print(type(dianshiju_list))
        for x in content1.strings:#这里是获取节点下面所有的内容
            if (x):#有的节点下面没有内容,所以需要将其过滤掉
                dianshiju_list.append(x)#将电视剧目表添加到数组中

     # 配角
    contents2=soup.find('div',attrs={'class':'mh-name-list'}).findAll('li')
    for contentx in contents2:
        aclist=contentx.findAll('p')
        for x in aclist:
             nameList.append(x.find('',attrs={'class':'mh-actor'}).getText())


    # 得到包含演员的数组
    surnamelist = []
    givennamelist = []
    surname_dict = {}
    for actorname in nameList:
        surnamelist.append(actorname[0])
        for givenname in actorname[2:]:
            givennamelist.append(givenname)
            if actorname[0] not in surname_dict:
                surname_dict[actorname[0]]=1
            else:
                surname_dict[actorname[0]]+=1


    file=open('./data.txt','w')
    for x in nameList:
        file.write(x)
        file.write(" ")
    file.close()

    word_count = collections.Counter(dianshiju_list);
    bg_pic = imread('mask.jpeg')
    wordcloud = WordCloud(font_path='./simhei.ttf', mask=bg_pic, background_color="white", width=1000, height=860,
                           margin=2).generate_from_frequencies((word_count))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
if __name__ == '__main__':
        get()



 

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值