python_爬虫获取优酷节目单

使用requests,BeautifulSoup库获取优酷视频列表,返回节目类别、主演等信息;


# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup


def getHtmlText(url, header):
    try:
        r = requests.get(url, headers=header)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        print(url)
        return r.text
    except:
        return '异常'


# 影片内容
def getVName(href, header, Vtype):
    text = getHtmlText(href, header)
    soup = BeautifulSoup(text, 'html.parser')
    # 影片列表
    vList = soup.select('body > div.s-body > div > div.vaule_main > div.box-series > ul > li')
    for i in vList:
        try:
            # 片名
            vName = i.select('div > div > a')[0].attrs['title']
            # 主演
            starring = i.select('div > ul.info-list > li.actor')[0].text
            print(vName, starring)
            with open(r'vList.txt', 'a') as f:
                f.write(Vtype + ',' + vName + ',' + starring + '\n')
        except:
            pass
    try:
        nextUrl = 'https:' + \
                  soup.select('body > div.s-body > div > div.vaule_main > div.yk-pager > ul > li.next > a')[0].attrs[
                      'href']
        getVName(nextUrl, header, Vtype)
    except:
        pass


def getUrls(href, header):
    urls = {}
    text = getHtmlText(url, header)
    soup = BeautifulSoup(text, 'html.parser')
    typeList = soup.select('#filterPanel > div > ul > li')
    for i in typeList:
        try:
            href = 'https://list.youku.com' + str(i.select('a')[0].attrs['href'])
            typeName = i.select('a')[0].text
            urls[typeName]=href
            # str2 = [typeName, href]
            # urls.append(str2)
        except:
            pass
    return urls

def getYearUrls(href, header,type):
    urls = []
    text = getHtmlText(href, header)
    soup = BeautifulSoup(text, 'html.parser')
    typeList = soup.select('#filterPanel > div:nth-of-type(4) > ul > li')
    for i in typeList:
        try:
            href = 'https:'+str(i.select('a')[0].attrs['href'])
            typeName = i.select('a')[0].text
            str2 = [type,typeName, href]
            urls.append(str2)
        except:
            pass
    return urls

if __name__ == '__main__':
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
    }
    url = 'https://list.youku.com/category/video/c_0.html?spm=a2h1n.8251845.filterPanel.5~1~3~A'
    urls = getUrls(url, header)
    yearUrls=[]
    vNmaes=input('输入要爬去的名字:')
    try:
        # yearUrls1=getYearUrls(i[1], header, i[0])
        yearUrls1=getYearUrls(urls[vNmaes], header, vNmaes)
        yearUrls.extend(yearUrls1)
        for i in yearUrls1:
            try:
                getVName(i[1], header, i[0])
            except:
                pass
    except:
        pass


 

 

发布了3 篇原创文章 · 获赞 5 · 访问量 2万+
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览