爬虫案例:requests库,fake_useragent库,lxml库爬取电影天堂,一个大型的分类,可以向上进阶为全站爬虫

 遇到的问题:

1. detail_urls = html.xpath("//table[@class='tbspan']//a/@href"),得到的url,为半成品,需要后期补充。

2.try:

         。。。。。

  except IndexError:
        pass

电影天堂有假的电影标题,就是只有标题,没有电影详细信息。

3.("◎主  演")以下信息为多行多种形式的信息展示:为多行信息,要总共爬取,分开爬取。

4.本文为电影天堂最新电影,为一个大的分类,可以往上进阶为全站爬虫。
                

# -*- coding: utf-8 -*-
# @Time    : 2019/1/25 0025 16:51
# @Author  : huangtao
# @Site    : 
# @File    : 2.电影天堂.py
# @Software: PyCharm
# @Blog    :https://blog.csdn.net/Programmer_huangtao
from lxml import etree
import time
import ssl
import requests
from fake_useragent import UserAgent
BASE_URL = 'http://www.dytt8.net'
HEADERS = {'User-Agent':UserAgent().chrome}
#引入的库文件,常量,请求头
def get_detail_urls(url):
#获取影片展示的列表页中所有的电影详细页的url
    res = requests.get(url,headers=HEADERS)
    text = res.text
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url:BASE_URL+url,detail_urls)
    return detail_urls
def parse_detail_page(url):
#获取影片的详细信息
    try:
    #电影天堂网站中有假的电影,就是只有一个标题,而没有详细信息,所有做一个异常判断
        movie = {}
        res = requests.get(url,headers=HEADERS)
        text = res.content.decode('gbk')
        html = etree.HTML(text)
        titles = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
        movie['titles'] = titles

        ZOOM = html.xpath('//div[@id="Zoom"]')[0]
        imgs =  ZOOM.xpath('.//img/@src')
        cover = imgs[0]
        jietu = imgs[1]
        movie['cover'] = cover
        movie['jietu'] = jietu

        infos = ZOOM.xpath('.//text()')
        for index,info in enumerate(infos):
            if info.startswith("◎译  名"):
                info = info.replace("◎译  名",'').strip()
                movie['译名'] = info
            elif info.startswith("◎片  名"):
                info = info.replace("◎片  名",'').strip()
                movie['片名'] = info
            elif info.startswith("◎年  代"):
                info = info.replace("◎年  代", '').strip()
                movie['年代'] = info
            elif info.startswith("◎产  地"):
                info = info.replace("◎产  地", '').strip()
                movie['产地'] = info
            elif info.startswith("◎类  别"):
                info = info.replace("◎类  别", '').strip()
                movie['类别'] = info
            elif info.startswith("◎语  言"):
                info = info.replace("◎语  言", '').strip()
                movie['语言'] = info
            elif info.startswith("◎字  幕"):
                info = info.replace("◎字  幕", '').strip()
                movie['字幕'] = info
            elif info.startswith("◎上映日期"):
                info = info.replace("◎上映日期", '').strip()
                movie['上映日期'] = info
            elif info.startswith("◎IMDb评分"):
                info = info.replace("◎IMDb评分", '').strip()
                movie['IMDb'] = info
            elif info.startswith("◎豆瓣评分"):
                info = info.replace("◎豆瓣评分", '').strip()
                movie['douban'] = info
            elif info.startswith("◎文件格式"):
                info = info.replace("◎文件格式", '').strip()
                movie['wenjiangeshi'] = info
            elif info.startswith("◎视频尺寸"):
                info = info.replace("◎视频尺寸", '').strip()
                movie['shipincicun'] = info
            elif info.startswith("◎文件大小"):
                info = info.replace("◎文件大小", '').strip()
                movie['wenjiandaxiao'] = info
            elif info.startswith("◎片  长"):
                info = info.replace("◎片  长", '').strip()
                movie['pianchang'] = info
            elif info.startswith("◎导  演"):
                info = info.replace("◎导  演", '').strip()
                movie['director'] = info
            elif info.startswith("◎主  演"):
                info = info.replace("◎主  演", '').strip()
                actors = [info]
                for x in range(index+1,len(infos)):
                    actor = infos[x].strip()
                    if actor.startswith("◎"):
                        break
                    actors.append(actor)
                    movie['actors'] = actors
            elif info.startswith("◎标  签"):
                info = info.replace("◎标  签", '').strip()
                movie['bianqian'] = info
            elif info.startswith("◎简  介"):
                info = info.replace("◎简  介", '').strip()
                jianjies = [info]
                for x in range(index+1,len(infos)):
                    jianjie = infos[x].strip()
                    if jianjie.startswith("【下载地址】"):
                        break
                    jianjies.append(jianjie)
                    movie['jianjie'] = jianjies[1]
            elif info.startswith("◎获奖情况"):
                info = info.replace("◎获奖情况", '').strip()
                huojiangqingkuangs = [info]
                for x in range(index + 1, len(infos)):
                    huojiangqingkuang = infos[x].strip()
                    if huojiangqingkuang.startswith("【下载地址】"):
                        break
                    huojiangqingkuangs.append(huojiangqingkuang)
                    movie['huojiangqingkuang'] = huojiangqingkuangs[1:-1]

        download_url = html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
        movie['download_url'] = download_url
        return movie
    except IndexError:
        pass

def spider():
    base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    movies = []
    for pages in range(1,2):
            #控制总共有多少页
        url = base_url.format(pages)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            #遍历一页中的url
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)
    # print(movies)
if __name__ == '__main__':
#主程序
    spider()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值