爬虫案例：requests库,fake_useragent库,lxml库爬取电影天堂，一个大型的分类，可以向上进阶为全站爬虫

最新推荐文章于 2024-09-08 09:35:21 发布

学无止境-程序猿

最新推荐文章于 2024-09-08 09:35:21 发布

阅读量914

点赞数

分类专栏： Python的爬虫使用案例一只网络上的虫（爬虫实例）文章标签：爬虫案例全站爬虫爬虫入门

本文链接：https://blog.csdn.net/programmer_huangtao/article/details/86653348

版权

Python的爬虫使用案例同时被 2 个专栏收录

34 篇文章 0 订阅

订阅专栏

一只网络上的虫（爬虫实例）

31 篇文章 1 订阅

订阅专栏

遇到的问题：

1. detail_urls = html.xpath("//table[@class='tbspan']//a/@href")，得到的url，为半成品，需要后期补充。

2.try:

。。。。。

except IndexError:
pass

电影天堂有假的电影标题，就是只有标题，没有电影详细信息。

3.("◎主　　演")以下信息为多行多种形式的信息展示:为多行信息，要总共爬取，分开爬取。

4.本文为电影天堂最新电影，为一个大的分类，可以往上进阶为全站爬虫。

# -*- coding: utf-8 -*-
# @Time    : 2019/1/25 0025 16:51
# @Author  : huangtao
# @Site    : 
# @File    : 2.电影天堂.py
# @Software: PyCharm
# @Blog    :https://blog.csdn.net/Programmer_huangtao
from lxml import etree
import time
import ssl
import requests
from fake_useragent import UserAgent
BASE_URL = 'http://www.dytt8.net'
HEADERS = {'User-Agent':UserAgent().chrome}
#引入的库文件,常量，请求头
def get_detail_urls(url):
#获取影片展示的列表页中所有的电影详细页的url
    res = requests.get(url,headers=HEADERS)
    text = res.text
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url:BASE_URL+url,detail_urls)
    return detail_urls
def parse_detail_page(url):
#获取影片的详细信息
    try:
    #电影天堂网站中有假的电影，就是只有一个标题，而没有详细信息，所有做一个异常判断
        movie = {}
        res = requests.get(url,headers=HEADERS)
        text = res.content.decode('gbk')
        html = etree.HTML(text)
        titles = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
        movie['titles'] = titles

        ZOOM = html.xpath('//div[@id="Zoom"]')[0]
        imgs =  ZOOM.xpath('.//img/@src')
        cover = imgs[0]
        jietu = imgs[1]
        movie['cover'] = cover
        movie['jietu'] = jietu

        infos = ZOOM.xpath('.//text()')
        for index,info in enumerate(infos):
            if info.startswith("◎译　　名"):
                info = info.replace("◎译　　名",'').strip()
                movie['译名'] = info
            elif info.startswith("◎片　　名"):
                info = info.replace("◎片　　名",'').strip()
                movie['片名'] = info
            elif info.startswith("◎年　　代"):
                info = info.replace("◎年　　代", '').strip()
                movie['年代'] = info
            elif info.startswith("◎产　　地"):
                info = info.replace("◎产　　地", '').strip()
                movie['产地'] = info
            elif info.startswith("◎类　　别"):
                info = info.replace("◎类　　别", '').strip()
                movie['类别'] = info
            elif info.startswith("◎语　　言"):
                info = info.replace("◎语　　言", '').strip()
                movie['语言'] = info
            elif info.startswith("◎字　　幕"):
                info = info.replace("◎字　　幕", '').strip()
                movie['字幕'] = info
            elif info.startswith("◎上映日期"):
                info = info.replace("◎上映日期", '').strip()
                movie['上映日期'] = info
            elif info.startswith("◎IMDb评分"):
                info = info.replace("◎IMDb评分", '').strip()
                movie['IMDb'] = info
            elif info.startswith("◎豆瓣评分"):
                info = info.replace("◎豆瓣评分", '').strip()
                movie['douban'] = info
            elif info.startswith("◎文件格式"):
                info = info.replace("◎文件格式", '').strip()
                movie['wenjiangeshi'] = info
            elif info.startswith("◎视频尺寸"):
                info = info.replace("◎视频尺寸", '').strip()
                movie['shipincicun'] = info
            elif info.startswith("◎文件大小"):
                info = info.replace("◎文件大小", '').strip()
                movie['wenjiandaxiao'] = info
            elif info.startswith("◎片　　长"):
                info = info.replace("◎片　　长", '').strip()
                movie['pianchang'] = info
            elif info.startswith("◎导　　演"):
                info = info.replace("◎导　　演", '').strip()
                movie['director'] = info
            elif info.startswith("◎主　　演"):
                info = info.replace("◎主　　演", '').strip()
                actors = [info]
                for x in range(index+1,len(infos)):
                    actor = infos[x].strip()
                    if actor.startswith("◎"):
                        break
                    actors.append(actor)
                    movie['actors'] = actors
            elif info.startswith("◎标　　签"):
                info = info.replace("◎标　　签", '').strip()
                movie['bianqian'] = info
            elif info.startswith("◎简　　介"):
                info = info.replace("◎简　　介", '').strip()
                jianjies = [info]
                for x in range(index+1,len(infos)):
                    jianjie = infos[x].strip()
                    if jianjie.startswith("【下载地址】"):
                        break
                    jianjies.append(jianjie)
                    movie['jianjie'] = jianjies[1]
            elif info.startswith("◎获奖情况"):
                info = info.replace("◎获奖情况", '').strip()
                huojiangqingkuangs = [info]
                for x in range(index + 1, len(infos)):
                    huojiangqingkuang = infos[x].strip()
                    if huojiangqingkuang.startswith("【下载地址】"):
                        break
                    huojiangqingkuangs.append(huojiangqingkuang)
                    movie['huojiangqingkuang'] = huojiangqingkuangs[1:-1]

        download_url = html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
        movie['download_url'] = download_url
        return movie
    except IndexError:
        pass

def spider():
    base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    movies = []
    for pages in range(1,2):
            #控制总共有多少页
        url = base_url.format(pages)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            #遍历一页中的url
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)
    # print(movies)
if __name__ == '__main__':
#主程序
    spider()