python3-爬虫之电影天堂

最新推荐文章于 2024-06-28 10:31:48 发布

海螺烧香

最新推荐文章于 2024-06-28 10:31:48 发布

阅读量1.7k

点赞数

分类专栏： python3爬虫文章标签： python3爬虫爬虫电影信息 python爬虫

本文链接：https://blog.csdn.net/weixin_43141153/article/details/84642541

版权

python3爬虫专栏收录该内容

15 篇文章 0 订阅

订阅专栏

pyhon3爬虫之——电影天堂最新电影信息

#导入所需要的库
import requests
from lxml import etree

定义全局变量
BASE_URL="https://www.dytt8.net"
HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }

#解析最新电影列表页面，获取当前页面每个电影的url
def get_detil_urls(url):
    response = requests.get(url, headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')
    # for deteil_url in deteil_urls:
    #     print(BASE_URL + deteil_url)
    detail_urls=map(lambda url:BASE_URL+url,detail_urls)
    return detail_urls

#解析电影详情页面，获取电影详情信息
def parse_detail_page(url):
    movie={}
    response=requests.get(url,headers=HEADERS)
    text=response.content.decode("gbk")
    html=etree.HTML(text)
    title=html.xpath("//div[@class='title_all']//font/text()")[0]
    movie["title"]=title

    zoomE=html.xpath("//div[@id='Zoom']")[0]
    imgs=zoomE.xpath(".//img/@src")
    cover=imgs[0]
    screenshot=imgs[1]
    movie["cover"]=cover
    movie["screenshot"]=screenshot

    def parse_info(info,rule):
        return info.replace(rule,"").strip()
    infos=zoomE.xpath(".//text()")
    for index,info in enumerate(infos):
        if info.startswith("◎年　　代"):
            info=parse_info(info,"◎年　　代")
            movie['year']=info
        elif info.startswith("◎产　　地"):
            info=parse_info(info,"◎产　　地")
            movie["country"]=info
        elif info.startswith("◎类　　别"):
            info=parse_info(info,"◎类　　别")
            movie["category"]=info
        elif info.startswith("◎豆瓣评分"):
            info=parse_info(info,"◎豆瓣评分")
            movie["score"]=info
        elif info.startswith("◎片　　长"):
            info=parse_info(info,"◎片　　长")
            movie["duration"]=info
        elif info.startswith("◎导　　演"):
            info=parse_info(info,"◎导　　演")
            movie["director"]=info
        elif info.startswith("◎主　　演"):
            info=parse_info(info,"◎主　　演")
            actors=[info]
            for x in range(index+1,len(infos)):
                actor=infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie["actors"]=actors
        elif info.startswith("◎简　　介"):
            info=parse_info(info,"◎简　　介")
            text=[]
            for x in range(index+1,len(infos)):
                profile=infos[x].strip()
                if profile.startswith("【下载地址】"):
                    break
                text.append(profile)
            movie['profile']=text[0]

    download_url=html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
    movie["download_url"]=download_url
    return movie

def spider(n):
    base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
    movies=[]
    for x in range(1,n):
        #第一个for循环是用来控制总共有几页
        url=base_url.format(x)
        detail_urls=get_detil_urls(url)
        for detail_url in detail_urls:
            #第二个for循环是用来遍历每一页的所有电影的详情url
            movie=parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)