python3-爬虫之电影天堂

pyhon3爬虫之——电影天堂最新电影信息

#导入所需要的库
import requests
from lxml import etree
定义全局变量
BASE_URL="https://www.dytt8.net"
HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
#解析最新电影列表页面,获取当前页面每个电影的url
def get_detil_urls(url):
    response = requests.get(url, headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')
    # for deteil_url in deteil_urls:
    #     print(BASE_URL + deteil_url)
    detail_urls=map(lambda url:BASE_URL+url,detail_urls)
    return detail_urls
#解析电影详情页面,获取电影详情信息
def parse_detail_page(url):
    movie={}
    response=requests.get(url,headers=HEADERS)
    text=response.content.decode("gbk")
    html=etree.HTML(text)
    title=html.xpath("//div[@class='title_all']//font/text()")[0]
    movie["title"]=title

    zoomE=html.xpath("//div[@id='Zoom']")[0]
    imgs=zoomE.xpath(".//img/@src")
    cover=imgs[0]
    screenshot=imgs[1]
    movie["cover"]=cover
    movie["screenshot"]=screenshot

    def parse_info(info,rule):
        return info.replace(rule,"").strip()
    infos=zoomE.xpath(".//text()")
    for index,info in enumerate(infos):
        if info.startswith("◎年  代"):
            info=parse_info(info,"◎年  代")
            movie['year']=info
        elif info.startswith("◎产  地"):
            info=parse_info(info,"◎产  地")
            movie["country"]=info
        elif info.startswith("◎类  别"):
            info=parse_info(info,"◎类  别")
            movie["category"]=info
        elif info.startswith("◎豆瓣评分"):
            info=parse_info(info,"◎豆瓣评分")
            movie["score"]=info
        elif info.startswith("◎片  长"):
            info=parse_info(info,"◎片  长")
            movie["duration"]=info
        elif info.startswith("◎导  演"):
            info=parse_info(info,"◎导  演")
            movie["director"]=info
        elif info.startswith("◎主  演"):
            info=parse_info(info,"◎主  演")
            actors=[info]
            for x in range(index+1,len(infos)):
                actor=infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie["actors"]=actors
        elif info.startswith("◎简  介"):
            info=parse_info(info,"◎简  介")
            text=[]
            for x in range(index+1,len(infos)):
                profile=infos[x].strip()
                if profile.startswith("【下载地址】"):
                    break
                text.append(profile)
            movie['profile']=text[0]

    download_url=html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
    movie["download_url"]=download_url
    return movie

def spider(n):
    base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
    movies=[]
    for x in range(1,n):
        #第一个for循环是用来控制总共有几页
        url=base_url.format(x)
        detail_urls=get_detil_urls(url)
        for detail_url in detail_urls:
            #第二个for循环是用来遍历每一页的所有电影的详情url
            movie=parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)
if __name__ =="__main__":
    spider()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值