爬取电影天堂电影详情和磁力链接

import requests
from lxml import etree

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}

base_url = 'https://www.dy2018.com'
detail_urls = []


#  获取每部影片详情页面的url
def get_detail_urls(num):
    for i in range(1, num+1):
        if i == 1:
            url = base_url + '/html/gndy/dyzz/index.html'
        else:
            url = base_url + '/html/gndy/dyzz/index_%d.html' % i
        response = requests.get(url, headers=headers)
        result = response.content.decode('gbk')
        html = etree.HTML(result)
        urls = html.xpath("//table[@class='tbspan']//a/@href")
        for value in urls:
            detail_url = base_url + value
            detail_urls.append(detail_url)

# 提取需要的数据
def parse_detail_page(url,movies):
    response = requests.get(url,headers=headers)
    result = response.content.decode('gbk')
    html = etree.HTML(result)
    details = html.xpath("//div[@id='Zoom']")
    movie = {}
    for detail in details:
        infos = detail.xpath(".//text()")
        for index,info in enumerate(infos):
            if info.startswith("◎片  名"):
                title = info.replace("◎片  名","").strip()
                movie['movie_name'] = title
            elif info.startswith("◎年  代"):
                year = info.replace("◎年  代","").strip()
                movie['movie_year'] = year
            elif info.startswith("◎产  地"):
                address = info.replace("◎产  地","").strip()
                movie['movie_address'] = address
            elif info.startswith("◎类  别"):
                category = info.replace("◎类  别","").strip()
                movie['movie_category'] = category
            elif info.startswith("◎语  言"):
                language = info.replace("◎语  言", "").strip()
                movie['movie_language'] = language
            elif info.startswith("◎豆瓣评分"):
                score = info.replace("◎豆瓣评分", "").strip()
                movie['douban_score'] = score
            elif info.startswith("◎导  演"):
                director = info.replace("◎导  演", "").strip()
                movie['movie_director'] = director
            elif info.startswith("◎主  演"):
                actor = info.replace("◎主  演", "").strip()
                actors = []
                actors.append(actor)
                for x in range(index+1,len(infos)):       # 获取所有主演
                    if infos[x].startswith("◎简  介"):
                        break
                    actor = infos[x].strip()
                    actors.append(actor)
                movie['movie_actors'] = actors
            elif info.startswith("◎简  介"):
                for x in range(index+1,index+3):
                    intro = infos[x]
                    movie['movie_intro'] = intro
    download_url = html.xpath(".//td[@bgcolor='#fdfddf']//a/text()")
    for index,url in enumerate(download_url):
        if url.find("电影天堂"):
            del download_url[index]
    movie['download_url'] = download_url
    movies.append(movie)



if __name__ == '__main__':
    movies = []
    num = int(input("请输入需要的爬取的页数:"))
    get_detail_urls(num)
    for url in detail_urls:
        parse_detail_page(url,movies)
    for movie in movies:
        for key,value in movie.items():
            print(key + ":",end="")
            print(value)
            print()
        print()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值