python xpath爬取电影天堂

最新推荐文章于 2022-11-20 15:39:57 发布

stay down

最新推荐文章于 2022-11-20 15:39:57 发布

阅读量874

点赞数

本文链接：https://blog.csdn.net/qq_43515464/article/details/102969930

版权

import requests
from lxml import html

BASE_DOMAIN = 'https://www.dytt8.net'

# url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2843.400',
}

def spider():
    base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
    movies = []
    for num in range(1,8):
        url = base_url.format(num)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)




def get_detail_urls(url):
    resp = requests.get(url, headers=HEADERS).content.decode('gbk', 'ignore')
    # 忽略除gbk以外的编码
    etree = html.etree
    htmlElements = etree.HTML(resp)
    detail_urls = htmlElements.xpath('//table[@class="tbspan"]//a/@href')
    detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
    return detail_urls

def parse_info(info,rule):
    return info.replace(rule,"").strip()

def parse_detail_page(url):
    movie = {}
    resp = requests.get(url, headers=HEADERS).content.decode('gbk', 'ignore')
    # 忽略除gbk以外的编码
    etree = html.etree
    htmlElements = etree.HTML(resp)

    title = htmlElements.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]
    movie['title'] = title

    zoomE = htmlElements.xpath('//div[@id="Zoom"]')[0]
    imgs = zoomE.xpath(".//img/@src")
    cover = imgs[0]
    screenshot = imgs[1]
    movie['cover'] = cover
    movie['screenshot'] = screenshot
    infos = zoomE.xpath('.//text()')
    # print(infos)

    for index,info in enumerate(infos):
       if info.startswith("◎年　　代"):
            info = info.replace("◎年　　代","").strip()
            movie["year"] = info
       if info.startswith("◎产　　地"):
            info = info.replace("◎产　　地",'').strip()
            movie['country'] = info
       if info.startswith("◎类　　别"):
            info = info.replace('◎类　　别',"").strip()
            movie["categary"] = info
       if info.startswith("◎豆瓣评分"):
            info =  info.replace("◎豆瓣评分","").strip()
            movie["douban"] = info
       if info.startswith("◎导　　演"):
            info = info.replace("◎导　　演","").strip()
            directors = [info]
            for x in range(index+1,len(infos)):
                director = infos[x].strip()
                if director.startswith("◎编　　剧"):
                    break
                directors.append(director)
            movie["director"] = directors
       if info.startswith("◎主　　演"):
            info = info.replace("◎主　　演",'').strip()
            actors = [info]
            for x in range(index+1,len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎标　　签"):
                    break
                actors.append(actor)
            movie["actor"] = actors
       if info.startswith("◎简　　介 "):
            info = info.replace("◎简　　介 ","").strip()
            profiles = []
            for x in range(index+1,len(infos)):
                profile = infos[x].strip()
                if profile.startswith("◎获奖情况 "):
                    break
                profiles.append(profile)
            movie["profile"] = profiles
    return movie





if __name__ == '__main__':
    spider()

1.return movie 可以使用yield代替提高效率
2.为啥我背景是黑色的在这里插入图片描述

stay down

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python xpath爬取电影天堂

import requestsfrom lxml import htmlBASE_DOMAIN = 'https://www.dytt8.net'# url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0...
复制链接

扫一扫