python爬虫3(电影天堂)

from lxml import etree
import  requests

BASE_DOMAIN = 'http://www.ygdy8.net'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2577.400'
}
def get_detal_urls(url):# 通过生成页码url,获取到每页的目录信息
    #url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html"

    response = requests.get(url, headers=HEADERS)
    #text = response.content.decode('gbk')
    text = response.text
    html = etree.HTML(text)
    # // 指定获取的标签,@ 指明获取标签的属性
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
#    for detail_url in detail_urls:
#        print(BASE_DOMAIN + detail_url)
    detail_urls1 = map(lambda url:BASE_DOMAIN+url,detail_urls)# 循环执行内部相加方法
#    for index, detail_url in detail_urls:
#        detail_url = BASE_DOMAIN+detail_url
#        detail_urls[index] = detail_url
#        index +=1
#        print(detail_url)
    return detail_urls1


def parge_detail_page(url):#通过电影详情的url,获取电影详情页面的内容
    movie = {}
    response = requests.get(url,headers = HEADERS)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie['title'] = title
    zoome = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoome.xpath(".//img/@src")
    cover = imgs[0]
    screenshot = imgs[1]
    movie['screenshot'] = screenshot
    movie['cover'] = cover
    infos = zoome.xpath(".//text()")
    for index,info in enumerate(infos):
        if info.startswith("◎年  代"): # 替换页面内容
            info = info.replace("◎年  代","").strip()
            movie['info'] = info
        elif info.startswith("◎产  地"): # 替换页面内容
            info = info.replace("◎产  地","").strip()
            movie['contry'] = info
        elif info.startswith("◎类  别"):  # 替换页面内容
            info = info.replace("◎类  别", "").strip()
            movie['category'] = info
        elif info.startswith("◎主  演"):  # 替换页面内容
            info = info.replace("◎主  演", "").strip()
            actors=[info]
            for x in range(index+1,len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie['actors'] = actors
        elif info.startswith("◎简  介"):
            info = info.replace("◎简  介","").strip()
            for x in range(index+1,len(infos)):
                profile = infos[x].strip()
                movie["profile"] = profile
    download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
    movie['download_url'] = download_url
    return movie


 #   for x in title:
  #      name = etree.tostring(x,encoding='utf-8').decode("utf-8")
  #      print(name)

def spider():
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    movies = []
    for x in range(1,8):
#        print(x)
        url = base_url.format(x)# 通过base_url利用range函数生成电影目录页面的分页 1-7页的url
        detail_urls = get_detal_urls(url)# 通过目录页面的url拿到所有电影详情的信息,提取电影详情中的url
        for detail_url in detail_urls:
 #           print(detail_url)
            movie = parge_detail_page(detail_url) # 通过详情的页面的url提取详情页面的信息,标题
            movies.append(movie)
    print(movies)




if __name__ == '__main__':
    spider()
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值