遇到的问题:
1. detail_urls = html.xpath("//table[@class='tbspan']//a/@href"),得到的url,为半成品,需要后期补充。
2.try:
。。。。。
except IndexError:
pass
电影天堂有假的电影标题,就是只有标题,没有电影详细信息。
3.("◎主 演")以下信息为多行多种形式的信息展示:为多行信息,要总共爬取,分开爬取。
4.本文为电影天堂最新电影,为一个大的分类,可以往上进阶为全站爬虫。
# -*- coding: utf-8 -*-
# @Time : 2019/1/25 0025 16:51
# @Author : huangtao
# @Site :
# @File : 2.电影天堂.py
# @Software: PyCharm
# @Blog :https://blog.csdn.net/Programmer_huangtao
from lxml import etree
import time
import ssl
import requests
from fake_useragent import UserAgent
BASE_URL = 'http://www.dytt8.net'
HEADERS = {'User-Agent':UserAgent().chrome}
#引入的库文件,常量,请求头
def get_detail_urls(url):
#获取影片展示的列表页中所有的电影详细页的url
res = requests.get(url,headers=HEADERS)
text = res.text
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
detail_urls = map(lambda url:BASE_URL+url,detail_urls)
return detail_urls
def parse_detail_page(url):
#获取影片的详细信息
try:
#电影天堂网站中有假的电影,就是只有一个标题,而没有详细信息,所有做一个异常判断
movie = {}
res = requests.get(url,headers=HEADERS)
text = res.content.decode('gbk')
html = etree.HTML(text)
titles = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie['titles'] = titles
ZOOM = html.xpath('//div[@id="Zoom"]')[0]
imgs = ZOOM.xpath('.//img/@src')
cover = imgs[0]
jietu = imgs[1]
movie['cover'] = cover
movie['jietu'] = jietu
infos = ZOOM.xpath('.//text()')
for index,info in enumerate(infos):
if info.startswith("◎译 名"):
info = info.replace("◎译 名",'').strip()
movie['译名'] = info
elif info.startswith("◎片 名"):
info = info.replace("◎片 名",'').strip()
movie['片名'] = info
elif info.startswith("◎年 代"):
info = info.replace("◎年 代", '').strip()
movie['年代'] = info
elif info.startswith("◎产 地"):
info = info.replace("◎产 地", '').strip()
movie['产地'] = info
elif info.startswith("◎类 别"):
info = info.replace("◎类 别", '').strip()
movie['类别'] = info
elif info.startswith("◎语 言"):
info = info.replace("◎语 言", '').strip()
movie['语言'] = info
elif info.startswith("◎字 幕"):
info = info.replace("◎字 幕", '').strip()
movie['字幕'] = info
elif info.startswith("◎上映日期"):
info = info.replace("◎上映日期", '').strip()
movie['上映日期'] = info
elif info.startswith("◎IMDb评分"):
info = info.replace("◎IMDb评分", '').strip()
movie['IMDb'] = info
elif info.startswith("◎豆瓣评分"):
info = info.replace("◎豆瓣评分", '').strip()
movie['douban'] = info
elif info.startswith("◎文件格式"):
info = info.replace("◎文件格式", '').strip()
movie['wenjiangeshi'] = info
elif info.startswith("◎视频尺寸"):
info = info.replace("◎视频尺寸", '').strip()
movie['shipincicun'] = info
elif info.startswith("◎文件大小"):
info = info.replace("◎文件大小", '').strip()
movie['wenjiandaxiao'] = info
elif info.startswith("◎片 长"):
info = info.replace("◎片 长", '').strip()
movie['pianchang'] = info
elif info.startswith("◎导 演"):
info = info.replace("◎导 演", '').strip()
movie['director'] = info
elif info.startswith("◎主 演"):
info = info.replace("◎主 演", '').strip()
actors = [info]
for x in range(index+1,len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = actors
elif info.startswith("◎标 签"):
info = info.replace("◎标 签", '').strip()
movie['bianqian'] = info
elif info.startswith("◎简 介"):
info = info.replace("◎简 介", '').strip()
jianjies = [info]
for x in range(index+1,len(infos)):
jianjie = infos[x].strip()
if jianjie.startswith("【下载地址】"):
break
jianjies.append(jianjie)
movie['jianjie'] = jianjies[1]
elif info.startswith("◎获奖情况"):
info = info.replace("◎获奖情况", '').strip()
huojiangqingkuangs = [info]
for x in range(index + 1, len(infos)):
huojiangqingkuang = infos[x].strip()
if huojiangqingkuang.startswith("【下载地址】"):
break
huojiangqingkuangs.append(huojiangqingkuang)
movie['huojiangqingkuang'] = huojiangqingkuangs[1:-1]
download_url = html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
movie['download_url'] = download_url
return movie
except IndexError:
pass
def spider():
base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
movies = []
for pages in range(1,2):
#控制总共有多少页
url = base_url.format(pages)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
#遍历一页中的url
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)
# print(movies)
if __name__ == '__main__':
#主程序
spider()