pyhon3爬虫之——电影天堂最新电影信息
#导入所需要的库
import requests
from lxml import etree
定义全局变量
BASE_URL="https://www.dytt8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
#解析最新电影列表页面,获取当前页面每个电影的url
def get_detil_urls(url):
response = requests.get(url, headers=HEADERS)
text = response.text
html = etree.HTML(text)
detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')
# for deteil_url in deteil_urls:
# print(BASE_URL + deteil_url)
detail_urls=map(lambda url:BASE_URL+url,detail_urls)
return detail_urls
#解析电影详情页面,获取电影详情信息
def parse_detail_page(url):
movie={}
response=requests.get(url,headers=HEADERS)
text=response.content.decode("gbk")
html=etree.HTML(text)
title=html.xpath("//div[@class='title_all']//font/text()")[0]
movie["title"]=title
zoomE=html.xpath("//div[@id='Zoom']")[0]
imgs=zoomE.xpath(".//img/@src")
cover=imgs[0]
screenshot=imgs[1]
movie["cover"]=cover
movie["screenshot"]=screenshot
def parse_info(info,rule):
return info.replace(rule,"").strip()
infos=zoomE.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith("◎年 代"):
info=parse_info(info,"◎年 代")
movie['year']=info
elif info.startswith("◎产 地"):
info=parse_info(info,"◎产 地")
movie["country"]=info
elif info.startswith("◎类 别"):
info=parse_info(info,"◎类 别")
movie["category"]=info
elif info.startswith("◎豆瓣评分"):
info=parse_info(info,"◎豆瓣评分")
movie["score"]=info
elif info.startswith("◎片 长"):
info=parse_info(info,"◎片 长")
movie["duration"]=info
elif info.startswith("◎导 演"):
info=parse_info(info,"◎导 演")
movie["director"]=info
elif info.startswith("◎主 演"):
info=parse_info(info,"◎主 演")
actors=[info]
for x in range(index+1,len(infos)):
actor=infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie["actors"]=actors
elif info.startswith("◎简 介"):
info=parse_info(info,"◎简 介")
text=[]
for x in range(index+1,len(infos)):
profile=infos[x].strip()
if profile.startswith("【下载地址】"):
break
text.append(profile)
movie['profile']=text[0]
download_url=html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
movie["download_url"]=download_url
return movie
def spider(n):
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
movies=[]
for x in range(1,n):
#第一个for循环是用来控制总共有几页
url=base_url.format(x)
detail_urls=get_detil_urls(url)
for detail_url in detail_urls:
#第二个for循环是用来遍历每一页的所有电影的详情url
movie=parse_detail_page(detail_url)
movies.append(movie)
print(movie)
if __name__ =="__main__":
spider()