import requests
from lxml import html
BASE_DOMAIN = 'https://www.dytt8.net'
# url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2843.400',
}
def spider():
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
movies = []
for num in range(1,8):
url = base_url.format(num)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)
def get_detail_urls(url):
resp = requests.get(url, headers=HEADERS).content.decode('gbk', 'ignore')
# 忽略除gbk以外的编码
etree = html.etree
htmlElements = etree.HTML(resp)
detail_urls = htmlElements.xpath('//table[@class="tbspan"]//a/@href')
detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
return detail_urls
def parse_info(info,rule):
return info.replace(rule,"").strip()
def parse_detail_page(url):
movie = {}
resp = requests.get(url, headers=HEADERS).content.decode('gbk', 'ignore')
# 忽略除gbk以外的编码
etree = html.etree
htmlElements = etree.HTML(resp)
title = htmlElements.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]
movie['title'] = title
zoomE = htmlElements.xpath('//div[@id="Zoom"]')[0]
imgs = zoomE.xpath(".//img/@src")
cover = imgs[0]
screenshot = imgs[1]
movie['cover'] = cover
movie['screenshot'] = screenshot
infos = zoomE.xpath('.//text()')
# print(infos)
for index,info in enumerate(infos):
if info.startswith("◎年 代"):
info = info.replace("◎年 代","").strip()
movie["year"] = info
if info.startswith("◎产 地"):
info = info.replace("◎产 地",'').strip()
movie['country'] = info
if info.startswith("◎类 别"):
info = info.replace('◎类 别',"").strip()
movie["categary"] = info
if info.startswith("◎豆瓣评分"):
info = info.replace("◎豆瓣评分","").strip()
movie["douban"] = info
if info.startswith("◎导 演"):
info = info.replace("◎导 演","").strip()
directors = [info]
for x in range(index+1,len(infos)):
director = infos[x].strip()
if director.startswith("◎编 剧"):
break
directors.append(director)
movie["director"] = directors
if info.startswith("◎主 演"):
info = info.replace("◎主 演",'').strip()
actors = [info]
for x in range(index+1,len(infos)):
actor = infos[x].strip()
if actor.startswith("◎标 签"):
break
actors.append(actor)
movie["actor"] = actors
if info.startswith("◎简 介 "):
info = info.replace("◎简 介 ","").strip()
profiles = []
for x in range(index+1,len(infos)):
profile = infos[x].strip()
if profile.startswith("◎获奖情况 "):
break
profiles.append(profile)
movie["profile"] = profiles
return movie
if __name__ == '__main__':
spider()
1.return movie 可以使用yield代替提高效率
2.为啥我背景是黑色的