转载自知了课堂的教程:
自己尝试一下,发现自己有没有理解,会不会操作。输入时有很多错误,这是2017年的教程,电影网站形式大部分未变化,才有机会应用。对于新手的我来说,要理解map函数,format,startswith…
流程主要是分析出分页的形式,解析每个分页中的电影详情页,把每个详情页中获取出信息。
其中lambda 作用类似于:
positionLink=HTML.xpath(’//td[1]/a/@href’)
for index in range(len(positionLink)-1):
#爬取职位链接,进行深度提取信息
page_url=“https://hr.tencent.com/” + positionLink[index]
#print(page_url)
from lxml import etree
import requests
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'referer':'https://www.dytt8.net/index.htm'}
BASE_DOMAIN='https://dytt8.net'
def get_detail_urls(url):
resp = requests.get(url,headers=HEADERS)
text = resp.text
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href") #得到分页中所有电影url半个网址的列表
detail_urls =map(lambda x: BASE_DOMAIN+x,detail_urls) #从右边的+x为 ,detail_urls列表中每一个电影半个url,加上DOMAIN组成完整的URL x,放入map列表中。
return detail_urls
def parse_info(info,rule):
return info.replace(rule, " ").strip() #将多余字符替换为空格
def parse_detail_page(url):
movie = {}
resp = requests.get(url,headers = HEADERS)
text = resp.content.decode('gbk')
html = etree.HTML(text)
title =html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] #得到lxml.etree._ElementUnicodeResult,在此条件下还能xpath
movie['title'] = title
zoomE =html.xpath("//div[@id='Zoom']")[0]
imgs =zoomE.xpath(".//img/@src")
cover=imgs[0]
movie['cover'] = cover
if len(imgs)>1:
photoshot = imgs[1] #有的电影没有截图
movie['photoshot'] = photoshot
infos = zoomE.xpath(".//text()") #得到列表用于遍历
for index,info in enumerate(infos):
#print(info)
#print(index) 方便查看索引和信息
#print('='*25)
if info.startswith("◎年 代"):
year = parse_info(info,"◎年 代")
movie['year']= year
elif info.startswith("◎产 地"):
country = parse_info(info,"◎产 地")
movie['country']= country
elif info.startswith("◎语 言"):
language = parse_info(info,"◎语 言")
movie['language']= language
elif info.startswith("◎豆瓣评分"):
douban_rating = parse_info(info,"◎豆瓣评分")
movie['douban_rating']= douban_rating
elif info.startswith("◎片 长"):
duration = parse_info(info,"◎片 长")
movie['duration']= duration
elif info.startswith("◎导 演"):
director = parse_info(info,"◎导 演")
movie['director']= director
elif info.startswith("◎主 演"):
info = parse_info(info,"◎主 演")
actors = [info] #主演当前行加入列表
for x in range(index+1,len(infos)): #主演下一行开始循环,len(infos)列表最大行数,防止列表越界
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors']= actors
elif info.startswith("◎简 介"):
for x in range(index+1,len(infos)):
if infos[x].startswith("【下载地址】") or infos[x].startswith("◎") or infos[x].startswith(" "):
break #当索引中出现以下字符中断,要用infos[x],是列表中的第几行
else:
profile =infos[x].strip()
movie['profile']= profile #如果简介索引中空出4行,字典值为空,重新取值
if movie['profile']:
movie['profile']= profile
else:
movie['profile'] = infos[index+4].strip()
elif info.startswith("ftp://"):
download_url = infos[index]
movie['download_url']= download_url #电影天堂有些不规则,xpath 没有text。 html.xpath("//td[@bgcolor='#fdfddf']/a/text()")
return movie
def spider():
movies=[]
base_url ='https://dytt8.net/html/gndy/dyzz/list_23_{}.html'
for x in range(1,2): #遍历分页,两个循环不能拆开
url=base_url.format(x)#占一个空位,循环填充
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
movie = parse_detail_page(detail_url) #遍历分页中的每一个详情页
movies.append(movie)
print(movies)
#break #中断第一个分页下的第一个详情页网站,方便修改时观察结果
if __name__ == '__main__':
spider()