分步获取信息

最新推荐文章于 2024-05-29 10:17:54 发布

哈斯卡:no pain no gain

最新推荐文章于 2024-05-29 10:17:54 发布

阅读量270

点赞数

本文链接：https://blog.csdn.net/sfwwdd/article/details/106764126

版权

转载自知了课堂的教程：
自己尝试一下，发现自己有没有理解，会不会操作。输入时有很多错误，这是2017年的教程，电影网站形式大部分未变化，才有机会应用。对于新手的我来说，要理解map函数，format,startswith…
流程主要是分析出分页的形式，解析每个分页中的电影详情页，把每个详情页中获取出信息。
其中lambda 作用类似于：
positionLink=HTML.xpath(’//td[1]/a/@href’)
for index in range(len(positionLink)-1):
#爬取职位链接，进行深度提取信息
page_url=“https://hr.tencent.com/” + positionLink[index]
#print(page_url)

from lxml import etree
import requests
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'referer':'https://www.dytt8.net/index.htm'}
BASE_DOMAIN='https://dytt8.net'
def get_detail_urls(url):
	resp = requests.get(url,headers=HEADERS)
	text = resp.text
	html = etree.HTML(text)
	detail_urls = html.xpath("//table[@class='tbspan']//a/@href") #得到分页中所有电影url半个网址的列表
	detail_urls =map(lambda x: BASE_DOMAIN+x,detail_urls) #从右边的+x为 ,detail_urls列表中每一个电影半个url,加上DOMAIN组成完整的URL x,放入map列表中。
	return detail_urls

def parse_info(info,rule):
	return info.replace(rule, " ").strip()  #将多余字符替换为空格

def parse_detail_page(url):
	movie = {}
	resp = requests.get(url,headers = HEADERS)
	text = resp.content.decode('gbk')
	html = etree.HTML(text)
	title =html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] #得到lxml.etree._ElementUnicodeResult，在此条件下还能xpath
	movie['title'] = title
	zoomE =html.xpath("//div[@id='Zoom']")[0]
	imgs =zoomE.xpath(".//img/@src")
	cover=imgs[0]
	movie['cover'] = cover
	if len(imgs)>1:
		photoshot = imgs[1] #有的电影没有截图
		movie['photoshot'] = photoshot
	infos = zoomE.xpath(".//text()") #得到列表用于遍历
	for index,info in enumerate(infos):
		#print(info)
		#print(index) 方便查看索引和信息
		#print('='*25)

		if info.startswith("◎年　　代"):
			year = parse_info(info,"◎年　　代")
			movie['year']= year
		elif info.startswith("◎产　　地"):
			country = parse_info(info,"◎产　　地")
			movie['country']= country
		elif info.startswith("◎语　　言"):
			language = parse_info(info,"◎语　　言")
			movie['language']= language
		elif info.startswith("◎豆瓣评分"):
			douban_rating = parse_info(info,"◎豆瓣评分")
			movie['douban_rating']= douban_rating
		elif info.startswith("◎片　　长"):
			duration = parse_info(info,"◎片　　长")
			movie['duration']= duration
		elif info.startswith("◎导　　演"):
			director = parse_info(info,"◎导　　演")
			movie['director']= director
		elif info.startswith("◎主　　演"):
			info = parse_info(info,"◎主　　演")
			actors = [info] #主演当前行加入列表
			for x in range(index+1,len(infos)): #主演下一行开始循环，len(infos)列表最大行数，防止列表越界
				actor = infos[x].strip()
				if actor.startswith("◎"):
					break
				actors.append(actor)
			movie['actors']= actors
		elif info.startswith("◎简　　介"):
			for x in range(index+1,len(infos)):
				if infos[x].startswith("【下载地址】") or infos[x].startswith("◎") or infos[x].startswith(" "):
					break #当索引中出现以下字符中断，要用infos[x],是列表中的第几行
				else:
					profile =infos[x].strip()
					movie['profile']= profile #如果简介索引中空出4行，字典值为空，重新取值
					if movie['profile']: 
						movie['profile']= profile
					else:
						movie['profile'] = infos[index+4].strip()
		elif info.startswith("ftp://"):
			download_url = infos[index]
			movie['download_url']= download_url #电影天堂有些不规则，xpath 没有text。 html.xpath("//td[@bgcolor='#fdfddf']/a/text()")
	return movie
def spider():
	movies=[]
	base_url ='https://dytt8.net/html/gndy/dyzz/list_23_{}.html'
	for x in range(1,2): #遍历分页，两个循环不能拆开
		url=base_url.format(x)#占一个空位，循环填充
		detail_urls = get_detail_urls(url)
		for detail_url in detail_urls:
			movie =  parse_detail_page(detail_url) #遍历分页中的每一个详情页
			movies.append(movie)
			print(movies)
			#break #中断第一个分页下的第一个详情页网站，方便修改时观察结果
if __name__ == '__main__':
	spider()

哈斯卡:no pain no gain

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
分步获取信息

转载自知了课堂的教程：自己尝试一下，发现自己有没有理解，会不会操作。输入时有很多错误，这是2017年的教程，电影网站形式大部分未变化，才有机会应用。对于新手的我来说，要理解map函数，format,startswith…流程主要是分析出分页的形式，解析每个分页中的电影详情页，把每个详情页中获取出信息。from lxml import etreeimport requestsHEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) A
复制链接

扫一扫