import requests
from lxml import etree
urls = 'https://www.dy2018.com/html/gndy/dyzz/index_12.html'
BASE_URL = 'https://www.dy2018.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
# 详情页的url
def get_detail_urls(urls):
response = requests.get(urls, headers=headers)
text = response.text
# text = response.content.decode('gb2312') 当前获取地址即可 列表中字符不能解析 乱码 会报错
html = etree.HTML(text)
detail_urls = html.xpath("//div[@class='co_content8']//ul//a/@href")
detail_urls = map(lambda url: BASE_URL + url, detail_urls)
# 以上代码相当于下面
# index = 0
# for detail_url in detail_urls:
# detail_url = BASE_URL+detail_url
# detail_urls[index] = detail_url
# index += 1
return detail_urls
# 获取内容页数据
def parse_detail_page(url):
reponse = requests.get(url, headers=headers)
text = reponse.content.decode('gbk')
html = etree.HTML(text)
# 第一种
title = html.xpath("//div[@class='title_all']/h1/text()")[0]
thumb = html.xpath("//div[@id='Zoom']/p[1]/img/@src")
# field1 = html.xpath("//div[@id='Zoom']/p[2]/text()")[0].strip().replace('◎译 名','').strip()
# field2 = html.xpath("//div[@id='Zoom']/p[3]/text()")[0].strip().replace('◎片 名','').strip()
# field3 = html.xpath("//div[@id='Zoom']/p[4]/text()")[0].strip().replace('◎年 代','').strip()
# field4 = html.xpath("//div[@id='Zoom']/p[5]/text()")[0].strip().replace('◎产 地','').strip()
# 第二种
# zoom = html.xpath("//div[@id='Zoom']")
# for i in zoom:
# thumb = i.xpath(".//p[1]/img/@src")
# field1 = i.xpath(".//p[2]/text()")[0].strip().replace('◎译 名','').strip()
# 第三种
def parse_info(info,rule):
return info.replace(rule,'').strip()
infos = html.xpath(".//text()")
for index,info in enumerate(infos):
# print(info)
if info.startswith("◎译 名"):
field1 = parse_info(info,"◎译 名")
elif info.startswith("◎片 名"):
field2 = parse_info(info,"◎片 名")
elif info.startswith("◎年 代"):
field3 = parse_info(info,"◎年 代")
elif info.startswith("◎产 地"):
field4 = parse_info(info, "◎产 地")
elif info.startswith("◎主 演"):
info = parse_info(info,"◎主 演")
actors = [info]
for i in range(index+1,len(infos)):
actor = infos[i].strip()
if actor.startswith("◎"):
break
# print(type(actor),'---',actor)
if actor.strip() != '':
actors.append(actor)
field5 = actors
elif info.startswith("◎简 介"):
# info = parse_info(info,"◎简 介")
descs = []
for i in range(index+1,len(infos)):
desc = infos[i].strip()
if desc.startswith("◎"):
break
if desc.strip() != '':
descs.append(desc)
field6 = ''.join(descs)
preview = html.xpath("//div[@id='Zoom']/div/img/@src")
down_urls = []
downurl = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
# print(downurl)
for i in downurl:
down_urls.append(i)
# print(down_urls)
movie = {
'title': title,
'thumb': thumb,
'field1': field1,
'field2': field2,
'field3': field3,
'field4': field4,
'field5': field5,
'field6': field6,
'preview': preview,
'down_urls': down_urls,
}
return movie
# 获取列表数据
def spider():
base_url = "https://www.dy2018.com/html/gndy/dyzz/index_{}.html"
for x in range(2, 4):
# print("===" * 10)
# print(x)
# print("===" * 10)
url = base_url.format(x)
detail_urls = get_detail_urls(url)
for u in detail_urls:
movie = parse_detail_page(u)
print(movie)
break
break
if __name__ == '__main__':
spider()