from lxml import etree
import requests
BASE_DOMIN="http://www.ygdy8.net"
headers={
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36"
}
url="http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html".format(1)
def get_detail_url(url):
re = requests.get(url, headers=headers)
text = re.text
# requests会用自己默认的编码方式解码,电影天堂的解码方式错误所以会产生乱码
# parser=etree.HTMLParser(encoding="utf-8")
html = etree.HTML(text)
dianyin_link = html.xpath("//table[@class='tbspan']//a/@href")
#name = html.xpath("//table[@class='tbspan']//tr//a/text()")[0]
#content = html.xpath("//table[@class='tbspan']//tr/td[@colspan='2']/text()")[0]
dianyin_link=list(map(lambda url:BASE_DOMIN+url,dianyin_link))
return dianyin_link
def parse_replace(text1,rule):
text = text1.replace(rule, "").strip()
return text
def parse_detail_url(url):
movie={}
respones=requests.get(url,headers=headers)
texts=respones.content.decode('gbk')
html=etree.HTML(texts)
detail_name=html.xpath("//font[@color='#07519a']/text()")[0]
movie['title']=detail_name
#电影名字
detail_infro=html.xpath("//div[@id='Zoom']")[0]
#电影详情
detail_img=detail_infro.xpath(".//img/@src")
cover=detail_img[0]
#电影海报
screen_shot=detail_img[1]
#电影截图
movie["cover"]=cover
movie["screeshot"]=screen_shot
detail_text=html.xpath("//div[@id='Zoom']//text()")
print(detail_text)
for index,text in enumerate(detail_text):
#为什么这里一直是false!!!明明有年代这个字符串
print(text.startswith("◎年 代"))
if text.startswith("◎年 代"):
text = parse_replace(text,"◎年 代")
movie["年份"]=text
elif text.startswith("◎产 地"):
text = parse_replace(text,"◎产 地")
movie["产地"]=text
print(text)
elif text.startswith("◎类 别"):
text = parse_replace(text, "◎类 别")
movie["类别"]=text
elif text.startswith("◎主 演"):
text = parse_replace(text, "◎主 演")
actors=[text]
for x in range(index+1,len(detail_text)):
actor1=detail_text[x].strip()
if actor1.startswith("◎"):
break
actors.append(actor1)
movie["actor"]=actors
elif text.startswith("◎简 介"):
text=parse_replace(text,"◎简 介")
for x in range(index+1,len(detail_text)):
profile=detail_text[x].strip()
if profile.startswith("【下载地址】"):
break
movie["profile"]=profile
download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
movie["下载地址"]=download_url
return movie
def spiders():
base_url="http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
for i in range(1,3):
url=base_url.format(i)
movies=[]
#print(url)
detail_urls=get_detail_url(url)
for detail_url in detail_urls:
movie=parse_detail_url(detail_url)
movies.append(movie)
print("$"*20)
print(movies)
if __name__=="__main__":
spiders()