from lxml import etree import requests BASE_DOMAIN = 'http://www.ygdy8.net' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2577.400' } def get_detal_urls(url):# 通过生成页码url,获取到每页的目录信息 #url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html" response = requests.get(url, headers=HEADERS) #text = response.content.decode('gbk') text = response.text html = etree.HTML(text) # // 指定获取的标签,@ 指明获取标签的属性 detail_urls = html.xpath("//table[@class='tbspan']//a/@href") # for detail_url in detail_urls: # print(BASE_DOMAIN + detail_url) detail_urls1 = map(lambda url:BASE_DOMAIN+url,detail_urls)# 循环执行内部相加方法 # for index, detail_url in detail_urls: # detail_url = BASE_DOMAIN+detail_url # detail_urls[index] = detail_url # index +=1 # print(detail_url) return detail_urls1 def parge_detail_page(url):#通过电影详情的url,获取电影详情页面的内容 movie = {} response = requests.get(url,headers = HEADERS) text = response.content.decode('gbk') html = etree.HTML(text) title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] movie['title'] = title zoome = html.xpath("//div[@id='Zoom']")[0] imgs = zoome.xpath(".//img/@src") cover = imgs[0] screenshot = imgs[1] movie['screenshot'] = screenshot movie['cover'] = cover infos = zoome.xpath(".//text()") for index,info in enumerate(infos): if info.startswith("◎年 代"): # 替换页面内容 info = info.replace("◎年 代","").strip() movie['info'] = info elif info.startswith("◎产 地"): # 替换页面内容 info = info.replace("◎产 地","").strip() movie['contry'] = info elif info.startswith("◎类 别"): # 替换页面内容 info = info.replace("◎类 别", "").strip() movie['category'] = info elif info.startswith("◎主 演"): # 替换页面内容 info = info.replace("◎主 演", "").strip() actors=[info] for x in range(index+1,len(infos)): actor = infos[x].strip() if actor.startswith("◎"): break actors.append(actor) movie['actors'] = actors elif info.startswith("◎简 介"): info = info.replace("◎简 介","").strip() for x in range(index+1,len(infos)): profile = infos[x].strip() movie["profile"] = profile download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0] movie['download_url'] = download_url return movie # for x in title: # name = etree.tostring(x,encoding='utf-8').decode("utf-8") # print(name) def spider(): base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" movies = [] for x in range(1,8): # print(x) url = base_url.format(x)# 通过base_url利用range函数生成电影目录页面的分页 1-7页的url detail_urls = get_detal_urls(url)# 通过目录页面的url拿到所有电影详情的信息,提取电影详情中的url for detail_url in detail_urls: # print(detail_url) movie = parge_detail_page(detail_url) # 通过详情的页面的url提取详情页面的信息,标题 movies.append(movie) print(movies) if __name__ == '__main__': spider()
python爬虫3(电影天堂)
最新推荐文章于 2024-07-24 17:05:28 发布