1.程序如下
import requests
from lxml import etree
import json
Base_download='http://www.9rmb.com'#后期每一电影的拼接基础
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
def spider():
base_url='http://www.9rmb.com/type/1/{}.html'#每一页电影的基础
movies=[]
for num in range(1,8):
join_url=base_url.format(num)#进行电影地址的拼接
detail_urls=get_detail_urls(join_url)#调用每一个电影的网址
print(detail_urls)
for urls in detail_urls:
#对每一个电影进行提取
movie=parse_detail_page(urls)
movies.append(movie)
print(movie)
print(type(movie))
#将获取到信息放入到文件中
with open('11.txt','a',encoding='utf-8')as f:
json.dump(movie,f,ensure_ascii=False)#最后一个为了获得是utf8
f.write('\n')
def get_detail_urls(urls):
#蒋电影网页中的每一电影连接获取到并返回给调用函数
r1=requests.get(url=urls,headers=headers)
r1_element=etree.HTML(r1.text)
detail_url=r1_element.xpath('//div[@class="movie-item"]/a/@href')
detail_urls=map(lambda url:Base_download+url,detail_url)#得到一个生成器
return detail_urls
def parse_detail_page(urls):
#进行电影目的的获取
movie={}
resp=requests.get(urls,headers=headers).content.decode('utf-8','ignore')
resp_element=etree.HTML(resp)
title=resp_element.xpath('//div[@class="col-md-12"]/h1/text()')[0]#获取电影名称
movie['title']=title
main_actors=resp_element.xpath('//td[@id="casts"]/text()')[0]
movie['actors']=main_actors#获取电影演员
coutry=resp_element.xpath('//tr[4]/td[2]/text()')[0]
movie['country']=coutry#获取电影的国家
evaluate=resp_element.xpath('//a[@class="score"]/text()')[0]
movie['evaluate']=evaluate #获取电影评价
return movie
if __name__=='__main__':
spider()
f=open('11.txt','w',encoding='utf-8')
f.close()
编写过程参考了:
https://blog.csdn.net/qq_43515464/article/details/102969930?