from fake_useragent import UserAgent
from lxml import etree
import requests
def get_stree(url):
‘’’
:param url: 电影页面的地址
:return: 该页面的etree
‘’’
requests.packages.urllib3.disable_warnings()
headers={‘User-Agent’:UserAgent().random}
req=requests.get(url,headers,verify=False)
e=etree.HTML(req.text)
return e
def get_movie(e):
movie_name=e.xpath(’//div[@class=“channel-detail movie-item-title”]/a/text()’)
movie_href=e.xpath(’//div[@class=“channel-detail movie-item-title”]/a/@href’)
movie = []
for i in range(len(movie_href)):
url=“https://maoyan.com”+movie_href[i]
new_e=get_stree(url)
import re
try:
movie_director=re.sub(’(\n)|( )’,"",new_e.xpath(’//ul[@class=“celebrity-list clearfix”]/li[@class=“celebrity “]/div[@class=“info”]/a/text()’)[0])
except:
movie_director=“暂无”
movie_actor=map(lambda i:i.replace(”\n”,"").replace(" “,”"),new_e.xpath(’//li[@class=“celebrity actor”]/div[@class=“info”]/a/text()’))
strs=’’
for j in movie_actor:
strs+=j+" "*3
obj={}
obj[‘name’]=movie_name[i]
obj[‘director’]=movie_director
obj[‘actor’]=strs
obj[‘url’]=url
movie.append(obj)
return movie
def get_num(num,movies=[]):
for i in range(num):
url = ‘https://maoyan.com/films?showType=3&offset=’ + str(i * 30)
movie = {}
movie[‘id’] = i+1
movie[‘type’] = ‘第{0}页数据’.format(i + 1)
movie[‘child’] = get_movie(get_stree(url))
movies.append(movie)
return movies
movies=[]
print(get_num(2,movies=movies))