今天给大家带来如何爬取某猫即将上映电影的详细数据
这是我们今天爬取的页面
今天需要用到3个模块
import parsel
import requests as r
import xlwt
parsel是基于scrapy分离出来的工具,有xpath,re正则,css选择器 这几种用法
xlwt是用于操作excel写入数据的库
现在开始上代码
import parsel
import requests as r
import xlwt
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"}
url='https://maoyan.com/films?showType=2&offset='
page=[0,30,60,90]# 4个页面页码
rows = 2
def movie_url():
film=[]
for i in page: #依次获取页面电影序号
response=r.get(f'{url}{i}',headers=headers)
data_order=parsel.Selector(response.text)
film_order=data_order.xpath('//div[@class="channel-detail movie-item-title"]/a/@href').extract()
film_url = ['https://maoyan.com' + a for a in film_order]#补全电影主页url
for b in film_url:
film.append(b)
return film
def movie_detail(film):
for c in film:#依次获取电影主页的数据
response1=r.get(c,headers=headers)
data=parsel.Selector(response1.text)
movie_name=data.re('>(.+)</h1>')[0]#电影名
# print(movie_name)
movie_message=data.xpath('//ul/li[@class="ellipsis"]/text()').extract()
movie_time=movie_message[-2].strip().split('\n')
if len(movie_time)==1:
movie_time.append('未知')
print(movie_time)
movie_country=movie_time[0] #国家
movie_time=movie_time[1].strip()#电影时长
#print(movie_country,movie_time)
type=data.re('class="text-link".+>(.+)</a>')#电影类型
movie_type=''
for d in type:
movie_type+=d
movie_story=data.re('class="dra">(.+)<')#电影简介
#print(movie_type)
# print(movie_story)
actors=data.xpath('//div[@class="info"]/a[@class="name"]/text()').extract()
dirctor=actors[0].split('/n')[0].strip()#导演
#print(dirctor)
actor=''
if len(actors)<3:
actors.append('\n 未知 \n')
actors.append('\n 未知 \n')
for f in range(3):
actor+=actors[f+1].split('/n')[0].strip()+' '#演员
#print(actor)
excel(movie_name, movie_type, movie_country, movie_time, dirctor, actor, movie_story,sheet1)
def set_style(name, height):#设置xlwt格式
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = name
font.height = height
style.font = font
return style
def excel(movie_name,movie_type,movie_country,movie_time,dirctor,actor,movie_story,sheet1):
global rows
print(f'正在写入【{movie_name}】数据')#写入excel
sheet1.write(rows,0, movie_name, set_style('微软雅黑', 245))
sheet1.write(rows,1, movie_type, set_style('宋体', 250))
sheet1.write(rows,2, movie_country, set_style('宋体', 250))
sheet1.write(rows,3, movie_time, set_style('宋体', 250))
sheet1.write(rows,4, dirctor, set_style('宋体', 250))
sheet1.write(rows,5, actor, set_style('宋体', 250))
sheet1.write(rows,6, movie_story[0], set_style('宋体', 250))
rows+=1
if __name__ == '__main__':
f = xlwt.Workbook()
sheet1=f.add_sheet('即将上映电影',cell_overwrite_ok='True')
row0=['电影名称','类型','国家','时间','导演','演员','简介']
for a in range(len(row0)):
sheet1.write_merge(0, 1, a, a, row0[a], set_style('微软雅黑', 260))
film=movie_url()
movie_detail(film)
f.save('即将上映电影.xls')
print('数据全部写入完成!')
运行结果如下
会在该py文件目录下产生‘即将上映电影.xls’文件
关于xlwt的用法大家可以百度,官方文档写的有些模糊