补如何抓取豆瓣网正在热映电影信息以及海报

5 篇文章 0 订阅
2 篇文章 0 订阅

废话不多说直接上代码
#!/usr/bin/env python

‘’’
这篇主要是获取豆瓣网正在热映电影网的电影全部信息
‘’’
import re
import requests
from requests import RequestException
from demo01.util import buid_proxy
from urllib.parse import urlencode
from lxml import etree
import json
import time
import codecs
import os
proxies=buid_proxy()
headers={
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36”,
“Referer”:“https://movie.douban.com/
#构造请求头是伪造成成浏览器,在有的反爬虫网站这个时候可以添加Cookie,Connection是非常有必要的
}
#第一步,获取豆瓣网的电影信息
def get_one_page(url):
res=requests.get(url,proxies=proxies,headers=headers)
if res.status_code==200:
html=res.text
return html
return None

def parse_one_page(html):
content=etree.HTML(str(html))
data=content.xpath(’//div[@id=“nowplaying”]/div[2]/ul/li’)
i=0
#获取这页所有电影的信息,在这里我们需要的有四个信息,但是能用的是三个
for items in data:
#print(items)
movie_info=[]
while i <len(data)-1:
movie_id=items.xpath(’//li/@id’)[i]
movie_name=items.xpath(’//li/@data-title’)[i]
movie_detail_url=items.xpath(’//li/ul/li[1]/a/@href’)[i]
movie_img_url=items.xpath(’//li/ul/li[1]/a/img/@src’)[i]
i +=1
movie_one_info={
“movie_id”:movie_id,
“movie_name”:movie_name,
“movie_detail_url”:movie_detail_url,
“movie_img_url”:movie_img_url
}
#因为我工作还涉及elasticsearch,简称es,存入es需要json文件,所以把文件变成这种字典形式
movie_info.append(movie_one_info)
return movie_info

def get_detail_info(one_info):
#先构造字典,获取需要字段的
info_movie_detail = []
# info_movie_detail=[]
for i,items in enumerate(one_info):
#构造一个字典只有key,value可以根据key对应插入,同时后面有新的字段方便插入
movie_d = {
“movie_title”: “”,
“directedBy”: “”,
“screenwriter”: “”,
“actors”: “”,
“genre”: “”,
“country”: “”,
“release_time”: “”,
“runtime”: “”,
“score”: “”,
“tags”: “”,
“intro”: “”,
“award”: “”,
“hot_comment”: “”,
“relat_like”: “”
}
movie_name=items.get(“movie_name”)
movie_detail_url=items.get(“movie_detail_url”)
res_detail_html=requests.get(movie_detail_url,proxies=proxies,headers=headers)
movie_detail=res_detail_html.content.decode()
# print(i+1,movie_name)
re_title=’

.?


re_title_res=re.search(re_title,movie_detail,re.S|re.M)
re_title_res=re_title_res.group() if re_title_res else “”
reg_title=’<span.?itemreviewed">(. ?)’
movie_title=re.findall(reg_title,re_title_res,re.M|re.S)
#join是一个强大的方法可以把任何东西变成字符串str(包括list,tuple,dict)
movie_d[“movie_title”]=" ".join(movie_title)
#截取基本信息段落
re_info = '
.?

re_info_res = re.search(re_info, movie_detail, re.S | re.M)
cont_info = re_info_res.group() if re_info_res else “”
cont_info_list = cont_info.split(’
’)#在这个地方注意
#构造字典存储基本信息
info_dict = dict()
for info in cont_info_list:
#print(info)
if “导演” in info:
info_dict[“directorBy”] = info
elif “编剧” in info:
info_dict[“screenwriter”]=info
elif “主演” in info:
info_dict[“actors”]=info
elif “类型” in info:
info_dict[“genre”]=info
elif “制片国家” in info:
info_dict[“country”]=info
elif “上映时间” in info:
info_dict[“release_time”]=info
elif “片长” in info:
info_dict[“runtime”]=info
directors = re.findall(’<. ?directedBy">(.?)’, info_dict.get(“directorBy”,""), re.M | re.S)
#print(directors)
screenwriter = re.findall(’<. ?\w+/?">(.?)’, info_dict.get(“screenwriter”, “”), re.M | re.S)
actor = re.findall(’. ?starring">(.?)’, info_dict.get(“actors”, “”), re.M | re.S)
genre = re.findall(’. ?genre">(.?)’, info_dict.get(“genre”, “”), re.M | re.S)
country = re.findall(’(. ?)$’, info_dict.get(“country”, “”), re.M | re.S)
country = country[0] if country else “”
release_date=re.findall(’.
?content="\d{4}-\d{2}-\d{2}([\u4e00-\u9fa5]+)">(. ?)’,info_dict.get(“release_time”,""),re.M|re.S)
#print(release_date)
runtime=re.findall(’<span property=“v:runtime”.
?>(. ?)’,info_dict.get(“runtime”,""),re.M|re.S)
movie_d[“directedBy”]="".join(directors)
movie_d[“actor”]=" “.join(actor)
movie_d[“screenwriter”]=”".join(screenwriter)
movie_d[“genre”]=" “.join(genre)
movie_d[“country”]=country.strip().replace(”/","").strip(" “)
movie_d[“release_time”]=”".join(release_date)
movie_d[“runtime”]=" “.join(runtime)
re_rating='average”>([\d.]
?)’
re_rating_res=re.search(re_rating,movie_detail,re.M|re.S)
rating=re_rating_res.group(1) if re_rating_res else “暂无评分”
movie_d[“score”]=rating
#print(rating)
re_better_block=’
. ?

re_better_res=re.search(re_better_block,movie_detail,re.S|re.M)
better_block=re_better_res.group() if re_better_res else “”
#re_better=’">(/\d{2}%?)’
re_better=‘action=">(. ?)’
better_list=re.findall(re_better,better_block,re.M|re.S)
better_list=[“好于%s”% b for b in better_list]
movie_d[“type_rank”]="|".join(better_list)
#print(better_list)
re_award_block=’
  • .?

award_block=re.findall(re_award_block,movie_detail,re.S|re.M)
award_list=list()
for block in award_block:
re_award=’<a. ?>(.?). ?
  • (.?)

  • re_award_res=re.search(re_award,block,re.M|re.S)
    aw_title=re_award_res.group(1) if re_award_res else “”
    aw_cont = re_award_res.group(2) if re_award_res else “”
    award_list.append([aw_title,aw_cont])
    movie_d[“award”]="|".join(["|".join(a) for a in award_list])
    #print(award_list)
    re_tags_block=’
    . ?

    re_tags_block_res=re.search(re_tags_block,movie_detail,re.M|re.S)
    tags_block=re_tags_block_res.group() if re_tags_block_res else “”
    re_tags=’<a.?>(. ?)’
    tags=re.findall(re_tags,tags_block,re.M|re.S)
    movie_d[“tags”]="|".join(tags)
    #print(tags)
    re_recommendations_block=’
    .?

    re_recommendations_block_res=re.search(re_recommendations_block,movie_detail,re.M|re.S)
    recommendations_block=re_recommendations_block_res.group() if re_recommendations_block_res else “”
    re_recommendations=‘alt="(. ?)"’
    recommendations=re.findall(re_recommendations,recommendations_block,re.M|re.S)
    movie_d[“relat_like”]="|".join(recommendations)
    #print(recommendations)
    re_intro_block=’<div class=“indent” .
    ?report">. ?’
    re_intro_block_res=re.search(re_intro_block,movie_detail,re.S|re.M)
    intro_block=re_intro_block_res.group() if re_intro_block_res else “”
    re_intro=’(.
    ?)’
    intro=re.findall(re_intro,intro_block,re.M|re.S)
    movie_d[“intro”]="".join(intro)
    #print(intro)
    info_movie_detail.append(movie_d)
    # print(movie_d)
    # print(info_movie_detail)
    return info_movie_detail

    path=‘E:/test001/photo/douban/’

    def save_to_file(detail_info):
    ‘’’
    file_path下载电影基本信息的时候同时创建电影信息文件,
    :param detail_info:
    :return:
    ‘’’
    file_path=path+time.strftime("%Y-%m-%d")+’.json’
    print(“正在下载热映中的%s部电影”%len(detail_info))
    for i,items in enumerate(detail_info):
    movie_name=items.get(“movie_title”)
    print(i,movie_name)
    with open(file_path,‘w’,encoding=‘utf-8’) as f_f:
    f_f.write(str(detail_info))
    f_f.write("\n")
    f_f.close()

    def save_img_dir(one_info):
    path=‘E:/test001/photo/douban/’
    ‘’‘one_info是一个list,电影基本信息都在内,我们这边只需要电影海报URL和电影名称
    电影海报地址是为了下载海报,而电影名称是为了命名海报,把电影名称与海报一一对应
    ‘’’
    for items in one_info:
    movie_url=items.get(“movie_detail_url”)
    movie_img_url=items.get(“movie_img_url”)
    movie_name=items.get(“movie_name”)
    res=requests.get(movie_img_url,proxies=proxies,headers=headers)
    content=res.content.decode()
    img_path=path+movie_name+’.jpg’
    with open(path,‘wb’) as f:
    f.write(content)
    f.close()

    def main():
    url = ‘https://movie.douban.com/cinema/nowplaying/nanjing/
    html=get_one_page(url)
    #print(html)
    one_info=parse_one_page(html)
    # print(one_info)
    detail_info=get_detail_info(one_info)
    #print(detail_info)
    save_to_file(detail_info)
    save_img_dir(one_info)

    if name == ‘main’:
    main()

    在这里插入图片描述
    在这里插入图片描述在这里插入图片描述
    数据还没整理还就这样了

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值