爬取猫眼网站TOP100的电影数据并写为json格式文件

# coding=utf-8

import requests
from lxml import etree
'''
爬取猫眼网站TOP100的电影数据并写为json格式文件
'''
# 通过起始url获取要爬取url列表
def get_url(url):
    url_list = [url]
    for num in range(10, 91, 10):
        data = {'offset': num}
        url_list.append(requests.get(url, params=data).url)

    return url_list

# 通过url获取页面信息
def get_html(url):
    header = {'User-Agent': 'Mozilla/5.0 (X11;Ubuntu;Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
    html = requests.get(url,headers=header )
    # html.encoding = 'utf-8'
    html = html.text
    return html

# 使用xpath定位元素,爬取原始数据
def get_element(html):

    html = etree.HTML(html)

    img_href = html.xpath("//dl[@class='board-wrapper']/dd/a/@href")
    title = html.xpath("//dl[@class='board-wrapper']/dd/a/@title")
    actress = html.xpath("//div[@class='movie-item-info']/p[2]/text()")
    relasetime = html.xpath("//div[@class='movie-item-info']/p[3]/text()")

    all_data = [img_href,title,actress,relasetime]

    return all_data

# 清理爬取的数据
def clear_data(data):
    from urllib.parse import urljoin
    url = 'http://maoyan.com/'
    '''
    img_href中的href添加url头部,/films/1203
    actress中去除换行符和多余空行
    '''
    img_url = []
    actor = []
    img_href = data[0]
    actress = data[2]

    for tail in img_href:
        img_url.append(urljoin(url,tail))
    data[0] = img_url

    for act in actress:
        actor.append(act.split())
    data[2] = actor
    return data

# 将爬取下来的数据整理为字典格式
def json_dict(data):
    '''
    将数据整理为:
    {title:{主演:演员,img:img,时间:上映时间}}
    '''
    json_dic = {}
    title = data[1]
    actress = data[2]
    releasetime = data[3]
    img = data[0]

    for item in title:
        json_dic[item] = {}

        # 处理演员列表
        for i in range(len(actress)):
            each_actor = actress[i][0]
            key = each_actor.split('')[0]
            value = each_actor.split('')[1]
            json_dic[item] = {key:value}

        # 处理上映时间列表
        for each_time in releasetime:
            rt = each_time.split('')[0]
            T = each_time.split('')[1]
            json_dic[item].update({rt:T})

        # 处理图片路径列表
        for each_img in img:
            json_dic[item].update({'img_url':each_img})


    return json_dic

# 转为json文件
def dump_json(dic,filename):
    import os,json

    abspath = os.path.join(os.path.abspath('.'),filename)

    # 不加ensure_ascii写入文件会被编码为ASCII
    # indent参数为格式化保存字典

    with open(abspath,'w',encoding='utf-8') as f:
        json.dump(dic,f,indent=4,ensure_ascii=False)

# 主函数
def main(url):
    url_list = get_url(url)
    json_dic = {}
    for url in url_list:
        html = get_html(url)
        data = get_element(html)
        json_dic.update(json_dict(clear_data(data)))

    dump_json(json_dic,'maoyan.json')


if __name__ == '__main__':
    url = 'http://maoyan.com/board/4'
    main(url)

 

转载于:https://www.cnblogs.com/scorpionSpace/p/9274292.html

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值