SQL|练习数据:Kaggle 的 tmdb-box-office-prediction(转架构化数据,用于 SQL 练习)

数据集说明

数据源:https://www.kaggle.com/c/tmdb-box-office-prediction/data(训练集)

数据下载地址:

表名表内容表数据字段数记录数
movies电影表movies.csv163000
belongs_to_collection电影系列表belongs_to_collection.csv4422
person人员表(演员与剧组人员)person.csv10134908
cast_rela电影与演员的关联表cast_rela.csv361811
crew_rela电影与剧组人员的关联表crew_rela.csv373097
genres电影体裁表genres.csv220
genres_rela电影与体裁关联表genres_rela.csv37511
keywords电影关键词表keywords.csv27400
keywords_rela电影与关键词关联表keywords_rela.csv321680
production_companies电影制作公司表production_companies.csv23712
production_companies_rela电影与制作公司关联表production_companies_rela.csv38095
production_countries电影制作国家表production_countries.csv274
production_countries_rela电影与制作国家关联表production_countries_rela.csv33979
spoken_languages电影翻译语言表spoken_languages.csv279
spoken_languages_rela电影与翻译语言关联表spoken_languages_rela.csv34357

建表语句如下:

CREATE TABLE movies (
    movie_id INT NOT NULL COMMENT "电影ID",
    belongs_to_collection INT COMMENT "所属系列电影ID",
    budget BIGINT COMMENT "预算",
    homepage VARCHAR(200) COMMENT "主页地址",
    imdb_id VARCHAR(20) COMMENT "IMDB的ID",
    original_language CHAR(2) COMMENT "原语言",
    original_title VARCHAR(100) COMMENT "原片名",
    overview TEXT COMMENT "剧情简介",
    popularity FLOAT COMMENT "流行系数",
    poster_path VARCHAR(40) COMMENT "海报地址",
    release_date DATE COMMENT "发布日期",
    runtime INT COMMENT "电影时长",
    status VARCHAR(20) COMMENT "电影是否已上映",
    tagline TEXT COMMENT "宣传标语",
    title VARCHAR(100) COMMENT "电影名",
    revenue BIGINT COMMENT "票房收入",
    PRIMARY KEY (movie_id)
) ENGINE=InnoDB;

CREATE TABLE belongs_to_collection (
    collect_id INT NOT NULL COMMENT "系列ID",
    name VARCHAR(60) NOT NULL COMMENT "系列名称",
    poster_path VARCHAR(40) COMMENT "海报地址",
    backdrop_path VARCHAR(40) COMMENT "背景地址",
    PRIMARY KEY (collect_id)
) ENGINE=InnoDB;

CREATE TABLE person (
    person_id INT NOT NULL COMMENT "人员ID",
    cast_id INT COMMENT "电影的第几个演员",
    `character` TEXT COMMENT "角色名称",
    credit_id VARCHAR(100) COMMENT "人员credit编号",
    gender TINYINT COMMENT "性别",
    name VARCHAR(100) COMMENT "姓名",
    `order` INT COMMENT "顺序",
    profile_path VARCHAR(100) COMMENT "个人页地址",
    department VARCHAR(40) COMMENT "所在部门",
    job VARCHAR(40) COMMENT "担任职务"
) ENGINE=InnoDB;

CREATE TABLE cast_rela (
    id INT NOT NULL COMMENT "主键",
    movie_id INT NOT NULL COMMENT "电影ID",
    person_id INT NOT NULL COMMENT "演员人员ID",
    PRIMARY KEY (id)
) ENGINE=InnoDB;

CREATE TABLE crew_rela (
    id INT NOT NULL COMMENT "主键",
    movie_id INT NOT NULL COMMENT "电影ID",
    person_id INT NOT NULL COMMENT "剧组人员ID",
    PRIMARY KEY (id)
) ENGINE=InnoDB;

CREATE TABLE genres (
    genre_id INT NOT NULL COMMENT "体裁ID",
    name VARCHAR(30) NOT NULL COMMENT "体裁名称",
    PRIMARY KEY (genre_id)
) ENGINE=InnoDB;

CREATE TABLE genres_rela (
    id INT NOT NULL COMMENT "主键",
    movie_id INT NOT NULL COMMENT "电影ID",
    genre_id INT NOT NULL COMMENT "体裁ID",
    PRIMARY KEY (id)
) ENGINE=InnoDB;

CREATE TABLE keywords (
    keyword_id INT NOT NULL COMMENT "关键词ID",
    name VARCHAR(60) NOT NULL COMMENT "关键词名称",
    PRIMARY KEY (keyword_id)
) ENGINE=InnoDB;

CREATE TABLE keywords_rela (
    id INT NOT NULL COMMENT "主键",
    movie_id INT NOT NULL COMMENT "电影ID",
    keyword_id INT NOT NULL COMMENT "关键词ID",
    PRIMARY KEY (id)
) ENGINE=InnoDB;

CREATE TABLE production_companies (
    company_id INT NOT NULL COMMENT "制作公司ID",
    name VARCHAR(100) NOT NULL COMMENT "制作公司名称",
    PRIMARY KEY (company_id)
) ENGINE=InnoDB;

CREATE TABLE production_companies_rela (
    id INT NOT NULL COMMENT "主键",
    movie_id INT NOT NULL COMMENT "电影ID",
    company_id INT NOT NULL COMMENT "制作公司ID",
    PRIMARY KEY (id)
) ENGINE=InnoDB;

CREATE TABLE production_countries (
    country_id CHAR(2) NOT NULL COMMENT "制作国家ID",
    name VARCHAR(100) NOT NULL COMMENT "制作国家名称",
    PRIMARY KEY (country_id)
) ENGINE=InnoDB;

CREATE TABLE production_countries_rela (
    id INT NOT NULL COMMENT "主键",
    movie_id INT NOT NULL COMMENT "电影ID",
    country_id CHAR(2) NOT NULL COMMENT "制作国家ID",
    PRIMARY KEY (id)
) ENGINE=InnoDB;

CREATE TABLE spoken_languages (
    language_id CHAR(2) NOT NULL COMMENT "语言ID",
    name VARCHAR(100) COMMENT "语言名称",
    PRIMARY KEY (language_id)
) ENGINE=InnoDB;

CREATE TABLE spoken_languages_rela (
    id INT NOT NULL COMMENT "主键",
    movie_id INT NOT NULL COMMENT "电影ID",
    language_id CHAR(2) NOT NULL COMMENT "语言ID",
    PRIMARY KEY (id)
) ENGINE=InnoDB;

原始数据及格式化过程

原始数据:Kaggle TMDB Box Office Prediction 竞赛的训练数据集

原始数据路径:https://www.kaggle.com/c/tmdb-box-office-prediction/data

训练数据集数据:train.csv(共 3000 条记录)

将训练数据集中的 belong_to_collection、genres、production_companies、production_countries、spoken_lanuages、keywords、cast、crew 这 8 个字段改造为子表。其中:

  • 只有 belongs_to_collection 每个电影只会对应 0 或 1 个系列,其他均会对应多个
  • 只有 cast 和 crew 共用相同 ID,其他的 ID 各不相同
#字段名字段类型含义
1idIdID
2belong_to_collectionString(Json)所属系列电影
3budgetInteger预算
4genresString(Json)体裁
5homepageString主页地址
6imdb_idStringIMDB 的 ID
7original_languageString原语言
8original_titleString原片名
9overviewString剧情简介
10popularityInteger流行系数
11poster_pathString海报地址
12production_companiesString(Json)制作公司
13production_countriesString(Json)制作国家
14release_dateDate发布日期
15runtimeInteger电影时长
16spoken_lanuagesString(Json)翻译成的语言
17statusString电影是否已上映
18taglineString宣传标语
19titleString电影名
20keywordsString(Json)关键词
21castString(Json)演员
22crewString(Json)剧组
23revenueInteger票房收入(仅 train.csv 包含)

改造数据集通过如下逻辑实现:

from collections import defaultdict
from csv import DictReader
from datetime import date
import os

import pandas as pd

from scaffold.formatter import format_float, format_int, format_json_list


def main(path: str) -> None:
    """数据集改造主逻辑

    Parameters
    ----------
    path : str
        数据集所在路径,改造后的数据集也存储于这个路径
    """

    # ---------- 格式化 csv 中的数据 ----------
    all_data = []
    with open(os.path.join(path, "train.csv"), "r", encoding="UTF-8") as file:
        for row in DictReader(file):
            row["movie_id"] = row.pop("\ufeffid")
            row["keywords"] = row.pop("Keywords")

            # 解析 Json 格式字符串表示的 Python 列表
            for field in ["belongs_to_collection", "production_companies", "production_countries",
                          "genres", "spoken_languages", "keywords", "cast", "crew"]:
                row[field] = format_json_list(row.get(field, ""), debug=True)

            # 解析字符串表示的浮点数
            for field in ["popularity"]:
                row[field] = format_float(row.get(field, ""), debug=True)

            # 解析字符串表示的整型
            for field in ["budget", "runtime", "revenue"]:
                row[field] = format_int(row.get(field, ""), debug=True)

            # 解析 月/日/年 的时间类型
            month, day, year = row["release_date"].split("/")
            year = int(year)
            if year < 30:
                year += 2000
            else:
                year += 1900
            row["release_date"] = date(year=int(year), month=int(month), day=int(day))

            all_data.append(row)

    # ---------- 调整关联表的主键 ID ----------
    for row in all_data:
        for old_key, new_key, field in [("id", "collect_id", "belongs_to_collection"),
                                        ("id", "company_id", "production_companies"),
                                        ("iso_3166_1", "country_id", "production_countries"),
                                        ("id", "genre_id", "genres"),
                                        ("iso_639_1", "language_id", "spoken_languages"),
                                        ("id", "keyword_id", "keywords"),
                                        ("id", "person_id", "cast"),
                                        ("id", "person_id", "crew")]:
            if row[field] is not None:
                for field_item in row[field]:
                    field_item[new_key] = field_item.pop(old_key)

    # ---------- 构造其他表 ----------
    outer_table_list = defaultdict(list)
    for row in all_data:
        for field in ["belongs_to_collection", "production_companies", "production_countries",
                      "genres", "spoken_languages", "keywords", "cast", "crew"]:
            if row[field] is not None:
                for field_item in row[field]:
                    outer_table_list[field].append(field_item)

    # ---------- 写出除 cast 和 crew 外的其他表数据到文件 ----------
    for field in ["belongs_to_collection", "production_companies", "production_countries",
                  "genres", "spoken_languages", "keywords"]:
        # 将数据写出到 csv 文件
        df = pd.DataFrame(outer_table_list[field])
        df.drop_duplicates(inplace=True)
        df.to_csv(os.path.join(path, f"{field}.csv"), index=False)

    # ---------- 写出 cast 和 crew 表数据到文件 ----------
    df = pd.DataFrame(outer_table_list["cast"] + outer_table_list["crew"])
    df.drop_duplicates(inplace=True)
    df.to_csv(os.path.join(path, f"person.csv"), index=False)

    # ---------- 生成关联表并写出到文件 ----------
    for _, new_key, field in [("id", "company_id", "production_companies"),
                              ("iso_3166_1", "country_id", "production_countries"),
                              ("id", "genre_id", "genres"),
                              ("iso_639_1", "language_id", "spoken_languages"),
                              ("id", "keyword_id", "keywords"),
                              ("id", "person_id", "cast"),
                              ("id", "person_id", "crew")]:
        rela_table_list = []
        for row in all_data:
            if row[field] is not None:
                for field_item in row[field]:
                    rela_table_list.append(
                        {"movie_id": row["movie_id"], new_key: field_item[new_key]})
        df = pd.DataFrame(rela_table_list)
        df.to_csv(os.path.join(path, f"{field}_rela.csv"), index_label="id")

    # ---------- 生成主表并写出 ----------
    final_data = []
    for row in all_data:
        # 剔除已生成关联表的字段
        for field in ["production_companies", "production_countries",
                      "genres", "spoken_languages", "keywords", "cast", "crew"]:
            if field in row:
                del row[field]

        # 将 belongs_to_collection 字段改为单映射
        row["belongs_to_collection"] = (int(row["belongs_to_collection"][0]["collect_id"])
                                        if row["belongs_to_collection"] else None)
        final_data.append(row)

    df = pd.DataFrame(final_data)
    df.to_csv(os.path.join(path, f"movies.csv"), index=False)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

长行

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值