1. 建表
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
2. 下载数据集
**从 GroupLens datasets 官网下载数据集 100K **
1. wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
解压数据集
2. unzip ml-100k.zip
3. 加载数据集-Hive
**从 GroupLens datasets 官网下载数据集 100K **
LOAD DATA LOCAL INPATH '/opt/data/ml-100k/u.data'
OVERWRITE INTO TABLE u_data;
3. 创建Python脚本
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
4. Hive查询
创建 u_data_new 表
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
添加脚本到hive中
add FILE weekday_mapper.py;
从u_data中获取数据经过Python处理装载到u_data_new
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
查询u_data_new 数据集中行数
SELECT weekday, COUNT(*)
FROM u_data_new
GROUP BY weekday;