1.
原始表
CREATE TABLE ml_100k (userid INT, movieid INT, rating INT, unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
数据清洗后的表
CREATE TABLE ml_100k2 (userid INT, movieid INT, rating INT, weekday int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
3.数据导入
LOAD DATA LOCAL INPATH '/home/centos/ml-100k/u.data' into table ml_100k;
4.脚本编写和脚本加载
clean_ml_100k.py
import sys
import datetime
for line in sys.stdin: #接收输入
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() #转化unixtime时间戳为日期时间,获取对应的星期值
print('\t'.join([userid, movieid, rating, str(weekday)])) # 输出清洗后的数据
然后 hdfs dfs -put
add file /home/centos/clean_ml_100k.py;
5.数据清洗+转储
INSERT OVERWRITE TABLE ml_100k2
SELECT
TRANSFORM (userid, movieid, rating, unixtime) --输入值(基表)
USING 'python clean_ml_100k.py' --使用脚本清洗
AS (userid, movieid, rating, weekday) --输出值(子表)
FROM ml_100k;
然后我就失败了
标签:脚本,rating,100k,python,ml,userid,INT,movieid,清洗