1. 创建目录存放yelp数据
[vagrant@master ~]$ hadoop fs -mkdir /yelp
[vagrant@master ~]$ hadoop fs -mkdir /yelp/business
[vagrant@master ~]$ hadoop fs -mkdir /yelp/checkin
[vagrant@master ~]$ hadoop fs -mkdir /yelp/review
[vagrant@master ~]$ hadoop fs -mkdir /yelp/tip
[vagrant@master ~]$ hadoop fs -mkdir /yelp/user
2. 上传已解压的json文件
hadoop fs -put yelp_academic_dataset_business.json /yelp/business
hadoop fs -put yelp_academic_dataset_checkin.json /yelp/checkin
hadoop fs -put yelp_academic_dataset_review.json /yelp/review
hadoop fs -put yelp_academic_dataset_tip.json /yelp/tip
hadoop fs -put yelp_academic_dataset_user.json /yelp/user
3. 查看上传成功的文件
hadoop fs -ls -R -h /yelp/
4. 创建hive表并获取数据
4.1 json_business表
CREATE
EXTERNAL TABLE json_business
(
json_body string
) stored AS textfile LOCATION '/yelp/business';
CREATE TABLE business
(
business_id string,
name string,
address string,
city string,
state string,
postal_code string,
latitude float,
longitude float,
stars float,
review_count int,
is_open tinyint,
attributes string,
categories string,
hours string
);
FROM json_business
INSERT
OVERWRITE
TABLE
business
SELECT get_json_object(json_body, '$.business_id'),
get_json_object(json_body, '$.name'),
get_json_object(json_body, '$.address'),
get_json_object(json_body, '$.city'),
get_json_object(json_body, '$.state'),
get_json_object(json_body, '$.postal_code'),
get_json_object(json_body, '$.latitude'),
get_json_object(json_body, '$.longitude'),
get_json_object(json_body, '$.stars'),
get_json_object(json_body, '$.review_count'),
get_json_object(json_body, '$.is_open'),
cast(get_json_object(json_body, '$.attributes') AS string),
get_json_object(json_body, '$.categories'),
get_json_object(json_body, '$.hours');
4.2 json_review表
create external table json_review
(
json_body string
) stored as textfile location '/yelp/review';
CREATE TABLE review
(
review_id string,
rev_user_id string,
rev_business_id string,
rev_stars int,
rev_useful int,
rev_funny int,
rev_cool int,
rev_text string,
rev_timestamp string,
rev_date date
);
FROM json_review
INSERT
OVERWRITE
TABLE
review
SELECT get_json_object(json_body, '$.review_id'),
get_json_object(json_body, '$.user_id'),
get_json_object(json_body, '$.business_id'),
get_json_object(json_body, '$.stars'),
get_json_object(json_body, '$.useful'),
get_json_object(json_body, '$.funny'),
get_json_object(json_body, '$.cool'),
regexp_replace(regexp_replace(get_json_object(json_body, '$.text'), '\n', ' '), '\r', ' '),
get_json_object(json_body, '$.date'),
cast(substr(get_json_object(json_body, '$.date'), 0, 10) as date);
4.3 json_user表
create external table json_user
(
json_body string
) stored as textfile location '/yelp/user';
CREATE TABLE users
(
user_id string,
user_name string,
user_review_count int,
user_yelping_since string,
user_friends string,
user_useful int,
user_funny int,
user_cool int,
user_fans int,
user_elite string,
user_average_stars float,
user_compliment_hot int,
user_compliment_more int,
user_compliment_profile int,
user_compliment_cute int,
user_compliment_list int,
user_compliment_note int,
user_compliment_plain int,
user_compliment_cool int,
user_compliment_funny int,
user_compliment_writer int,
user_compliment_photos int
);
FROM json_user
INSERT
OVERWRITE
TABLE
users
SELECT get_json_object(json_body, '$.user_id'),
get_json_object(json_body, '$.name'),
get_json_object(json_body, '$.review_count'),
get_json_object(json_body, '$.yelping_since'),
get_json_object(json_body, '$.friends'),
get_json_object(json_body, '$.useful'),
get_json_object(json_body, '$.funny'),
get_json_object(json_body, '$.cool'),
get_json_object(json_body, '$.fans'),
get_json_object(json_body, '$.elite'),
get_json_object(json_body, '$.average_stars'),
get_json_object(json_body, '$.compliment_hot'),
get_json_object(json_body, '$.compliment_more'),
get_json_object(json_body, '$.compliment_profile'),
get_json_object(json_body, '$.compliment_cute'),
get_json_object(json_body, '$.compliment_list'),
get_json_object(json_body, '$.compliment_note'),
get_json_object(json_body, '$.compliment_plain'),
get_json_object(json_body, '$.compliment_cool'),
get_json_object(json_body, '$.compliment_funny'),
get_json_object(json_body, '$.compliment_writer'),
get_json_object(json_body, '$.compliment_photos');
4.4 json_checkin表
create external table json_checkin
(
json_body string
) stored as textfile location '/yelp/checkin';
CREATE TABLE checkin
(
business_id string,
checkin_dates string
);
FROM json_checkin
INSERT
OVERWRITE
TABLE
checkin
SELECT get_json_object(json_body, '$.business_id'), get_json_object(json_body, '$.date');