数据分析之准备yelp数据

1. 创建目录存放yelp数据

[vagrant@master ~]$ hadoop fs -mkdir /yelp
[vagrant@master ~]$ hadoop fs -mkdir /yelp/business
[vagrant@master ~]$ hadoop fs -mkdir /yelp/checkin
[vagrant@master ~]$ hadoop fs -mkdir /yelp/review
[vagrant@master ~]$ hadoop fs -mkdir /yelp/tip
[vagrant@master ~]$ hadoop fs -mkdir /yelp/user

2. 上传已解压的json文件

hadoop fs -put yelp_academic_dataset_business.json /yelp/business
hadoop fs -put yelp_academic_dataset_checkin.json /yelp/checkin
hadoop fs -put yelp_academic_dataset_review.json /yelp/review
hadoop fs -put yelp_academic_dataset_tip.json /yelp/tip
hadoop fs -put yelp_academic_dataset_user.json /yelp/user

3. 查看上传成功的文件

hadoop fs -ls -R -h /yelp/

4. 创建hive表并获取数据

        4.1 json_business表

CREATE
    EXTERNAL TABLE json_business
(
    json_body string
) stored AS textfile LOCATION '/yelp/business';
 
CREATE TABLE business
(
    business_id      string,
    name             string,
    address          string,
    city             string,
    state            string,
    postal_code  string,
    latitude     float,
    longitude    float,
    stars        float,
    review_count int,
    is_open      tinyint,
    attributes   string,
    categories   string,
    hours        string
);
 
 
FROM json_business
INSERT
OVERWRITE
TABLE
business
SELECT get_json_object(json_body, '$.business_id'),
       get_json_object(json_body, '$.name'),
       get_json_object(json_body, '$.address'),
       get_json_object(json_body, '$.city'),
       get_json_object(json_body, '$.state'),
       get_json_object(json_body, '$.postal_code'),
       get_json_object(json_body, '$.latitude'),
       get_json_object(json_body, '$.longitude'),
       get_json_object(json_body, '$.stars'),
       get_json_object(json_body, '$.review_count'),
       get_json_object(json_body, '$.is_open'),
       cast(get_json_object(json_body, '$.attributes') AS string),
       get_json_object(json_body, '$.categories'),
       get_json_object(json_body, '$.hours');

        4.2 json_review表

create external table json_review
(
    json_body string
) stored as textfile location '/yelp/review';
 
CREATE TABLE review
(
    review_id       string,
    rev_user_id     string,
    rev_business_id string,
    rev_stars       int,
    rev_useful      int,
    rev_funny       int,
    rev_cool        int,
    rev_text        string,
    rev_timestamp   string,
    rev_date        date
);
 
FROM json_review
INSERT
OVERWRITE
TABLE
review
SELECT get_json_object(json_body, '$.review_id'),
       get_json_object(json_body, '$.user_id'),
       get_json_object(json_body, '$.business_id'),
       get_json_object(json_body, '$.stars'),
       get_json_object(json_body, '$.useful'),
       get_json_object(json_body, '$.funny'),
       get_json_object(json_body, '$.cool'),
       regexp_replace(regexp_replace(get_json_object(json_body, '$.text'), '\n', ' '), '\r', ' '),
       get_json_object(json_body, '$.date'),
       cast(substr(get_json_object(json_body, '$.date'), 0, 10) as date);

        4.3 json_user表

create external table json_user
(
    json_body string
) stored as textfile location '/yelp/user';
 
CREATE TABLE users
(
    user_id                 string,
    user_name               string,
    user_review_count       int,
    user_yelping_since      string,
    user_friends            string,
    user_useful             int,
    user_funny              int,
    user_cool               int,
    user_fans               int,
    user_elite              string,
    user_average_stars      float,
    user_compliment_hot     int,
    user_compliment_more    int,
    user_compliment_profile int,
    user_compliment_cute    int,
    user_compliment_list    int,
    user_compliment_note    int,
    user_compliment_plain   int,
    user_compliment_cool    int,
    user_compliment_funny   int,
    user_compliment_writer  int,
    user_compliment_photos  int
);
 
FROM json_user
INSERT
OVERWRITE
TABLE
users
SELECT get_json_object(json_body, '$.user_id'),
       get_json_object(json_body, '$.name'),
       get_json_object(json_body, '$.review_count'),
       get_json_object(json_body, '$.yelping_since'),
       get_json_object(json_body, '$.friends'),
       get_json_object(json_body, '$.useful'),
       get_json_object(json_body, '$.funny'),
       get_json_object(json_body, '$.cool'),
       get_json_object(json_body, '$.fans'),
       get_json_object(json_body, '$.elite'),
       get_json_object(json_body, '$.average_stars'),
       get_json_object(json_body, '$.compliment_hot'),
       get_json_object(json_body, '$.compliment_more'),
       get_json_object(json_body, '$.compliment_profile'),
       get_json_object(json_body, '$.compliment_cute'),
       get_json_object(json_body, '$.compliment_list'),
       get_json_object(json_body, '$.compliment_note'),
       get_json_object(json_body, '$.compliment_plain'),
       get_json_object(json_body, '$.compliment_cool'),
       get_json_object(json_body, '$.compliment_funny'),
       get_json_object(json_body, '$.compliment_writer'),
       get_json_object(json_body, '$.compliment_photos');

        4.4 json_checkin表

create external table json_checkin
(
    json_body string
) stored as textfile location '/yelp/checkin';
 
CREATE TABLE checkin
(
    business_id   string,
    checkin_dates string
);
 
FROM json_checkin
INSERT
OVERWRITE
TABLE
checkin
SELECT get_json_object(json_body, '$.business_id'), get_json_object(json_body, '$.date');
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值