数据预处理课程设计操作笔记

文章目录

数据预处理

更改文件权限
chmod 755 ./small_user.csv
删除第一行
sed -i '1d' small_user.csv
#-i 此选项会直接修改源文件,要慎用。
#d 删除
删除每行读取的文件的第四个字段、通过字符串截取有效时间、增加省份列
#建一个脚本文件pretreatment.sh
vim pretreatment.sh
#!/bin/bash
infile=$1
outfile=$2

awk -F "," 'BEGIN{
        srand();
        id=0;
        Province[0]="山东";Province[1]="山西";Province[2]="河南";Province[3]="河北";Province[4]="陕西";Province[5]="内蒙古";Province[6]="上海市";
        Province[7]="北京市";Province[8]="重庆市";Province[9]="天津市";Province[10]="福建";Province[11]="广东";Province[12]="广西";Province[13]="云南"; 
        Province[14]="浙江";Province[15]="贵州";Province[16]="新疆";Province[17]="西藏";Province[18]="江西";Province[19]="湖南";Province[20]="湖北";
        Province[21]="黑龙江";Province[22]="吉林";Province[23]="辽宁"; Province[24]="江苏";Province[25]="甘肃";Province[26]="青海";Province[27]="四川";
        Province[28]="安徽"; Province[29]="宁夏";Province[30]="海南";Province[31]="香港";Province[32]="澳门";Province[33]="台湾";
    }
    {
        id=id+1;
        value=int(rand()*34);       
        print id"\t"$1"\t"$2"\t"$3"\t"$5"\t"substr($6,1,10)"\t"Province[value]
    }' $infile > $outfile
#执行pretreatment.sh脚本文件,对数据预处理
bash ./pretreatment.sh small_user.csv result.txt
查看结果文件编码,并转换文件编码
vim result.txt
:set fileencoding		#查看文件编码
:set fileencoding=utf-8		转换文件编码
set fileencoding=utf8

上传至HDFS

#源数据加个_raw
#创建相关文件夹,并将数据上传
hdfs dfs -mkdir /hadoop_course_design
hdfs dfs -put result.txt /hadoop_course_design

根据上传至HDFS的文件构建数据仓库Hive

hive
create database db_hcd
use db_hcd

#创建外部表
CREATE EXTERNAL TABLE hadoop_course_design(id INT,user_id STRING,item_id STRING,behavior_type INT,item_category STRING,the_date DATE,province STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/hadoop_course_design';  #这是hadoop上面的目录

#查看表结构
desc hadoop_course_design;

#查看前10行
select * from hadoop_course_design limit 10;

Hive数据分析

(1) 根据user_id查询不重复的数据有多少行。
#select count (*) from hadoop_course_design;查看有多少条数据
#select distinct user_id from hadoop_course_design;  #将该字段去重
select count(distinct user_id) from hadoop_course_design;
(2)查询不重复的数据有多少行。
select count(*) from (select user_id,item_id,behavior_type,item_category,the_date,province from hadoop_course_design group by user_id,item_id,behavior_type,item_category,the_date,province)a;
#或者
select count(distinct user_id,item_id,behavior_type,item_category,the_date,province) from hadoop_course_design;
(3)统计时间在2014-12-11和2014-12-12这两天商品售出总和。
select count(*) from hadoop_course_design where behavior_type='4' and the_date >= '2014-12-11' and the_date <= '2014-12-12';
(4)以月的第n天为统计单位,依次显示第n天网站卖出去的商品的个数。
select day(the_date),count(*) from hadoop_course_design where behavior_type='4' group by day (the_date) ;
#下面是排序
#select day(the_date) as mydate,count(*) from hadoop_course_design where #behavior_type='4' group by day(the_date) order by mydate ASC;
(5)取给定时间和给定地点,我们可以求当天发出到该地点的货物的数量。(如查看2014-12-12发货到江西的数量。)
select count(*) from hadoop_course_design where province = '江西' and the_date = '2014-12-12' and behavior_type='4';
(6)统计出2014-12-11的购买数、浏览数,分析购买率
SELECT count(if(behavior_type='4',1,NULL)),COUNT(if(behavior_type='1',1,null)), count(if(behavior_type='4',1,NULL))/COUNT(if(behavior_type='1',1,null))*100 from hadoop_course_design where the_date ='2014-12-11';
(7)在2014-12-12这天,用户10001082的所有点击行为数、所有用户的点击行为数、并获取用户10001082的所有点击行为数占所有用户的点击行为数的比例。
SELECT COUNT(if(user_id='10001082',1,NULL)),COUNT(user_id),COUNT(if(user_id='10001082',1,NULL))/COUNT(user_id) FROM hadoop_course_design WHERE behavior_type='1' and the_date ='2014-12-12';
(8)查询2014-12-12当天购买数超过5的id及购买商品数量,并以购买的商品数降序排列查询结果。
select user_id,count(behavior_type) as number from hadoop_course_design where behavior_type='4' and the_date='2014-12-12' group by user_id having count(behavior_type)>5 order by number DESC;
(9)查询时间为2014-12-12每个地区当天的浏览数。
select province , count(behavior_type) from hadoop_course_design where the_date='2014-12-12' and behavior_type='1' group by province;
(10)取时间为2014-12-12,求当天各地区的购买货物的数量。
select province , count(behavior_type) from hadoop_course_design where the_date='2014-12-12' and behavior_type='4' group by province;

Hive数据导出到mysql

创建数据库,赋权限
#源数据加个_raw
create database hive_hcd;
grant all privileges on hive_hcd.* to 'hive'@'%'; 
flush privileges; /*刷新权限*/
在MySQL数据库中创建表
CREATE TABLE  table_01(id int(50),user_id varchar(50),item_id varchar(50),behavior_type int(50),item_category varchar(50),the_date DATE,province varchar(50)) ENGINE=MyISAM  DEFAULT CHARSET=utf8;
存储引擎优势劣势适用
ISAM快速读取,不占用大量的内存和存储资源不支持事务、外来键、索引
MYIAM快速读取不支持事务做很多count的计算;插入不频繁,查询频繁;不需要事务
HEAP有极高的插入、更新和查询效率,默认使用哈希索引只主流在内存里的临时表格,保存的数据具有不稳定性用于临时表
INNODB有提交、回滚和崩溃恢复能力的失误安全存储引擎读取速度比ISAM慢表更新和查询都相当频繁,且表锁定的计划比较大的情况
ARCHIVE有高效的插入速度不支持索引,仅支持insert和select语句存储大量独立的、作为历史记录的数据,如日志记录和聚合分析
将处理过的文件导入mysql
show variables like 'character%';   #显示编码格式
set character_set_database=utf8;
set character_set_server=utf8;
#导出前更改一下编码,不然中文不显示

#源数据加个_raw

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table table_01 --export-dir '/hadoop_course_design/result.txt' --fields-terminated-by '\t'

#useUnicode=true&characterEncoding=utf-8作用是:指定字符的编码、解码格式
在hive上依次对各个问题筛选出最终数据,并建立相关的内部表
1.双十二当天的各个省的发货量。
create table question1 as select province , count(behavior_type) as total_shipment from hadoop_course_design where the_date='2014-12-12' and behavior_type='4' group by province;
2.每天的浏览量和购买量。
create table question2 as SELECT the_date,count(if(behavior_type='1',1,null))as liulan_count,count(if(behavior_type='4',1,null))as goumai_count from hadoop_course_design  group by the_date ;
3. 每天的购买率。
create table question3 as select the_date,round(goumai_count/liulan_count,7)*100 as rate from question2;
4. 双十二当天top10的用户信息(包括:user_id,购买数量,省份)
create table question4 as select user_id,province,count(behavior_type='4') as goumai_count  from hadoop_course_design where the_date='2014-12-12'  group by  user_id,province  order by goumai_count desc limit 10;

将hive中四个问题所需要的数据导入mysql中
问题1
#创建MySQL数据表
CREATE TABLE  q1(province varchar(50),total_shipment int(50)) #mysql中
#hive 导入到hdfs中
hdfs dfs -mkdir /hadoop_course_design/result_data/question1  #创建目录

#导入
INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question1'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question1;
#hdfs导入到mysql中
sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q1 --export-dir '/hadoop_course_design/result_data/question1/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver
问题2
CREATE TABLE  q2(data varchar(50),browse int(50),buy int(50)) #mysql中

hdfs dfs -mkdir /hadoop_course_design/result_data/question2  #创建目录

INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question2;

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q2 --export-dir '/hadoop_course_design/result_data/question2/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver
问题3
CREATE TABLE  q3(the_date date,rate varchar(50));      #mysql中

hdfs dfs -mkdir /hadoop_course_design/result_data/question3  #创建目录

INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question3'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question3;

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q3 --export-dir '/hadoop_course_design/result_data/question3/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver
问题4
CREATE TABLE  q4(user_id varchar(50),province varchar(50),buy_count int);    #mysql中

hdfs dfs -mkdir /hadoop_course_design/result_data/question4  #创建目录

INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question4'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question4;

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q4 --export-dir '/hadoop_course_design/result_data/question4/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver
将mysql数据导出为CSV文件
#查看本地secure-file-priv变量配置(只能在其路径下保存文件)
mysql> show variables like '%secure%';

select * from q1 into outfile '/var/lib/mysql-files/q1.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;

select * from q2 into outfile '/var/lib/mysql-files/q2.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;

select * from q3 into outfile '/var/lib/mysql-files/q3.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;

select * from q4 into outfile '/var/lib/mysql-files/q4.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值