数据预处理课程设计操作笔记

魏什码

已于 2023-02-20 16:57:49 修改

阅读量311

点赞数

分类专栏：笔记文章标签： hadoop hdfs hive

于 2022-12-28 22:17:57 首次发布

本文链接：https://blog.csdn.net/qq_53217531/article/details/128474814

版权

笔记专栏收录该内容

1 篇文章 0 订阅

订阅专栏

文章目录

数据预处理

更改文件权限

chmod 755 ./small_user.csv

删除第一行

sed -i '1d' small_user.csv
#-i 此选项会直接修改源文件，要慎用。
#d 删除

删除每行读取的文件的第四个字段、通过字符串截取有效时间、增加省份列

#建一个脚本文件pretreatment.sh
vim pretreatment.sh

#!/bin/bash
infile=$1
outfile=$2

awk -F "," 'BEGIN{
        srand();
        id=0;
        Province[0]="山东";Province[1]="山西";Province[2]="河南";Province[3]="河北";Province[4]="陕西";Province[5]="内蒙古";Province[6]="上海市";
        Province[7]="北京市";Province[8]="重庆市";Province[9]="天津市";Province[10]="福建";Province[11]="广东";Province[12]="广西";Province[13]="云南"; 
        Province[14]="浙江";Province[15]="贵州";Province[16]="新疆";Province[17]="西藏";Province[18]="江西";Province[19]="湖南";Province[20]="湖北";
        Province[21]="黑龙江";Province[22]="吉林";Province[23]="辽宁"; Province[24]="江苏";Province[25]="甘肃";Province[26]="青海";Province[27]="四川";
        Province[28]="安徽"; Province[29]="宁夏";Province[30]="海南";Province[31]="香港";Province[32]="澳门";Province[33]="台湾";
    }
    {
        id=id+1;
        value=int(rand()*34);       
        print id"\t"$1"\t"$2"\t"$3"\t"$5"\t"substr($6,1,10)"\t"Province[value]
    }' $infile > $outfile

#执行pretreatment.sh脚本文件，对数据预处理
bash ./pretreatment.sh small_user.csv result.txt

查看结果文件编码，并转换文件编码

vim result.txt
:set fileencoding		#查看文件编码
:set fileencoding=utf-8		转换文件编码
set fileencoding=utf8

上传至HDFS

#源数据加个_raw
#创建相关文件夹，并将数据上传
hdfs dfs -mkdir /hadoop_course_design
hdfs dfs -put result.txt /hadoop_course_design

根据上传至HDFS的文件构建数据仓库Hive

hive
create database db_hcd
use db_hcd

#创建外部表
CREATE EXTERNAL TABLE hadoop_course_design(id INT,user_id STRING,item_id STRING,behavior_type INT,item_category STRING,the_date DATE,province STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/hadoop_course_design';  #这是hadoop上面的目录

#查看表结构
desc hadoop_course_design;

#查看前10行
select * from hadoop_course_design limit 10;

Hive数据分析

（1）根据user_id查询不重复的数据有多少行。

#select count (*) from hadoop_course_design;查看有多少条数据
#select distinct user_id from hadoop_course_design;  #将该字段去重
select count(distinct user_id) from hadoop_course_design;

（2）查询不重复的数据有多少行。

select count(*) from (select user_id,item_id,behavior_type,item_category,the_date,province from hadoop_course_design group by user_id,item_id,behavior_type,item_category,the_date,province)a;
#或者
select count(distinct user_id,item_id,behavior_type,item_category,the_date,province) from hadoop_course_design;

（3）统计时间在2014-12-11和2014-12-12这两天商品售出总和。

select count(*) from hadoop_course_design where behavior_type='4' and the_date >= '2014-12-11' and the_date <= '2014-12-12';

（4）以月的第n天为统计单位，依次显示第n天网站卖出去的商品的个数。

select day(the_date),count(*) from hadoop_course_design where behavior_type='4' group by day (the_date) ;
#下面是排序
#select day(the_date) as mydate,count(*) from hadoop_course_design where #behavior_type='4' group by day(the_date) order by mydate ASC;

（5）取给定时间和给定地点，我们可以求当天发出到该地点的货物的数量。（如查看2014-12-12发货到江西的数量。）

select count(*) from hadoop_course_design where province = '江西' and the_date = '2014-12-12' and behavior_type='4';

（6）统计出2014-12-11的购买数、浏览数，分析购买率

SELECT count(if(behavior_type='4',1,NULL)),COUNT(if(behavior_type='1',1,null)), count(if(behavior_type='4',1,NULL))/COUNT(if(behavior_type='1',1,null))*100 from hadoop_course_design where the_date ='2014-12-11';

（7）在2014-12-12这天,用户10001082的所有点击行为数、所有用户的点击行为数、并获取用户10001082的所有点击行为数占所有用户的点击行为数的比例。

SELECT COUNT(if(user_id='10001082',1,NULL)),COUNT(user_id),COUNT(if(user_id='10001082',1,NULL))/COUNT(user_id) FROM hadoop_course_design WHERE behavior_type='1' and the_date ='2014-12-12';

（8）查询2014-12-12当天购买数超过5的id及购买商品数量，并以购买的商品数降序排列查询结果。

select user_id,count(behavior_type) as number from hadoop_course_design where behavior_type='4' and the_date='2014-12-12' group by user_id having count(behavior_type)>5 order by number DESC;

（9）查询时间为2014-12-12每个地区当天的浏览数。

select province , count(behavior_type) from hadoop_course_design where the_date='2014-12-12' and behavior_type='1' group by province;

（10）取时间为2014-12-12，求当天各地区的购买货物的数量。

select province , count(behavior_type) from hadoop_course_design where the_date='2014-12-12' and behavior_type='4' group by province;

Hive数据导出到mysql

创建数据库，赋权限

#源数据加个_raw
create database hive_hcd;
grant all privileges on hive_hcd.* to 'hive'@'%'; 
flush privileges; /*刷新权限*/

在MySQL数据库中创建表

CREATE TABLE  table_01(id int(50),user_id varchar(50),item_id varchar(50),behavior_type int(50),item_category varchar(50),the_date DATE,province varchar(50)) ENGINE=MyISAM  DEFAULT CHARSET=utf8;

存储引擎	优势	劣势	适用
ISAM	快速读取，不占用大量的内存和存储资源	不支持事务、外来键、索引
MYIAM	快速读取	不支持事务	做很多count的计算；插入不频繁，查询频繁；不需要事务
HEAP	有极高的插入、更新和查询效率，默认使用哈希索引	只主流在内存里的临时表格，保存的数据具有不稳定性	用于临时表
INNODB	有提交、回滚和崩溃恢复能力的失误安全存储引擎	读取速度比ISAM慢	表更新和查询都相当频繁，且表锁定的计划比较大的情况
ARCHIVE	有高效的插入速度	不支持索引，仅支持insert和select语句	存储大量独立的、作为历史记录的数据，如日志记录和聚合分析

将处理过的文件导入mysql

show variables like 'character%';   #显示编码格式
set character_set_database=utf8;
set character_set_server=utf8;
#导出前更改一下编码，不然中文不显示

#源数据加个_raw

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table table_01 --export-dir '/hadoop_course_design/result.txt' --fields-terminated-by '\t'

#useUnicode=true&characterEncoding=utf-8作用是：指定字符的编码、解码格式

在hive上依次对各个问题筛选出最终数据，并建立相关的内部表

1.双十二当天的各个省的发货量。

create table question1 as select province , count(behavior_type) as total_shipment from hadoop_course_design where the_date='2014-12-12' and behavior_type='4' group by province;

2.每天的浏览量和购买量。

create table question2 as SELECT the_date,count(if(behavior_type='1',1,null))as liulan_count,count(if(behavior_type='4',1,null))as goumai_count from hadoop_course_design  group by the_date ;

3. 每天的购买率。

create table question3 as select the_date,round(goumai_count/liulan_count,7)*100 as rate from question2;

4. 双十二当天top10的用户信息（包括：user_id，购买数量，省份）

create table question4 as select user_id,province,count(behavior_type='4') as goumai_count  from hadoop_course_design where the_date='2014-12-12'  group by  user_id,province  order by goumai_count desc limit 10;

将hive中四个问题所需要的数据导入mysql中

问题1

#创建MySQL数据表
CREATE TABLE  q1(province varchar(50),total_shipment int(50)) #mysql中

#hive 导入到hdfs中
hdfs dfs -mkdir /hadoop_course_design/result_data/question1  #创建目录

#导入
INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question1'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question1;

#hdfs导入到mysql中
sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q1 --export-dir '/hadoop_course_design/result_data/question1/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver

问题2

CREATE TABLE  q2(data varchar(50),browse int(50),buy int(50)) #mysql中

hdfs dfs -mkdir /hadoop_course_design/result_data/question2  #创建目录

INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question2;

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q2 --export-dir '/hadoop_course_design/result_data/question2/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver

问题3

CREATE TABLE  q3(the_date date,rate varchar(50));      #mysql中

hdfs dfs -mkdir /hadoop_course_design/result_data/question3  #创建目录

INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question3'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question3;

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q3 --export-dir '/hadoop_course_design/result_data/question3/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver

问题4

CREATE TABLE  q4(user_id varchar(50),province varchar(50),buy_count int);    #mysql中

hdfs dfs -mkdir /hadoop_course_design/result_data/question4  #创建目录

INSERT OVERWRITE  DIRECTORY '/hadoop_course_design/result_data/question4'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from question4;

sqoop export --connect "jdbc:mysql://192.168.207.130:3306/hive_hcd?useUnicode=true&characterEncoding=utf-8" --username hive --password 123456 --table q4 --export-dir '/hadoop_course_design/result_data/question4/000000_0' --input-fields-terminated-by '\t' --driver com.mysql.jdbc.Driver

将mysql数据导出为CSV文件

#查看本地secure-file-priv变量配置(只能在其路径下保存文件)
mysql> show variables like '%secure%';

select * from q1 into outfile '/var/lib/mysql-files/q1.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;

select * from q2 into outfile '/var/lib/mysql-files/q2.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;

select * from q3 into outfile '/var/lib/mysql-files/q3.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;

select * from q4 into outfile '/var/lib/mysql-files/q4.csv' fields terminated by ","  escaped by '' optionally enclosed  by ''   lines terminated by '\n' ;