create database hive;
use hive;
create table loan(
LoanStatus STRING,
BorrowerRate FLOAT,
ProsperScore FLOAT,
Occupation STRING,
EmploymentStatus STRING,
IsBorrowerHomeowner BOOLEAN,
CreditScorerangeLower INT,
CreditScorerangeUpper INT,
IncomeRange STRING
)row format serde
'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with SERDEPROPERTIES ("separatorChar"=",")STORED AS TEXTFILE;
--从本地上传到HDFS
hadoop fs -mkdir /college
hadoop fs -put /usr/data/loan.csv /college/
--从HDFS上加载数据到表中,如是本则在 inpath 前面加local
load data inpath '/college/loan.csv' into table loan;
select * from loan;
--以信用得分ProsperScore为变量,对借款进行计数统计(降序),结果写入本地/root/college001/中
insert overwrite local directory '/root/college001/' row format
delimited fields terminated by '\t'
select ProsperScore ,count(*)as sum from loan group by ProsperScore order by sum desc;
--重开一个会话,查看生成的数据
--cat /root/college001/*
-- 显示结果:
-- 29084
-- 4 12595
-- 6 12278
-- 8 12053
-- 7 10597
-- 5 9813
-- 3 7642
-- 9 6911
-- 2 5766
-- 10 4750
-- 11 1456
-- 1 992
-- 注意:
-- 解决方法:将hive设置成本地模式:
-- set hive.exec.mode.local.auto=true;
-- 再试试count()函数 就可以运行
-- 但是只在本次hive的任务有效,如果想长期有效 的话要修改hive的配置文件:hive-site.xml 位置在hive/conf下
-- 在slave1的机器上的hive-site.xml中的configuration里面添加一个property:
-- <property>
-- <name>hive.exec.mode.local.auto</name>
-- <value>true</value>
-- </property>
--给出借款较多的行业top5,结果写入本地/root/college002/中。
insert overwrite local directory '/root/college002/' row format
delimited fields terminated by '\t'
select Occupation,count(*)as cnt from loan group by
Occupation order by cnt desc limit 5;
--重开一个会话,查看生成的数据
--cat /root/college002/*
-- Other 28617
-- Professional 13628
-- Computer Programmer 4478
-- Executive 4311
-- Teacher 3759
--分析贷款状态为违约(Defaulted)的贷款人就业信息,将结果top3写入/root/college003/
insert overwrite local directory '/root/college003/' row format
delimited fields terminated by '\t'
select Occupation from loan where LoanStatus='Defaulted' limit 3
--查看生成的数据
--cat /root/college003/*
-- Administrative Assistant
-- Skilled Labor
-- Other
--对数据中收入范围进行分组统计(降序),查看贷款人收入情况,结果写入/root/college004/
insert overwrite local directory '/root/college004/' row format
delimited fields terminated by '\t'
select IncomeRange,count(*)as cnt from loan group by IncomeRange order by cnt desc;
--查看生成的数据
--cat /root/college004/*
-- .$25.000-49.999 32192
-- $50.000-74.999 31050
-- $100.000+ 17337
-- $75.000-99.999 16916
-- Not displayed 7741
-- $1-24.999 7274
-- Not employed 806
-- $0 621
--对信用得分进行中间数求值作为信用得分,计算各职业中的信用得分最高分。结果top5写入/root/college005/
--percentile产生中位数
--开启hive默认出于安全考虑,不允许使用union all. 如果你知道你在干什么,可以去设置允许使用。
set hive.strict.checks.cartesian.product=false;
set hive.mapred.mode=nonstrict;
insert overwrite local directory '/root/college005/' row format
delimited fields terminated by '\t'
select Occupation,max(score)as sc from (
select if(ProsperScore='',b.mid,ProsperScore) as score , Occupation from loan a join (
select percentile(cast(ProsperScore as INT),0.5) as mid from loan)b )c group by c.Occupation order by sc ;
--查看生成的数据
-- --cat /root/college005/*
-- Waiter/Waitress 9
-- Truck Driver 9
-- Tradesman - Plumber 9
-- Tradesman - Mechanic 9
-- Tradesman - Electrician 9
-- Tradesman - Carpenter 9
-- Teacher's Aide 9
-- Teacher 9
-- Student - Technical School 9
-- Student - Community College 9
-- Student - College Sophomore 9
-- Student - College Senior 9
-- Student - College Junior 9
-- Student - College Graduate Student 9
-- Student - College Freshman 9
--后面省略
--未考虑Occupation空值做中间数的处理
select Occupation,max(ProsperScore)as sc from loan group by Occupation order by sc ;
--(1) 支持度写到本地/root/college006/中(保留五位小数);贷款状态为违约的人数的最多的职业人数说明关联性最大
--除以总人数,结果为支持度
--开启hive默认出于安全考虑,不允许使用union all. 如果你知道你在干什么,可以去设置允许使用。
set hive.strict.checks.cartesian.product=false;
set hive.mapred.mode=nonstrict;
insert overwrite local directory '/root/college006/'
SELECT round((t2.s/t4.s),5)
FROM(select t1.occupation as c, count(*)as s from loan t1
where t1.LoanStatus='Defaulted'
group by t1.occupation
order by s desc, t1.occupation asc limit 1
)t2
JOIN(select count(*)as s from loan t3 )t4;
--查看生成的数据
-- cat /root/college006/*
--0.01124
--(2) 置信度写到本地/root/college007/中(保留五位小数)
insert overwrite local directory '/root/college007/'
SELECT round((t2.s/t4.s),5)
FROM(select t1.occupation as c, count(*)as s from loan t1
where t1.LoanStatus='Defaulted'
group by t1.occupation
order by s desc, t1.occupation asc limit 1
)t2
JOIN( select count(*)as s from loan t3
where occupation='Other')t4;
--查看生成的数据
-- cat /root/college007/*
--0.04476
--关于支持度、置信度要根据出题方的意思设计SQL,不能完全与关联规则中的概念做参照
贷款Hive分析
最新推荐文章于 2023-07-21 16:42:46 发布
该博客展示了使用Hive进行数据操作和分析的多个步骤,包括创建表、加载数据、统计信用得分分布、按职业分组计数、违约者就业信息分析以及收入范围统计。还探讨了如何设置Hive的本地模式以及计算支持度和置信度。
摘要由CSDN通过智能技术生成