贷款Hive分析

该博客展示了使用Hive进行数据操作和分析的多个步骤,包括创建表、加载数据、统计信用得分分布、按职业分组计数、违约者就业信息分析以及收入范围统计。还探讨了如何设置Hive的本地模式以及计算支持度和置信度。
摘要由CSDN通过智能技术生成
create database hive;
use hive;

create table loan(
    LoanStatus STRING,
    BorrowerRate FLOAT,
    ProsperScore FLOAT,
    Occupation STRING,
    EmploymentStatus  STRING,
    IsBorrowerHomeowner BOOLEAN,
    CreditScorerangeLower INT,
    CreditScorerangeUpper INT,
    IncomeRange STRING  
)row format serde
      'org.apache.hadoop.hive.serde2.OpenCSVSerde'
      with SERDEPROPERTIES ("separatorChar"=",")STORED AS TEXTFILE;

--从本地上传到HDFS
hadoop fs -mkdir /college
hadoop fs -put /usr/data/loan.csv  /college/

--从HDFS上加载数据到表中,如是本则在 inpath 前面加local
load data   inpath '/college/loan.csv' into table loan; 
select * from loan; 

--以信用得分ProsperScore为变量,对借款进行计数统计(降序),结果写入本地/root/college001/中

insert overwrite local directory '/root/college001/'  row format 
delimited fields terminated by '\t'
select ProsperScore ,count(*)as sum  from loan group by ProsperScore order by  sum  desc;

--重开一个会话,查看生成的数据
--cat /root/college001/*
-- 显示结果:
-- 	29084
-- 4	12595
-- 6	12278
-- 8	12053
-- 7	10597
-- 5	9813
-- 3	7642
-- 9	6911
-- 2	5766
-- 10	4750
-- 11	1456
-- 1	992 

-- 注意:
-- 解决方法:将hive设置成本地模式: 
-- set hive.exec.mode.local.auto=true;
-- 再试试count()函数 就可以运行 
-- 但是只在本次hive的任务有效,如果想长期有效 的话要修改hive的配置文件:hive-site.xml 位置在hive/conf下
-- 在slave1的机器上的hive-site.xml中的configuration里面添加一个property:
-- <property> 
--     <name>hive.exec.mode.local.auto</name> 
--     <value>true</value>  
-- </property>

--给出借款较多的行业top5,结果写入本地/root/college002/中。

insert overwrite local directory '/root/college002/'  row format 
delimited fields terminated by '\t'
select Occupation,count(*)as cnt from loan group by 
 Occupation order by cnt desc limit 5;

 --重开一个会话,查看生成的数据
--cat /root/college002/*
-- Other	28617
-- Professional	13628
-- Computer Programmer	4478
-- Executive	4311
-- Teacher	3759

--分析贷款状态为违约(Defaulted)的贷款人就业信息,将结果top3写入/root/college003/
insert overwrite local directory '/root/college003/'  row format 
delimited fields terminated by '\t'
select Occupation from loan where LoanStatus='Defaulted' limit 3

 --查看生成的数据
--cat /root/college003/*
-- Administrative Assistant
-- Skilled Labor
-- Other

--对数据中收入范围进行分组统计(降序),查看贷款人收入情况,结果写入/root/college004/
insert overwrite local directory '/root/college004/'  row format 
delimited fields terminated by '\t'
select IncomeRange,count(*)as cnt from loan group by  IncomeRange order by  cnt desc;
 --查看生成的数据
 --cat /root/college004/*
-- .$25.000-49.999	32192
-- $50.000-74.999	31050
-- $100.000+	17337
-- $75.000-99.999	16916
-- Not displayed	7741
-- $1-24.999	7274
-- Not employed	806
-- $0 	621

--对信用得分进行中间数求值作为信用得分,计算各职业中的信用得分最高分。结果top5写入/root/college005/
--percentile产生中位数
--开启hive默认出于安全考虑,不允许使用union all. 如果你知道你在干什么,可以去设置允许使用。
set hive.strict.checks.cartesian.product=false;
set hive.mapred.mode=nonstrict;

 insert overwrite local directory '/root/college005/'  row format 
delimited fields terminated by '\t'
select Occupation,max(score)as sc from (
select if(ProsperScore='',b.mid,ProsperScore) as score , Occupation from loan a join (
select percentile(cast(ProsperScore as INT),0.5) as mid from loan)b )c group by c.Occupation order by sc ;
 --查看生成的数据
--  --cat /root/college005/*
-- Waiter/Waitress	9
-- Truck Driver	9
-- Tradesman - Plumber	9
-- Tradesman - Mechanic	9
-- Tradesman - Electrician	9
-- Tradesman - Carpenter	9
-- Teacher's Aide	9
-- Teacher	9
-- Student - Technical School	9
-- Student - Community College	9
-- Student - College Sophomore	9
-- Student - College Senior	9
-- Student - College Junior	9
-- Student - College Graduate Student	9
-- Student - College Freshman	9
--后面省略


--未考虑Occupation空值做中间数的处理
select Occupation,max(ProsperScore)as sc from loan group by Occupation order by sc ; 

--(1) 支持度写到本地/root/college006/中(保留五位小数);贷款状态为违约的人数的最多的职业人数说明关联性最大
--除以总人数,结果为支持度
--开启hive默认出于安全考虑,不允许使用union all. 如果你知道你在干什么,可以去设置允许使用。
set hive.strict.checks.cartesian.product=false;
set hive.mapred.mode=nonstrict;

insert  overwrite  local   directory   '/root/college006/'
SELECT   round((t2.s/t4.s),5) 
FROM(select  t1.occupation  as  c,  count(*)as  s  from  loan  t1
where  t1.LoanStatus='Defaulted'
group  by  t1.occupation
order  by  s  desc,  t1.occupation  asc  limit  1
)t2
JOIN(select  count(*)as  s   from  loan  t3  )t4;
 --查看生成的数据
-- cat /root/college006/*
--0.01124


--(2) 置信度写到本地/root/college007/中(保留五位小数)

insert   overwrite  local   directory   '/root/college007/'
SELECT   round((t2.s/t4.s),5)
FROM(select  t1.occupation  as  c, count(*)as  s   from   loan  t1
where   t1.LoanStatus='Defaulted'
group  by   t1.occupation
order  by  s  desc,  t1.occupation   asc   limit 1
)t2
JOIN( select  count(*)as   s   from   loan t3
where   occupation='Other')t4;
 --查看生成的数据
-- cat /root/college007/*
--0.04476
--关于支持度、置信度要根据出题方的意思设计SQL,不能完全与关联规则中的概念做参照
  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值