数据:https://pan.baidu.com/s/15pV-S5vTVQrmoq5zmXDTyw
提取码:v9yp
1、首先通过jupyternotebook做下简单处理,并将数据做新的保存
import pandas as pd
#读取数据
data=pd.read_excel(r'C:/Users/Administrator/Downloads/20210115数据.xls')
#查看数据信息
data.info()
#将空的那几行数据暴力删除算了,并在原数据上进行修改
data.dropna(how='any',inplace=True)
#查看是否有第一次访问大于最近一次访问的记录(无)
#data[data['last_invest_date']<data['first_invest_date']]
#查看第一次投资金额和总投资金额数据是否有异常
#data['first_invest_money'].min()#最小值为92
#data['sum_invest_money'].min()#最小值为100
#data[data['sum_invest_money']<data['first_invest_money']] #无
#查看首次投资月份数
#data['first_invest_month'].value_counts()
#查看投资年纪
#data['age'].value_counts()
#查看性别数据
#data['sex'].value_counts()
#查看是否有异常点
#data.describe()
#将处理完的数据进行保存
data.to_excel(r'C:/Users/Administrator/Downloads/20210115pyend.xls')
2、使用navicat 做一些基础统计
-- 查看整表记录
select * from porder;
-- 查询总投资人数和总投资金额
select
count(distinct user_id) as '总投资人数'
,sum(sum_invest_money) as '总投资金额'
from porder;
-- 不同年份注册的投资用户人数,注册用户的总投资金额
select
date_format(create_time,'%Y') as create_year
,count(distinct user_id) as '注册人数'
,sum(sum_invest_money) as '投资金额'
from porder
group by date_format(create_time,'%Y');
-- 男性用户与女性用户投资人数
select
sex
,count(distinct user_id) as '投资人数'
from porder
group by sex;
-- 贡献投资金额最大的十个年龄
select
age
,sum(sum_invest_money) as '总投资金额'
from porder
group by age
order by 总投资金额 desc
limit 10;
-- 不同产品期限的投资金额与投资人数分布
select
first_invest_month
,count(distinct user_id) as '投资人数'
,sum(sum_invest_money) as '投资金额'
from porder
group by first_invest_month;
-- 不同产品利率的投资金额与投资人数分布
select
first_invest_rate
,count(distinct user_id) as '投资人数'
,sum(sum_invest_money) as '投资金额'
from porder
group by first_invest_rate;
-- 用户首投最喜欢的产品利率与用户首投最喜欢的产品期限
select
first_invest_rate
,count(distinct user_id) as num
from porder
group by first_invest_rate
order by num desc
limit 1;
select
first_invest_month
,count(distinct user_id) as num
from porder
group by first_invest_month
order by num desc
limit 1;
-- 用户注册到投资普遍的转化周期
select
datediff(first_invest_date,create_time) as changetime
,count(distinct user_id) as num
from porder
group by datediff(first_invest_date,create_time)
order by num desc;
-- 每个区域内,投资人数最多的10个城市
select * from
(
select
area
,city
,count(distinct user_id) as num
,rank() over(partition by area order by count(distinct user_id) desc) as `rank`
from porder
group by city
) t
where t.rank<=10;