接到了一个小需求,手上有某应用8月和9月的用户信息,希望能算出8月新注册用户的留存率。即计算在8月注册的用户有多少比例在一个月内又一次使用了该应用。并返回每个城市每天的用户留存率。
因为对pandas中dataframe的操作还不熟悉,dataframe的索引又比较鸡龟,所以程序里用了很多不必要的遍历和循环,代替了本来用dataframe可以快速完成的操作,因此程序效率不高,但是基本完成了需求,作为一个笔记暂且记录在这里吧。
import pandas as pd
import numpy as np
'''数据分开存在几个文件里,先读入数据,id是用户编号,pt记录日期,is_old标记是否是新用户的记录,city标记城市'''
f1=pd.read_csv('a1.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
f2=pd.read_csv('a2.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
f3=pd.read_csv('a3.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
f4=pd.read_csv('a4.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
f5=pd.read_csv('a5.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
f6=pd.read_csv('a6.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
f7=pd.read_csv('a7.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
f8=pd.read_csv('a8.csv',header=None,names=['pag_id','pt','is_old','city'],encoding='gbk')
df=pd.concat([f1,f2,f3,f4,f5,f6,f7,f8],ignore_index=True)
#清除城市标记为空的数据
city=df['city']
city_null=pd.isnull(city)
df=df[city_null==False].reset_index(drop=True)
#取出新用户在8月的所有记录
d_new=df[(df['is_old']==0) & (df['pt']<'2017-09-01')].sort_values(by='pt')
d_all=pd.merge(d_new,df,how='left',on='pag_id')
from collections import defaultdict
import datetime
starttime = datetime.datetime.now()
d_all[['pag_id','pt_y','is_old_y','city_y']]
'''
def sub_above(df):
li=list(pd.to_datetime(df['pt_y']))
'''
'''grouped['rank']=d_all['pt_y'].groupby(d_all['pag_id']).rank()
print(grouped['rank'])'''
'''d_all['pt']=pd.to_datetime(d_all['pt_y'])
d_all['days']=(d_all['pt']-d_all['pt'].min())<= datetime.timedelta(days=30)'''
#print (grouped['rank'])
#print(d_all['days'])
city_dic=defaultdict(int)
date_dic=defaultdict(int)
num = 0
num1 = 0
d_all['pt'] = pd.to_datetime(d_all['pt_y'])
reg_dates_back = {}#30天内有单用户
reg_dates_all = {}#新用户
reg_dates_rate= {}
reg_city1 = {}
reg_city2 = {}
reg_city3 = {}
l_date = list(d_all['pt'].drop_duplicates())#日期列表
l_city = list(d_all['city_y'].drop_duplicates())#城市列表
for i in l_city:
reg_city1[i]=0
for i in l_city:
reg_city2[i]=0
for i in l_city:
reg_city3[i]=0
for i in l_date:
reg_dates_back[i] = reg_city1
for i in l_date:
reg_dates_new[i] = reg_city2
for i in l_date:
reg_dates_rate[i] = reg_city3
for [pag,city],group in d_all.groupby(['pag_id','city_y']):
if 0 in list(group['is_old_y']):#判断是否是新用户
reg_date = group['pt'].min()
reg_dates_new[reg_date][city]+=1#某天的某城市的新用户加一
group['days']=(group['pt']-group['pt'].min())<= datetime.timedelta(days=30)
#print list(group['days'])#.count(True)
#print city
#group['news'] = 0 in list(d_all['is_old_y'])
if list(group['days']).count(True)>1:
reg_dates_back[reg_date][city]+=1#某天的某城市的回头用户加一
num+=1
if num-num1>100:
endtime = datetime.datetime.now()
print (endtime - starttime).seconds#运行时间
print(num)#运行数据数
num1 = num
for i in l_date:
for j in l_city:#计算结果
try:
reg_dates_rate[i][j] = (float(reg_dates_back[i][j])/reg_dates_new[i][j])
except: pass #防止零除