一、查看数据&读入json文件
1-4
Nowcoder = pd.read_csv('Nowcoder.csv',dtype=object,use_cols = ['user_id','date'])
Summary:
dataframe的具体行列查询用loc&iloc,区分 i为index
读入df时用use_cols可以筛选一下需要的列
读入json
import pandas as pd
import json
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
with open('Nowcoder.json', 'r') as f:
data = json.loads(f.read())
print(pd.DataFrame(data))
二、数据索引
查看数据每列是否有空值
import pandas as pd
file = pd.read_csv('Nowcoder.csv',dtype=object)
print(file.isna().any())
筛选数据 eg查看Language为python的用户
import pandas as pd
file = pd.read_csv('Nowcoder.csv',dtype=object)
print(file[file['Language']=='Python'])
print(file[file['Language']=='Python']['Achievement_value'])
print(file[file['Language']=='Python'].Achievement_value)
三、数据清洗
删除信息不全用户
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',', dtype=object)
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
Nowcoder.dropna()
print(Nowcoder)
dropna参数
axis:判断方向 0行1列 默认0
how:any 存在nan即删,all全为nan则删
thresh:instance1 = [1,nan,3] instance2 = [1,nan,nan],若thresh=2则删除2保留1
subset:如subset=['a','b'],则如果某数据列a和列b中存在缺失,则丢弃
inplace:是否在原df进行修改,显然为了安全默认False
数据替换
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
max_year = Nowcoder.Graduate_year.max()
mean_ach = Nowcoder.Achievement_value.mean()
values = {"Graduate_year":max_year,
"Language":'Python',
"Achievement_value":mean_ach
}
Nowcoder.fillna(value=values,inplace = True)
print(Nowcoder)
#method 插值方向 forward前向ffill backward后向bfill
3. 删除重复数据
只记drop_duplicates就好了
subset:默认为全列,如subset=['a','b']即使其余数据不同,ab相同则删除
inplacce:默认True
四、逻辑运算
print(Nowcoder.loc[Nowcoder['Num_of_exercise']>=500, ['Level','Achievement_value']])
print(Nowcoder.loc[(Nowcoder['Language']=='Java')&(Nowcoder['Graduate_year']==2020),:])
print(Noecoder[Nowcoder['Language'].isin(['CPP','C','C#'])])
print(Nowcoder.query('Language == "CPP" & Level >= 7 & Graduate_year != 2018'))
五、函数
所占比例
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
seven=Nowcoder[Nowcoder['Level']==7]['Achievement_value']#筛选出7级用户的成就值
total=Nowcoder['Achievement_value'].sum()#计算出所有用户的成就值之和
print(seven/total)
排序
import pandas as pd
sales = pd.read_csv('sales.csv')
data = sales.sort_values(by='monetary', ascending=False).reset_index(drop=True)[:3]
print(data)
用户特征评分
import pandas as pd
sales = pd.read_csv('sales.csv')
def rfm(x,y,data):
a = data[y].quantile(0.25)
b = data[y].quantile(0.5)
c = data[y].quantile(0.75)
if y == 'frequency':
if x <=a:
return 4
elif x<=b:
return 3
elif x <=c:
return 2
elif c<x:
return 1
else:
if x <=a:
return 1
elif x<=b:
return 2
elif x<=c:
return 3
elif c<x:
return 4
sales['R_Quartile'] = sales['recency'].apply(lambda x: rfm(x,'recency',sales))
sales['F_Quartile'] = sales['frequency'].apply(lambda x: rfm(x,'frequency',sales))
sales['M_Quartile'] = sales['monetary'].apply(lambda x: rfm(x,'monetary',sales))
print(sales.head(5))
import pandas as pd
sales = pd.read_csv('sales.csv')
R = pd.qcut(sales["recency"], 4,labels=["4", "3", "2", "1"]).astype("str")
F = pd.qcut(sales["frequency"], 4, labels=["1", "2", "3", "4"]).astype("str")
M = pd.qcut(sales["monetary"], 4, labels=["1", "2", "3", "4"]).astype("str")
sales['RFMClass'] = R+F+M
print(sales.head(5))
print('\n')
print(sales[sales['RFMClass'] == '444'].sort_values('monetary',ascending= False).head(5))
六、合并
各项目报名人数
import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
temp = pd.merge(signup,items,how='inner',on='item_id')
test = temp.groupby('item_name')['employee_id'].count()
print(test)
#inner join 会忽略为0的
#left join 如果项目人数为零
#多项目 signup = pd.concat([1,2])
职能部分运动会某项目的报名信息
import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
join = pd.merge(signup,items,how='inner',on='item_id')
temp = join[
(join['department'] == 'functional')
&
(join['item_name'] == 'javelin')
]
print(temp[['employee_id','name','sex']].reset_index(drop=True))
#reset_index作用 刚筛选完的行仍具有原来的index ,让他重新排一下
透视表
import pandas as pd
signup = pd.read_csv("signup.csv")
items = pd.read_csv("items.csv")
df_all = pd.merge(
signup, items, how="left", left_on=signup["item_id"], right_on=items["item_id"]
)
print(
pd.pivot_table(
df_all,
index=["sex", "department"],
columns=["item_name"],
values=["employee_id"],
aggfunc="count",
fill_value=0,
)
)
七、分组聚合
平均次日留存率
import pandas as pd
nowcoder = pd.read_csv('nowcoder.csv',usecols=['user_id','date'])
nowcoder.date = pd.to_datetime(nowcoder.date).dt.date
nowcoder.drop_duplicates(inplace=True)
total = nowcoder.user_id.count()
join = pd.merge(nowcoder,nowcoder,on= 'user_id',suffixes=['1','2'])
temp = join[ join.date1-join.date2 == '1 days']
cnt = len(temp.user_id.unique())
print(round(cnt/total,2))
连续三天登录
import pandas as pd
nowcoder=pd.read_csv('nowcoder.csv',usecols=['user_id','date'])
nowcoder.date=pd.to_datetime(nowcoder.date).dt.date
nowcoder=nowcoder.drop_duplicates()
nowcoder['newrank']=nowcoder.date-pd.to_datetime(nowcoder.groupby(['user_id'])['date'].rank(),unit='d').dt.date
res=nowcoder.groupby(['user_id','newrank'])['date'].count()
res = res[res>=3].reset_index('newrank',drop=True).groupby('user_id').max()
print(res)