Python数据分析-牛客网例题

一、查看数据&读入json文件

1-4

Nowcoder = pd.read_csv('Nowcoder.csv',dtype=object,use_cols = ['user_id','date'])

Summary:

  1. dataframe的具体行列查询用loc&iloc,区分 i为index

  1. 读入df时用use_cols可以筛选一下需要的列

读入json

import pandas as pd
import json

pd.set_option('display.width', 300)  # 设置字符显示宽度
pd.set_option('display.max_rows', None)  # 设置显示最大行
pd.set_option('display.max_columns', None)
with open('Nowcoder.json', 'r') as f:
    data = json.loads(f.read())
print(pd.DataFrame(data))

二、数据索引

  1. 查看数据每列是否有空值

import pandas as pd 
file = pd.read_csv('Nowcoder.csv',dtype=object)
print(file.isna().any())
  1. 筛选数据 eg查看Language为python的用户

import pandas as pd 
file = pd.read_csv('Nowcoder.csv',dtype=object)
print(file[file['Language']=='Python'])
print(file[file['Language']=='Python']['Achievement_value'])
print(file[file['Language']=='Python'].Achievement_value)

三、数据清洗

  1. 删除信息不全用户

import pandas as pd

Nowcoder = pd.read_csv('Nowcoder.csv', sep=',', dtype=object)
pd.set_option('display.width', 300)  # 设置字符显示宽度
pd.set_option('display.max_rows', None)  # 设置显示最大行
pd.set_option('display.max_columns', None)
Nowcoder.dropna()
print(Nowcoder)

dropna参数

axis:判断方向 0行1列 默认0

how:any 存在nan即删,all全为nan则删

thresh:instance1 = [1,nan,3] instance2 = [1,nan,nan],若thresh=2则删除2保留1

subset:如subset=['a','b'],则如果某数据列a和列b中存在缺失,则丢弃

inplace:是否在原df进行修改,显然为了安全默认False

  1. 数据替换

import pandas as pd

Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
pd.set_option('display.width', 300)  # 设置字符显示宽度
pd.set_option('display.max_rows', None)  # 设置显示最大行
pd.set_option('display.max_columns', None)

max_year = Nowcoder.Graduate_year.max()
mean_ach = Nowcoder.Achievement_value.mean()
values = {"Graduate_year":max_year,
          "Language":'Python',
          "Achievement_value":mean_ach
         }
Nowcoder.fillna(value=values,inplace = True)
print(Nowcoder)
#method 插值方向 forward前向ffill backward后向bfill

3. 删除重复数据

只记drop_duplicates就好了

subset:默认为全列,如subset=['a','b']即使其余数据不同,ab相同则删除

inplacce:默认True

四、逻辑运算


print(Nowcoder.loc[Nowcoder['Num_of_exercise']>=500, ['Level','Achievement_value']])
print(Nowcoder.loc[(Nowcoder['Language']=='Java')&(Nowcoder['Graduate_year']==2020),:])
print(Noecoder[Nowcoder['Language'].isin(['CPP','C','C#'])])
print(Nowcoder.query('Language == "CPP" & Level >= 7 & Graduate_year != 2018'))

五、函数

  1. 所占比例

import pandas as pd

Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
seven=Nowcoder[Nowcoder['Level']==7]['Achievement_value']#筛选出7级用户的成就值
total=Nowcoder['Achievement_value'].sum()#计算出所有用户的成就值之和
print(seven/total)
  1. 排序

import pandas as pd
sales = pd.read_csv('sales.csv')

data = sales.sort_values(by='monetary', ascending=False).reset_index(drop=True)[:3]
print(data)
  1. 用户特征评分

import pandas as pd
sales = pd.read_csv('sales.csv')

def rfm(x,y,data):

    a = data[y].quantile(0.25)
    b = data[y].quantile(0.5)
    c = data[y].quantile(0.75)

    if y == 'frequency':
        if x <=a:
            return 4
        elif x<=b:
            return 3
        elif x <=c:
            return 2
        elif c<x:
            return 1
    else:
        if x <=a:
            return 1
        elif x<=b:
            return 2
        elif x<=c:
            return 3
        elif c<x:
            return 4

sales['R_Quartile'] = sales['recency'].apply(lambda x: rfm(x,'recency',sales))
sales['F_Quartile'] = sales['frequency'].apply(lambda x: rfm(x,'frequency',sales))
sales['M_Quartile'] = sales['monetary'].apply(lambda x: rfm(x,'monetary',sales))
print(sales.head(5))
import pandas as pd
sales = pd.read_csv('sales.csv')
R = pd.qcut(sales["recency"], 4,labels=["4", "3", "2", "1"]).astype("str")
F = pd.qcut(sales["frequency"], 4, labels=["1", "2", "3", "4"]).astype("str")
M = pd.qcut(sales["monetary"], 4, labels=["1", "2", "3", "4"]).astype("str")
sales['RFMClass'] = R+F+M
print(sales.head(5))
print('\n')
print(sales[sales['RFMClass'] == '444'].sort_values('monetary',ascending= False).head(5))

六、合并

  1. 各项目报名人数

import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
temp = pd.merge(signup,items,how='inner',on='item_id')
test = temp.groupby('item_name')['employee_id'].count()
print(test)

#inner join 会忽略为0的
#left join 如果项目人数为零
#多项目 signup = pd.concat([1,2])
  1. 职能部分运动会某项目的报名信息

import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
join = pd.merge(signup,items,how='inner',on='item_id')
temp = join[
    (join['department'] == 'functional')
    &
    (join['item_name'] == 'javelin')
]
print(temp[['employee_id','name','sex']].reset_index(drop=True))

#reset_index作用 刚筛选完的行仍具有原来的index ,让他重新排一下
  1. 透视表

import pandas as pd

signup = pd.read_csv("signup.csv")
items = pd.read_csv("items.csv")

df_all = pd.merge(
    signup, items, how="left", left_on=signup["item_id"], right_on=items["item_id"]
)

print(
    pd.pivot_table(
        df_all,
        index=["sex", "department"],
        columns=["item_name"],
        values=["employee_id"],
        aggfunc="count",
        fill_value=0,
    )
)

七、分组聚合

  1. 平均次日留存率

import pandas as pd 
nowcoder = pd.read_csv('nowcoder.csv',usecols=['user_id','date'])
nowcoder.date = pd.to_datetime(nowcoder.date).dt.date
nowcoder.drop_duplicates(inplace=True)
total = nowcoder.user_id.count()
join = pd.merge(nowcoder,nowcoder,on= 'user_id',suffixes=['1','2'])
temp = join[ join.date1-join.date2 == '1 days']
cnt = len(temp.user_id.unique())
print(round(cnt/total,2))
  1. 连续三天登录

import pandas as pd

nowcoder=pd.read_csv('nowcoder.csv',usecols=['user_id','date'])
nowcoder.date=pd.to_datetime(nowcoder.date).dt.date
nowcoder=nowcoder.drop_duplicates()
nowcoder['newrank']=nowcoder.date-pd.to_datetime(nowcoder.groupby(['user_id'])['date'].rank(),unit='d').dt.date
res=nowcoder.groupby(['user_id','newrank'])['date'].count()
res = res[res>=3].reset_index('newrank',drop=True).groupby('user_id').max()
print(res)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值