【牛客编程题】python数据分析49题(pandas库使用)
做题链接:https://www.nowcoder.com/exam/oj?page=1&tab=Python%E7%AF%87&topicId=326
什么是pandas库
在 Python 自带的科学计算库中,Pandas 模块是最适于数据科学相关操作的工具。它与 Scikit-learn 两个模块几乎提供了数据科学家所需的全部工具。
Pandas 的核心是一个高效易用的数据类型:DataFrame。
其他适用的数据类型有:Series,CSV,JSON
参考资料:
参考1:https://www.runoob.com/pandas/pandas-tutorial.html
参考2:gairuo.com/p/pandas-selecting-data
部分常用函数
import pandas as pd
nowcoder = pd.read_csv('nowcoder.csv', parse_dates=True, index_col='date')
# 1、查看数据(行,列)
len(Nowcoder.columns) # 列数
Nowcoder[1:6] #1-6行
Nowcoder.loc[10,:] #第10行
Nowcoder.iloc[-5:,[0,1,2,5]] # 最后5行的指定列
Nowcoder.loc[Nowcoder.Language.isin(["CPP", "C", "C#"])]#筛选某个列的条件
Nowcoder.isnull().all() #某一列是否存在空值
Nowcoder.duplicated() # 某一行是否重复
Nowcoder[Nowcoder['Language']=="Python"] # 筛选某个条件
Nowcoder[Nowcoder['Language']=='Python']['Achievement_value'] #筛选某个条件
Nowcoder[(Nowcoder.Language=='Java')&(Nowcoder.Graduate_year==2020)]#筛选多个条件
Nowcoder[Nowcoder.Num_of_exercise>=500][['Level','Achievement_value']]#筛选某个条件
df['name'] # 会返回本列的 Series
df[['Q1', 'Q2']] # 选择两列,返回 DataFrame
df[:2] # 切片前两行数据
df.loc[[0,5,10]] # 指定索引 0,5,10 的行
# 2、数据特征(统计值)
Nowcoder['Language'].value_counts() #对某列进行数据透视(分组统计)
Nowcoder['Continuous_check_in_days'].max() # 某一列的最大值,最小值
Nowcoder[Nowcoder['Language'] == 'Python']['Number_of_submissions'].mean()#某一列满足特定条件的平均数
Nowcoder[Nowcoder['Num_of_exercise'] >=10]['Level'].median() #某一列满足特定条件的中位数
Nowcoder['Language'].nunique() # 某一列的种类数
Nowcoder['Language'].unique() # 某一列有哪些种类(list)
Nowcoder['Level'].mode() # 某一列的众数
Nowcoder[['Achievement_value','Continuous_check_in_days']].quantile(0.25) # 某一列的分位数,前1/4
Nowcoder['Num_of_exercise'].var().round(2) # 某一列的方差
Nowcoder['Number_of_submissions'].std().round(2) #某一列的标准差
Nowcoder[Nowcoder['Level']==7]['Achievement']/Nowcoder['Achievement'].sum() # 满足特定条件的值占总和的比例
Nowcoder['Name'].str.len() # 某一列的每个字符串的长度
# 3、数据清洗(缺失,填充,修改)
Nowcoder[Nowcoder.isna() == False] # 去掉有缺失数据的行
Nowcoder.drop_duplicates(0) # 去除重复的行
Nowcoder['Graduate_year'].fillna(value=Nowcoder['Graduate_year'].max(),inplace=True) #用最大值填充毕业年份
Nowcoder['Language'].fillna(value='Python',inplace=True) # 用Python填充缺失语言
Nowcoder['Achievement_value'].fillna(value=round(Nowcoder['Achievement_value'].mean(),0),inplace=True) # 用平均值填充缺失成就值
Nowcoder['Last_submission_time'] = pd.to_datetime(Nowcoder["Last_submission_time"],format="%Y-%m-%d") # 标准化年份数据
pd.DataFrame(json.loads("1.json")) # 将json转为表格
nowcoder.rename(columns={'date':'year-month-day'},inplace=True)#修改某一列的列名
# 4、数据操作(筛选,合并,排序)
nowcoder.groupby(by='date')['question_id'].count() # 按某一列分组,并对id计数
nowcoder.groupby(by=['result','year-month-day'])['question_id'].count() # 按多列分组筛选,并对id计数
nowcoder.groupby(['result']).result.count() #对某一列分组并计数
Nowcoder.groupby(['Level','Language']).Nowcoder_ID.count() # 对多列分组并计数
Nowcoder.groupby('Level')['Nowcoder_ID'].count() > 5 # 对多列分组计数并筛选
a=pd.merge(items,signup,on='item_id',how='left') #合并两个csv文件
a = pd.concat([signup,signup1],axis=0)
b = pd.merge(a,items,how='left',on='item_id') # 合并3个csv文件
df = pd.merge(signup,items,on="item_id",how="left")
pd.merge(Nowcoder1,Nowcoder2,on='Nowcoder_ID')
df=pd.merge(Nowcoder1, Nowcoder2, on='Nowcoder_ID', how='inner') #合并用户信息
res = sales.sort_values(by=['monetary'], ascending=False) # 对某一列排序
res = res.reset_index(drop=True) # 更新排序后的索引
Nowcoder.sort_values(by='Level', ascending = False) # 对某一列排序(升序)
sales['R_Quartile'] = pd.qcut(sales['recency'], [0, 0.25, 0.5, 0.75, 1],["4", "3", "2", "1"]).astype("int") # 分位数,更新标签,前1/4打4分,后1/4打1分
sales['M_Quartile'] = pd.qcut(sales['monetary'], [0, 0.25, 0.5, 0.75, 1],["1", "2", "3", "4"]).astype("int") # 反向打分
sales['R_Quartile'] = pd.qcut(sales['recency'],4,["4", "3", "2", "1"]).astype("int") # 分位数,更新标签
01 查看数据
DA1 用pandas查看牛客网用户数据
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',',dtype=object)
print(Nowcoder[1:6])
# print(Nowcoder.head(5))
#import numpy as np
#df1=np.read_csv('Nowcoder.csv')
#print(Nowcoder[1:6,:])
DA2 牛客网用户数据集的大小
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',',dtype=object)
a = len(Nowcoder)
b = len(Nowcoder.columns)
print((a,b))
DA3 牛客网的第10位用户
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',',dtype=object)
print(Nowcoder.loc[10,:])
# print(Nowcoder.iloc[10,:])
DA4 统计牛客网部分用户使用语言
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',',dtype=object)
# loc : Selection by Label ,按标签取数据
print(Nowcoder.iloc[10:20, 5])
02 数据索引
DA5 牛客网用户没有补全的信息
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',',dtype=object)
# 代用isnull判断,后跟条件any方法到条件
# print(Nowcoder.isnull().any(axis=0))
print(Nowcoder.isnull().all())
DA6 查看牛客网哪些用户使用Python
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',',dtype=object)
lan = Nowcoder[Nowcoder['Language']=="Python"]
print(lan)
DA7 牛客网Python用户的成就值
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',')
print(Nowcoder[Nowcoder['Language']=='Python']['Achievement_value'] )
DA8 文件最后用户的部分数据
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
#print(Nowcoder[['Nowcoder_ID','Level','Achievement_value','Language']].tail(5))
print(Nowcoder.iloc[-5:,[0,1,2,5]])
03 逻辑运算
DA9 2020年毕业的人中最喜欢用Java的用户
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',')
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
print(Nowcoder[(Nowcoder.Language=='Java')&(Nowcoder.Graduate_year==2020)])
DA9 牛客网C系用户们的信息
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',')
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
print(Nowcoder.loc[Nowcoder.Language.isin(["CPP", "C", "C#"])])
DA10 统计牛客网刷题数量500以上的大佬
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',')
print(Nowcoder[Nowcoder.Num_of_exercise>=500][['Level','Achievement_value']])
DA11 按照毕业年份与使用语言筛选牛客网7级用户
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
print(Nowcoder[(Nowcoder.Language=='CPP')&(Nowcoder.Level==7)&(Nowcoder.Graduate_year!=2018)])
04 中级函数
DA12 牛客网不同语言使用人数
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder['Language'].value_counts())
DA13 牛客网用户最近的最长与最短连续签到天数
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder['Continuous_check_in_days'].max())
print(Nowcoder['Continuous_check_in_days'].min())
DA14 Python用户的平均提交次数
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(round(Nowcoder[Nowcoder['Language'] == 'Python']['Number_of_submissions'].mean(),1))
DA15 牛客网用户等级的中位数
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(int(Nowcoder[Nowcoder['Num_of_exercise'] >=10]['Level'].median()))
DA16 用户常用语言有多少
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder['Language'].nunique())
print(list(Nowcoder['Language'].unique()))
DA17 牛客网最多的用户等级
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
res = Nowcoder['Level'].mode() #求众数
print(pd.DataFrame(res,columns = ['Level'])) #输出格式
DA18 用分位数分析牛客网用户活动
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder[['Achievement_value','Continuous_check_in_days']].quantile(0.25))
print(Nowcoder[['Num_of_exercise','Number_of_submissions']].quantile(0.75))
DA19 牛客网大佬之间的差距
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
mx = Nowcoder[Nowcoder['Level']== 7]['Achievement_value'].max()
mi = Nowcoder[Nowcoder['Level'] == 7]['Achievement_value'].min()
print(mx-mi)
DA20 牛客用户刷题量的方差与提交次数的标准差
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder['Num_of_exercise'].var().round(2)) #方差
print(Nowcoder['Number_of_submissions'].std().round(2)) #标准差
DA21 大佬用户成就值比例
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
total=Nowcoder['Achievement_value'].sum()
print(Nowcoder[Nowcoder['Level']==7]['Achievement_value']/total)
DA22 牛客网用户最高的正确率
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
# 刷题数>10的用户的刷题数,提交数
a = Nowcoder['Num_of_exercise'][Nowcoder['Num_of_exercise'] > 10]
b = Nowcoder['Number_of_submissions'][Nowcoder['Num_of_exercise'] > 10]
print((a/b).max().round(3))
DA23 统计牛客网用户的名字长度
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder['Name'].str.len())
05 数据清洗
DA24 去掉信息不全的用户
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',', dtype=object)
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
print(Nowcoder[Nowcoder.isna() == False])
DA25 修补缺失的用户数据
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
# 用当前的最大年份填充缺失的毕业年份(“Graduate_year”)
max_year = Nowcoder['Graduate_year'].max()
Nowcoder['Graduate_year'].fillna(value=max_year,inplace=True)
# 用Python填充缺失的常用语言(“Language”)
Nowcoder['Language'].fillna(value='Python',inplace=True)
# 用成就值的均值(四舍五入保留整数)填充缺失的成就值(“Achievement_value”)。
mean_value = round(Nowcoder['Achievement_value'].mean(),0)
Nowcoder['Achievement_value'].fillna(value=mean_value,inplace=True)
print(Nowcoder)
DA26 解决牛客网用户重复的数据
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',', dtype=object)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
print(Nowcoder.duplicated()) #输出每一行是否重复
print(Nowcoder.drop_duplicates(0))#输出去重后的文件
DA27 统一最后刷题日期的格式
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv',sep=',',dtype=object)
Nowcoder['Last_submission_time'] = pd.to_datetime(Nowcoder["Last_submission_time"],format="%Y-%m-%d")
print(Nowcoder[['Nowcoder_ID','Level','Last_submission_time']])
06 Json处理
DA28 将用户的json文件转换为表格形式
import pandas as pd
import json
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
with open('Nowcoder.json', 'r') as f:
data = json.loads(f.read())
print(pd.DataFrame(data))
07 分组聚合
DA29 牛客网的每日练题量
import pandas as pd
nowcoder = pd.read_csv('nowcoder.csv', parse_dates=True, index_col='date')
print(nowcoder.groupby(by='date')['question_id'].count())
DA30 牛客网用户练习的平均次日留存率
import pandas as pd
from datetime import timedelta
nowcoder = pd.read_csv('nowcoder.csv')
# 某天练习后第二天还会再来的用户数除以user_id总次数
df = pd.merge(nowcoder, nowcoder, on='user_id', suffixes=['_a','_b'])
df.date_a = pd.to_datetime(df.date_a).dt.date
df.date_b = pd.to_datetime(df.date_b).dt.date
df = df[(df.date_a+timedelta(days=1))==df.date_b]
all_num = nowcoder.user_id.count()
again_num = df.user_id.count()
print(round(again_num/all_num,2))
DA31 牛客网每日正确与错误的答题次数
import pandas as pd
nowcoder = pd.read_csv('nowcoder.csv')
# 2021年12月答题结果正确和错误的前提下每天的答题次数
nowcoder.rename(columns={'date':'year-month-day'},inplace=True)
print(nowcoder.groupby(by=['result','year-month-day'])['question_id'].count())
DA32 牛客网答题正误总数
import pandas as pd
nowcoder = pd.read_csv('nowcoder.csv')
print(nowcoder.groupby(['result']).result.count())
DA33 牛客网连续练习题目3天及以上的用户
import pandas as pd
from datetime import timedelta
nowcoder = pd.read_csv('nowcoder.csv')
df = nowcoder.groupby('user_id')["user_id"].count()
print(df[df>=3])
DA34 牛客网不同毕业年份的大佬
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder.groupby(by=['Graduate_year'])['Achievement_value'].max())
DA35 不同等级用户语言使用情况
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder.groupby(['Level','Language']).Nowcoder_ID.count())
DA36 总人数超过5的等级
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder.groupby('Level')['Nowcoder_ID'].count() > 5)
08 合并
DA37 统计运动会项目报名人数
import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
print(items.groupby("item_name").size() > 0)
DA38 统计运动会项目报名人数(二)
import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
a=pd.merge(items,signup,on='item_id',how='left')
print(a.groupby('item_name')['employee_id'].count())
DA39 多报名表的运动项目人数统计
import pandas as pd
signup = pd.read_csv('signup.csv')
signup1 = pd.read_csv('signup1.csv')
items = pd.read_csv('items.csv')
a = pd.concat([signup,signup1],axis=0)
b = pd.merge(a,items,how='left',on='item_id')
print(b.groupby('item_name')['employee_id'].count())
DA40 统计职能部分运动会某项目的报名信息
import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
df = pd.merge(signup,items,on="item_id")
df1 = df[(df["department"] == "functional") & (df["item_name"] == "javelin")]
df2 = df1[["employee_id","name","sex"]]
print(df2)
DA41 运动会各项目报名透视表
import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
df = pd.merge(signup,items,on="item_id",how="left")
print(df.groupby(['sex', 'department','item_name'])[['employee_id']].count().unstack().fillna(0))
DA42 合并用户信息表与用户活跃表
import pandas as pd
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
Nowcoder1 = pd.read_csv('Nowcoder1.csv', sep=',')
Nowcoder2 = pd.read_csv('Nowcoder2.csv', sep=',')
print(pd.merge(Nowcoder1,Nowcoder2,on='Nowcoder_ID'))
DA43 两份用户信息表格中的查找
import pandas as pd
Nowcoder1 = pd.read_csv('Nowcoder1.csv', sep=',')
Nowcoder2 = pd.read_csv('Nowcoder2.csv', sep=',')
df=pd.merge(Nowcoder1, Nowcoder2, on='Nowcoder_ID', how='inner')
print(df[['Name','Num_of_exercise','Number_of_submissions']])
09 排序
DA44 某店铺消费最多的前三名用户
import pandas as pd
sales = pd.read_csv('sales.csv')
res = sales.sort_values(by=['monetary'], ascending=False)
res = res.reset_index(drop=True)
print(res.head(3))
DA45 按照等级递增序查看牛客网用户信息
import pandas as pd
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder.sort_values(by='Level', ascending = False))
010 函数
DA46 某店铺用户消费特征评分
import pandas as pd
sales = pd.read_csv('sales.csv')
sales['R_Quartile'] = pd.qcut(sales['recency'], [0, 0.25, 0.5, 0.75, 1],["4", "3", "2", "1"]).astype("int")
sales['F_Quartile'] = pd.qcut(sales['frequency'], [0, 0.25, 0.5, 0.75, 1],["1", "2", "3", "4"]).astype("int")
sales['M_Quartile'] = pd.qcut(sales['monetary'], [0, 0.25, 0.5, 0.75, 1],["1", "2", "3", "4"]).astype("int")
print(sales.head())
# import pandas as pd
# sales = pd.read_csv('sales.csv')
# sales['R_Quartile'] = pd.qcut(sales['recency'],4,["4", "3", "2", "1"]).astype("int")
# sales['F_Quartile'] = pd.qcut(sales['frequency'], 4,["1", "2", "3", "4"]).astype("int")
# sales['M_Quartile'] = pd.qcut(sales['monetary'], 4,["1", "2", "3", "4"]).astype("int")
# print(sales.head())
DA47 筛选某店铺最有价值用户中消费最多前5名
import pandas as pd
sales = pd.read_csv('sales.csv')
R = pd.qcut(sales["recency"], 4,labels=["4", "3", "2", "1"]).astype("str")
F = pd.qcut(sales["frequency"], 4, labels=["1", "2", "3", "4"]).astype("str")
M = pd.qcut(sales["monetary"], 4, labels=["1", "2", "3", "4"]).astype("str")
sales['RFMClass'] = R+F+M
print(sales.head(5))
print('\n')
print(sales[sales['RFMClass'] == '444'].sort_values('monetary',ascending= False).head(5))