python杂记与dataframe常见操作(筛选，groupby,str.contain,时间差）

最新推荐文章于 2024-06-06 15:32:24 发布

贪狼切

最新推荐文章于 2024-06-06 15:32:24 发布

阅读量3.2k

点赞数

分类专栏： python数据分析与处理文章标签： python 数据分析数据处理

本文链接：https://blog.csdn.net/tanlangqie/article/details/119855444

版权

python数据分析与处理专栏收录该内容

24 篇文章 5 订阅

订阅专栏

black_list = df_filter.query(“is_black_x == ‘1’”)[‘open_id_x’].unique().tolist()

nan与none相关

#1 查看含有nan的行
df[df.isna().any(axis=1)]

删除有nan的行
df.drop(df[np.isnan(df.group_click_num)&np.isnan(df.individual_click_num)].index, inplace=True)

#判断每列是否有缺失指
data.isnull().any()

#填充空值
data[‘device_brand’].fillna(0)

#2，查看不含有nan的行
df[~df.isna().any(axis=1)]

#过滤掉列表中中的none值
new_lst_2 = list(filter(None,lst))

#筛选出有空值的行
kong = data[data.isnull().T.any()]
#无空值得行
data[data.isnull().T.any()==False]

#找出有none 的行
tt = tt[tt.astype(str).eq(‘None’).any(1)]
#找出没有none 的行
tt = tt[~tt.astype(str).eq(‘None’).any(1)]

#输出df
res.to_excel(‘wechatid拉黑openid.xlsx’,index=False,engine=‘xlsxwriter’)

np数组填充
raw_data[np.where(raw_data==None)] = 0

#df计数
df[‘a’].value_counts()

#重命名
black_fes.columns = [‘zw.openid’,‘city_equal’,‘ip_num’]

求时间差

from datetime import datetime, date
a_time = datetime.strptime(str(‘2020-11-18 23:59:52’), “%Y-%m-%d %H:%M:%S”)
b_time = datetime.strptime(str(‘2020-11-17 11:23:30’), “%Y-%m-%d %H:%M:%S”)
(a_time - b_time)/606021

上下/左右合并

#合并数据,左右合并
black_fes = pd.merge(black_fes,tt,on=‘zw.openid’)
#合并数据,上下合并
t = pd.concat([df1, df2],ignore_index = True)
result = df1.append(df2)

#列表对应位置相加
print(np.sum([a,b,c], axis = 0))

#array上下合并
c = np.vstack((a,b))

#左右合并
d = np.hstack((a,b))

#df按照某列值进行排序
data.sort_values(by=‘时间’, ascending=True,inplace=True)

包含.str.contain

#某一列不包含某个字符串
groups_all[~groups_all[‘group_id’].str.contains(‘chatroom’)]

#包含某个列表
t = [‘a’,‘b’]
tt = df[‘one’].str.contains(’|’.join(t))
得到true或false组成的series

值筛选

筛选出openid列在id_list中的数据
train_seq_df.loc[train_seq_df[‘open_id’].isin(id_list)]

值筛选+值替换（将满足条件的行的push_type列设置为2）

data.loc[(data.task_type2)|((data.task_type3)&(data.risk_strategy==0)),‘push_type’]=2

#保存至csv
black_fes.to_csv(“black_fes.csv”,index = False)
print(black_fes.head())

#值筛选
print(white_fes.loc[white_fes[‘ip_num’]==95,‘zw.openid’])
#+产生新列
df.loc[df.two < 3,‘white’] = 1
df.fillna(0)

#打乱顺序
data = data.reindex(np.random.permutation(data.index)) #

#apply生产新列
data_r[‘risk’] = data_r.apply(lambda x : risk_fun(x[‘all_code_num’],x[‘code_num’],x[‘risk_code_num’],x[‘un_time_num’]),axis = 1)

#转换数据类型
data1[‘f2’] = data1[‘f2’].apply(lambda x : int(x))

聚合

#生成一列sum_age 对age 进行累加
df[‘sum_age’] = df[‘age’].cumsum()

#分组聚合
def fun_hour(x):
c = 0
for k,v in enumerate(x):
if str(v) in [‘0’,‘1’,‘2’,‘3’,‘4’,‘5’,‘6’]:
c = c+1
return c
m = tt[‘zw.hour’].groupby(tt[‘zw.openid’]).agg(fun_hour)

选取3列数据，按照’from_username’,'group_id’分组，对wechat_id聚合计数
badcase[[‘wechat_id’,‘from_username’,‘group_id’]].drop_duplicates().groupby([‘from_username’,‘group_id’],as_index=False)[‘wechat_id’].count()

#series 2 dataframe
t = t.reset_index()
t.columns=[‘group_id’,‘tt’]
t

group by

df1[‘近半年落地页操作数’] = df.groupby(by = ‘open_id’)[‘uid’].count().tolist()

g1 = data.groupby([‘one’]).size() 等价 g1 = data.groupby([‘one’])[‘two’].count()

#分组求均值
test = raw_data.groupby(‘open_id’).mean()

#求历史特征值大于零的个数
tp = pd.DataFrame(df.groupby(‘uid’).apply(
lambda df:np.where(df[i]>0,1,0).sum()).reset_index())

聚合后取日期最大的哪一个数据
find latest comment index
idx = df_cmt.groupby([‘sku_id’])[‘dt’].transform(max) == df_cmt[‘dt’]
df_cmt = df_cmt[idx]

分组取top-k

df.sort_values(‘C’, ascending=False).groupby(‘B’).head(2)

分组排序

https://blog.csdn.net/baidu_38409988/article/details/102668006
data[‘rank’] = data.groupby([‘Name_y’])[‘Salary’].rank(ascending=False,method=‘dense’)

#重命名
m.columns = [‘zw.openid’,‘op_count’]
m.rename(columns={‘op_count’:‘op_type_count’},inplace=True)

#数据去重
tt = data_b_frame.drop_duplicates(subset=[‘zw.openid’,‘zw.day’,‘zw.hour’],keep=‘first’)

#新建一个空的DataFrame，往里面添加DataFrame数据
group_21 = pd.DataFrame(columns = [‘群’,‘总推码数’,‘21日推码数’])
group_21 = group_21.append(result_df21,ignore_index=True)

#删除某列
tt.drop(columns=[‘zw.net_type’],inplace= True)

#将 true false转换为 0 1
data[‘is_mobile’] = data[‘is_mobile’].astype(int)

#df 排序
result_df[result_df[‘耗时2s内’] < 0.6].sort_values(by=‘平均发言’, ascending=False)

#删除老的索引，生成新的索引
data_time.reset_index(drop=True,inplace=True)

#计数
from collections import Counter
hour_count = Counter(tt[‘zw.hour’])
print(hour_count)

#对计数结果（字典）进行排序从小到大
t = sorted(hour_count.items(),key=lambda x:x[1],reverse=False)

#输出数组形状
x.shape

#上周五
friday = tomorrow - datetime.timedelta(days=tomorrow.weekday()) + datetime.timedelta(days=4, weeks=-1)

上周五到本周四
week_first_day = (tomorrow - datetime.timedelta(days=tomorrow.weekday()+3)).strftime("%Y-%m-%d")
week_last_day = (tomorrow - datetime.timedelta(days=tomorrow.weekday()-3)).strftime("%Y-%m-%d")

datetime计算时间差

不同天的时间差
time_1 = '2020-03-02 15:00:00'
time_2 = '2020-03-03 16:00:00'

time_1_struct = datetime.strptime(time_1, "%Y-%m-%d %H:%M:%S")
time_2_struct = datetime.strptime(time_2, "%Y-%m-%d %H:%M:%S")

来获取时间差中的秒数。注意，seconds获得的秒只是时间差中的小时、分钟和秒部分，没有包含天数差，total_seconds包含天数差
所以total_seconds两种情况都是可以用的
total_seconds = (time_2_struct - time_1_struct).total_seconds()
print('不同天的秒数为：')
print(int(total_seconds))

min_sub = total_seconds / 60
print('不同天的分钟数为：')
print(int(min_sub))

dataframe中求相差天数

test= (raw_data['punish_date'] - raw_data['join_time']).dt.days
#相差秒数
(data['brush_time'] - data['push_time']).dt.seconds

dic.get()

list1=[1,2,3,'a','a','c',4,6,2,3,1,1,1,1]
dict1={}
for key in list1:
    dict1[key]=dict1.get(key,0)+1
print(dict1)

计算分位数

for i in [10,20,30,40,50,60,70,80,90]:
    print(np.percentile(sorted(bb[:,-1]),i))

df根据筛选的结果产生新的一列

result_group.loc[(result_group.create_time < result_group.operatortime), 'group_flag'] = 1

#df字典映射
browser_family 是一个字典
data[‘browser_family’] = data[‘browser_family’].map(browser_family)

df数据描述

下面是一些描述整体信息常用的方法：

df.info()方法： i               可以查看（数据有多少行多少列；各个属性的类型object 、int32 、int64等）

df.head(number)方法：  可以查看（前number行的数据值）

df.sample(n)方法      ：   （随机的查看几个样本）

df.shape属性：                可以查看（数据有多少行多少列）

df.describe（）方法：默认情况下只显示出来数值类型的数据情况，可以查看（数据的统计情况如均值、方差、最大最小值、分位数）

df.describe(include='all')   加入include参数以后，可以显示所有数据的情况，显示出来的信息如下（count、unique、top、freq 还有上述默认情况下的信息）

df.isnull().sum()                可以查看（每个列中有多少个nan值）

one-hot编码

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df)
res = enc.transform(df).toarray()
print(res)
#每一列的特征名
print(enc.get_feature_names())

多维数组求均值

import  numpy as np
a = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])
print(a)
print(a.shape)
print('--------')
print(a[:, -0, ...])
print('--------')
print(a.mean(0))   #两个二维数组对应位置的相加求均值
print('--------')
print(a.mean(1))   #每个二维数组内部上下对应位置的相加求均值
print('--------')
print(a.mean(2))    #每个二维数组内部左右对应位置的相加求均值

tensorflow

tf.where(condition,x=None,y=None,name=None)
如果 condition = TRUE 返回 x ,否则返回y


tf.greater(a,b)
功能：比较a、b两个值的大小
返回值:一个列表,元素值都是true和false

reduce_sum( ) 是求和函数，在 tensorflow 里面，计算的都是 tensor，可以通过调整 axis =0,1 的维度来控制求和维度。

贪狼切

关注

0
点赞
踩
12

收藏

觉得还不错? 一键收藏
0
评论
python杂记与dataframe常见操作(筛选，groupby,str.contain,时间差）

black_list = df_filter.query(“is_black_x == ‘1’”)[‘open_id_x’].unique().tolist()nan与none相关#1 查看含有nan的行df[df.isna().any(axis=1)]删除有nan的行df.drop(df[np.isnan(df.group_click_num)&np.isnan(df.individual_click_num)].index, inplace=True)#判断每列是否有缺失指data
复制链接

扫一扫