python类型的操作

1、list

     list连接

    list = list1 +list2

sen_list = []
for i in tqdm(data.index):
    sen_list += data.coreEmotions_list[i]
    
pd.value_counts(sen_list)

    list交集

2、dict

3、pandas

dataframe apply使用两列操作

def difflib_similarity(str1,str2):
    return difflib.SequenceMatcher(a=str1, b=str2).quick_ratio()

data['prefix_title_sim']=data[['prefix','title']].apply(lambda row: difflib_similarity(row[0],row[1]),raw=True,axis=1)

apply raw = True速度会快很多

def list_intersection(listA,listB):
    retB = list(set(listA).intersection(set(listB)))
    return retB

df['query_title'] = df.apply(lambda x: list_intersection(x['query'],x['title']),axis=1)
df

找出空值的行 

press_train[press_train.isnull().values==True]

4、numpy

返回最大k值的下标

arr = result.argsort()[0][::-1][:top_k]

5、groupby 操作

df = data.groupby('query_id')['title_length'].agg(['min','mean','max','sum','std','median','count'])
data = pd.merge(data,df,how='left',on='query_id')

df = data.groupby(['query_id'])['title_length'].rank().reset_index()
df.rename(columns={'title_length':"rank"},inplace=True)
data = pd.concat([data,df[['rank']]],axis=1)
cols = ['title_length','query_title_length','q_len/t_len','qt_len/q_len','qt_len/t_len','tfidf_cos','doc_cos_qt']

for c in cols:
    df = data.groupby('query_id')[c].agg(['min','mean','max','sum','std','median','count'])
    df.columns = [ c+'_'+col for col in df.columns]
    data = pd.merge(data,df,how='left',on='query_id')
    
    df = data.groupby(['query_id'])[c].rank().reset_index()
    df.rename(columns={c:c+'_'+'rank'},inplace=True)
    data = pd.concat([data,df[[c+'_'+'rank']]],axis=1)
def dict_join(df):
    df = df[['itemID','rating']]
    df.set_index(['itemID'],inplace=True)
    df = df.to_dict(orient="dict")
    return df['rating']

df_train = df_train.groupby('uid').apply(dict_join).reset_index()

d = {'query_id': [1,1,1, 2,2,3,3], 'y_true': [0,1,0,1,0,1,0], 'y_pred': [0.2,0.6,0.6,0.7,0.4,0.8,0.3]}
df = pd.DataFrame(data=d)

from sklearn.metrics import roc_auc_score

def q_auc(x):
    
    x['auc_i'] = roc_auc_score(x['y_true'],x['y_pred'])
    
    return x

df = df.groupby(['query_id']).apply(q_auc)
df = df[['query_id','auc_i']].drop_duplicates()
np.mean(df['auc_i'])
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')


for c in categorical_feats:
    train[categorical_feats] = train[categorical_feats].astype('category')

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值