pandas编程技巧

pandas编程技巧

  1. drop_duplicates 删除重复行

    from pandas import DataFrame, Series
    
    info = [
        {"order_id":1001, "name":"wangwu"},
        {"order_id":1002, "name":"zhangsan"},
        {"order_id":1003, "name":"lisi"},
        {"order_id":1001, "name":"wang wu"}
        ]
    
    df = DataFrame(info)
    print(df)
    
    '''
    #drop_duplicates  删除重复行.参数如下:
    subset:删除某一列下面的重复行
    keep : {‘first’, ‘last’, False}, default ‘first’ ,默认:删除重复项并保留第一次出现的项
    inplace : boolean, default False ,是直接在原来数据上修改还是保留一个副本
    '''
    df = df.drop_duplicates(subset="order_id", keep="first")
    print("删除重复行后:", "*"*30)
    print(df)
    
    -------------------------------------------结果------------
       order_id      name
    0      1001    wangwu
    1      1002  zhangsan
    2      1003      lisi
    3      1001   wang wu
    删除重复行后: ******************************
       order_id      name
    0      1001    wangwu
    1      1002  zhangsan
    2      1003      lisi
    
  2. 将DataFrame转化为字典dict

    from pandas import DataFrame, Series
    
    info = [
        {"order_id":1001, "name":"wangwu"},
        {"order_id":1002, "name":"zhangsan"},
        {"order_id":1003, "name":"lisi"},
        {"order_id":1001, "name":"wang wu"}
        ]
    
    df = DataFrame(info)
    print(df)
    print("orient=index", "*"*20)
    dic_1 = df.to_dict(orient = "index")
    dic_2 = df.to_dict(orient = "records")
    print("orient=index:","-"*10)
    print(dic_1)
    print("orient=records:", "-"*10)
    print(dic_2)
    
    dic_3 = df.to_dict(orient="dict")
    print("dic_3:", "-"*10)
    print(dic_3)
    
    
    dic_4 = df.to_dict(orient = "series")
    print("series:", "-"*10)
    print(dic_4)
    print("+"*5)
    print(dic_4["order_id"])
    --------------------------------------结果-----------------
       
       order_id      name
    0      1001    wangwu
    1      1002  zhangsan
    2      1003      lisi
    3      1001   wang wu
    orient=index ********************
    orient=index: ----------
    {0: {'order_id': 1001, 'name': 'wangwu'}, 1: {'order_id': 1002, 'name': 'zhangsan'}, 2: {'order_id': 1003, 'name': 'lisi'}, 3: {'order_id': 1001, 'name': 'wang wu'}}
    orient=records: ----------
    [{'order_id': 1001, 'name': 'wangwu'}, {'order_id': 1002, 'name': 'zhangsan'}, {'order_id': 1003, 'name': 'lisi'}, {'order_id': 1001, 'name': 'wang wu'}]
    dic_3: ----------
    {'order_id': {0: 1001, 1: 1002, 2: 1003, 3: 1001}, 'name': {0: 'wangwu', 1: 'zhangsan', 2: 'lisi', 3: 'wang wu'}}
    series: ----------
    {'order_id': 0    1001
    1    1002
    2    1003
    3    1001
    Name: order_id, dtype: int64, 'name': 0      wangwu
    1    zhangsan
    2        lisi
    3     wang wu
    Name: name, dtype: object}
    +++++
    0    1001
    1    1002
    2    1003
    3    1001
    Name: order_id, dtype: int64
    <class 'pandas.core.series.Series'>
    list: ----------
    {'order_id': [1001, 1002, 1003, 1001], 'name': ['wangwu', 'zhangsan', 'lisi', 'wang wu']}
    
    
  3. apply函数

    axis=0/1 默认=0, 1:以行为单位; 0:以列为单位

    
    
    from pandas import DataFrame, Series
    
    info = [
        {"order_id":1001, "name":"wangwu", "score_1":90, "score_2":85},
        {"order_id":1002, "name":"zhangsan", "score_1":60, "score_2":75},
        {"order_id":1003, "name":"lisi", "score_1":96, "score_2":95},
        {"order_id":1001, "name":"wang wu", "score_1":89, "score_2":90}
        ]
    
    df = DataFrame(info)
    df["score1_is_good"] = df.apply(lambda x: 1 if x.score_1 > 80 else 0, axis = 1)
    df["score2_is_good"] = df.apply(lambda x: 1 if x.score_2 > 80 else 0, axis = 1)
    df["score_total"] = df.apply(lambda x: x.score_1 + x.score_2, axis = 1)
    print(df)
    
    -----------------------结果-----------------
       order_id      name  score_1  score_2  score1_is_good  score2_is_good  score_total
    0      1001    wangwu       90       85               1               1          175
    1      1002  zhangsan       60       75               0               0          135
    2      1003      lisi       96       95               1               1          191
    3      1001   wang wu       89       90               1               1          179
    

例子2:对特征进行归一化处理

import numpy as np
from pandas import Series, DataFrame
arr = np.random.randn(5,6)
df = DataFrame(arr)
print(df)

def feature_normal(arr):
    return (arr - arr.min())/(arr.max() - arr.min())


print("*"*20,“归一化后的结果”)
print(df.apply(feature_normal , axis = 0))

---------------------------------result------------------------
          0         1         2         3         4         5
0  1.079595 -0.351661  0.309417  2.155845 -2.012087  0.835636
1  1.594347  0.524309  0.642305  0.138198 -0.075382 -0.791217
2  1.849154  0.651603  0.574854  0.741077 -0.788106 -0.115997
3  0.686670 -0.787141  0.048679  0.520909  1.597393  1.850610
4 -0.391876 -0.536599 -1.522485 -0.463636  0.478463 -0.351089
********************
          0         1         2         3         4         5
0  0.656605  0.302681  0.846226  1.000000  0.000000  0.615806
1  0.886299  0.911524  1.000000  0.229753  0.536561  0.000000
2  1.000000  1.000000  0.968842  0.459905  0.339102  0.255588
3  0.481272  0.000000  0.725781  0.375855  1.000000  1.000000
4  0.000000  0.174139  0.000000  0.000000  0.690003  0.166600
  1. groupby():根据某个字段进行分组

    agg():对分组后的结果进行聚合运算

    reset_index():对分组后的数据重新编排顺序

    merge():将两个dataframe根据某个字段进行列融合

    from pandas import DataFrame, Series
    import numpy as np
    
    info = [
        {"order_id":1001, "name":"wangwu", "score_1":90, "score_2":85, "created_time": "2019-10-10"},
        {"order_id":1002, "name":"zhangsan", "score_1":60, "score_2":75, "created_time": "2019-10-10"},
        {"order_id":1002, "name":"zhangsan", "score_1":96, "score_2":95, "created_time": "2019-08-10"},
        {"order_id":1001, "name":"wangwu", "score_1":89, "score_2":90, "created_time": "2019-08-10"}
        ]
    
    df = DataFrame(info)
    
    gp1 = df.groupby("order_id").agg({"score_1":["max", "min", np.mean, "count"]})
    gp1 = gp1.reset_index()
    gp1.columns = ["order_id", "score_1_max", "score_1_min", "score_1_avg", "score_1_count"]
    print(gp1)
    
    gp2 = df.groupby("order_id").agg({"score_2":["max", "min", np.mean, "count"]})
    gp2 = gp2.reset_index()
    gp2.columns = ["order_id", "score_2_max", "score_2_min", "score_2_avg", "score_2_count"]
    print(gp2)
    
    df2 = df.merge(gp1, on="order_id", how="inner").merge(gp2, on="order_id", how="inner")
    print(df2)
    df3 = df2[["order_id", "name", "score_1_max", "score_1_min", "score_1_avg", "score_1_count", "score_2_max", "score_2_min", "score_2_avg", "score_2_count"]]
    df3 = df3.drop_duplicates(subset = "order_id", keep = "first")
    print(df3)
    
    --------------------------------------结果------------------
       order_id  score_1_max  score_1_min  score_1_avg  score_1_count
    0      1001           90           89         89.5              2
    1      1002           96           60         78.0              2
       order_id  score_2_max  score_2_min  score_2_avg  score_2_count
    0      1001           90           85         87.5              2
    1      1002           95           75         85.0              2
       order_id      name  score_1  score_2  ... score_2_max  score_2_min  score_2_avg  score_2_count
    0      1001    wangwu       90       85  ...          90           85         87.5              2
    1      1001    wangwu       89       90  ...          90           85         87.5              2
    2      1002  zhangsan       60       75  ...          95           75         85.0              2
    3      1002  zhangsan       96       95  ...          95           75         85.0              2
    
    [4 rows x 13 columns]
       order_id      name  score_1_max  score_1_min  ...  score_2_max  score_2_min  score_2_avg  score_2_count
    0      1001    wangwu           90           89  ...           90           85         87.5              2
    2      1002  zhangsan           96           60  ...           95           75         85.0              2
    
    [2 rows x 10 columns]
    
  2. 对Series进行去重unique()操作

    from pandas import DataFrame, Series
    
    info = [
        {"order_id":1001, "name":"wangwu", "score_1":90, "score_2":85, "created_time": "2019-10-10"},
        {"order_id":1002, "name":"zhangsan", "score_1":60, "score_2":75, "created_time": "2019-10-10"},
        {"order_id":1002, "name":"zhangsan", "score_1":96, "score_2":95, "created_time": "2019-08-10"},
        {"order_id":1001, "name":"wangwu", "score_1":89, "score_2":90, "created_time": "2019-08-10"}
        ]
    
    df = DataFrame(info)
    order_list = df["order_id"]
    print(order_list, type(order_list))
    order_list = df["order_id"].unique()
    print(order_list, type(order_list))
    
    -------------------------------result--------------------
    0    1001
    1    1002
    2    1002
    3    1001
    Name: order_id, dtype: int64 <class 'pandas.core.series.Series'>
    [1001 1002] <class 'numpy.ndarray'>
    
  3. concat()

    from pandas import DataFrame, Series
    import pandas as pd
    df_1 = DataFrame([
        {"order_id": 1001, "name": "wangwu"},
        {"order_id": 1002, "name": "lisi"},
        {"order_id": 1003, "name": "zhangsan"},
        ])
    
    df_2 = DataFrame([
        {"order_id": 1004, "name": "wangqi"},
        {"order_id": 1005, "name": "wangfang"},
        {"order_id": 1006, "name": "zhouliu"},
        ])
    
    df_new = pd.concat([df_1, df_2], axis =0) #行合并 (沿着垂直的方向)
    df_new2 = pd.concat([df_1, df_2], axis =1) #列合并 (沿着水平的方向)
    print(df_new)
    print(df_new2)
    ---------------------------result-------------------
       order_id      name
    0      1001    wangwu
    1      1002      lisi
    2      1003  zhangsan
    0      1004    wangqi
    1      1005  wangfang
    2      1006   zhouliu
       order_id      name  order_id      name
    0      1001    wangwu      1004    wangqi
    1      1002      lisi      1005  wangfang
    2      1003  zhangsan      1006   zhouliu
    
    
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值