pandas编程技巧2

  1. pandas 的axis=1和0的区别

axis = 1 沿着横轴的方向(列操作,逐列)

axis = 0 沿着纵轴的方向 (行操作,逐行)

from pandas import DataFrame, Series
import pandas as pd

df = DataFrame([
    {"order_id":1001, "name":"wangwu"},
    {"order_id":1002, "name":"wangliu"},
    {"order_id":1003, "name":"zhangsan"},
    ])

df_2 = DataFrame([
    {"order_id":1004, "name":"wangwu"},
    {"order_id":1005, "name":"wangliu"},
    {"order_id":1006, "name":"zhangsan"},
    ])

df.index = [0,1,2]
print(df)
df_01 = df.drop("name", axis=1)
print("删除列 axis=1\n", df_01)
df_02 = df.drop(0, axis=0)
print("删除行 axis=0\n", df_02)
print("merge-----------,默认是列连接")
df_03 = df.merge(df_2, on="order_id", how = "left")
print(df_03)
print("concat----------默认是行连接")
df_04 = pd.concat([df, df_2])
print(df_04)
mean = df["order_id"].mean(axis = 0)
print(mean, type(mean))

-----------------------------result--------------------
   order_id      name
0      1001    wangwu
1      1002   wangliu
2      1003  zhangsan
删除列 axis=1
    order_id
0      1001
1      1002
2      1003
删除行 axis=0
    order_id      name
1      1002   wangliu
2      1003  zhangsan
merge-----------,默认是列连接
   order_id    name_x name_y
0      1001    wangwu    NaN
1      1002   wangliu    NaN
2      1003  zhangsan    NaN
concat----------默认是行连接
   order_id      name
0      1001    wangwu
1      1002   wangliu
2      1003  zhangsan
0      1004    wangwu
1      1005   wangliu
2      1006  zhangsan
1002.0 <class 'numpy.float64'>
  1. astype()

    from pandas import DataFrame, Series
    
    df = DataFrame([
        {"order_id":1001, "name":"wangwu", "score":90},
        {"order_id":1002, "name":"wangliu", "score":89},
        {"order_id":1003, "name":"zhangsan", "score":70},
        ])
    print(df)
    print("*"*30)
    df["order_id"] = df["order_id"].astype(str)
    df["score"] = df["score"].astype(float)
    print(df)
    
    ------------------------result-------------------
       order_id      name  score
    0      1001    wangwu     90
    1      1002   wangliu     89
    2      1003  zhangsan     70
    ******************************
      order_id      name  score
    0     1001    wangwu   90.0
    1     1002   wangliu   89.0
    2     1003  zhangsan   70.0
    
  2. dataframe筛选样本或特定列

    from pandas import DataFrame, Series
    import numpy as np
    df = DataFrame([
        {"order_id":1001, "name":"wangwu", "score":90, "address":np.nan},
        {"order_id":1002, "name":"wangliu", "score":89, "address":"nanjing"},
        {"order_id":1003, "name":"zhangsan", "score":70, "address":"beijing"},
        ])
    
    df_1 = df[["order_id", "score"]]
    print("根据列名选取指定的列:\n", df_1)
    
    df_2 = df[:2]
    print("选取指定的行:\n", df_2)
    
    df_3 = df[df["score"]>80]
    print("根据比较运算符筛选指定的行:\n", df_3)
    
    df_4 = df[df["name"]=="wangwu"]
    print(df_4)
    
    ------------------------result--------------------
    根据列名选取指定的列:
        order_id  score
    0      1001     90
    1      1002     89
    2      1003     70
    选取指定的行:
        order_id     name  score  address
    0      1001   wangwu     90      NaN
    1      1002  wangliu     89  nanjing
    根据比较运算符筛选指定的行:
        order_id     name  score  address
    0      1001   wangwu     90      NaN
    1      1002  wangliu     89  nanjing
    
       order_id    name  score address
    0      1001  wangwu     90     NaN
    
    
  3. 缺失值的处理

    from pandas import DataFrame, Series
    import numpy as np
    
    df = DataFrame([
        {"order_id":1001, "name":"wangwu", "score":90, "address":np.nan},
        {"order_id":1002, "name":"wangliu", "score":89, "address":"nanjing"},
        {"order_id":1003, "name":"zhangsan", "score":70, "address":"beijing"},
        {"order_id":np.nan, "name":"zhangsan", "score":70, "address":np.nan},
        ])
    
    print("删除缺失值")
    df_1 = df.dropna(axis = 0, subset=["order_id"])
    print(df_1)
    print("留下特征值的数量>=n的行  (n是不为nan的值的数量)")
    df_2 = df.dropna(thresh=4)
    print(df_2)
    
    print("填充缺失值")
    print(df["address"].fillna("不详"))
    
    print("横向用缺失值前面的值替换缺失值")
    print(df.fillna(axis=1,method='ffill'))
    
    print("纵向用缺失值上面的值替换缺失值")
    print(df.fillna(axis=0,method='ffill'))
    
    print("使用默认值填充")
    values = {"order_id":1000, "address":"China"}
    print(df.fillna(value = values))
    
    #筛选出空值的index
    print(df[df["address"].isna()])
    
    
    #筛选非空值的index
    
    print(df[df["address"].notnull()])
    
    ---------------------result-----------------
    删除缺失值
       order_id      name  score  address
    0    1001.0    wangwu     90      NaN
    1    1002.0   wangliu     89  nanjing
    2    1003.0  zhangsan     70  beijing
    留下特征值的数量>=n的行  (n是不为nan的值的数量)
       order_id      name  score  address
    1    1002.0   wangliu     89  nanjing
    2    1003.0  zhangsan     70  beijing
    填充缺失值
    0         不详
    1    nanjing
    2    beijing
    3         不详
    Name: address, dtype: object
    横向用缺失值前面的值替换缺失值
      order_id      name score  address
    0     1001    wangwu    90       90
    1     1002   wangliu    89  nanjing
    2     1003  zhangsan    70  beijing
    3      NaN  zhangsan    70       70
    纵向用缺失值上面的值替换缺失值
       order_id      name  score  address
    0    1001.0    wangwu     90      NaN
    1    1002.0   wangliu     89  nanjing
    2    1003.0  zhangsan     70  beijing
    3    1003.0  zhangsan     70  beijing
    使用默认值填充
       order_id      name  score  address
    0    1001.0    wangwu     90    China
    1    1002.0   wangliu     89  nanjing
    2    1003.0  zhangsan     70  beijing
    3    1000.0  zhangsan     70    China
    
       order_id      name  score address
    0    1001.0    wangwu     90     NaN
    3       NaN  zhangsan     70     NaN
    
       order_id      name  score  address
    1    1002.0   wangliu     89  nanjing
    2    1003.0  zhangsan     70  beijing
    
  4. 给dataframe添加列名和行序号

    from pandas import DataFrame, Series
    df = DataFrame([
        [100, "a", 30],
        [101, "b", 28],
        [102, "c", 26]
        ])
    
    df.columns = ["id", "name", "age"]
    df.index = [0,1,2]
    print(df)
    
    print("获取df的index数组", df.index.values)
    print("获取df的列名数组:", df.columns.values)
    -----------------------------result--------------------------------
        id name  age
    0  100    a   30
    1  101    b   28
    2  102    c   26
    获取df的index数组 ['user_id' 'age' 'age_new']
    获取df的列名数组: [2 1 0]
    
    1. 排序

      from pandas import DataFrame,Series
      
      import numpy as np
      
      df = DataFrame([
          {"order_id":1001, "name":"wangwu", "score":90, "address":np.nan},
          {"order_id":1002, "name":"wangliu", "score":89, "address":"nanjing"},
          {"order_id":1003, "name":"zhangsan", "score":70, "address":"beijing"},
          {"order_id":1004, "name":"zhangsan", "score":70, "address":np.nan},
          ])
      
      df = df.sort_values(by=["name", "order_id"], ascending=False)
      print(df)
      
      print(df.sort_values(by="score"))
      
      ------------------------------result-------------------
         order_id      name  score  address
      3      1004  zhangsan     70      NaN
      2      1003  zhangsan     70  beijing
      0      1001    wangwu     90      NaN
      1      1002   wangliu     89  nanjing
      
         order_id      name  score  address
      3      1004  zhangsan     70      NaN
      2      1003  zhangsan     70  beijing
      1      1002   wangliu     89  nanjing
      0      1001    wangwu     90      NaN
      
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值