- pandas 的axis=1和0的区别
axis = 1 沿着横轴的方向(列操作,逐列)
axis = 0 沿着纵轴的方向 (行操作,逐行)
from pandas import DataFrame, Series
import pandas as pd
df = DataFrame([
{"order_id":1001, "name":"wangwu"},
{"order_id":1002, "name":"wangliu"},
{"order_id":1003, "name":"zhangsan"},
])
df_2 = DataFrame([
{"order_id":1004, "name":"wangwu"},
{"order_id":1005, "name":"wangliu"},
{"order_id":1006, "name":"zhangsan"},
])
df.index = [0,1,2]
print(df)
df_01 = df.drop("name", axis=1)
print("删除列 axis=1\n", df_01)
df_02 = df.drop(0, axis=0)
print("删除行 axis=0\n", df_02)
print("merge-----------,默认是列连接")
df_03 = df.merge(df_2, on="order_id", how = "left")
print(df_03)
print("concat----------默认是行连接")
df_04 = pd.concat([df, df_2])
print(df_04)
mean = df["order_id"].mean(axis = 0)
print(mean, type(mean))
-----------------------------result--------------------
order_id name
0 1001 wangwu
1 1002 wangliu
2 1003 zhangsan
删除列 axis=1
order_id
0 1001
1 1002
2 1003
删除行 axis=0
order_id name
1 1002 wangliu
2 1003 zhangsan
merge-----------,默认是列连接
order_id name_x name_y
0 1001 wangwu NaN
1 1002 wangliu NaN
2 1003 zhangsan NaN
concat----------默认是行连接
order_id name
0 1001 wangwu
1 1002 wangliu
2 1003 zhangsan
0 1004 wangwu
1 1005 wangliu
2 1006 zhangsan
1002.0 <class 'numpy.float64'>
-
astype()
from pandas import DataFrame, Series df = DataFrame([ {"order_id":1001, "name":"wangwu", "score":90}, {"order_id":1002, "name":"wangliu", "score":89}, {"order_id":1003, "name":"zhangsan", "score":70}, ]) print(df) print("*"*30) df["order_id"] = df["order_id"].astype(str) df["score"] = df["score"].astype(float) print(df) ------------------------result------------------- order_id name score 0 1001 wangwu 90 1 1002 wangliu 89 2 1003 zhangsan 70 ****************************** order_id name score 0 1001 wangwu 90.0 1 1002 wangliu 89.0 2 1003 zhangsan 70.0
-
dataframe筛选样本或特定列
from pandas import DataFrame, Series import numpy as np df = DataFrame([ {"order_id":1001, "name":"wangwu", "score":90, "address":np.nan}, {"order_id":1002, "name":"wangliu", "score":89, "address":"nanjing"}, {"order_id":1003, "name":"zhangsan", "score":70, "address":"beijing"}, ]) df_1 = df[["order_id", "score"]] print("根据列名选取指定的列:\n", df_1) df_2 = df[:2] print("选取指定的行:\n", df_2) df_3 = df[df["score"]>80] print("根据比较运算符筛选指定的行:\n", df_3) df_4 = df[df["name"]=="wangwu"] print(df_4) ------------------------result-------------------- 根据列名选取指定的列: order_id score 0 1001 90 1 1002 89 2 1003 70 选取指定的行: order_id name score address 0 1001 wangwu 90 NaN 1 1002 wangliu 89 nanjing 根据比较运算符筛选指定的行: order_id name score address 0 1001 wangwu 90 NaN 1 1002 wangliu 89 nanjing order_id name score address 0 1001 wangwu 90 NaN
-
缺失值的处理
from pandas import DataFrame, Series import numpy as np df = DataFrame([ {"order_id":1001, "name":"wangwu", "score":90, "address":np.nan}, {"order_id":1002, "name":"wangliu", "score":89, "address":"nanjing"}, {"order_id":1003, "name":"zhangsan", "score":70, "address":"beijing"}, {"order_id":np.nan, "name":"zhangsan", "score":70, "address":np.nan}, ]) print("删除缺失值") df_1 = df.dropna(axis = 0, subset=["order_id"]) print(df_1) print("留下特征值的数量>=n的行 (n是不为nan的值的数量)") df_2 = df.dropna(thresh=4) print(df_2) print("填充缺失值") print(df["address"].fillna("不详")) print("横向用缺失值前面的值替换缺失值") print(df.fillna(axis=1,method='ffill')) print("纵向用缺失值上面的值替换缺失值") print(df.fillna(axis=0,method='ffill')) print("使用默认值填充") values = {"order_id":1000, "address":"China"} print(df.fillna(value = values)) #筛选出空值的index print(df[df["address"].isna()]) #筛选非空值的index print(df[df["address"].notnull()]) ---------------------result----------------- 删除缺失值 order_id name score address 0 1001.0 wangwu 90 NaN 1 1002.0 wangliu 89 nanjing 2 1003.0 zhangsan 70 beijing 留下特征值的数量>=n的行 (n是不为nan的值的数量) order_id name score address 1 1002.0 wangliu 89 nanjing 2 1003.0 zhangsan 70 beijing 填充缺失值 0 不详 1 nanjing 2 beijing 3 不详 Name: address, dtype: object 横向用缺失值前面的值替换缺失值 order_id name score address 0 1001 wangwu 90 90 1 1002 wangliu 89 nanjing 2 1003 zhangsan 70 beijing 3 NaN zhangsan 70 70 纵向用缺失值上面的值替换缺失值 order_id name score address 0 1001.0 wangwu 90 NaN 1 1002.0 wangliu 89 nanjing 2 1003.0 zhangsan 70 beijing 3 1003.0 zhangsan 70 beijing 使用默认值填充 order_id name score address 0 1001.0 wangwu 90 China 1 1002.0 wangliu 89 nanjing 2 1003.0 zhangsan 70 beijing 3 1000.0 zhangsan 70 China order_id name score address 0 1001.0 wangwu 90 NaN 3 NaN zhangsan 70 NaN order_id name score address 1 1002.0 wangliu 89 nanjing 2 1003.0 zhangsan 70 beijing
-
给dataframe添加列名和行序号
from pandas import DataFrame, Series df = DataFrame([ [100, "a", 30], [101, "b", 28], [102, "c", 26] ]) df.columns = ["id", "name", "age"] df.index = [0,1,2] print(df) print("获取df的index数组", df.index.values) print("获取df的列名数组:", df.columns.values) -----------------------------result-------------------------------- id name age 0 100 a 30 1 101 b 28 2 102 c 26 获取df的index数组 ['user_id' 'age' 'age_new'] 获取df的列名数组: [2 1 0]
-
排序
from pandas import DataFrame,Series import numpy as np df = DataFrame([ {"order_id":1001, "name":"wangwu", "score":90, "address":np.nan}, {"order_id":1002, "name":"wangliu", "score":89, "address":"nanjing"}, {"order_id":1003, "name":"zhangsan", "score":70, "address":"beijing"}, {"order_id":1004, "name":"zhangsan", "score":70, "address":np.nan}, ]) df = df.sort_values(by=["name", "order_id"], ascending=False) print(df) print(df.sort_values(by="score")) ------------------------------result------------------- order_id name score address 3 1004 zhangsan 70 NaN 2 1003 zhangsan 70 beijing 0 1001 wangwu 90 NaN 1 1002 wangliu 89 nanjing order_id name score address 3 1004 zhangsan 70 NaN 2 1003 zhangsan 70 beijing 1 1002 wangliu 89 nanjing 0 1001 wangwu 90 NaN
-