pandas

最新推荐文章于 2021-02-07 19:58:16 发布

轻羽羽

最新推荐文章于 2021-02-07 19:58:16 发布

阅读量118

点赞数

分类专栏： Python库

本文链接：https://blog.csdn.net/qq_25841513/article/details/103108211

版权

Python库专栏收录该内容

7 篇文章 0 订阅

订阅专栏

import numpy as np
import pandas as pd

# cvs文件的读取和写入等
## 读取
df = pd.read_csv('happiness_train_abbr.csv')

# 显示各列名属性,object类型等价于字符型
print(df.dtypes)

# 显示前几行
df.head()
# 显示后几行
df.tail()

# 显示列名
df.columns

# 显示第一列数据
df.loc[0]
# 显示其他列数据
df.loc[3:6]
df.loc[[2,5,10]]
# 显示第一列0到10共11行数据
a = df['age']
a.loc[0:10]

# 显示指定列数据
df['a']

# 显示表格形状
df.shape

# 查看某列是否有缺失值
pd.isnull(age)

# 打乱顺序

## 写入
dataframe = pd.DataFrame({'list': list, 'happiness': listPredict})
dataframe.to_csv("happiness_submit.csv", index=False, columns=['list','happiness'])

# 把颜色独热编码
df_colors = df['Color'].str.get_dummies().add_prefix('Color: ')

## 删除
df = df.drop(['survey_time','province','city','gender','birth','nationality'], axis=1)
axis=1为删除整列，axis=0为删除整行
## 赋值，第一列（0,1,2）赋值给rawLabels
rawLabels = df.iloc[:, 1]
# 转化为列表
list_rawLabels = list(rawLabels)
# 转化为数组
np_rawLabels = np.array(rawLabels)

# Series对象（一维）
## data.values转化为数组
population_dict = {'C':12,'B':26}
population = pd.Series(population_dict) # Series是特殊的数组
print(population)
    C    12
    B    26
    dtype: int64
    
data = pd.Series([0.25,0.5])
print('data:\n',data)
print('\ndata.values\n',data.values) # 返回结果与数组相同
print('\ndata[0]\n',data[0])
    data:
    0    0.25
    1    0.50
    dtype: float64

    data.values
     [0.25 0.5 ]

    data[0]
     0.25
     
data.index
    ##输出 RangeIndex(start=0, stop=2, step=1)

# 添加
print(data)
data[3] =6
print(data)
    0    0.25
    1    0.50
    dtype: float64
    0    0.25
    1    0.50
    3    6.00
    dtype: float64

# 自定义索引
c = pd.Series([0.25,0.5],index=['a','b'])
print(c)
c.index # Index(['a', 'b'], dtype='object')
    a    0.25
    b    0.50
    dtype: float64

print(c[0])
print(c['a'])
    0.25
    0.25
    
# 不按顺序的索引
c = pd.Series([0.25,0.2,0.1],index=[5,7,3])
print(c)
    5    0.25
    7    0.20
    3    0.10
    dtype: float64
print(c[5])
    0.25
type(c.values)
    numpy.ndarray
    
# DataFrame对象（二维）
population_dict = {'C':12,'B':26}
population = pd.Series(population_dict)
print(population)
    C    12
    B    26
    dtype: int64
area_dict = {'C':22,'B':33}
area = pd.Series(area_dict)
print(area)
    C    22
    B    33
    dtype: int64
states = pd.DataFrame({'population':population,'area':area})
print(states['area'])
states
    C    22
    B    33
    Name: area, dtype: int64
        population  area
    C   12          22
    B   26          33
    
states.values # 二维数组
    array([[12, 22],
           [26, 33]], dtype=int64)
states.index
    Index(['C', 'B'], dtype='object')
states.columns
    Index(['population', 'area'], dtype='object')

data = pd.Series(['a','b','c'],index=[1,3,5])
data
    1    a
    3    b
    5    c
    dtype: object
    
# 索引器loc、iloc

## 显示索引loc
print(data.loc[1])
print(data.loc[3])
print(data.loc[5])
    a
    b
    c
data.loc[1:5]
    1    a
    3    b
    5    c
    dtype: object
    
## 隐式索引iloc
print(data.iloc[0])
print(data.iloc[1])
print(data.iloc[2])
    a
    b
    c
    
data.iloc[0:3] # 左开右闭
    1    a
    3    b
    5    c
    dtype: object
    
# 处理缺失值null/NaN/NA
import numpy as np
a = pd.Series([1,np.nan,None])
print(a)
    0    1.0
    1    NaN
    2    NaN
dtype: float64
a.isnull()
    0    False
    1     True
    2     True
dtype: bool
# 剔除含缺失值的整行或整列
# 判断是否nan值
from math import isnan
if(isnan(data_train[i][j])):
	data_train[i][j] = int(round(np.mean(df.iloc[:,j]))) # 四舍五入取整

b = a.dropna() # 整行
c = a.dropna(axis=1) # 整列
d = a.dropna(axis=1,how='all') # 整列,且剔除的该列所有值皆为缺失值
b
    0    1.0
dtype: float64
# 填充缺失值
## 以指定值填充
print(a)
a.fillna(3.1415)
    0    1.0
    1    NaN
    2    NaN
    dtype: float64
    0    1.0000
    1    3.1415
    2    3.1415
    dtype: float64