import numpy as np
import pandas as pd
# cvs文件的读取和写入等
## 读取
df = pd.read_csv('happiness_train_abbr.csv')
# 显示各列名属性,object类型等价于字符型
print(df.dtypes)
# 显示前几行
df.head()
# 显示后几行
df.tail()
# 显示列名
df.columns
# 显示第一列数据
df.loc[0]
# 显示其他列数据
df.loc[3:6]
df.loc[[2,5,10]]
# 显示第一列0到10共11行数据
a = df['age']
a.loc[0:10]
# 显示指定列数据
df['a']
# 显示表格形状
df.shape
# 查看某列是否有缺失值
pd.isnull(age)
# 打乱顺序
## 写入
dataframe = pd.DataFrame({'list': list, 'happiness': listPredict})
dataframe.to_csv("happiness_submit.csv", index=False, columns=['list','happiness'])
# 把颜色独热编码
df_colors = df['Color'].str.get_dummies().add_prefix('Color: ')
## 删除
df = df.drop(['survey_time','province','city','gender','birth','nationality'], axis=1)
axis=1为删除整列,axis=0为删除整行
## 赋值,第一列(0,1,2)赋值给rawLabels
rawLabels = df.iloc[:, 1]
# 转化为列表
list_rawLabels = list(rawLabels)
# 转化为数组
np_rawLabels = np.array(rawLabels)
# Series对象(一维)
## data.values转化为数组
population_dict = {'C':12,'B':26}
population = pd.Series(population_dict) # Series是特殊的数组
print(population)
C 12
B 26
dtype: int64
data = pd.Series([0.25,0.5])
print('data:\n',data)
print('\ndata.values\n',data.values) # 返回结果与数组相同
print('\ndata[0]\n',data[0])
data:
0 0.25
1 0.50
dtype: float64
data.values
[0.25 0.5 ]
data[0]
0.25
data.index
##输出 RangeIndex(start=0, stop=2, step=1)
# 添加
print(data)
data[3] =6
print(data)
0 0.25
1 0.50
dtype: float64
0 0.25
1 0.50
3 6.00
dtype: float64
# 自定义索引
c = pd.Series([0.25,0.5],index=['a','b'])
print(c)
c.index # Index(['a', 'b'], dtype='object')
a 0.25
b 0.50
dtype: float64
print(c[0])
print(c['a'])
0.25
0.25
# 不按顺序的索引
c = pd.Series([0.25,0.2,0.1],index=[5,7,3])
print(c)
5 0.25
7 0.20
3 0.10
dtype: float64
print(c[5])
0.25
type(c.values)
numpy.ndarray
# DataFrame对象(二维)
population_dict = {'C':12,'B':26}
population = pd.Series(population_dict)
print(population)
C 12
B 26
dtype: int64
area_dict = {'C':22,'B':33}
area = pd.Series(area_dict)
print(area)
C 22
B 33
dtype: int64
states = pd.DataFrame({'population':population,'area':area})
print(states['area'])
states
C 22
B 33
Name: area, dtype: int64
population area
C 12 22
B 26 33
states.values # 二维数组
array([[12, 22],
[26, 33]], dtype=int64)
states.index
Index(['C', 'B'], dtype='object')
states.columns
Index(['population', 'area'], dtype='object')
data = pd.Series(['a','b','c'],index=[1,3,5])
data
1 a
3 b
5 c
dtype: object
# 索引器loc、iloc
## 显示索引loc
print(data.loc[1])
print(data.loc[3])
print(data.loc[5])
a
b
c
data.loc[1:5]
1 a
3 b
5 c
dtype: object
## 隐式索引iloc
print(data.iloc[0])
print(data.iloc[1])
print(data.iloc[2])
a
b
c
data.iloc[0:3] # 左开右闭
1 a
3 b
5 c
dtype: object
# 处理缺失值null/NaN/NA
import numpy as np
a = pd.Series([1,np.nan,None])
print(a)
0 1.0
1 NaN
2 NaN
dtype: float64
a.isnull()
0 False
1 True
2 True
dtype: bool
# 剔除含缺失值的整行或整列
# 判断是否nan值
from math import isnan
if(isnan(data_train[i][j])):
data_train[i][j] = int(round(np.mean(df.iloc[:,j]))) # 四舍五入取整
b = a.dropna() # 整行
c = a.dropna(axis=1) # 整列
d = a.dropna(axis=1,how='all') # 整列,且剔除的该列所有值皆为缺失值
b
0 1.0
dtype: float64
# 填充缺失值
## 以指定值填充
print(a)
a.fillna(3.1415)
0 1.0
1 NaN
2 NaN
dtype: float64
0 1.0000
1 3.1415
2 3.1415
dtype: float64