一、DataFrame添加列
直接通过赋值为空,添加一列。
>>> import pandas as pd
>>> df = pd.DataFrame(np.arange(12).reshape(3, 4), index = ['row1', 'row2', 'row3'], columns=['col1', 'col2', 'col3', 'col4'])
>>> df
col1 col2 col3 col4
row1 0 1 2 3
row2 4 5 6 7
row3 8 9 10 11
>>>
>>> df['col5']=''
>>>
>>> df
col1 col2 col3 col4 col5
row1 0 1 2 3
row2 4 5 6 7
row3 8 9 10 11
通过一个list给新加的列赋值,添加一列。注意,list里的元素个数要跟dataframe的行数一致,否则回报长度对不齐的错误。
>>> df['col6']=[1,1,1]
>>>
>>> df
col1 col2 col3 col4 col5 col6
row1 0 1 2 3 1
row2 4 5 6 7 1
row3 8 9 10 11 1
>>> df['col7']=[]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3119, in __setitem__
self._set_item(key, value)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3194, in _set_item
value = self._sanitize_column(key, value)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3391, in _sanitize_column
value = _sanitize_index(value, self.index, copy=False)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/series.py", line 4001, in _sanitize_index
raise ValueError('Length of values does not match length of ' 'index')
ValueError: Length of values does not match length of index
>>>
>>> df['col7']=[1]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3119, in __setitem__
self._set_item(key, value)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3194, in _set_item
value = self._sanitize_column(key, value)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3391, in _sanitize_column
value = _sanitize_index(value, self.index, copy=False)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/series.py", line 4001, in _sanitize_index
raise ValueError('Length of values does not match length of ' 'index')
ValueError: Length of values does not match length of index
如果需要在指定位置处添加一列,用insert方法实现。DataFrame.insert(loc, column, value, allow_duplicates=False),loc-位置,column-列名,value-元素值,allow_duplicates-是否允许重列。
>>> df
col1 col2 col3 col4 col5 col6
row1 0 1 2 3 1
row2 4 5 6 7 1
row3 8 9 10 11 1
>>>
>>> df.insert(1,'col7','')
>>>
>>> df
col1 col7 col2 col3 col4 col5 col6
row1 0 1 2 3 1
row2 4 5 6 7 1
row3 8 9 10 11 1
>>>
>>> df.insert(4,'col8',[2,2,2])
>>>
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
二、DataFrame取行列
1、按索引或者行列名选取DataFrame的具体行和列
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
>>>
>>> col1=df['col1']
>>>
>>> col1
row1 0
row2 4
row3 8
Name: col1, dtype: int64
>>>
>>> type(col1)
<class 'pandas.core.series.Series'>
注意,不能用行名取行,也不能用数字索引取行列,以下方式都会报错。
>>> row1=df['row1']
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 3078, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'row1'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2688, in __getitem__
return self._getitem_column(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2695, in _getitem_column
return self._get_item_cache(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 2489, in _get_item_cache
values = self._data.get(item)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'row1'
>>>
>>> row1=df[0]
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 3078, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2688, in __getitem__
return self._getitem_column(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2695, in _getitem_column
return self._get_item_cache(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 2489, in _get_item_cache
values = self._data.get(item)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
>>>
>>> row1=df[0,:]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2688, in __getitem__
return self._getitem_column(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2695, in _getitem_column
return self._get_item_cache(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 2487, in _get_item_cache
res = cache.get(item)
TypeError: unhashable type: 'slice'
>>>
>>> col1=df[:,0]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2688, in __getitem__
return self._getitem_column(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2695, in _getitem_column
return self._get_item_cache(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 2487, in _get_item_cache
res = cache.get(item)
TypeError: unhashable type: 'slice'
取某些行可以用限定行号的方式。
>>> row1=df[0:1]
>>>
>>> row1
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
>>>
>>> row2=df[1:2]
>>>
>>> row2
col1 col7 col2 col3 col8 col4 col5 col6
row2 4 5 6 2 7 1
>>>
>>> row23=df[1:3]
>>>
>>> row23
col1 col7 col2 col3 col8 col4 col5 col6
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
2、使用iloc按行号和列号取相应的行列
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
# 取第一行
>>> row1=df.iloc[0]
>>>
>>> row1
col1 0
col7
col2 1
col3 2
col8 2
col4 3
col5
col6 1
Name: row1, dtype: object
# 取第1,2,3行
>>> row123=df.iloc[[0,1,2]]
>>>
>>> row123
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
>>>
>>> row12=df.iloc[[0,1]]
>>>
>>> row12
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
# 连续取多行
>>> row12=df.iloc[0:2]
>>>
>>> row12
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
>>>
>>> row12=df.iloc[0:2,:]
>>>
>>> row12
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
# 取行时可以不指定列,但取列时必须用:,来指定全行
>>> col1=df.iloc[:,0]
>>>
>>> col1
row1 0
row2 4
row3 8
Name: col1, dtype: int64
# 用list[0,2]指定取第1,3列
>>> col13=df.iloc[:,[0,2]]
>>>
>>> col13
col1 col2
row1 0 1
row2 4 5
row3 8 9
>>>
>>> col123=df.iloc[:,[0,1,2]]
>>>
>>> col123
col1 col7 col2
row1 0 1
row2 4 5
row3 8 9
# 用0:2指定连续的多列
>>> col12=df.iloc[:,0:2]
>>>
>>> col12
col1 col7
row1 0
row2 4
row3 8
3、使用loc按行名和列名取相应的行列
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
>>>
>>> row1=df.loc['row1']
>>>
>>> row1
col1 0
col7
col2 1
col3 2
col8 2
col4 3
col5
col6 1
Name: row1, dtype: object
>>>
>>> row2=df.loc['row2',:]
>>>
>>> row2
col1 4
col7
col2 5
col3 6
col8 2
col4 7
col5
col6 1
Name: row2, dtype: object
>>>
>>> row12=df.loc[['row1','row2']]
>>>
>>> row12
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
>>>
>>> row123=df.loc['row1':'row3',:]
>>>
>>> row123
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
>>>
>>> col1=df.loc[:,'col1']
>>>
>>> col1
row1 0
row2 4
row3 8
Name: col1, dtype: int64
>>> col128=df.loc[:,['col1','col2','col8']]
>>>
>>> col128
col1 col2 col8
row1 0 1 2
row2 4 5 2
row3 8 9 2
>>>
>>> col1723=df.loc[:,'col1':'col3']
>>>
>>> col1723
col1 col7 col2 col3
row1 0 1 2
row2 4 5 6
row3 8 9 10
4、使用条件取相应的行列
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
# 取出第一列,有对应数字为4的所有行
>>> df.loc[df['col1']==4]
col1 col7 col2 col3 col8 col4 col5 col6
row2 4 5 6 2 7 1
>>>
>>> df.loc[df['col8']==2]
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
>>> df.loc[~(df['col1']!=4)]
col1 col7 col2 col3 col8 col4 col5 col6
row2 4 5 6 2 7 1
>>>
>>> df.loc[~(df['col1']==4)]
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row3 8 9 10 2 11 1
5、使用条件替换行列的值
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 4 5 6 2 7 1
row3 8 9 10 2 11 1
# 把第一列中值为4的行选取出来,再选择第'col1'列的元素赋值成新的44
>>> df.loc[df['col1']==4,'col1']=44
>>>
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 44 5 6 2 7 1
row3 8 9 10 2 11 1
# 把第一列中值为4的行选取出来,再选择第'col4'列的元素赋值成新的44
>>> df.loc[df['col1']==44,'col4']=44
>>>
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 44 5 6 2 44 1
row3 8 9 10 2 11 1
# 选择'row3'行,其中有值为2的元素赋值成新的44
>>> df.loc['row3',df.loc['row3',:]==2]=44
>>>
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 44 5 6 2 44 1
row3 8 9 10 44 11 1
# 改变某行某列的元素值
>>> df
col1 col2 col3 col4
row1 0 1 2 3
row2 4 5 6 7
row3 8 9 10 11
>>>
>>> df.at['row1','col1']=100
>>>
>>> df
col1 col2 col3 col4
row1 100 1 2 3
row2 4 5 6 7
row3 8 9 10 11
6、使用条件替换整个矩阵的部分元素的值
>>> df
col1 col7 col2 col3 col8 col4 col5 col6
row1 0 1 2 2 3 1
row2 44 5 6 2 44 1
row3 8 9 10 44 11 1
# 将所有值大于40的元素赋值为新的40,注意包含np.nan空值时会报错,需要先去除或者赋值
>>> df[df>40]=40
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3114, in __setitem__
self._setitem_frame(key, value)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 3161, in _setitem_frame
self._check_inplace_setting(value)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 4503, in _check_inplace_setting
raise TypeError('Cannot do inplace boolean setting on '
TypeError: Cannot do inplace boolean setting on mixed-types with a non np.nan value
>>>
>>>
>>> df>40
col1 col7 col2 col3 col8 col4 col5 col6
row1 False True False False False False True False
row2 True True False False False True True False
row3 False True False False True False True False
>>> df=df.drop(['col7'], axis=1)
>>> df=df.drop(['col5'], axis=1)
>>>
>>> df
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row2 44 5 6 2 44 1
row3 8 9 10 44 11 1
>>>
>>> df[df>40]=40
>>>
>>> df
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row2 40 5 6 2 40 1
row3 8 9 10 40 11 1
三、DataFrame取元素
1、选择列然后遍历获取元素的值
>>> df
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row2 40 5 6 2 40 1
row3 8 9 10 40 11 1
>>> for index in df['col1'].index:
... idx=df['col1'].get(index)
... print(idx)
...
0
40
8
2、使用pandas.DataFrame.at的行索引和列名获取元素的值
>>> df
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row2 40 5 6 2 40 1
row3 8 9 10 40 11 1
>>>
>>> df.at[4,'col1']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexing.py", line 2141, in __getitem__
key = self._convert_key(key)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexing.py", line 2227, in _convert_key
raise ValueError("At based indexing on an non-integer "
ValueError: At based indexing on an non-integer index can only have non-integer indexers
>>>
>>> df.at['row4','col1']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexing.py", line 2142, in __getitem__
return self.obj._get_value(*key, takeable=self._takeable)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py", line 2539, in _get_value
return engine.get_value(series._values, index)
File "pandas/_libs/index.pyx", line 106, in pandas._libs.index.IndexEngine.get_value
File "pandas/_libs/index.pyx", line 114, in pandas._libs.index.IndexEngine.get_value
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1492, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1500, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'row4'
>>>
>>> df.at['row2','col1']
40
>>> df.iloc[1].at['col1']
40
3、使用pandas.DataFrame.iat的行索引和列索引获取元素的值
>>> df
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row2 40 5 6 2 40 1
row3 8 9 10 40 11 1
>>>
>>> df.iat[1,2]
6
>>>
>>> df.iloc[3].iat[4]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexing.py", line 1478, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexing.py", line 2102, in _getitem_axis
self._validate_integer(key, axis)
File "/root/miniconda3/lib/python3.6/site-packages/pandas/core/indexing.py", line 2009, in _validate_integer
raise IndexError("single positional indexer is out-of-bounds")
IndexError: single positional indexer is out-of-bounds
>>>
>>> df.iloc[2].iat[4]
11
4、使用pandas.DataFrame.loc获取元素的值或者行
>>> df
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row2 40 5 6 2 40 1
row3 8 9 10 40 11 1
>>>
>>>
>>> df.loc['row1','col1']
0
>>>
>>> df.loc['row1']
col1 0
col2 1
col3 2
col8 2
col4 3
col6 1
Name: row1, dtype: int64
>>>
>>> df.loc[['row1','row3']]
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row3 8 9 10 40 11 1
5、使用pandas.DataFrame.iloc获取元素的值或者行
>>> df
col1 col2 col3 col8 col4 col6
row1 0 1 2 2 3 1
row2 40 5 6 2 40 1
row3 8 9 10 40 11 1
>>>
>>> df.iloc[0,2]
2
>>>
>>> df.iloc[2]
col1 8
col2 9
col3 10
col8 40
col4 11
col6 1
Name: row3, dtype: int64
引用
【1】https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.insert.html