数据分析常用库之【pandas】DataFrame方法操作

最新推荐文章于 2024-02-06 11:47:37 发布

薛定谔的猫1992

最新推荐文章于 2024-02-06 11:47:37 发布

阅读量474

点赞数

分类专栏： AI学习

本文链接：https://blog.csdn.net/weixin_42456166/article/details/103221034

版权

AI学习专栏收录该内容

13 篇文章 1 订阅

订阅专栏

#!/usr/bin/python
from __future__ import print_function
from __future__ import with_statement
import os    #获取当前工作路径
import numpy as np
import pandas as pd
from pandas import *  # Sereis, DataFrame
file = os.getcwd() + '\\1.csv' #获取文件路径，文件命名并传给变量file
print(file)
'''生成4X4表格，索引列为abcd,行栏为wxyz'''
data = DataFrame(np.arange(16).reshape(4,4),index=list('abcd'),columns=list('wxyz'))
print(data)
print(data['w'])  #选择表格中的'w'列，使用类字典属性,返回的是Series类型
print(data.w)   #选择表格中的'w'列，使用点属性,返回的是Series类型
print(data[['w']])  #选择表格中的'w'列，返回的是DataFrame类型
print(data[['w','z']]) #选择表格中的'w'、'z'列
data = DataFrame({'a':[1, 2, 3], 'b': [4, 5, 6]})  #要保存的数据
print(data)
data.to_csv(file,index=None,encoding='utf-8')

import pandas as pd
import numpy as np
'''第一种写法：当值都是list类型的数据'''
data1 = {'A':range(3),'B':list("abc"),'C':['red',np.NaN,'yellow']}
df1=pd.DataFrame(data1)
'''第二种写法：当值为string类型的数据，此时需要加上 index=[0] 因为pandas 的dataframe需要一个可迭代的对象'''
data2 = {'姓名': 'fuhang', '性别': '男', '昵称': '那时的吻真香'}
df2=pd.DataFrame(data2,index=[0])
print(df2)
df1.to_csv('Result1.csv',index=None,encoding='utf-8')
df2.to_csv('Result2.csv',index=None,encoding='utf-8')
df1.to_csv('Result1.csv',index=None,mode='a')

#!/usr/bin/python
# -*- coding: UTF-8 -*-

"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np
#生成时间序列
dates = pd.date_range('20130101', periods=6)
print('1')
print(dates)
#numpy.random.randn(d0,d1,…,dn)
#randn函数返回一个或者一组样本，具有标准正态分布
#dn表示每个维度
#返回值为指定维度的排列
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A', 'B', 'C', 'D'])
print('2')
print(df)
#将第三行第三列的数值修改为1111
df.iloc[2,2] = 1111
print('3')
print(df)
#将索引列为2013-01-03，索引行为D的数值修改为2222
df.loc['2013-01-03', 'D'] = 2222
print('3')
print(df)
#将索引行大于0的数值置为0
df.A[df.A>0] = 0
print('4')
print(df)
#增加索引行F并置为空
df['F'] = np.nan
print('5')
print(df)
#增加索引行G,并按照日期索引列序列排序值赋值
df['G']  = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101', periods=6))
print('6')
print(df)

#!/usr/bin/python
# -*- coding: UTF-8 -*-

"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np
#date_range函数生成日期序列
dates = pd.date_range('20130101', periods=6)
#生成带起点和终点的特定步长的排列，dataframe格式为6X4，索引列为日期，索引行为ABCD
df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D'])
print('1')
print(df)
'''
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23'''
#将第一行第二列的值置为空值
df.iloc[0,1] = np.nan
print('2')
print(df)
'''
             A     B   C   D
2013-01-01   0   NaN   2   3
2013-01-02   4   5.0   6   7
2013-01-03   8   9.0  10  11
2013-01-04  12  13.0  14  15
2013-01-05  16  17.0  18  19
2013-01-06  20  21.0  22  23
'''
#将第二行第三列的值置为空值
df.iloc[1,2] = np.nan
print('3')
print(df)
'''
             A     B     C   D
2013-01-01   0   NaN   2.0   3
2013-01-02   4   5.0   NaN   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
'''
print('4')
#将含有空值的行丢弃掉
print(df.dropna())   # dropna默认丢弃任何含有缺失的行：
'''
             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
'''
print('5')
#将空值列置为0
print(df.fillna(value=0))
'''
             A     B     C   D
2013-01-01   0   0.0   2.0   3
2013-01-02   4   5.0   0.0   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
'''
print('6')
#将空值列置为真
print(pd.isnull(df))
'''
                A      B      C      D
2013-01-01  False   True  False  False
2013-01-02  False  False   True  False
2013-01-03  False  False  False  False
2013-01-04  False  False  False  False
2013-01-05  False  False  False  False
2013-01-06  False  False  False  False
'''

#!/usr/bin/python
# -*- coding: UTF-8 -*-

"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np

# concatenating
# ignore index
df1 = pd.DataFrame(np.ones((4,4))*0, columns=['a','b','c','d'])
print(df1)
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
print(df2)
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
print(df3)
#axis=0时，表示在行维度上扩展，默认为外连接
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print("(1) test")
print(res)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  1.0  1.0  1.0  1.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
9  2.0  2.0  2.0  2.0
'''
#axis=0时，表示在列维度上扩展，默认为外连接
res1 = pd.concat([df1, df2, df3], axis=1, ignore_index=True)
print("(2) test")
print(res1)
'''
   0    1    2    3    4    5    6    7    8    9    10   11
0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
1  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
3  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
'''

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd

# merging two df by key/keys. (may be used in database)
# simple example
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                                  'A': ['A0', 'A1', 'A2', 'A3'],
                                  'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                                    'C': ['C0', 'C1', 'C2', 'C3'],
                                    'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
#内连接,关联字段为'key'
res = pd.merge(left, right, on='key')
print(res)
'''
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
'''
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                             'key2': ['K0', 'K1', 'K0', 'K1'],
                             'A': ['A0', 'A1', 'A2', 'A3'],
                             'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                              'key2': ['K0', 'K0', 'K0', 'K0'],
                              'C': ['C0', 'C1', 'C2', 'C3'],
                              'D': ['D0', 'D1', 'D2', 'D3']})
print('1')
print(left)
print('2')
print(right)
print('3')
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')  # default for how='inner'
# how = ['left', 'right', 'outer', 'inner']
print("test inner join")
#内连接，且关联主键为'[key1,key2]'联合主键
print(res)
'''
test inner join
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
'''
print('4')
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print("test left join")
#左外连接，以左边的表为主表
print(res)
'''
test left join
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
'''

#!/usr/bin/python
# -*- coding: UTF-8 -*-

"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# plot data

# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
print('1')
print(data)
data = data.cumsum()
##data.plot()
print('2')
print(data)
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
print('3')
print(data)
#本行加上上一行的数
data = data.cumsum()
print('4')
print(data)
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
print('5')
print(ax)
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()