#!/usr/bin/python
from __future__ import print_function
from __future__ import with_statement
import os #获取当前工作路径
import numpy as np
import pandas as pd
from pandas import * # Sereis, DataFrame
file = os.getcwd() + '\\1.csv' #获取文件路径,文件命名并传给变量file
print(file)
'''生成4X4表格,索引列为abcd,行栏为wxyz'''
data = DataFrame(np.arange(16).reshape(4,4),index=list('abcd'),columns=list('wxyz'))
print(data)
print(data['w']) #选择表格中的'w'列,使用类字典属性,返回的是Series类型
print(data.w) #选择表格中的'w'列,使用点属性,返回的是Series类型
print(data[['w']]) #选择表格中的'w'列,返回的是DataFrame类型
print(data[['w','z']]) #选择表格中的'w'、'z'列
data = DataFrame({'a':[1, 2, 3], 'b': [4, 5, 6]}) #要保存的数据
print(data)
data.to_csv(file,index=None,encoding='utf-8')
import pandas as pd
import numpy as np
'''第一种写法:当值都是list类型的数据'''
data1 = {'A':range(3),'B':list("abc"),'C':['red',np.NaN,'yellow']}
df1=pd.DataFrame(data1)
'''第二种写法:当值为string类型的数据,此时需要加上 index=[0] 因为pandas 的dataframe需要一个可迭代的对象'''
data2 = {'姓名': 'fuhang', '性别': '男', '昵称': '那时的吻真香'}
df2=pd.DataFrame(data2,index=[0])
print(df2)
df1.to_csv('Result1.csv',index=None,encoding='utf-8')
df2.to_csv('Result2.csv',index=None,encoding='utf-8')
df1.to_csv('Result1.csv',index=None,mode='a')
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np
#生成时间序列
dates = pd.date_range('20130101', periods=6)
print('1')
print(dates)
#numpy.random.randn(d0,d1,…,dn)
#randn函数返回一个或者一组样本,具有标准正态分布
#dn表示每个维度
#返回值为指定维度的排列
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A', 'B', 'C', 'D'])
print('2')
print(df)
#将第三行第三列的数值修改为1111
df.iloc[2,2] = 1111
print('3')
print(df)
#将索引列为2013-01-03,索引行为D的数值修改为2222
df.loc['2013-01-03', 'D'] = 2222
print('3')
print(df)
#将索引行大于0的数值置为0
df.A[df.A>0] = 0
print('4')
print(df)
#增加索引行F并置为空
df['F'] = np.nan
print('5')
print(df)
#增加索引行G,并按照日期索引列序列排序值赋值
df['G'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101', periods=6))
print('6')
print(df)
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np
#date_range函数生成日期序列
dates = pd.date_range('20130101', periods=6)
#生成带起点和终点的特定步长的排列,dataframe格式为6X4,索引列为日期,索引行为ABCD
df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D'])
print('1')
print(df)
'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23'''
#将第一行第二列的值置为空值
df.iloc[0,1] = np.nan
print('2')
print(df)
'''
A B C D
2013-01-01 0 NaN 2 3
2013-01-02 4 5.0 6 7
2013-01-03 8 9.0 10 11
2013-01-04 12 13.0 14 15
2013-01-05 16 17.0 18 19
2013-01-06 20 21.0 22 23
'''
#将第二行第三列的值置为空值
df.iloc[1,2] = np.nan
print('3')
print(df)
'''
A B C D
2013-01-01 0 NaN 2.0 3
2013-01-02 4 5.0 NaN 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
'''
print('4')
#将含有空值的行丢弃掉
print(df.dropna()) # dropna默认丢弃任何含有缺失的行:
'''
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
'''
print('5')
#将空值列置为0
print(df.fillna(value=0))
'''
A B C D
2013-01-01 0 0.0 2.0 3
2013-01-02 4 5.0 0.0 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
'''
print('6')
#将空值列置为真
print(pd.isnull(df))
'''
A B C D
2013-01-01 False True False False
2013-01-02 False False True False
2013-01-03 False False False False
2013-01-04 False False False False
2013-01-05 False False False False
2013-01-06 False False False False
'''
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np
# concatenating
# ignore index
df1 = pd.DataFrame(np.ones((4,4))*0, columns=['a','b','c','d'])
print(df1)
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
print(df2)
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
print(df3)
#axis=0时,表示在行维度上扩展,默认为外连接
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print("(1) test")
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 1.0 1.0 1.0 1.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
9 2.0 2.0 2.0 2.0
'''
#axis=0时,表示在列维度上扩展,默认为外连接
res1 = pd.concat([df1, df2, df3], axis=1, ignore_index=True)
print("(2) test")
print(res1)
'''
0 1 2 3 4 5 6 7 8 9 10 11
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
3 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN
'''
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
# merging two df by key/keys. (may be used in database)
# simple example
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
#内连接,关联字段为'key'
res = pd.merge(left, right, on='key')
print(res)
'''
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
'''
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print('1')
print(left)
print('2')
print(right)
print('3')
res = pd.merge(left, right, on=['key1', 'key2'], how='inner') # default for how='inner'
# how = ['left', 'right', 'outer', 'inner']
print("test inner join")
#内连接,且关联主键为'[key1,key2]'联合主键
print(res)
'''
test inner join
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
'''
print('4')
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print("test left join")
#左外连接,以左边的表为主表
print(res)
'''
test left join
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
'''
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
print('1')
print(data)
data = data.cumsum()
##data.plot()
print('2')
print(data)
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
print('3')
print(data)
#本行加上上一行的数
data = data.cumsum()
print('4')
print(data)
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
print('5')
print(ax)
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()