Pandas
pandas是一个基于python编程语言,快速的、强大的、易用的开源数据分析和操作数据集的工具。
Series,一维数组,背后基于numpy
import pandas as pd
import numpy as np
# 国家人口,单位百万
g7_pop = pd.Series([35.467, 63.951, 80.940,60.665, 127.061, 64.511, 318.523])
g7_pop.name = 'GPopulation in millions'
print(g7_pop)
print(g7_pop.dtype)
print(g7_pop.values)
print(type(g7_pop.values))
print(g7_pop[0])
print(g7_pop[1])
print(g7_pop.index)
# 我们可以指定index
g7_pop.index = [
'canada',
'France',
'Germany',
'Italy',
'Japan',
'United Kingdom',
'United States'
]
print(g7_pop)
# 这样来看其实series看起来更像是一个dict字典,但是series是带顺序的,python中的dict字典可没有顺序
# 其实我们一开始就可以传进来index和name的
pd.Series({
'Canada': 35.467,
'France': 63.951,
'Germany': 80.940,
'Italy': 60.665,
'Japan': 127.061,
'United Kingdom': 64.511,
'United States': 318.523
}, name='GPopulation in millions')
pd.Series(g7_pop, index=['France','Germany', ' Italy', 'Spain'])
indexing 根据索引取值
print(g7_pop)
print(g7_pop['Canada'])
print(g7_pop['Japan'])
# 当有了index之后,依然可以根据位置取值
print(g7_pop.iloc[0])
print(g7_pop.iloc[-1])
# multi indexing
print(g7_pop[['Italy', 'France']])
print(g7_pop.iloc[[0,1]])
# 这里需要特别注意的是切片操作
l = ['a','b','c']
print(l[:2]) # 我们得不到元素c
print(g7_pop['Canada':'Italy']) # 我们确可以得到最后的元素
operations and methods
print(g7_pop)
print(g7_pop * 1_000_000)
print(g7_pop.mean())
print(np.log(g7_pop))
print(g7_pop['France':'Italy'].mean())
conditional selection (boolean arrays)
print(g7_pop)
print(g7_pop>70) # 得到True or False
print(g7_pop[g7_pop>70]) # 选择
print(g7_pop.mean())
print(g7_pop[g7_pop>g7_pop.mean()])
print(g7_pop.std())
# ~ not
# | or
# & and
print(g7_pop[(g7_pop>80) | (g7_pop<40)])
print(g7_pop[(g7_pop>80) & (g7_pop<200)])
print(g7_pop[(g7_pop>g7_pop.mean() -
g7_pop.std()/2) | (g7_pop>g7_pop.mean() +
g7_pop.std()/2)])
modifying series
g7_pop['Canada'] = 40.5
print(g7_pop)
g7_pop.iloc[-1] = 500
print(g7_pop)
g7_pop[g7_pop<70] = 99.99
print(g7_pop)
DataFrames
df = pd.DataFrame({
'Population': [35.467, 63.951, 80.940,
60.665, 127.061, 64.511, 318.523],
'GDP': [
1785387,
2833687,
3874437,
2167744,
4602367,
2950039,
17348075
],
'Surface Area': [
9984670,
640679,
357114,
301336,
377930,
242495,
9525067
],
'HDI': [
0.913,
0.888,
0.916,
0.873,
0.891,
0.907,
0.915
],
'Continent': [
'America',
'Europe',
'Europe',
'Europe',
'Asia',
'Europe',
'America'
]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])
print(df) # 有行有列,像table表一样的,dataframe的每一列就是一个series,也就是说我们可以把dataframe看成是一系列series的组合
# 我们可以像之前一样去给上index
df.index = [
'canada',
'France',
'Germany',
'Italy',
'Japan',
'United Kingdom',
'United States'
]
print(df)
print(df.columns)
print(df.index)
print(df.info()) # 会告诉我们每列的数据类型,还会告诉我们有没有空值,有助于我们去做数据清洗data clean
print(df.size)
print(df.shape)
print(df.describe()) # 给出可以统计的列的统计值
print(df.dtypes)
print(df.dtypes.value_counts())
indexing, selecting, slicing
print(df)
print(df.loc['Canada']) # 选择整行
print(df.iloc[-1]) # 选择最后一整行
print(df['Population']) # 选择整列
# 但是不管选择一行一列,返回给我们的都是series
print(df['Population'].to_frame())
print(df[['Population', 'GDP']])
print(df[1:3])
print(df.loc['Italy'])
1print(df.loc['France': 'Italy'])
# 同时操作两个维度
print(df.loc['France':'Italy',
'Population'])
print(df.loc['France':'Italy',
['Population','GDP']])
# 对于iloc也是一样
print(df)
print(df.iloc[0])
print(df.iloc[-1])
print(df.iloc[[0, 1, -1]])
print(df.iloc[1:3])
print(df.iloc[1:3, 3])
print(df.iloc[1:3, [0,3]])
print(df.iloc[1:3, 1:3])
conditional selection (boolean arrays)
print(df)
print(df['Population']>70)
print(df.loc[df['Population']>70])
print(df.loc[df['Population']>70,
'Population'])
print(df.loc[df['Population']>70,
['Population', 'GDP']])
丢弃数据
print(df)
print(df.drop('Canada'))
print(df.drop(['Canada', 'Japan']))
print(df.drop(columns=['Population', 'HDI']))
print(df.drop(['Italy', 'Canada'], axis=0))
print(df.drop(['Population', 'HDI'], axis=1))
print(df.drop(['Italy', 'Canada'],
axis='rows'))
print(df.drop(['Population', 'HDI'],
axis='columns'))
操作
print(df)
print(df[['Population', 'GDP']])
print(df[['Population', 'GDP']] / 100)
# 广播机制
crisis = pd.Series([-1_000_000, -0.3], index=
['GDP', 'HDI'])
print(crisis)
print(df[['GDP','HDI']] + crisis)
print(df)
修改dataframe,之前的操作统统都会返回一个新的dataframe
# 添加新的一列
langs = pd.Series(
['French', 'German', 'Italian'],
index=['France', 'Germany', 'Italy'],
name='Language'
)
df['Language']=langs
print(df) # 虽然language只有几种但是没有关系,NaN
意味着空
df['Language']='English' # 将改变每一行
print(df)
renaming columns
df.rename(
columns={
'HDI': 'Human Development Index',
'Anual Popcorn Consumption': 'APC'
}, index={
'United States': 'USA',
'United Kingdom': 'UK',
'Argentina': 'AR'
}
)
print(df) # 不存在的就改不了,而且是返回新的dataframe,之前的dataframe并没有变
print(df.rename(index=str.upper))
print(df.rename(index=lambda x: x.lower()))
删除和添加
# 删除列
df.drop(columns='Language', inplace=True)
# 添加值,会返回一个新的DataFrame
df.append(pd.Series({
'Population':3,
'GDP':5
}, name='China'))
print(df)
# 可以直接设置新的index和values
df.loc['China'] = pd.Series({'Population':
1_400_000_000, 'Continent':'Asia'})
print(df)
# 可以删除某一个行
df.drop('China', inplace=True)
print(df)
# 更彻底的改变index
df.reset_index()
df.set_index('Population')
通过其它列创建新的列
print(df)
print(df[['Population', 'GDP']])
print(df['GDP']/df['Population'])
df['GDP Per capita'] = df['GDP'] /
df['Population']
print(df)
统计信息
print(df)
print(df.head())
print(df.describe())
population = df['Population']
population.min()
population.max()
population.mean()
population.std()
population.median()
population.describe()
population.quantile(.25)
population.quantile([.2, .4, .6, .8, 1])
reading external data
import numpy as np
import pandas as pd
df = pd.read_csv('dta/btc-market-price.csv')
print(df.head())
df = pd.read_csv('dta/btc-market-price.csv',
header=None)
print(df.head())
df.columns = ['Timestamp', 'Price']
print(df.head())
print(df.shape)
print(df.info())
print(df.tail())
print(df.tail(3))
# 日期格式是object,而不是日期类型
print(df.dtypes)
print(pd.to_datetime(df['Timestamp']).head()
)
df['Timestamp']=pd.to_datetime(df['Timestamp
'])
print(df.head())
print(df.dtypes)
df.set_index('Timestamp', inplace=True)
print(df.head())
# 这样去做的好处是可以方便进行查询数据
print(df.loc['2017-09-29'])
# 有更好的方式一行搞定以上几行
df = pd.read_csv(
'dta/btc-market-price.csv',
header=None,
names=['Timestamp', 'Price'],
index_col=0, # 第一列作为index
parse_dates=True
)
print(df.head())
pandas 绘图
print(df)
print(df.plot())
import matplotlib.pyplot as plt
plt.plot(df.index, df['Price'])
plt.plot(x, x**2)
plt.plot(x, -1*(x**2))
plt.figure(figsize=(12,6))
plt.plot(x, x**2)
plt.plot(x, -1*(x**2))
plt.title('My Nice Plot')
# plt.figure() 和 plt.plot() 中的一些参数在
pandas的plot中也是可用的
df.plot(figsize=(16,9), title='Bitcoin Price2017-2022')
df.plot.hist() # bar() pie() barh()
# 同时绘制dataframe中两列数据?
prices.plot(figsize(12,6))
# 绘制一段时间区间段的数据?
prices.loc['2017-12-01':'2018-01-01'].plot(figsize=(12,6))
sorting and functions
df.apply(np.sqrt)
df.apply(lambda x:x/10)
for x in df:
print(x)
for key,value in df['age'].iteritems():
print("{}:{}".format(key,value))
for row in df.iterrows():
print(row)
df.sort_index(inplace=True)
print(df)
df.sort_values(by=
['Name','Age'],inplace=True)
print(df)
to_csv()
df.T
df.sum()
df.prod()
join、merge
names = {
'SSN': [ 2,5,7,8],
'Name': ['Anna','Bob','John','Mike']
}
ages = {
'SSN': [1,2,3,4],
'Age': [28, 34, 45, 62]
}
df= pd.DataFrame(names)
df= pd.DataFrame(ages)
1df = pd.merge(df1, df2, on='SSN',
how='outer') # left right
df.set('SSN', inplace=True)
print(df)
处理缺失值
import numpy as np
import pandas as pd
df.isna()
df.dropna()
df.fillna()