python_Pandas介绍

1687F

已于 2024-03-08 15:28:23 修改

阅读量388

点赞数 8

文章标签： python pandas 开发语言

于 2024-03-08 11:03:11 首次发布

本文链接：https://blog.csdn.net/m0_61581389/article/details/136555603

版权

Pandas

pandas是一个基于python编程语言，快速的、强大的、易用的开源数据分析和操作数据集的工具。

Series，一维数组，背后基于numpy

import pandas as pd
import numpy as np
# 国家人口，单位百万
g7_pop = pd.Series([35.467, 63.951, 80.940,60.665, 127.061, 64.511, 318.523])
g7_pop.name = 'GPopulation in millions'
print(g7_pop)
print(g7_pop.dtype)
print(g7_pop.values)
print(type(g7_pop.values))
print(g7_pop[0])
print(g7_pop[1])
print(g7_pop.index)
# 我们可以指定index
g7_pop.index = [
'canada',
'France',
'Germany',
'Italy',
    'Japan',
'United Kingdom',
'United States'
]
print(g7_pop)
# 这样来看其实series看起来更像是一个dict字典，但是series是带顺序的，python中的dict字典可没有顺序

# 其实我们一开始就可以传进来index和name的
pd.Series({
'Canada': 35.467,
'France': 63.951,
'Germany': 80.940,
'Italy': 60.665,
'Japan': 127.061,
'United Kingdom': 64.511,
'United States': 318.523
}, name='GPopulation in millions')

pd.Series(g7_pop, index=['France','Germany', ' Italy', 'Spain'])

indexing 根据索引取值

print(g7_pop)
print(g7_pop['Canada'])
print(g7_pop['Japan'])
# 当有了index之后，依然可以根据位置取值
print(g7_pop.iloc[0])
print(g7_pop.iloc[-1])
# multi indexing
print(g7_pop[['Italy', 'France']])
print(g7_pop.iloc[[0,1]])
# 这里需要特别注意的是切片操作
l = ['a','b','c']
print(l[:2]) # 我们得不到元素c
print(g7_pop['Canada':'Italy']) # 我们确可以得到最后的元素

operations and methods

print(g7_pop)
print(g7_pop * 1_000_000)
print(g7_pop.mean())
print(np.log(g7_pop))
print(g7_pop['France':'Italy'].mean())

conditional selection (boolean arrays)

print(g7_pop)
print(g7_pop>70) # 得到True or False
print(g7_pop[g7_pop>70]) # 选择
print(g7_pop.mean())
print(g7_pop[g7_pop>g7_pop.mean()])
print(g7_pop.std())
# ~ not
# | or
# & and
print(g7_pop[(g7_pop>80) | (g7_pop<40)])
print(g7_pop[(g7_pop>80) & (g7_pop<200)])

print(g7_pop[(g7_pop>g7_pop.mean() -
g7_pop.std()/2) | (g7_pop>g7_pop.mean() +
g7_pop.std()/2)])

modifying series

g7_pop['Canada'] = 40.5
print(g7_pop)
g7_pop.iloc[-1] = 500
print(g7_pop)
g7_pop[g7_pop<70] = 99.99
print(g7_pop)

DataFrames

df = pd.DataFrame({
   'Population': [35.467, 63.951, 80.940,
60.665, 127.061, 64.511, 318.523],
   'GDP': [
       1785387,
       2833687,
       3874437,
       2167744,
       4602367,
       2950039,
      17348075
 ],
  'Surface Area': [
      9984670,
      640679,
      357114,
      301336,
      377930,
      242495,
      9525067
 ],
  'HDI': [
      0.913,
      0.888,
      0.916,
      0.873,
      0.891,
      0.907,
      0.915
 ],
  'Continent': [
      'America',
      'Europe',
      'Europe',
      'Europe',
      'Asia',
      'Europe',
      'America'
 ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

print(df) # 有行有列，像table表一样的，dataframe的每一列就是一个series，也就是说我们可以把dataframe看成是一系列series的组合

# 我们可以像之前一样去给上index
df.index = [
  'canada',
  'France',
  'Germany',
  'Italy',
  'Japan',
  'United Kingdom',
  'United States'
]
print(df)
print(df.columns)
print(df.index)
print(df.info()) # 会告诉我们每列的数据类型，还会告诉我们有没有空值，有助于我们去做数据清洗data clean
print(df.size)
print(df.shape)
print(df.describe()) # 给出可以统计的列的统计值
print(df.dtypes)
print(df.dtypes.value_counts())

indexing, selecting, slicing

print(df)
print(df.loc['Canada']) # 选择整行
print(df.iloc[-1]) # 选择最后一整行
print(df['Population']) # 选择整列
# 但是不管选择一行一列，返回给我们的都是series
print(df['Population'].to_frame())
print(df[['Population', 'GDP']])
print(df[1:3])
print(df.loc['Italy'])
1print(df.loc['France': 'Italy'])

# 同时操作两个维度
print(df.loc['France':'Italy',
'Population'])
print(df.loc['France':'Italy',
['Population','GDP']])

# 对于iloc也是一样
print(df)
print(df.iloc[0])
print(df.iloc[-1])
print(df.iloc[[0, 1, -1]])
print(df.iloc[1:3])
print(df.iloc[1:3, 3])
print(df.iloc[1:3, [0,3]])
print(df.iloc[1:3, 1:3])

conditional selection (boolean arrays)

print(df)
print(df['Population']>70)
print(df.loc[df['Population']>70])
print(df.loc[df['Population']>70,
'Population'])
print(df.loc[df['Population']>70,
['Population', 'GDP']])

丢弃数据

print(df)
print(df.drop('Canada'))
print(df.drop(['Canada', 'Japan']))
print(df.drop(columns=['Population', 'HDI']))
print(df.drop(['Italy', 'Canada'], axis=0))
print(df.drop(['Population', 'HDI'], axis=1))
print(df.drop(['Italy', 'Canada'],
axis='rows'))
print(df.drop(['Population', 'HDI'],
axis='columns'))

操作

print(df)
print(df[['Population', 'GDP']])
print(df[['Population', 'GDP']] / 100)
# 广播机制
crisis = pd.Series([-1_000_000, -0.3], index=
['GDP', 'HDI'])
print(crisis)
print(df[['GDP','HDI']] + crisis)
print(df)

修改dataframe，之前的操作统统都会返回一个新的dataframe

# 添加新的一列
langs = pd.Series(
  ['French', 'German', 'Italian'],
   index=['France', 'Germany', 'Italy'],
   name='Language'
)
df['Language']=langs
print(df) # 虽然language只有几种但是没有关系，NaN
意味着空
df['Language']='English' # 将改变每一行
print(df)

renaming columns

df.rename(
columns={
       'HDI': 'Human Development Index',
       'Anual Popcorn Consumption': 'APC'
  }, index={
       'United States': 'USA',
       'United Kingdom': 'UK',
       'Argentina': 'AR'
  }
)
print(df)  # 不存在的就改不了，而且是返回新的dataframe，之前的dataframe并没有变
print(df.rename(index=str.upper))
print(df.rename(index=lambda x: x.lower()))

删除和添加

# 删除列
df.drop(columns='Language', inplace=True)

# 添加值,会返回一个新的DataFrame
df.append(pd.Series({
   'Population':3,
   'GDP':5
}, name='China'))
print(df)

# 可以直接设置新的index和values
df.loc['China'] = pd.Series({'Population':
1_400_000_000, 'Continent':'Asia'})
print(df)
# 可以删除某一个行
df.drop('China', inplace=True)
print(df)

# 更彻底的改变index
df.reset_index()
df.set_index('Population')

通过其它列创建新的列

print(df)
print(df[['Population', 'GDP']])
print(df['GDP']/df['Population'])
df['GDP Per capita'] = df['GDP'] /
df['Population']
print(df)

统计信息

print(df)
print(df.head())
print(df.describe())
population = df['Population']
population.min()
population.max()
population.mean()
population.std()
population.median()
population.describe()
population.quantile(.25)
population.quantile([.2, .4, .6, .8, 1])

reading external data

import numpy as np
import pandas as pd
df = pd.read_csv('dta/btc-market-price.csv')
print(df.head())

df = pd.read_csv('dta/btc-market-price.csv',
header=None)
print(df.head())
df.columns = ['Timestamp', 'Price']
print(df.head())
print(df.shape)
print(df.info())
print(df.tail())
print(df.tail(3))

# 日期格式是object，而不是日期类型
print(df.dtypes)
print(pd.to_datetime(df['Timestamp']).head()
)
df['Timestamp']=pd.to_datetime(df['Timestamp
'])
print(df.head())
print(df.dtypes)
df.set_index('Timestamp', inplace=True)
print(df.head())
# 这样去做的好处是可以方便进行查询数据
print(df.loc['2017-09-29'])

# 有更好的方式一行搞定以上几行
df = pd.read_csv(
'dta/btc-market-price.csv',
  header=None,
  names=['Timestamp', 'Price'],
  index_col=0, # 第一列作为index
  parse_dates=True
)
print(df.head())

pandas 绘图

print(df)
print(df.plot())

import matplotlib.pyplot as plt

plt.plot(df.index, df['Price'])
plt.plot(x, x**2)
plt.plot(x, -1*(x**2))

plt.figure(figsize=(12,6))
plt.plot(x, x**2)
plt.plot(x, -1*(x**2))
plt.title('My Nice Plot')

# plt.figure() 和 plt.plot() 中的一些参数在
pandas的plot中也是可用的
df.plot(figsize=(16,9), title='Bitcoin Price2017-2022')
df.plot.hist() # bar() pie() barh()

# 同时绘制dataframe中两列数据？
prices.plot(figsize(12,6))
# 绘制一段时间区间段的数据？
prices.loc['2017-12-01':'2018-01-01'].plot(figsize=(12,6))

sorting and functions

df.apply(np.sqrt)
df.apply(lambda x:x/10)

for x in df:
   print(x)

for key,value in df['age'].iteritems():
print("{}:{}".format(key,value))

for row in df.iterrows():
print(row)
  
df.sort_index(inplace=True)
print(df)
df.sort_values(by=
['Name','Age'],inplace=True)
print(df)

to_csv()
df.T
df.sum()
df.prod()

join、merge

names = {
   'SSN': [ 2,5,7,8],
   'Name': ['Anna','Bob','John','Mike']
}

ages = {
   'SSN': [1,2,3,4],
   'Age': [28, 34, 45, 62]
}

df= pd.DataFrame(names)
df= pd.DataFrame(ages)

1df = pd.merge(df1, df2, on='SSN',
how='outer')  # left right
df.set('SSN', inplace=True)
print(df)

处理缺失值

import numpy as np
import pandas as pd

df.isna()
df.dropna()
df.fillna()

1687F

关注

8
点赞
踩
5

收藏

觉得还不错? 一键收藏
打赏
0
评论
python_Pandas介绍

pandas是一个基于python编程语言，快速的、强大的、易用的开源数据分析和操作数据集的工具。
复制链接

扫一扫