Pandas 简介:
python数据分析library
是基于Numpy构建的一个library
有一种用python做Excel/SQL/R的感觉
现在流行的机器学习框架 Tensorflow/PyTorch 等等,语法都与Numpy比较接近
# encoding=utf-8
import numpy as np
import pandas as pd
def main():
"""
一个Dataframe就是一张表格,Series表示的是一维数组,Dataframe则是一个二维数组,
可以类比成一张excel的spreadsheet,也可以理解成数据库中的一张表,
最后也可以把Dataframe当做一组Series的集合
"""
# 1.创建一个Dataframe, dataframe可以由 dictionary or list 构建得到
data = {
"city": ["beijing", "shanghai", "guangzhou", "shenzhen", "wuhan"],
"year": [2016, 2017, 2016, 2017, 2016],
"population": [2100, 2300, 1000, 700, 500]
}
# columns的名字和顺序可以指定,如果指定不存在的column,会生成对应的column并设置为None
data_frame = pd.DataFrame(data, columns=["city", "year", "population", "debt"])
print(data_frame, "\n", type(data_frame))
"""
city year population debt
0 beijing 2016 2100 NaN
1 shanghai 2017 2300 NaN
2 guangzhou 2016 1000 NaN
3 shenzhen 2017 700 NaN
4 wuhan 2016 500 NaN
<class 'pandas.core.frame.DataFrame'>
"""
# 可以指定index的名称顺序,如果有多余的index会报错!
data_frame_index = pd.DataFrame(data, index=["one", "two", "three", "four", "five"])
print(data_frame_index, "\n", type(data_frame_index))
"""
city population year
one beijing 2100 2016
two shanghai 2300 2017
three guangzhou 1000 2016
four shenzhen 700 2017
five wuhan 500 2016
<class 'pandas.core.frame.DataFrame'>
"""
# 2.从Dataframe中选择数据
city_series = data_frame["city"]
print(city_series, "\n", type(city_series)) # 得到的是一个series
"""
0 beijing
1 shanghai
2 guangzhou
3 shenzhen
4 wuhan
Name: city, dtype: object
<class 'pandas.core.series.Series'>
"""
# 上面的data_frame["city"]也可以写成dataframe.city
print(data_frame.city)
# 怎样读一行的数据呢?
one_series = data_frame.ix[1]
print(one_series, "\n", type(one_series))
"""
city shanghai
year 2017
population 2300
debt NaN
Name: 1, dtype: object
<class 'pandas.core.series.Series'>
"""
# 上面的data_frame.ix[1]不能写成data_frame.ix.1
# 可以写成data_frame.ix[index_name]
# 切片取值,是根据index来取
print(data_frame[2:-1]) # 根据默认的index取值是顾前不顾尾
"""
city year population debt
2 guangzhou 2016 1000 NaN
3 shenzhen 2017 700 NaN
"""
# 根据手动指定的index来切片
print(data_frame_index["one": "three"]) # 使用指定的index取值是顾前又顾尾
"""
city population year
one beijing 2100 2016
two shanghai 2300 2017
three guangzhou 1000 2016
"""
# 同时指定 index and columns 来切片,切完之后发现不像切片,像坐标点取值["one", "city"] ["one", "year"] ...
print(data_frame_index.ix[["one", "three"], ["city", "year"]])
"""
city year
one beijing 2016
three guangzhou 2016
"""
# 这个像范围指定,适用于对列的切片,例如:[:, "city": "year"]
print(data_frame_index.ix["one": "three", "city": "year"])
"""
city population year
one beijing 2100 2016
two shanghai 2300 2017
three guangzhou 1000 2016
"""
# 使用 boolean indexing 来过滤行
print(data_frame_index[data_frame_index["population"] > 2000])
"""
city population year
one beijing 2100 2016
two shanghai 2300 2017
"""
print(data_frame_index[data_frame_index.index == "one"])
"""
city population year
one beijing 2100 2016
"""
# 取columns的内容
print(data_frame.columns)
print(type(data_frame.columns))
"""
Index(['city', 'year', 'population', 'debt'], dtype='object')
<class 'pandas.core.indexes.base.Index'>
"""
# 取index的内容
print(data_frame.index)
print(type(data_frame.index))
"""
RangeIndex(start=0, stop=5, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>
"""
# 3.给Dataframe赋值
# 给单个数据赋值
data_frame["population"][0] = 2222
# data_frame.ix[4]["year"] = 2019 这样貌似用不了
# 给整列赋值
data_frame["debt"] = -200000
# 给整行赋值
data_frame.ix[4] = 0
print(data_frame)
"""
city year population debt
0 beijing 2016 2222 -200000
1 shanghai 2017 2300 -200000
2 guangzhou 2016 1000 -200000
3 shenzhen 2017 700 -200000
4 0 0 0 0
"""
# 还可以把一个ndarray赋值给某一行、某一列
data_frame["debt"] = np.arange(0, 5)
data_frame.ix[4] = np.arange(1, 5)
print(data_frame)
"""
city year population debt
0 beijing 2016 2222 0
1 shanghai 2017 2300 1
2 guangzhou 2016 1000 2
3 shenzhen 2017 700 3
4 1 2 3 4
"""
# 还可以用Series来指定需要修改的index以及相对应的value,没有指定的默认用None
value_series = pd.Series([100, 200, 300]) # 在指定的index名称的情况下也适用
data_frame.debt = value_series # 将一个series赋值给data_frame
print(data_frame)
"""
city year population debt
0 beijing 2016 2222 100.0
1 shanghai 2017 2300 200.0
2 guangzhou 2016 1000 300.0
3 shenzhen 2017 700 NaN
4 1 2 3 NaN
"""
# 使用直接赋值的方式增加一个column
data_frame["western"] = (data_frame.city == "beijing")
# data_frame.western = (data_frame.city == "beijing") # 不存在western列的情况下不能这么写
print(data_frame)
"""
city year population debt western
0 beijing 2016 2222 100.0 True
1 shanghai 2017 2300 200.0 False
2 guangzhou 2016 1000 300.0 False
3 shenzhen 2017 700 NaN False
4 1 2 3 NaN False
"""
# 4.Dataframe转置 和 数据的重新组织
print(data_frame.T)
"""
0 1 2 3 4
city beijing shanghai guangzhou shenzhen 1
year 2016 2017 2016 2017 2
population 2222 2300 1000 700 3
debt 100 200 300 NaN NaN
western True False False False False
"""
# 数据的重新组织
populations = {
"beijing": {2016: 2100, 2017: 2200},
"shanghai": {2015: 2400, 2016: 2500, 2017: 2600}
}
pop_data_frame = pd.DataFrame(populations)
print(pop_data_frame)
"""
beijing shanghai
2015 NaN 2400
2016 2100.0 2500
2017 2200.0 2600
"""
print(pop_data_frame.T)
"""
2015 2016 2017
beijing NaN 2100.0 2200.0
shanghai 2400.0 2500.0 2600.0
"""
# 获取两个城市的人口数据的series
beijing_series = pop_data_frame["beijing"][:-1] # 不取最后一个
print(beijing_series)
"""
2015 NaN
2016 2100.0
Name: beijing, dtype: float64
"""
shanghai_series = pop_data_frame["shanghai"][:-1]
print(shanghai_series)
"""
2015 2400
2016 2500
Name: shanghai, dtype: int64
"""
# 重新构建data_frame
pdata = {"beijing": beijing_series, "shanghai": shanghai_series}
pdata_frame = pd.DataFrame(pdata)
print(pdata_frame)
"""
beijing shanghai
2015 NaN 2400
2016 2100.0 2500
"""
# 5.几个常用的function
# 指定index and columns 的名字
pdata_frame.columns.name = "city"
pdata_frame.index.name = "year"
print(pdata_frame)
"""
city beijing shanghai
year
2015 NaN 2400
2016 2100.0 2500
"""
# 取values
print(pdata_frame["beijing"].values, "\n", type(pdata_frame.values))
"""
[ nan 2100.]
<class 'numpy.ndarray'>
"""
if __name__ == '__main__':
main()