Pandas 简介:
python数据分析library
是基于Numpy构建的一个library
有一种用python做Excel/SQL/R的感觉
现在流行的机器学习框架 Tensorflow/PyTorch 等等,语法都与Numpy比较接近
# encoding=utf-8
import numpy as np
import pandas as pd
def main():
"""
数据结构Series
Series 是一个一维的数据结构类
"""
# 示例,可以接收一个list or dictionary
s = pd.Series(["chenkai", "man", 178.00, 65, -1])
print(s, "\n", type(s))
# 1.pandas会默认用0-n来作为Series的index,但是我们也可以自己指定index
"""
0 chenkai
1 man
2 178
3 65
4 -1
dtype: object
<class 'pandas.core.series.Series'>
"""
# 说来就来
s = pd.Series(["chenkai", "man", 178.00, 65, -1],
index=["one", "two", "three", "four", "five"]) # index个数和列表长度需要保持一致,不然会报错
print(s, "\n", type(s))
"""
one chenkai
two man
three 178
four 65
five -1
dtype: object
<class 'pandas.core.series.Series'>
"""
# 2.下面我们使用dictionary来构建一个Series,因为Series本来就是 key value pairs
cities = {"beijing": 50000, "shanghai": 40000, "shenzhen": 40000, "hangzhou": 30000, "wuhan": 20000, "suzhou": None}
apts = pd.Series(cities)
print(apts, "\n", type(apts))
"""
beijing 50000.0
hangzhou 30000.0
shanghai 40000.0
shenzhen 40000.0
suzhou NaN
wuhan 20000.0
dtype: float64
<class 'pandas.core.series.Series'>
"""
# 3.获取数据,拿一个和拿多个有区别
print(apts["wuhan"], "\n", type(apts["wuhan"])) # 20000.0 <class 'numpy.float64'>
# 取多个值
many_apts = apts[["beijing", "shanghai", "wuhan"]]
print(many_apts, "\n", type(many_apts))
"""
beijing 50000.0
shanghai 40000.0
wuhan 20000.0
dtype: float64
<class 'pandas.core.series.Series'>
"""
# 切片取值,按照默认的index取值的规则是顾前不顾尾
print("="*50, apts[1:3])
"""
hangzhou 30000.0
shanghai 40000.0
dtype: float64
"""
# TODO series的行级排序是有序的吗?如果不是有序的, 那么每次切片的结果就不一致了
print(apts["hangzhou": "wuhan"]) # 按照指定的index取值包含最后一个
"""
hangzhou 30000.0
shanghai 40000.0
shenzhen 40000.0
suzhou NaN
wuhan 20000.0
dtype: float64
"""
# 4.numpy中有一个boolean indexing,在pandas中同样适用
less_tha_30000 = apts < 30000
print(less_tha_30000, "\n", type(less_tha_30000))
"""
beijing False
hangzhou False
shanghai False
shenzhen False
suzhou False
wuhan True
dtype: bool
<class 'pandas.core.series.Series'>
"""
print(apts[less_tha_30000], "\n", type(apts[less_tha_30000]))
# 以上两步可以写成一步:
print(apts[apts < 30000], "\n", type(apts[apts < 30000]))
"""
wuhan 20000.0
dtype: float64
<class 'pandas.core.series.Series'>
"""
# 5.Series元素赋值
apts["wuhan"] = 25000
print(apts["wuhan"], "\n", type(apts["wuhan"]))
"""
25000.0
<class 'numpy.float64'>
"""
# 先用boolean indexing 过滤,然后对得到的数据进行统一赋值
apts[apts < 30000] = 15000
print(apts, "\n", type(apts))
"""
beijing 50000.0
hangzhou 30000.0
shanghai 40000.0
shenzhen 40000.0
suzhou NaN
wuhan 15000.0
dtype: float64
<class 'pandas.core.series.Series'>
"""
# 6.Series数学运算
# 支持加减乘除 and **(平方:apts ** 2)
apts = apts / 2
print(apts, "\n", type(apts))
"""
beijing 25000.0
hangzhou 15000.0
shanghai 20000.0
shenzhen 20000.0
suzhou NaN
wuhan 7500.0
dtype: float64
<class 'pandas.core.series.Series'>
"""
# 我们再生成一个Series用来做加法
cities = {"beijing": 200000, "shanghai": 200000, "shenzhen": 200000, "wuhan": 100000, "tianjin": 150000}
cars = pd.Series(cities)
print(apts + cars, "\n", type(apts + cars))
# 两边只有index相同的部分才相加,不同的为None
"""
beijing 225000.0
hangzhou NaN
shanghai 220000.0
shenzhen 220000.0
suzhou NaN
tianjin NaN
wuhan 107500.0
dtype: float64
<class 'pandas.core.series.Series'>
"""
# 7.Series数据缺失
# 判断是否有相应的index
print("wuhan" in apts) # True
print("wuhan" in cars) # True
print("tianjin" in apts) # False
# 判断index对应的值是否为空
print(apts.notnull(), "\n", type(apts))
"""
beijing True
hangzhou True
shanghai True
shenzhen True
suzhou False
wuhan True
dtype: bool
<class 'pandas.core.series.Series'>
"""
# 还有对应的isnull()函数
print(apts.isnull())
# 发挥一下想象力,和 boolean indexing 合并起来使用
print(apts[apts.isnull() == True])
"""
suzhou NaN
dtype: float64
"""
# 再来一下
print(apts[apts.notnull() == True])
"""
beijing 25000.0
hangzhou 15000.0
shanghai 20000.0
shenzhen 20000.0
wuhan 7500.0
dtype: float64
"""
if __name__ == '__main__':
main()