1. 导包
import numpy as np
import pandas as pd
2. Series的创建
data = pd.Series(["skey","syl","earth"]) #Series is a one-dimensional array of indexed data.
print(data)
0 skey
1 syl
2 earth
dtype: object
data.values #取值
array(['skey', 'syl', 'earth'], dtype=object)
data.index
RangeIndex(start=0, stop=3, step=1)
data[:2] #Series取值,总的来讲Series更像是一个版本的Array
0 skey
1 syl
dtype: object
info = pd.Series(["job","15","student"],index=["name","age","work"]) #编辑索引
print(info)
name job
age 15
work student
dtype: object
info["age"] #取值方法
'15'
dic = {
"name":"zha",
"age":"15"
}
dic2ser = pd.Series(dic)
print(dic2ser,type(dic2ser)) # 利用字典来创建一个Series
name zha
age 15
dtype: object
print(
pd.Series({
1:1,
3:13,
2:14
})
)
1 1
3 13
2 14
dtype: int64
3.Dataframe的创建与读取
df = pd.DataFrame(np.array(range(10)).reshape(2,5),index=["one","two"],columns=[1,2,3,4,5])
print(df)
1 2 3 4 5
one 0 1 2 3 4
two 5 6 7 8 9
print(df.index,"\n",df.columns)
Index(['one', 'two'], dtype='object')
Int64Index([1, 2, 3, 4, 5], dtype='int64')
print(df[1]) #取列数据
one 0
two 5
Name: 1, dtype: int32
#.ix is deprecated. Please use
#.loc for label based indexing or
#.iloc for positional indexing
print(df.loc["one"]) #取行数据,即记录
print(df.iloc[0])
1 0
2 1
3 2
4 3
5 4
Name: one, dtype: int32
1 0
2 1
3 2
4 3
5 4
Name: one, dtype: int32
4. As joins across datasets
indA = pd.Index([1,2,3,4,5,6])
indB = pd.Index([1,3,5,7])
indA & indB #即数据库中的内连接,两个集合的交集
Int64Index([1, 3, 5], dtype='int64')
indA | indB #即数据库中的外链接,两个集合的并集
Int64Index([1, 2, 3, 4, 5, 6, 7], dtype='int64')
indA ^ indB #两个集合中不重复的部分
Int64Index([2, 4, 6, 7], dtype='int64')
Se = pd.Series(range(5),index=["one","two","three","four","five"])
print(Se)
print(Se[0:2]) #按照下标取值,不包含尾端元素
one 0
two 1
three 2
four 3
five 4
dtype: int64
one 0
two 1
dtype: int64
print(Se["one":"three"]) #按照index取值,包含尾端元素
one 0
two 1
three 2
dtype: int64
5.Dataframe数据读取
df = pd.read_excel("G:\\python3\\Python_notebook\\第3关:数据分析的基本过程\\朝阳医院2018年销售数据.xlsx")
print(df.head(5)) #查看前五行数据
购药时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额
0 2018-01-01 星期五 1.616528e+06 236701.0 强力VC银翘片 6.0 82.8 69.00
1 2018-01-02 星期六 1.616528e+06 236701.0 清热解毒口服液 1.0 28.0 24.64
2 2018-01-06 星期三 1.260283e+07 236701.0 感康 2.0 16.8 15.00
3 2018-01-11 星期一 1.007034e+10 236701.0 三九感冒灵 1.0 28.0 28.00
4 2018-01-15 星期五 1.015543e+08 236701.0 三九感冒灵 8.0 224.0 208.00
print(df.columns)
Index(['购药时间', '社保卡号', '商品编码', '商品名称', '销售数量', '应收金额', '实收金额'], dtype=