pandas基础:Series、DataFrame的生成属性和方法

一、Series生成、属性、方法

pandas是用来处理表格型或异质型数据的。numpy是用于处理同质型数值类数据的。pandas的两个数据结构:Series,Pandas。

  • Series是一种一维的数组型对象,它包含了一个值序列(values)和数据标签(index)。
  • DataFrame表示的是矩阵数据表。每一列可以是不同的数据类型。既有行索引(index),也有列索引(columns)。
import pandas as pd
import numpy as np

(一)Series生成

# 1.列表生成Series
obj = pd.Series([4,7,-5,3])
obj2 = pd.Series([4,7,-5,3], index=['a','b','c','d'])
# 2.字典生成Series
obj_dict = {'Leo':None,'Tiger':78,'Coco':91,'Ada':86}
obj3 = pd.Series(obj_dict)
obj3
# 3.结果如下:
# Leo       NaN
# Tiger    78.0
# Coco     91.0
# Ada      86.0
# dtype: float64

(二)Series属性

# Series的属性:index,value,name
obj_dict = {'Leo':None,'Tiger':78,'Coco':91,'Ada':86}
obj3 = pd.Series(obj_dict)
print(obj3.index)
# Index(['Leo', 'Tiger', 'Coco', 'Ada'], dtype='object')

print(obj3.values)
# [nan 78. 91. 86.] 

obj3.name = 'score'
obj3.index.name = 'name'
print(obj3)
# name
# Leo       NaN
# Tiger    78.0
# Coco     91.0
# Ada      86.0
# Name: score, dtype: float64

(三)Series方法

# 1.isnull,notnull缺失值判断
obj_dict = {'Leo':None,'Tiger':78,'Coco':91,'Ada':np.nan}
obj = pd.Series(obj_dict)
print(obj.isnull(),'\n')
# print(obj.notnull(),'\n')

# Leo       True
# Tiger    False
# Coco     False
# Ada       True
# dtype: bool 

# 2.fillna缺失值的填充
obj.fillna(obj.mean())
# 结果如下:
# Leo      84.5
# Tiger    78.0
# Coco     91.0
# Ada      84.5
# dtype: float64

# 3.缺失值的删除
obj.dropna(how='any')

# Tiger    78.0
# Coco     91.0
# dtype: float64

# 4.删除重复值
# Series的drop_duplicates()可以删除重复值,结果仍为Series;pd.unique结果为array
obj = pd.Series([1,1,2,3,5])
obj.duplicated()
obj.drop_duplicates()
pd.unique(obj)

## 5.map方法 
obj = pd.Series(['bacon', 'pulled pork','bacon','pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'])
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
    }

lowercased = obj.str.lower()
lowercased.map(meat_to_animal)

# 
# 0       pig
# 1       pig
# 2       pig
# 3       cow
# 4       cow
# 5       pig
# 6       cow
# 7       pig
# 8    salmon
# dtype: object

## 6.replace方法
obj = pd.Series(['bacon', 'pulled pork','bacon','pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'])
lowercased = obj.str.lower()
lowercased.replace({
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
    })
# 
# 0       pig
# 1       pig
# 2       pig
# 3       cow
# 4       cow
# 5       pig
# 6       cow
# 7       pig
# 8    salmon
# dtype: object

## 7.修改索引:rename
obj_dict = {'Leo':None,'Tiger':78,'Coco':91,'Ada':86}
obj = pd.Series(obj_dict)
obj.rename(index={'Tiger':'tiger'})


## 8.排序:sort_index,sort_values,rank
obj = pd.Series(range(4),index=['d','a','b','c'])
obj.sort_index()
obj.sort_values()

## rank通过平均排名打破平级关系
obj1 = pd.Series([2,-1,3,4,4,5,21,14])
obj1.rank()

# 0    2.0
# 1    1.0
# 2    3.0
# 3    4.5
# 4    4.5
# 5    6.0
# 6    8.0
# 7    7.0
# dtype: float64

二、DataFrame生成、属性、方法

(一)DataFrame生成

# 1.数组生成DataFrame
arr = np.arange(10).reshape(5,2)
df1 = pd.DataFrame(arr, columns = ['col_1', 'col_2'])
# 2.字典生成DataFrame
data = {'state': ['Ohio', 'Ohio','Ohio','Nevada','Nevada','Nevada'],
       'year': [2000, 2001, 2001, 2001, 2002, 2003],
       'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df2 = pd.DataFrame(data)
display(df1, df2)

(二)DataFrame属性

# DataFrame的属性:index, columns, values
data = {'state': ['Ohio', 'Ohio','Ohio','Nevada','Nevada','Nevada'],
       'year': [2000, 2001, 2001, 2001, 2002, 2003],
       'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df2 = pd.DataFrame(data)
print(df2.index)
# RangeIndex(start=0, stop=6, step=1)

print(df2.columns)
# Index(['state', 'year', 'pop'], dtype='object')

print(df2.values)
# [['Ohio' 2000 1.5]
#  ['Ohio' 2001 1.7]
#  ['Ohio' 2001 3.6]
#  ['Nevada' 2001 2.4]
#  ['Nevada' 2002 2.9]
#  ['Nevada' 2003 3.2]]

df2.columns.name = '人口统计'
df2.index.name = '序号'
print(df2)
# 人口统计   state  year  pop
# 序号                     
# 0       Ohio  2000  1.5
# 1       Ohio  2001  1.7
# 2       Ohio  2001  3.6
# 3     Nevada  2001  2.4
# 4     Nevada  2002  2.9
# 5     Nevada  2003  3.2

(三)DataFrame方法

## 1.抽样查看数据:head,tail,sample,take
data = {'state': ['Ohio', 'Ohio','Ohio','Nevada','Nevada','Nevada'],
       'year': [2000, 2001, 2001, 2001, 2002, 2003],
       'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df2 = pd.DataFrame(data)
df2.head()
df2.tail()
df2.loc[:,'pop']
df2.iloc[:,1]
df2.sample(n=3)
sample_idx = np.random.permutation(3) 
df2.take(sample_idx)


## 2.drop删除记录/字段
data = pd.DataFrame(np.arange(16).reshape((4,4)),index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one','two','three','four'])
data.drop(['New York'],inplace=False)         # inplace=False,不在原对象上进行操作,返回新对象
data.drop(['Ohio'],inplace=True)              # inplace=True,在原对象上进行操作,不返回新对象
data.drop(['one'],axis=1,inplace=True)        # 删除列


## 3.函数应用和映射:applymap,apply
data = pd.DataFrame(np.random.randn(3,4), columns=list('abcd'))
np.abs(data)

### 1)将函数应用在每个元素上:精度保留到0.01
format = lambda x: '%.2f' %x
display(data.applymap(format))
display(data.a.map(format))

### 2)将函数应用在一行或一列的数组上:结果为标量或数组
f = lambda x:x.max() - x.mean()
data.apply(f,axis=0)

def f1(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])

data.apply(f1)

三、pandas函数

(一)分箱操作

## 1.pandas中的分箱操作:cut
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
group_names = ['Youth', 'YongAdult','MiddleAged','Senior']
cats = pd.cut(ages, bins,labels=group_names)    
cats
# 返回一个特殊的categorical对象
# [Youth, Youth, Youth, YongAdult, Youth, ..., YongAdult, Senior, MiddleAged, MiddleAged, YongAdult]
# Length: 12
# Categories (4, object): [Youth < YongAdult < MiddleAged < Senior]

cats.codes
cats.categories
pd.value_counts(cats)
# Youth         5
# MiddleAged    3
# YongAdult     3
# Senior        1
# dtype: int64

# 2.qcut根据样本分位数进行分箱
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats.value_counts()

cats_1 = pd.qcut(data, [0,0.1, 0.5, 0.9, 1.0])
cats_1.value_counts()

# (-3.697, -1.326]      100
# (-1.326, -0.00874]    400
# (-0.00874, 1.273]     400
# (1.273, 3.275]        100
# dtype: int64

(二)虚拟变量

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1':range(6)})
dummies = pd.get_dummies(df.key, prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

#data1	key_a	key_b	key_c
#0	0	0	1	0
#1	1	0	1	0
#2	2	1	0	0
#3	3	0	0	1
#4	4	1	0	0
#5	5	0	1	0

(三)生成日期序列

# 1.date_range生成固定频率的日期序列
pd.date_range('2022-01-01','2022-01-15')                 # 默认日频
pd.date_range('2022-01-01',periods=5)                    # 默认日频
pd.date_range(end='2022-01-05',periods=5)                # 默认日频

pd.date_range('2022-01-01','2022-04-01',freq='W-SUN')    # 周频,周日
pd.date_range('2022-01-01','2022-04-01',freq='W-MON')    # 周频,周一
 
pd.date_range('2022-01-01','2022-04-01',freq='M')        # 月频,月初
pd.date_range('2022-01-01','2022-04-01',freq='MS')       # 月频,月初

pd.date_range('2022-01-02','2022-10-01',freq='Q')        # 季频,季末
pd.date_range('2022-01-02','2022-10-01',freq='QS')       # 季频,季初

pd.date_range('2022-01','2025-10',freq='A')              # 年频,年末
pd.date_range('2022-01','2025-10',freq='AS-JAN')         # 年频,年初

# 2.infer_freq判断时间序列的频率
index_freq = ['2022-01-02', '2022-01-09', '2022-01-16', '2022-01-23',
               '2022-01-30', '2022-02-06', '2022-02-13', '2022-02-20',
               '2022-02-27', '2022-03-06', '2022-03-13', '2022-03-20',
               '2022-03-27']
pd.infer_freq(index_freq)

# 3.interval_range生成固定频率的日期interval序列
pd.interval_range(start=0, end=6, periods=4)
pd.interval_range(start=0, end=6, freq=2)
pd.interval_range(start=0, freq=2, periods=4)
pd.interval_range(start=pd.Timestamp('2017-01-01'),periods=3, freq='MS')

参考文档

[1]利用Python进行数据分析

  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值