python汇总统计_【Python】pandas--汇总和计算描述统计、数据缺失处理)

最新推荐文章于 2024-06-30 18:41:59 发布

weixin_39968640

最新推荐文章于 2024-06-30 18:41:59 发布

阅读量94

点赞数

文章标签： python汇总统计

这篇博客介绍了如何使用Pandas库进行数据操作，包括DataFrame的创建、加总、平均值计算、按行/列求和及求平均、唯一值统计、条件筛选等。同时，讲解了如何处理缺失值，如直接填充、使用特定值填充、前向填充等方法，展示了各种处理缺失数据的实用技巧。

摘要由CSDN通过智能技术生成

df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])

Out[155]:

one two

a 1.40 NaN

b 7.10

-4.5

c NaN NaN

d 0.75

-1.3

df.sum()

Out[156]:

one 9.25

two -5.80

dtype:

float64

df.sum(axis=1)

Out[157]:

a 1.40

b 2.60

c 0.00

d -0.55

dtype:

float64

df.mean(axis=1)

Out[158]:

a 1.400

b 1.300

c NaN

d -0.275

dtype:

float64

df.mean(axis=1,skipna=False)

Out[159]:

a NaN

b 1.300

c NaN

d -0.275

dtype:

float64

df.idxmax()

Out[160]:

one b

two d

dtype:

object

df.cumsum()

Out[161]:

one two

a 1.40 NaN

b 8.50

-4.5

c NaN NaN

d 9.25

-5.8

df.describe()

Out[162]:

one two

count 3.000000 2.000000

mean 3.083333

-2.900000

std 3.493685 2.262742

min 0.750000

-4.500000

25% 1.075000

-3.700000

50% 1.400000

-2.900000

75% 4.250000

-2.100000

max 7.100000

-1.300000

obj=Series(['a','a','b','c']*4)

obj.describe()

Out[165]:

count 16

unique 3

top a

freq 8

dtype:

object

obj

Out[166]:

0 a

1 a

2 b

3 c

4 a

5 a

6 b

7 c

8 a

9 a

10 b

11 c

12 a

13 a

14 b

15 c

dtype:

object

#相关系数和协方差

有待之后解决

#唯一值和统计值

obj=Series(['c','a','d','a','a','b','b','c','c'])

uniques=obj.unique()

uniques

Out[5]:

array(['c', 'a', 'd', 'b'], dtype=object)

a=obj.value_counts()

Out[8]:

c 3

a 3

b 2

d 1

dtype:

int64

a.sort_index()

Out[11]:

a 3

b 2

c 3

d 1

dtype:

int64

a.sort_index(ascending=False)

Out[12]:

d 1

c 3

b 2

a 3

dtype:

int64

obj.value_counts()

Out[13]:

c 3

a 3

b 2

d 1

dtype:

int64

mask=obj.isin(['b','c']) #判断

mask

Out[15]:

0 True

1 False

2 False

3 False

4 False

5 True

6 True

7 True

8 True

dtype:

bool

obj[mask]

Out[16]:

0 c

5 b

6 b

7 c

8 c

dtype:

object

data=DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})

data

Out[18]:

Qu1 Qu2 Qu3

0 1 2 1

1 3 3 5

2 4 1 2

3 3 2 4

4 4 3 4

result=data.apply(pd.value_counts).fillna(0)

result

Out[20]:

Qu1 Qu2 Qu3

1 1.0 1.0 1.0

2 0.0 2.0 1.0

3 2.0 2.0 0.0

4 2.0 0.0 2.0

5 0.0 0.0 1.0

#处理缺失数据

string_data=Series(['aardvark','artichoke',np.nan,'avocado'])

string_data

Out[22]:

0 aardvark

1 artichoke

2 NaN

3 avocado

dtype:

object

string_data.isnull()

Out[23]:

0 False

1 False

2 True

3 False

dtype:

bool

string_data[0]=None

string_data.isnull()

Out[25]:

0 True

1 False

2 True

3 False

dtype:

bool

#滤除缺失数据

from

numpy import nan as NA

data=Series([1,NA,3.5,NA,7])

data

Out[28]:

0 1.0

1 NaN

2 3.5

3 NaN

4 7.0

dtype:

float64

data.dropna()

Out[29]:

0 1.0

2 3.5

4 7.0

dtype:

float64

data[data.notnull()]

Out[30]:

0 1.0

2 3.5

4 7.0

dtype:

float64

data=DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])

data

Out[32]:

0 1 2

0 1.0 6.5 3.0

1 1.0 NaN NaN

2 NaN NaN NaN

3 NaN 6.5 3.0

data.dropna()

Out[33]:

0 1 2

0 1.0 6.5 3.0

data.dropna(how='all') #丢弃全为NA的行

Out[34]:

0 1 2

0 1.0 6.5 3.0

1 1.0 NaN NaN

3 NaN 6.5 3.0

data.dropna(axis=1,how='all') #丢弃全为NA的列

Out[35]:

0 1 2

0 1.0 6.5 3.0

1 1.0 NaN NaN

2 NaN NaN NaN

3 NaN 6.5 3.0

df=DataFrame(np.random.randn(7,3))

Out[37]:

0 1 2

0 0.329323

-0.020140 -1.418435

-0.801842

-0.327760 2.141541

-1.990021

-0.528645 1.976050

3 0.593861 0.877746 0.709352

-0.367444 0.363773

-0.503865

5 0.303713

-0.567919 -1.314300

6 1.207189 0.665677

-0.229915

df.ix[:4,1]=NA;df.ix[:2,2]=NA

Out[39]:

0 1 2

0 0.329323 NaN NaN

-0.801842 NaN NaN

-1.990021 NaN NaN

3 0.593861 NaN 0.709352

-0.367444 NaN

-0.503865

5 0.303713

-0.567919 -1.314300

6 1.207189 0.665677

-0.229915

df.dropna(thresh=3) #时间序列

Out[40]:

0 1 2

0.303713 -0.567919 -1.314300

6 1.207189 0.665677

-0.229915

#填充缺失数据

df.fillna(0)

Out[41]:

0 1 2

0 0.329323 0.000000 0.000000

-0.801842 0.000000 0.000000

-1.990021 0.000000 0.000000

3 0.593861 0.000000 0.709352

-0.367444 0.000000

-0.503865

5 0.303713

-0.567919 -1.314300

6 1.207189 0.665677

-0.229915

df.fillna({1:0.5,3:-1})

Out[42]:

0 1 2

0 0.329323 0.500000 NaN

-0.801842 0.500000 NaN

-1.990021 0.500000 NaN

3 0.593861 0.500000 0.709352

-0.367444 0.500000

-0.503865

5 0.303713

-0.567919 -1.314300

6 1.207189 0.665677

-0.229915

_=df.fillna(0,inplace=True) #总是返回被填充对象的引用

Out[45]:

0 1 2

0 0.329323 0.000000 0.000000

-0.801842 0.000000 0.000000

-1.990021 0.000000 0.000000

3 0.593861 0.000000 0.709352

-0.367444 0.000000

-0.503865

5 0.303713

-0.567919 -1.314300

6 1.207189 0.665677

-0.229915

df.fillna(0,inplace=True)

Out[47]:

0 1 2

0 0.329323 0.000000 0.000000

-0.801842 0.000000 0.000000

-1.990021 0.000000 0.000000

3 0.593861 0.000000 0.709352

-0.367444 0.000000

-0.503865

5 0.303713

-0.567919 -1.314300

6 1.207189 0.665677

-0.229915

df=DataFrame(np.random.randn(6,3))

df.ix[2:,1]=NA;df.ix[4:,2]=NA

Out[50]:

0 1 2

0 1.266212 0.036003 1.549048

1 2.777833

-0.321865 -0.545796

-0.371542 NaN

-0.395398

1.231016 NaN

-1.528668

-1.023311 NaN NaN

5 1.817814 NaN NaN

df.fillna(method='ffill')

Out[51]:

0 1 2

0 1.266212 0.036003 1.549048

1 2.777833

-0.321865 -0.545796

-0.371542 -0.321865 -0.395398

3 1.231016

-0.321865 -1.528668

-1.023311 -0.321865 -1.528668

5 1.817814

-0.321865 -1.528668

df.fillna(method='ffill',limit=2)

Out[52]:

0 1 2

0 1.266212 0.036003 1.549048

1 2.777833

-0.321865 -0.545796

-0.371542 -0.321865 -0.395398

3 1.231016

-0.321865 -1.528668

-1.023311 NaN

-1.528668

5 1.817814 NaN

-1.528668

data=Series([1,NA,3.5,NA,7])

data.fillna(data.mean())

Out[54]:

0 1.000000

1 3.833333

2 3.500000

3 3.833333

4 7.000000

dtype:

float64

weixin_39968640

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫