pandas数据处理

最新推荐文章于 2024-06-10 15:53:13 发布

Mr_yuekitty

最新推荐文章于 2024-06-10 15:53:13 发布

阅读量197

点赞数

分类专栏： Others 文章标签： python

本文链接：https://blog.csdn.net/mr_yuekitty/article/details/105835461

版权

Others 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

import sys, time

class ShowProcess():
    """
    显示处理进度的类
    调用该类相关函数即可实现处理进度的显示
    """
    i = 0 # 当前的处理进度
    max_steps = 0 # 总共需要处理的次数
    max_arrow = 50 #进度条的长度
    infoDone = 'done'

    # 初始化函数，需要知道总共的处理次数
    def __init__(self, max_steps, infoDone = 'Done'):
        self.max_steps = max_steps
        self.i = 0
        self.infoDone = infoDone

    # 显示函数，根据当前的处理进度i显示进度
    # 效果为[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]100.00%
    def show_process(self, i=None):
        if i is not None:
            self.i = i
        else:
            self.i += 1
        num_arrow = int(self.i * self.max_arrow / self.max_steps) #计算显示多少个'>'
        num_line = self.max_arrow - num_arrow #计算显示多少个'-'
        percent = self.i * 100.0 / self.max_steps #计算完成进度，格式为xx.xx%
        process_bar = '[' + '#' * num_arrow + '-' * num_line + ']'\
                      + '%.2f' % percent + '%' + '\r' #带输出的字符串，'\r'表示不换行回到最左边
        sys.stdout.write(process_bar) #这两句打印字符到终端
        sys.stdout.flush()
        if self.i >= self.max_steps:
            self.close()

    def close(self):
        print('')
        print(self.infoDone)
        self.i = 0

if __name__=='__main__':
    max_steps = 4

    process_bar = ShowProcess(max_steps, 'OK')

    for i in range(max_steps):
        process_bar.show_process()
        time.sleep(0.1)

[##################################################]100.00%
OK

import pandas as pd
orders=pd.read_table('./data/chipotle.tsv')

orders.head()

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98


users=pd.read_table('./data/chipotle.tsv',header=None)

users.head()

	0	1	2	3	4
0	order_id	quantity	item_name	choice_description	item_price
1	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
2	1	1	Izze	[Clementine]	$3.39
3	1	1	Nantucket Nectar	[Apple]	$3.39
4	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39

users.shape

(4623, 5)

type(users)

pandas.core.frame.DataFrame

users.describe(include=['object'])

	0	1	2	3	4
count	4623	4623	4623	3377	4623
unique	1835	10	51	1044	79
top	926	1	Chicken Bowl	[Diet Coke]	$8.75
freq	23	4355	726	134	730

users.describe()

	0	1	2	3	4
count	4623	4623	4623	3377	4623
unique	1835	10	51	1044	79
top	926	1	Chicken Bowl	[Diet Coke]	$8.75
freq	23	4355	726	134	730

users.columns

Int64Index([0, 1, 2, 3, 4], dtype='int64')

users.rename(columns = {'0':'order_id','1':'quantity','2':'item_name','3':'choice_description','4':'item_price'},inplace=
            True)

users.columns

Int64Index([0, 1, 2, 3, 4], dtype='int64')

users_cols=['order_id','quantity','item_name','choice_description','item_price']

users.columns=users_cols

users.head()

	order_id	quantity	item_name	choice_description	item_price
0	order_id	quantity	item_name	choice_description	item_price
1	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
2	1	1	Izze	[Clementine]	$3.39
3	1	1	Nantucket Nectar	[Apple]	$3.39
4	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39

use=pd.read_table('./data/chipotle.tsv',names=users_cols,header=0)

use.head()

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98

use.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

use.columns=use.columns.str.replace(' ','_')

use.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

use.item_price.sort_values()

261     $1.09 
1805    $1.09 
1030    $1.09 
3020    $1.09 
3021    $1.09 
         ...  
4547    $9.39 
4391    $9.39 
2600    $9.39 
4241    $9.39 
4390    $9.39 
Name: item_price, Length: 4622, dtype: object

use.sort_values('item_price',ascending=False)

	order_id	quantity	item_name	choice_description	item_price
2624	1042	1	Steak Salad Bowl	[Fresh Tomato Salsa, [Black Beans, Sour Cream,...	$9.39
4419	1762	1	Steak Salad Bowl	[Roasted Chili Corn Salsa, [Fajita Vegetables,...	$9.39
4036	1615	1	Steak Salad Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Chees...	$9.39
1825	738	1	Barbacoa Salad Bowl	[Fresh Tomato Salsa, [Rice, Pinto Beans, Chees...	$9.39
3115	1243	1	Carnitas Salad Bowl	[Tomatillo Green Chili Salsa, [Rice, Pinto Bea...	$9.39
...	...	...	...	...	...
3145	1254	1	Canned Soda	[Diet Dr. Pepper]	$1.09
414	180	1	Canned Soda	[Dr. Pepper]	$1.09
3162	1262	1	Canned Soda	[Coca Cola]	$1.09
821	338	1	Canned Soda	[Coca Cola]	$1.09
1457	591	1	Canned Soda	[Sprite]	$1.09

4622 rows × 5 columns

use.head()

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98

import pandas as pd 
import numpy as np

s2=pd.Series([4.0,6.5,-0.5,4.2],index=['d','b','a','c'])
print(s2)

d    4.0
b    6.5
a   -0.5
c    4.2
dtype: float64

s2[['a','b','c']]

a   -0.5
b    6.5
c    4.2
dtype: float64

#Series:一个定长的有序字典
dic1={'apple':5,'pen':3,'applepen':10}
s3=pd.Series(dic1)
print(s3)

apple        5
pen          3
applepen    10
dtype: int64

#DataFrame
data={'year':[2014,2015,2016,2017],
     'income':[10000,30000,50000,60000],
     'pay':[5000,20000,30000,30000]
     }
df1=pd.DataFrame(data,index=['a','b','c','d'])
df1

	year	income	pay
a	2014	10000	5000
b	2015	30000	20000
c	2016	50000	30000
d	2017	60000	30000

df2=pd.DataFrame(np.arange(12).reshape(3,4))
df2

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11

df3=pd.DataFrame(np.arange(12).reshape(3,4),index=['a','c','b'],columns=[2,33,44,5])
df3

	2	33	44	5
a	0	1	2	3
c	4	5	6	7
b	8	9	10	11

df1.columns

Index(['year', 'income', 'pay'], dtype='object')

df1.values

array([[ 2014, 10000,  5000],
       [ 2015, 30000, 20000],
       [ 2016, 50000, 30000],
       [ 2017, 60000, 30000]], dtype=int64)

df1.describe()

	year	income	pay
count	4.000000	4.000000	4.000000
mean	2015.500000	37500.000000	21250.000000
std	1.290994	22173.557826	11814.539066
min	2014.000000	10000.000000	5000.000000
25%	2014.750000	25000.000000	16250.000000
50%	2015.500000	40000.000000	25000.000000
75%	2016.250000	52500.000000	30000.000000
max	2017.000000	60000.000000	30000.000000

df1.T

	a	b	c	d
year	2014	2015	2016	2017
income	10000	30000	50000	60000
pay	5000	20000	30000	30000

df3.sort_index(axis=1)

	2	5	33	44
a	0	3	1	2
c	4	7	5	6
b	8	11	9	10

df3.sort_values(by=44)#按照某一列排序

	2	33	44	5
a	0	1	2	3
c	4	5	6	7
b	8	9	10	11

dates=pd.date_range('20190101',periods=6)
df1=pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['a','b','c','d'])
df1

	a	b	c	d
2019-01-01	0	1	2	3
2019-01-02	4	5	6	7
2019-01-03	8	9	10	11
2019-01-04	12	13	14	15
2019-01-05	16	17	18	19
2019-01-06	20	21	22	23

df1.a

2019-01-01     0
2019-01-02     4
2019-01-03     8
2019-01-04    12
2019-01-05    16
2019-01-06    20
Freq: D, Name: a, dtype: int32

#通过标签选择数据
df1.loc['20190101',['a','c']]

a    0
c    2
Name: 2019-01-01 00:00:00, dtype: int32

df1.loc[:,['a','c']]

	a	c
2019-01-01	0	2
2019-01-02	4	6
2019-01-03	8	10
2019-01-04	12	14
2019-01-05	16	18
2019-01-06	20	22

#通过位置选择数据
df1.iloc[2]

a     8
b     9
c    10
d    11
Name: 2019-01-03 00:00:00, dtype: int32

df1.iloc[[1,2,4],[1,3]]

	b	d
2019-01-02	5	7
2019-01-03	9	11
2019-01-05	17	19

df1.a > 6

2019-01-01    False
2019-01-02    False
2019-01-03     True
2019-01-04     True
2019-01-05     True
2019-01-06     True
Freq: D, Name: a, dtype: bool

df1[df1.a > 6]

	a	b	c	d
2019-01-03	8	9	10	11
2019-01-04	12	13	14	15
2019-01-05	16	17	18	19
2019-01-06	20	21	22	23

#缺失值处理
dates=np.arange(20190101,20190105)
df1=pd.DataFrame(np.arange(12).reshape(4,3),index=dates,columns=['a','b','c'])
df2=pd.DataFrame(df1,index=dates,columns=['a','b','c','d','e'])
df2

	a	b	c	d	e
20190101	0	1	2	NaN	NaN
20190102	3	4	5	NaN	NaN
20190103	6	7	8	NaN	NaN
20190104	9	10	11	NaN	NaN

s1=pd.Series([3,4,6],index=dates[:3])
s2=pd.Series([32,5,2],index=dates[1:])
df2['d']=s1
df2['e']=s2
df2

	a	b	c	d	e
20190101	0	1	2	3.0	NaN
20190102	3	4	5	4.0	32.0
20190103	6	7	8	6.0	5.0
20190104	9	10	11	NaN	2.0

#去除有空值的行
df2.dropna(axis=0,how='any')#how=['any','all'] any任意一个或多个

	a	b	c	d	e
20190102	3	4	5	4.0	32.0
20190103	6	7	8	6.0	5.0

#将空值赋值为0
df2.fillna(value=0)

	a	b	c	d	e
20190101	0	1	2	3.0	0.0
20190102	3	4	5	4.0	32.0
20190103	6	7	8	6.0	5.0
20190104	9	10	11	0.0	2.0

df2.isnull()

	a	b	c	d	e
20190101	False	False	False	False	True
20190102	False	False	False	False	False
20190103	False	False	False	False	False
20190104	False	False	False	True	False

Mr_yuekitty

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
pandas数据处理

pandas数据处理的一些方式
复制链接

扫一扫

专栏目录