pandas数据处理

import sys, time

class ShowProcess():
    """
    显示处理进度的类
    调用该类相关函数即可实现处理进度的显示
    """
    i = 0 # 当前的处理进度
    max_steps = 0 # 总共需要处理的次数
    max_arrow = 50 #进度条的长度
    infoDone = 'done'

    # 初始化函数,需要知道总共的处理次数
    def __init__(self, max_steps, infoDone = 'Done'):
        self.max_steps = max_steps
        self.i = 0
        self.infoDone = infoDone

    # 显示函数,根据当前的处理进度i显示进度
    # 效果为[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]100.00%
    def show_process(self, i=None):
        if i is not None:
            self.i = i
        else:
            self.i += 1
        num_arrow = int(self.i * self.max_arrow / self.max_steps) #计算显示多少个'>'
        num_line = self.max_arrow - num_arrow #计算显示多少个'-'
        percent = self.i * 100.0 / self.max_steps #计算完成进度,格式为xx.xx%
        process_bar = '[' + '#' * num_arrow + '-' * num_line + ']'\
                      + '%.2f' % percent + '%' + '\r' #带输出的字符串,'\r'表示不换行回到最左边
        sys.stdout.write(process_bar) #这两句打印字符到终端
        sys.stdout.flush()
        if self.i >= self.max_steps:
            self.close()

    def close(self):
        print('')
        print(self.infoDone)
        self.i = 0

if __name__=='__main__':
    max_steps = 4

    process_bar = ShowProcess(max_steps, 'OK')

    for i in range(max_steps):
        process_bar.show_process()
        time.sleep(0.1)

[##################################################]100.00%
OK
import pandas as pd
orders=pd.read_table('./data/chipotle.tsv')
orders.head()
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98

users=pd.read_table('./data/chipotle.tsv',header=None)
users.head()
01234
0order_idquantityitem_namechoice_descriptionitem_price
111Chips and Fresh Tomato SalsaNaN$2.39
211Izze[Clementine]$3.39
311Nantucket Nectar[Apple]$3.39
411Chips and Tomatillo-Green Chili SalsaNaN$2.39
users.shape
(4623, 5)
type(users)
pandas.core.frame.DataFrame
users.describe(include=['object'])
01234
count46234623462333774623
unique18351051104479
top9261Chicken Bowl[Diet Coke]$8.75
freq234355726134730
users.describe()
01234
count46234623462333774623
unique18351051104479
top9261Chicken Bowl[Diet Coke]$8.75
freq234355726134730
users.columns
Int64Index([0, 1, 2, 3, 4], dtype='int64')
users.rename(columns = {'0':'order_id','1':'quantity','2':'item_name','3':'choice_description','4':'item_price'},inplace=
            True)
users.columns
Int64Index([0, 1, 2, 3, 4], dtype='int64')
users_cols=['order_id','quantity','item_name','choice_description','item_price']
users.columns=users_cols
users.head()
order_idquantityitem_namechoice_descriptionitem_price
0order_idquantityitem_namechoice_descriptionitem_price
111Chips and Fresh Tomato SalsaNaN$2.39
211Izze[Clementine]$3.39
311Nantucket Nectar[Apple]$3.39
411Chips and Tomatillo-Green Chili SalsaNaN$2.39
use=pd.read_table('./data/chipotle.tsv',names=users_cols,header=0)
use.head()
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
use.columns
Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')
use.columns=use.columns.str.replace(' ','_')
use.columns
Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')
use.item_price.sort_values()
261     $1.09 
1805    $1.09 
1030    $1.09 
3020    $1.09 
3021    $1.09 
         ...  
4547    $9.39 
4391    $9.39 
2600    $9.39 
4241    $9.39 
4390    $9.39 
Name: item_price, Length: 4622, dtype: object
use.sort_values('item_price',ascending=False)
order_idquantityitem_namechoice_descriptionitem_price
262410421Steak Salad Bowl[Fresh Tomato Salsa, [Black Beans, Sour Cream,...$9.39
441917621Steak Salad Bowl[Roasted Chili Corn Salsa, [Fajita Vegetables,...$9.39
403616151Steak Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Chees...$9.39
18257381Barbacoa Salad Bowl[Fresh Tomato Salsa, [Rice, Pinto Beans, Chees...$9.39
311512431Carnitas Salad Bowl[Tomatillo Green Chili Salsa, [Rice, Pinto Bea...$9.39
..................
314512541Canned Soda[Diet Dr. Pepper]$1.09
4141801Canned Soda[Dr. Pepper]$1.09
316212621Canned Soda[Coca Cola]$1.09
8213381Canned Soda[Coca Cola]$1.09
14575911Canned Soda[Sprite]$1.09

4622 rows × 5 columns

use.head()
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
import pandas as pd 
import numpy as np
s2=pd.Series([4.0,6.5,-0.5,4.2],index=['d','b','a','c'])
print(s2)
d    4.0
b    6.5
a   -0.5
c    4.2
dtype: float64
s2[['a','b','c']]
a   -0.5
b    6.5
c    4.2
dtype: float64
#Series:一个定长的有序字典
dic1={'apple':5,'pen':3,'applepen':10}
s3=pd.Series(dic1)
print(s3)
apple        5
pen          3
applepen    10
dtype: int64
#DataFrame
data={'year':[2014,2015,2016,2017],
     'income':[10000,30000,50000,60000],
     'pay':[5000,20000,30000,30000]
     }
df1=pd.DataFrame(data,index=['a','b','c','d'])
df1
yearincomepay
a2014100005000
b20153000020000
c20165000030000
d20176000030000
df2=pd.DataFrame(np.arange(12).reshape(3,4))
df2
0123
00123
14567
2891011
df3=pd.DataFrame(np.arange(12).reshape(3,4),index=['a','c','b'],columns=[2,33,44,5])
df3
233445
a0123
c4567
b891011
df1.columns
Index(['year', 'income', 'pay'], dtype='object')
df1.values
array([[ 2014, 10000,  5000],
       [ 2015, 30000, 20000],
       [ 2016, 50000, 30000],
       [ 2017, 60000, 30000]], dtype=int64)
df1.describe()
yearincomepay
count4.0000004.0000004.000000
mean2015.50000037500.00000021250.000000
std1.29099422173.55782611814.539066
min2014.00000010000.0000005000.000000
25%2014.75000025000.00000016250.000000
50%2015.50000040000.00000025000.000000
75%2016.25000052500.00000030000.000000
max2017.00000060000.00000030000.000000
df1.T
abcd
year2014201520162017
income10000300005000060000
pay5000200003000030000
df3.sort_index(axis=1)
253344
a0312
c4756
b811910
df3.sort_values(by=44)#按照某一列排序
233445
a0123
c4567
b891011
dates=pd.date_range('20190101',periods=6)
df1=pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['a','b','c','d'])
df1
abcd
2019-01-010123
2019-01-024567
2019-01-03891011
2019-01-0412131415
2019-01-0516171819
2019-01-0620212223
df1.a
2019-01-01     0
2019-01-02     4
2019-01-03     8
2019-01-04    12
2019-01-05    16
2019-01-06    20
Freq: D, Name: a, dtype: int32
#通过标签选择数据
df1.loc['20190101',['a','c']]
a    0
c    2
Name: 2019-01-01 00:00:00, dtype: int32
df1.loc[:,['a','c']]
ac
2019-01-0102
2019-01-0246
2019-01-03810
2019-01-041214
2019-01-051618
2019-01-062022
#通过位置选择数据
df1.iloc[2]
a     8
b     9
c    10
d    11
Name: 2019-01-03 00:00:00, dtype: int32
df1.iloc[[1,2,4],[1,3]]
bd
2019-01-0257
2019-01-03911
2019-01-051719
df1.a > 6
2019-01-01    False
2019-01-02    False
2019-01-03     True
2019-01-04     True
2019-01-05     True
2019-01-06     True
Freq: D, Name: a, dtype: bool
df1[df1.a > 6]
abcd
2019-01-03891011
2019-01-0412131415
2019-01-0516171819
2019-01-0620212223
#缺失值处理
dates=np.arange(20190101,20190105)
df1=pd.DataFrame(np.arange(12).reshape(4,3),index=dates,columns=['a','b','c'])
df2=pd.DataFrame(df1,index=dates,columns=['a','b','c','d','e'])
df2
abcde
20190101012NaNNaN
20190102345NaNNaN
20190103678NaNNaN
2019010491011NaNNaN
s1=pd.Series([3,4,6],index=dates[:3])
s2=pd.Series([32,5,2],index=dates[1:])
df2['d']=s1
df2['e']=s2
df2
abcde
201901010123.0NaN
201901023454.032.0
201901036786.05.0
2019010491011NaN2.0
#去除有空值的行
df2.dropna(axis=0,how='any')#how=['any','all'] any任意一个或多个
abcde
201901023454.032.0
201901036786.05.0
#将空值赋值为0
df2.fillna(value=0)
abcde
201901010123.00.0
201901023454.032.0
201901036786.05.0
20190104910110.02.0
df2.isnull()
abcde
20190101FalseFalseFalseFalseTrue
20190102FalseFalseFalseFalseFalse
20190103FalseFalseFalseFalseFalse
20190104FalseFalseFalseTrueFalse

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值