pandas数据处理--结合r

21 篇文章 1 订阅
import numpy as np
import pandas as pd

pandas数据整理

flights = pd.read_csv('flights.csv',index_col=0)

# filter
# filter(flights, month == 1, day == 1)
flights.loc[(flights['month']==1) & (flights['day']==1)]

筛选行

# nov_dec <- filter(flights, month %in% c(11, 12))
flights.loc[(flights['month']==11) | (flights['month']==12)]

排序

# arrange(flights, year, month, day)
flights.sort_values(by=['year','month','day'])
# arrange(flights, desc(dep_delay))
flights.sort_values(by=['year','month','day'],ascending = False)

选取列

# 选取列
# select(flights, year, month, day)

flights.filter(items= ['year','month','day'])
# select(flights, year:day)??

# select(flights, -(year:day))



#     starts_with("abc"): matches names that begin with “abc”.

#     ends_with("xyz"): matches names that end with “xyz”.

#     contains("ijk"): matches names that contain “ijk”.

#     matches("(.)\\1"): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.

#     num_range("x", 1:3): matches x1, x2 and x3.


# df.filter(regex='Q', axis=1) # 列名包含Q的?

# select(flights, time_hour, air_time, everything())

创建新变量

# flights_sml <- select(flights, 
#   year:day, 
#   ends_with("delay"), 
#   distance, 
#   air_time
# )
# mutate(flights_sml,
#   gain = dep_delay - arr_delay,
#   speed = distance / air_time * 60
# )



flights_sml = flights.filter(items = ['year','month','day','distance','airtime','dep_delay','arr_delay'])
flights_sml['gain'] = flights_sml['dep_delay']-flights_sml['arr_delay']
# by_day <- group_by(flights, year, month, day)
# summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

flights.groupby(['year','month','day'])['dep_delay'].agg(np.mean)

数据清洗

长短表转换

t1 = pd.read_csv('t1.csv',index_col=0)
t2 = pd.read_csv('t2.csv',index_col=0)
t3 = pd.read_csv('t3.csv',index_col=0)
t4a = pd.read_csv('t4a.csv',index_col=0)
t4b = pd.read_csv('t4b.csv',index_col=0)

# table4a %>% 
#   pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
t4a
t4a.melt(id_vars='country',var_name='year',value_name='cases')
# table2 %>%
#     pivot_wider(names_from = type, values_from = count)
t2
t2.pivot_table(index=['country','year'],columns='type',values= 'count')

seprate 和 unite

# table3 %>% 
#   separate(rate, into = c("cases", "population"))
t5  = t3['rate'].str.split('/',1,expand=True)
t5
t5.iloc[:,0]+'/'+t5.iloc[:,1]

数据表合并

# pd.merge
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值