import numpy as np
import pandas as pd
pandas数据整理
flights = pd.read_csv('flights.csv',index_col=0)
# filter
# filter(flights, month == 1, day == 1)
flights.loc[(flights['month']==1) & (flights['day']==1)]
筛选行
# nov_dec <- filter(flights, month %in% c(11, 12))
flights.loc[(flights['month']==11) | (flights['month']==12)]
排序
# arrange(flights, year, month, day)
flights.sort_values(by=['year','month','day'])
# arrange(flights, desc(dep_delay))
flights.sort_values(by=['year','month','day'],ascending = False)
选取列
# 选取列
# select(flights, year, month, day)
flights.filter(items= ['year','month','day'])
# select(flights, year:day)??
# select(flights, -(year:day))
# starts_with("abc"): matches names that begin with “abc”.
# ends_with("xyz"): matches names that end with “xyz”.
# contains("ijk"): matches names that contain “ijk”.
# matches("(.)\\1"): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.
# num_range("x", 1:3): matches x1, x2 and x3.
# df.filter(regex='Q', axis=1) # 列名包含Q的?
# select(flights, time_hour, air_time, everything())
创建新变量
# flights_sml <- select(flights,
# year:day,
# ends_with("delay"),
# distance,
# air_time
# )
# mutate(flights_sml,
# gain = dep_delay - arr_delay,
# speed = distance / air_time * 60
# )
flights_sml = flights.filter(items = ['year','month','day','distance','airtime','dep_delay','arr_delay'])
flights_sml['gain'] = flights_sml['dep_delay']-flights_sml['arr_delay']
# by_day <- group_by(flights, year, month, day)
# summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
flights.groupby(['year','month','day'])['dep_delay'].agg(np.mean)
数据清洗
长短表转换
t1 = pd.read_csv('t1.csv',index_col=0)
t2 = pd.read_csv('t2.csv',index_col=0)
t3 = pd.read_csv('t3.csv',index_col=0)
t4a = pd.read_csv('t4a.csv',index_col=0)
t4b = pd.read_csv('t4b.csv',index_col=0)
# table4a %>%
# pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
t4a
t4a.melt(id_vars='country',var_name='year',value_name='cases')
# table2 %>%
# pivot_wider(names_from = type, values_from = count)
t2
t2.pivot_table(index=['country','year'],columns='type',values= 'count')
seprate 和 unite
# table3 %>%
# separate(rate, into = c("cases", "population"))
t5 = t3['rate'].str.split('/',1,expand=True)
t5
t5.iloc[:,0]+'/'+t5.iloc[:,1]
数据表合并
# pd.merge