😐 详见 pandas官网
数据建立
DateFrame
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})
# 创建方法1
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'],
'Sue': ['Pretty good.', 'Bland.']},
index=['Product A', 'Product B'])
# 创建方法2
pd.DataFrame([[35, 21], [41, 34]],
columns=['Apples', 'Bananas'],
index=['2017 Sales', '2018 Sales'])
Series
pd.Series([1, 2, 3, 4, 5])
pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')
重命名和组合(行列)
重命名
reviews.rename(columns={'points': 'score'})
reviews.rename(index={0: 'firstEntry', 1: 'secondEntry'})
reviews.rename_axis("wines", axis='rows').rename_axis("fields", axis='columns')
组合
concat
canadian_youtube = pd.read_csv("../input/youtube-new/CAvideos.csv")
british_youtube = pd.read_csv("../input/youtube-new/GBvideos.csv")
pd.concat([canadian_youtube, british_youtube])
join
left = canadian_youtube.set_index(['title', 'trending_date'])
right = british_youtube.set_index(['title', 'trending_date'])
left.join(right, lsuffix='_CAN', rsuffix='_UK')
merge
.csv数据导入
melbourne_file_path = '../***/***.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
导入数据查看
整体查看
# 查看所有
melbourne_data.describe()
# 查看前5行
melbourne_data.head()
或'查看一行'
reviews.head(1)
# 查看列名
melbourne_data.columns
# 查看数据多大
melbourne_data.shape
查看特定
查看指定列
reviews.country
或
reviews['country']
或
reviews.iloc[:, 0] # 也可查看行,[]内与python行列顺序相反
查看指定行
# 第一行
reviews.iloc[0]
# 倒数五行
reviews.iloc[-5:]
查看指定(范围)格
# 查看一格
reviews['country'][0]
iloc
# 查看指定范围格子
reviews.iloc[:3, 0]
# 查看分散范围格
reviews.iloc[[0, 1, 2], 0]
loc
# 指定格
reviews.loc[0, 'country']
# 保留需要列
reviews.loc[:, ['taster_name', 'taster_twitter_handle', 'points']]
iloc和loc之间需要注意
查看指定类型
# include:选择的数据类型, exclude:排除的数据类型
DataFrame.select_dtypes(include['int'], exclude=None)
查看特定-设置索引查看
reviews.set_index("title")
查看特定-逻辑查看(true&flase)
reviews.country == 'Italy'
条件查看
# 和loc结合
reviews.loc[reviews.country == 'Italy']
条件结合
reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)]
同理
reviews.loc[(reviews.country == 'Italy') | (reviews.points >= 90)]
reviews.loc[reviews.country.isin(['Italy', 'France'])]
计算相关
均值
reviews.points.mean()
中值
reviews.points.median()
最大值查找
bargain_idx = (reviews.points / reviews.price).idxmax()
print(reviews.loc[bargain_idx])
唯一值
# 这里是taster_name这列的唯一值
reviews.taster_name.unique()
出现频次
reviews.taster_name.value_counts()
注:组分析的count / size 也可计算频次
改变、替换值
reviews['critic'] = 'everyone'
reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino")
值迭代
reviews['index_backwards'] = range(len(reviews), 0, -1)
map
review_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - review_points_mean)
或
review_points_mean = reviews.points.mean()
reviews.points - review_points_mean
两者区别:
apply
# 批量修改points数据
def remean_points(row):
row.points = row.points - review_points_mean
return row
reviews.apply(remean_points, axis='columns')
map和apply需注意
“国家”、“地区”合并:标签合并?
reviews.country + " - " + reviews.region_1
groupby组分析(实用复杂索引和排序)
频次count / size
'''
和value_counts()功能一致,
区别:value_counts()是groupby的快捷方式。
'''
reviews.groupby('points').points.count()
或
reviews.groupby('points').size()
区别:
min
# 找到每个points里的最小值
reviews.groupby('points').price.min()
apply
reviews.groupby('winery').apply(lambda df: df.title.iloc[0])
多标签分组
reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])
agg运行一组函数
reviews.groupby(['country']).price.agg([len, min, max])
多索引
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed
mi = countries_reviewed.index
type(mi)
多索引详见: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html
转回常规索引
countries_reviewed.reset_index()
排序
值的排序
# 转常规索引
countries_reviewed = countries_reviewed.reset_index()
# 按标签'len'升序(需存在len,这里countries_reviewed是组)
countries_reviewed.sort_values(by='len')
# 降序
# countries_reviewed.sort_values(by='len', ascending=False)
索引的排序
countries_reviewed.sort_index()
多条件排序(值和索引)
countries_reviewed.sort_values(by=['country', 'len'])
数据类型和丢失数据
数据类型
reviews.price.dtype
reviews.dtypes
类型转换
reviews.points.astype('float64')
DataFram、Series Index数据类型
reviews.index.dtype
丢失数据(na)设为不可用
melbourne_data = melbourne_data.dropna(axis=0)
空数据
# 非空,过滤空数据
reviews.loc[reviews.price.notnull()]
# 显示空数据
reviews[pd.isnull(reviews.country)]
替换缺失
# 将nan替换成unknown
reviews.region_2.fillna("Unknown")
删除数据
.drop()
# 删除列 'Price'
melb_predictors = data.drop(['Price'], axis=1)
数据保存
导出为.csv
melbourne_data .to_csv(“melbourne_data .csv”)
注意:保存前要确认数据类型是pandas,如果是其他类型,请往下看 😃
numpy 类型
md = pd.DataFrame(melbourne_data ) # 转换为pandas类型
md.to_csv("melbourne_data .csv")