三、Pandas小结（1）—Series & DataFrame & 索引

最新推荐文章于 2021-10-15 21:46:40 发布

爱看球的领带

最新推荐文章于 2021-10-15 21:46:40 发布

阅读量429

点赞数

本文链接：https://blog.csdn.net/jzlixiao/article/details/79557103

版权

# 一. Series

import pandas as pd

#1、创建Series
countries = ['中国','美国','澳大利亚']
countries_s = pd.Series(countries)
print(type(countries_s))
print(countries_s)

numbers = [4,5,6]
print(pd.Series(numbers))

country_dicts = {'CH': '中国',
'US': '美国',
'AU': '澳大利亚'}

country_dict_s = pd.Series(country_dicts)
# 给索引命名
country_dict_s.index.name = 'Code'
# 给数据命名
country_dict_s.name = 'Country'

print(country_dict_s)
print(country_dict_s.values)
print(country_dict_s.index)

#2、处理缺失数据
countries = ['中国', '美国', '澳大利亚', None]
print(pd.Series(countries))

numbers = [4, 5, 6, None]
print(pd.Series(numbers))

#3、Series 索引
country_dicts = {'CH': '中国',
'US': '美国',
'AU': '澳大利亚'}

country_dict_s = pd.Series(country_dicts)
print(country_dict_s)

# 通过索引判断数据是存在
# Series也可看作定长、有序的字典
print('CH' in country_dict_s)
print('NZ' in country_dict_s)

print('iloc: ', country_dict_s.iloc[2]) #位置索引
print('loc: ', country_dict_s.loc['CH']) #标签索引
print('[]:', country_dict_s['US'])

print('iloc:\n', country_dict_s.iloc[[0, 2]])
print()
print('loc:\n', country_dict_s.loc[['US', 'AU']])

# 4、向量化操作
import numpy as np

s = pd.Series(np.random.randint(0, 1000, 10000))
print(s.head()) #c参数为空，默认为5
print(s.head(10))
print(len(s))

%%timeit -n 100
total = 0
for item in s:
total += item

%%timeit -n 100
total = np.sum(s)

%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
for label, value in s.iteritems():
s.loc[label] = value + 2

%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
s += 2

# 二、DataFrame

# 1、创建Dataframe
import pandas as pd

country1 = pd.Series({'Name': '中国',
'Language': 'Chinese',
'Area': '9.597M km2',
'Happiness Rank': 79})

country2 = pd.Series({'Name': '美国',
'Language': 'English (US)',
'Area': '9.834M km2',
'Happiness Rank': 14})

country3 = pd.Series({'Name': '澳大利亚',
'Language': 'English (AU)',
'Area': '7.692M km2',
'Happiness Rank': 9})

df = pd.DataFrame([country1, country2, country3], index = ['CH', 'US', 'AU'])
# 注意在jupyter中使用print和不使用print的区别
print(df)
df

# 添加数据
# 如果个数小于要求的个数，会自动进行“广播”操作
# 如果大于要求的个数，会报错
df['Location'] = '地球'
print(df)

df['Region'] = ['亚洲', '北美洲', '大洋洲']
# print(df)
df

# 2、Dataframe索引
#标签索引
print('loc:')
print(df.loc['CH'])
print(type(df.loc['CH']))
# 位置索引
print('iloc:')
print(df.iloc[1])
# 列索引
print(df['Area'])
print(type(df['Area']))

# 获取不连续的列数据
print(df[['Language', 'Location']])

# 混合索引
# 注意写法上的区别
print('**************************')
print('先取出列，再取行：')
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])

print('先取出行，再取列：')
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])

# 转换行和列
print(df.T)

# 3、删除数据
print(df.drop(['CH']))
# 注意drop操作只是将修改后的数据copy一份，而不会对原始数据进行修改
print(df)

print(df.drop(['CH'], inplace=True))
# 如果使用了inplace=True，会在原始数据上进行修改，同时不会返回一个copy
print(df)

# 如果需要删除列，需要指定axis=1
print(df.drop(['Area'], axis=1))
print(df)

# 也可直接使用del关键字
del df['Name']
print(df)

# 4、DataFrame的操作与加载
df['Happiness Rank']

# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
ranks = df['Happiness Rank']
ranks += 2
print(ranks)
print(df)

# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
# ！！！安全的操作是使用copy()！！！
ranks = df['Happiness Rank'].copy()
ranks += 2
print(ranks)
print(df)

# 加载csv文件数据
reprot_2015_df = pd.read_csv('./2015.csv')
print('2015年数据预览：')
#print(reprot_2015_df.head())
reprot_2015_df.head()

print(reprot_2015_df.info())

# 三、索引

# 使用index_col指定索引列
# 使用usecols指定需要读取的列
reprot_2016_df = pd.read_csv('./2016.csv',
index_col='Country',
usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region'])
# 数据预览
reprot_2016_df.head()

print('列名(column)：', reprot_2016_df.columns)
print('行名(index)：', reprot_2016_df.index)

# 注意index是不可变的
reprot_2016_df.index[0] = '丹麦'

# 重置index
# 注意inplace加与不加的区别
reprot_2016_df.reset_index(inplace=True)

reprot_2016_df.head()

# 重命名列名，注意inplace的使用
reprot_2016_df.rename(columns={'Region': '地区', 'Happiness Rank': '排名', 'Happiness Score': '幸福指数'},
inplace=True)

reprot_2016_df.head()

# 四、Boolean Mask

reprot_2016_df.head()

only_western_europe_10 = (reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)
only_western_europe_10
# 叠加 boolean mask 得到最终结果
reprot_2016_df[only_western_europe_10]
#合并写法
reprot_2016_df[(reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)]

# 五、层级索引

reprot_2015_df.head()

# 设置层级索引
report_2015_df2 = reprot_2015_df.set_index(['Region', 'Country'])
report_2015_df2.head(20)

# level0 索引
report_2015_df2.loc['Western Europe']

# 两层索引
report_2015_df2.loc['Western Europe', 'Switzerland']

# 交换分层顺序
report_2015_df2.swaplevel()

# 排序分层
report_2015_df2.sort_index(level=0)

爱看球的领带

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
三、Pandas小结（1）—Series & DataFrame & 索引

# 一. Seriesimport pandas as pd#1、创建Seriescountries = ['中国','美国','澳大利亚']countries_s = pd.Series(countries)print(type(countries_s))print(countries_s)numbers = [4,5,6]print(pd.Series(numbers))country_dic...
复制链接

扫一扫