Pandas

  • pandas基本操作:
# pandas:关于数据处理的库;基于numpy
import pandas as pd
import numpy as np

# 读取文件,结构:DataFrame(相当于矩阵结构,行和列)
# 数据类型(dtype):object ---> string / int / float / bool
# 文件名可以不区分大小写
data = pd.read_csv("ex1data1.txt")
print(data.dtypes)   # 各个属性列的type
print(type(data))   # DataFrame
print(data.head())   # 默认输出前5行
print(data.tail())   # 后
print(data.columns)
print(data.shape)

# 读取数据
# loc[start_index,end_index] : 包括end_index
print(data.loc[0])
print(data.loc[0:3])

test = pd.DataFrame({'a':np.arange(1,8),'b':np.arange(8,1,-1)})
print(test.loc[1])
print(test.loc[1,'a'])
# Output:
# a    2
# b    7
# Name: 1, dtype: int32
# 2

# 按列读取
columns = ['a','b']
print(data[columns])

# 寻找列
col_names = data.columns.tolist()
print(col_names)
temp = []
for c in col_names:
    if c is 'a':
        temp.append(c)
print(temp)

# 运算
print(data['a'] * 100)
print(data.shape)
c = data["a"] + data['b']
data['c'] = c   # 新增列
print(data.shape)

print(data['a'].min())
print(data['b'].max())
print(data['b'].mean())

# 排序(默认从小到大)
data.sort_values('a',inplace = True,ascending=False)   # inplace:是在原来的数据上排序,还是新生成,默认False
print(data['a'])
  • 泰坦尼克号数据集
# 泰坦尼克号数据集
titanic = pd.read_csv("titanic_train.csv")
print(titanic.dtypes)
print(titanic.head())
age = titanic["Age"]
age_is_null = pd.isnull(age)   # 判断缺失值,返回bool,true:缺失
print(age_is_null)
age_null_true = age[age_is_null]
print(len(age_null_true))   # len()

# 平均值,返回nan(数据中含有nan缺失值)
mean_age = sum(titanic["Age"]) / len(titanic["Age"])
print(mean_age)

good_age = titanic["Age"][age_is_null == False]   #
correct_mean_age = sum(good_age) / len(good_age)
print(good_age.mean())
print(correct_mean_age)

# 各类平均船票价格(pivot_table)
passenger_classes = [1,2,3]
fares_by_class = {}   # 字典
for this_class in passenger_classes:
    pclass_rows = titanic[titanic["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

# pivot_table:相当于一个数据透视表,也相当于我们要统计的数据和其他数据的关系的一个函数
# (参数index:以谁为基准,参数values:基准跟谁之间的关系,参数aggfunc:什么样的关系,默认求均值)
passenger_survival = titanic.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)   # 各个等级的存活率
print(passenger_survival)

port_stats = titanic.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)   # 登船口的船票和存活人数和
print(port_stats)

# 丢弃缺失值
drop_na = titanic.dropna(axis=1)   # axis=1:删除包含缺失值的列  axis=0:删除包含缺失值的行
new_titanic = titanic.dropna(axis=0,subset=["Age","Sex"])

# 定位
row_index_83_age = titanic.loc[83,"Age"]
print(row_index_83_age)

new_titanic = titanic.sort_values("Age",ascending=False)
print(new_titanic.head())
print(new_titanic.columns)
titanic_reindexed = new_titanic.reset_index(drop=True)   # 重置排序后的index,drop=True:抛弃之前的index
print(titanic_reindexed.head())

# 函数
def hundredth_row(column):
    hundredth_item = column.loc[99]
    return hundredth_item
hundredth = titanic.apply(hundredth_row)
print(hundredth)

def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = titanic.apply(not_null_count)
print(column_null_count)

def which_class(row):
    pclass = row["Pclass"]
    if pd.isnull(pclass):
        return "UnKnown"
    elif pclass == 1:
        return "First class"
    elif pclass == 2:
        return "Second class"
    elif pclass == 3:
        return "Third class"
classes = titanic.apply(which_class,axis=1)
print(classes)

def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False
minors = titanic.apply(is_minor,axis=1)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"
age_label = titanic.apply(generate_age_label,axis=1)
titanic['age_labels'] = age_label
age_group_survival = titanic.pivot_table(index="age_labels",values="Survived",aggfunc=np.mean)
print(age_group_survival)
  • pandas补充:
# value_counts()查看表格某列有多少个不同值的快捷方式,并计算每个不同值在该列有多少重复值
# value_counts()是Series的方法,一般在DataFrame中使用时,需要指定对那一列或行使用
# value_counts()返回的也是Series类型,且index为该列的不同值,values为不同值的个数

fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()
fandango_distribution = fandango_distribution.sort_index()
  • series
# DataFrame里面的结构series,series里面的结构ndarray

import pandas as pd
import numpy as np
from pandas import Series

# 数据:电影评分
fandango = pd.read_csv("fandango_score_comparison.csv")
series_film = fandango['FILM']   # 一列类型为series,超过一列类型为DataFrame(fandango[["a","b"]])
print(type(series_film))
print(series_film.dtypes)
series_rt = fandango['RottenTomatoes']

film_names = series_film.values
rt_scores = series_rt.values
print(type(film_names))   # <class 'numpy.ndarray'>
series_custom = Series(rt_scores,index=film_names)   # Series(values,key) 在Series可以使用字符作为index
print(series_custom[5:10])

# 排序和DataFrame基本一致,一般不在series中使用

# add
print(np.add(series_custom,series_custom))   # 针对series中的values
print(np.sin(series_custom))
print(np.max(series_custom))

series_greater = series_custom[(series_custom > 50) & (series_custom <= 100)]   # 注意优先级
print(series_greater)

# set_index():将DataFrame中的列转化为索引(原来的index依旧可以使用)
# 参数drop:默认当列变为行索引之后,原来的列就没了,但是可以通过drop来保留原来的列
print(np.arange(7))
test = pd.DataFrame({'a':np.arange(1,8),'b':np.arange(8,1,-1)})
test_set_index = test.set_index('a')
test_set_index1 = test.set_index('a',drop=False)
print(test)
print(test_set_index)
print(test_set_index1)
# 输出对比
#    a  b
# 0  1  8
# 1  2  7
# 2  3  6
# 3  4  5
# 4  5  4
# 5  6  3
# 6  7  2
#    b
# a
# 1  8
# 2  7
# 3  6
# 4  5
# 5  4
# 6  3
# 7  2
#    a  b
# a
# 1  1  8
# 2  2  7
# 3  3  6
# 4  4  5
# 5  5  4
# 6  6  3
# 7  7  2


fandango_films = fandango.set_index('FILM',drop=False)

types = fandango.dtypes
# print(types)
# print(types.index)   # types.index等同于types.keys()
# print(types.keys())
# print(types.values == 'float64')
float_columns = types[types.values == 'float64'].index
# print(float_columns)
float_df = fandango[float_columns]
print(float_df)

# std():标准差;np中求平均的时候除以的是数据的总数N,而pd中却是N-1
# lambda:匿名函数代替一些简单的函数,使得代码看上去更简洁并且可读性高
# f = lambda x:pow(x,2)
# print(f(2))
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值