# pandas:关于数据处理的库;基于numpy
import pandas as pd
import numpy as np
# 读取文件,结构:DataFrame(相当于矩阵结构,行和列)
# 数据类型(dtype):object ---> string / int / float / bool
# 文件名可以不区分大小写
data = pd.read_csv("ex1data1.txt")
print(data.dtypes) # 各个属性列的type
print(type(data)) # DataFrame
print(data.head()) # 默认输出前5行
print(data.tail()) # 后
print(data.columns)
print(data.shape)
# 读取数据
# loc[start_index,end_index] : 包括end_index
print(data.loc[0])
print(data.loc[0:3])
test = pd.DataFrame({'a':np.arange(1,8),'b':np.arange(8,1,-1)})
print(test.loc[1])
print(test.loc[1,'a'])
# Output:
# a 2
# b 7
# Name: 1, dtype: int32
# 2
# 按列读取
columns = ['a','b']
print(data[columns])
# 寻找列
col_names = data.columns.tolist()
print(col_names)
temp = []
for c in col_names:
if c is 'a':
temp.append(c)
print(temp)
# 运算
print(data['a'] * 100)
print(data.shape)
c = data["a"] + data['b']
data['c'] = c # 新增列
print(data.shape)
print(data['a'].min())
print(data['b'].max())
print(data['b'].mean())
# 排序(默认从小到大)
data.sort_values('a',inplace = True,ascending=False) # inplace:是在原来的数据上排序,还是新生成,默认False
print(data['a'])
# 泰坦尼克号数据集
titanic = pd.read_csv("titanic_train.csv")
print(titanic.dtypes)
print(titanic.head())
age = titanic["Age"]
age_is_null = pd.isnull(age) # 判断缺失值,返回bool,true:缺失
print(age_is_null)
age_null_true = age[age_is_null]
print(len(age_null_true)) # len()
# 平均值,返回nan(数据中含有nan缺失值)
mean_age = sum(titanic["Age"]) / len(titanic["Age"])
print(mean_age)
good_age = titanic["Age"][age_is_null == False] #
correct_mean_age = sum(good_age) / len(good_age)
print(good_age.mean())
print(correct_mean_age)
# 各类平均船票价格(pivot_table)
passenger_classes = [1,2,3]
fares_by_class = {} # 字典
for this_class in passenger_classes:
pclass_rows = titanic[titanic["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
# pivot_table:相当于一个数据透视表,也相当于我们要统计的数据和其他数据的关系的一个函数
# (参数index:以谁为基准,参数values:基准跟谁之间的关系,参数aggfunc:什么样的关系,默认求均值)
passenger_survival = titanic.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean) # 各个等级的存活率
print(passenger_survival)
port_stats = titanic.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum) # 登船口的船票和存活人数和
print(port_stats)
# 丢弃缺失值
drop_na = titanic.dropna(axis=1) # axis=1:删除包含缺失值的列 axis=0:删除包含缺失值的行
new_titanic = titanic.dropna(axis=0,subset=["Age","Sex"])
# 定位
row_index_83_age = titanic.loc[83,"Age"]
print(row_index_83_age)
new_titanic = titanic.sort_values("Age",ascending=False)
print(new_titanic.head())
print(new_titanic.columns)
titanic_reindexed = new_titanic.reset_index(drop=True) # 重置排序后的index,drop=True:抛弃之前的index
print(titanic_reindexed.head())
# 函数
def hundredth_row(column):
hundredth_item = column.loc[99]
return hundredth_item
hundredth = titanic.apply(hundredth_row)
print(hundredth)
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic.apply(not_null_count)
print(column_null_count)
def which_class(row):
pclass = row["Pclass"]
if pd.isnull(pclass):
return "UnKnown"
elif pclass == 1:
return "First class"
elif pclass == 2:
return "Second class"
elif pclass == 3:
return "Third class"
classes = titanic.apply(which_class,axis=1)
print(classes)
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic.apply(is_minor,axis=1)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_label = titanic.apply(generate_age_label,axis=1)
titanic['age_labels'] = age_label
age_group_survival = titanic.pivot_table(index="age_labels",values="Survived",aggfunc=np.mean)
print(age_group_survival)
# value_counts()查看表格某列有多少个不同值的快捷方式,并计算每个不同值在该列有多少重复值
# value_counts()是Series的方法,一般在DataFrame中使用时,需要指定对那一列或行使用
# value_counts()返回的也是Series类型,且index为该列的不同值,values为不同值的个数
fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()
fandango_distribution = fandango_distribution.sort_index()
# DataFrame里面的结构series,series里面的结构ndarray
import pandas as pd
import numpy as np
from pandas import Series
# 数据:电影评分
fandango = pd.read_csv("fandango_score_comparison.csv")
series_film = fandango['FILM'] # 一列类型为series,超过一列类型为DataFrame(fandango[["a","b"]])
print(type(series_film))
print(series_film.dtypes)
series_rt = fandango['RottenTomatoes']
film_names = series_film.values
rt_scores = series_rt.values
print(type(film_names)) # <class 'numpy.ndarray'>
series_custom = Series(rt_scores,index=film_names) # Series(values,key) 在Series可以使用字符作为index
print(series_custom[5:10])
# 排序和DataFrame基本一致,一般不在series中使用
# add
print(np.add(series_custom,series_custom)) # 针对series中的values
print(np.sin(series_custom))
print(np.max(series_custom))
series_greater = series_custom[(series_custom > 50) & (series_custom <= 100)] # 注意优先级
print(series_greater)
# set_index():将DataFrame中的列转化为索引(原来的index依旧可以使用)
# 参数drop:默认当列变为行索引之后,原来的列就没了,但是可以通过drop来保留原来的列
print(np.arange(7))
test = pd.DataFrame({'a':np.arange(1,8),'b':np.arange(8,1,-1)})
test_set_index = test.set_index('a')
test_set_index1 = test.set_index('a',drop=False)
print(test)
print(test_set_index)
print(test_set_index1)
# 输出对比
# a b
# 0 1 8
# 1 2 7
# 2 3 6
# 3 4 5
# 4 5 4
# 5 6 3
# 6 7 2
# b
# a
# 1 8
# 2 7
# 3 6
# 4 5
# 5 4
# 6 3
# 7 2
# a b
# a
# 1 1 8
# 2 2 7
# 3 3 6
# 4 4 5
# 5 5 4
# 6 6 3
# 7 7 2
fandango_films = fandango.set_index('FILM',drop=False)
types = fandango.dtypes
# print(types)
# print(types.index) # types.index等同于types.keys()
# print(types.keys())
# print(types.values == 'float64')
float_columns = types[types.values == 'float64'].index
# print(float_columns)
float_df = fandango[float_columns]
print(float_df)
# std():标准差;np中求平均的时候除以的是数据的总数N,而pd中却是N-1
# lambda:匿名函数代替一些简单的函数,使得代码看上去更简洁并且可读性高
# f = lambda x:pow(x,2)
# print(f(2))
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)