数据类型
object - 字符串
int - 整形
float - 浮点型
datatime - 时间类型
bool - 布尔型
name,age,slary
zhangsan,20,1000
lisi,30,2000.34
jim,25,1500
wang,34,3000.0
kate,32,9000.0
liming,25,1200.0
import pandas
# 读取文本
test = pandas.read_csv("test.csv")
# 文本类型
print(type(test))
print("---------------------------------------------")
# 变量类型
print(test.dtypes)
#print("---------------------------------------------")
#print(help(pandas.read_csv))
运行结果
<class 'pandas.core.frame.DataFrame'> --------------------------------------------- name object age int64 slary float64 dtype: object
显示读取的头部数据
# test.head() 显示读进来的数据,不会进行完整显示,只显示前五条数据。
# 如果想显示前3条,输入参数3即可。
test.head(2)
运行结果
# 显示尾部两行
test.tail(2)
运行结果
显示每一列的指标
# 显示每一列的指标,即第一行数据
print(test.columns)
# 样本数,6个样本,每个样本3个数据
print(test.shape)
运行结果
Index(['name', 'age', 'slary'], dtype='object') (6, 3) # 6 个样本,每个样本3个指标
读取数据
# 取数据,取第0号数据
print(test.loc[0])
运行结果
name zhangsan age 20 slary 1000 Name: 0, dtype: object
读取数据片
# 取出索引3,4,5 行数据,索引从0开始
print(test.loc[3:5])
运行结果
name age slary 3 wang 34 3000.0 4 kate 32 9000.0 5 liming 25 1200.0
读取任意数据
# 取出0,2,4行数据
# 方法一
test_index = [0,2,4]
print(test.loc[test_index])
运行结果
name age slary 0 zhangsan 20 1000.0 2 jim 25 1500.0 4 kate 32 9000.0
# 取出0,2,4行数据
# 方法二
print(test.loc[[0,2,4]])
运行结果
name age slary 0 zhangsan 20 1000.0 2 jim 25 1500.0 4 kate 32 9000.0
取一列数据
# 按列取出数据
print(test["name"])
运行结果
0 zhangsan 1 lisi 2 jim 3 wang 4 kate 5 liming Name: name, dtype: object
取多列数据
# 定位到两个列
col = ["name","age"]
print(test[col])
运行结果
name age 0 zhangsan 20 1 lisi 30 2 jim 25 3 wang 34 4 kate 32 5 liming 25
---------------------------------------------------------------------------------------------------------------------
test.csv
NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Iron_(mg)
1001,BUTTER WITH SALT,15.87,717,0.02
1002,BUTTER WHIPPED WITH SALT,15.87,717,0.16
1003,BUTTER OIL ANHYDROUS,0.24,876,0
1004,CHEESE BLUE,42.41,353,0.31
1005,CHEESE BRICK,41.11,371,0.43
import pandas
food_info = pandas.read_csv("test.csv")
col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []
print("---------------------------------------------------------------------------")
# 读取以 单位为 g 的数据
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
print(food_info[gram_columns].columns)
运行结果
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Iron_(mg)'] --------------------------------------------------------------------------- Index(['Water_(g)'], dtype='object')
# 单位转换, 把 mg 结尾的转换为 g
print(food_info["Iron_(mg)"])
print("--------------------------------------------------------------------------")
div_1000 = food_info["Iron_(mg)"] / 1000
print(div_1000)
print("--------------------------------------------------------------------------")
# 增加一列,并没有写入文件
food_info["Iron_(g)"] = div_1000
food_info.head()
运行结果
0 0.02 1 0.16 2 0.00 3 0.31 4 0.43 Name: Iron_(mg), dtype: float64 -------------------------------------------------------------------------- 0 0.00002 1 0.00016 2 0.00000 3 0.00031 4 0.00043 Name: Iron_(mg), dtype: float64 --------------------------------------------------------------------------
# 对两个列进行组合,对应位置相乘
water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"]
print(water_energy)
运行结果
0 11378.79 1 11378.79 2 210.24 3 14970.73 4 15251.81 dtype: float64
归一化操作
# # 求一列中的最大值
max_calories = food_info["Energ_Kcal"].max()
print(max_calories)
print("--------------------------------------------------------------------------")
# 进行列归一化,把当前列除以最大值
normalized_calories = food_info["Energ_Kcal"] / max_calories
print(normalized_calories)
运行结果
876 -------------------------------------------------------------------------- 0 0.818493 1 0.818493 2 1.000000 3 0.402968 4 0.423516 Name: Energ_Kcal, dtype: float64排序操作
# 排序操作,从小到大
food_info.sort_values("Iron_(g)", inplace=True)
print(food_info["Iron_(g)"])
print("--------------------------------------------------------------------------")
# 排序操作,从大到小
food_info.sort_values("Iron_(g)", inplace=True, ascending=False)
print(food_info["Iron_(g)"])
运行结果
2 0.00000 0 0.00002 1 0.00016 3 0.00031 4 0.00043 Name: Iron_(g), dtype: float64 -------------------------------------------------------------------------- 4 0.00043 3 0.00031 1 0.00016 0 0.00002 2 0.00000 Name: Iron_(g), dtype: float64
----------------------------------------------------------------------------------------------------------------------------
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S
12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S
13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S
14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("test.csv")
titanic_survival.head()
运行结果
# 观察年龄 列
age = titanic_survival["Age"]
print(age.loc[0:10])
print("-------------------------------------")
# 判断缺失值,FALSE 不是缺失值,TRUE是缺失值
age_is_null = pd.isnull(age)
print(age_is_null)
print("-------------------------------------")
# 找出所有缺失值
age_null_true = age[age_is_null]
print(age_null_true)
print("-------------------------------------")
# # 所有缺失值的个数
age_null_count = len(age_null_true)
print(age_null_count)
运行结果
0 22.0 1 38.0 2 26.0 3 35.0 4 35.0 5 NaN 6 54.0 7 2.0 8 27.0 9 14.0 10 4.0 Name: Age, dtype: float64 ------------------------------------- 0 False 1 False 2 False 3 False 4 False 5 True 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False Name: Age, dtype: bool ------------------------------------- 5 NaN Name: Age, dtype: float64 ------------------------------------- 1
# 对缺失值进行处理,取平均年龄
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)
print("-------------------------------------")
# 去掉缺失值,只取有的值
good_ages = titanic_survival["Age"][age_is_null == False]
corrent_mean_age = sum(good_ages) / len(good_ages)
print(corrent_mean_age)
print("-------------------------------------")
# python 提供函数处理
corrent_mean_age = titanic_survival["Age"].mean()
print(corrent_mean_age)
运行结果
nan ------------------------------------- 27.714285714285715 ------------------------------------- 27.714285714285715
# 查看每个舱位登记平均价格是多少
passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
print("------------------------------------------------------")
# 查看坐一二三等舱各平均获救多少人
# index 统计以谁为基准的,以 Pclass 为基准值
# values 统计 Pclass 跟什么之间的关系
# aggfunc 统计 index 与 values 关系的,对 每一个 Pclass 的 平均获救人数是多少
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print(passenger_survival)
{1: 50.698949999999996, 2: 30.0708, 3: 12.77708} ------------------------------------------------------ Survived Pclass 1 0.75 2 1.00 3 0.30
# 查看每个舱位等级下的平均年龄,默认是求均值
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_survival)
Age Pclass 1 46.25 2 14.00 3 21.00
# 查看一个量跟其他两个量之间的关系
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
Fare Survived Embarked C 101.3541 2 Q 8.4583 0 S 250.8250 5
# 丢弃缺失值
drop_na_colums = titanic_survival.dropna(axis = 1)
new_titanic_survival = titanic_survival.dropna(axis=0, subset=["Age", "Sex"])
print(new_titanic_survival)
PassengerId Survived Pclass \ 0 1 0 3 1 2 1 1 2 3 1 3 3 4 1 1 4 5 0 3 6 7 0 1 7 8 0 3 8 9 1 3 9 10 1 2 10 11 1 3 11 12 1 1 12 13 0 3 13 14 0 3 14 15 0 3 Name Sex Age SibSp \ 0 Braund, Mr. Owen Harris male 22.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 2 Heikkinen, Miss. Laina female 26.0 0 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 4 Allen, Mr. William Henry male 35.0 0 6 McCarthy, Mr. Timothy J male 54.0 0 7 Palsson, Master. Gosta Leonard male 2.0 3 8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 10 Sandstrom, Miss. Marguerite Rut female 4.0 1 11 Bonnell, Miss. Elizabeth female 58.0 0 12 Saundercock, Mr. William Henry male 20.0 0 13 Andersson, Mr. Anders Johan male 39.0 1 14 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 Parch Ticket Fare Cabin Embarked 0 0 A/5 21171 7.2500 NaN S 1 0 PC 17599 71.2833 C85 C 2 0 STON/O2. 3101282 7.9250 NaN S 3 0 113803 53.1000 C123 S 4 0 373450 8.0500 NaN S 6 0 17463 51.8625 E46 S 7 1 349909 21.0750 NaN S 8 2 347742 11.1333 NaN S 9 0 237736 30.0708 NaN C 10 1 PP 9549 16.7000 G6 S 11 0 113783 26.5500 C103 S 12 0 A/5. 2151 8.0500 NaN S 13 5 347082 31.2750 NaN S 14 0 350406 7.8542 NaN S
# 定位样本编号
row_index_4_age = titanic_survival.loc[4,"Age"]
print(row_index_4_age)
row_index_10_pclass = titanic_survival.loc[5,"Pclass"]
print(row_index_10_pclass)
35.0 3
# 按年龄降序排列
new_titanic_survival = titanic_survival.sort_values("Age", ascending=False)
print(new_titanic_survival)
# index 进行重新排序
print("-----------------------------------------------------------------------------")
titanic_reindex = new_titanic_survival.reset_index(drop=True)
print(titanic_reindex)
PassengerId Survived Pclass \ 11 12 1 1 6 7 0 1 13 14 0 3 1 2 1 1 3 4 1 1 4 5 0 3 8 9 1 3 2 3 1 3 0 1 0 3 12 13 0 3 9 10 1 2 14 15 0 3 10 11 1 3 7 8 0 3 5 6 0 3 Name Sex Age SibSp \ 11 Bonnell, Miss. Elizabeth female 58.0 0 6 McCarthy, Mr. Timothy J male 54.0 0 13 Andersson, Mr. Anders Johan male 39.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 4 Allen, Mr. William Henry male 35.0 0 8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 Heikkinen, Miss. Laina female 26.0 0 0 Braund, Mr. Owen Harris male 22.0 1 12 Saundercock, Mr. William Henry male 20.0 0 9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 14 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 10 Sandstrom, Miss. Marguerite Rut female 4.0 1 7 Palsson, Master. Gosta Leonard male 2.0 3 5 Moran, Mr. James male NaN 0 Parch Ticket Fare Cabin Embarked 11 0 113783 26.5500 C103 S 6 0 17463 51.8625 E46 S 13 5 347082 31.2750 NaN S 1 0 PC 17599 71.2833 C85 C 3 0 113803 53.1000 C123 S 4 0 373450 8.0500 NaN S 8 2 347742 11.1333 NaN S 2 0 STON/O2. 3101282 7.9250 NaN S 0 0 A/5 21171 7.2500 NaN S 12 0 A/5. 2151 8.0500 NaN S 9 0 237736 30.0708 NaN C 14 0 350406 7.8542 NaN S 10 1 PP 9549 16.7000 G6 S 7 1 349909 21.0750 NaN S 5 0 330877 8.4583 NaN Q ----------------------------------------------------------------------------- PassengerId Survived Pclass \ 0 12 1 1 1 7 0 1 2 14 0 3 3 2 1 1 4 4 1 1 5 5 0 3 6 9 1 3 7 3 1 3 8 1 0 3 9 13 0 3 10 10 1 2 11 15 0 3 12 11 1 3 13 8 0 3 14 6 0 3 Name Sex Age SibSp \ 0 Bonnell, Miss. Elizabeth female 58.0 0 1 McCarthy, Mr. Timothy J male 54.0 0 2 Andersson, Mr. Anders Johan male 39.0 1 3 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 5 Allen, Mr. William Henry male 35.0 0 6 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 7 Heikkinen, Miss. Laina female 26.0 0 8 Braund, Mr. Owen Harris male 22.0 1 9 Saundercock, Mr. William Henry male 20.0 0 10 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 11 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 12 Sandstrom, Miss. Marguerite Rut female 4.0 1 13 Palsson, Master. Gosta Leonard male 2.0 3 14 Moran, Mr. James male NaN 0 Parch Ticket Fare Cabin Embarked 0 0 113783 26.5500 C103 S 1 0 17463 51.8625 E46 S 2 5 347082 31.2750 NaN S 3 0 PC 17599 71.2833 C85 C 4 0 113803 53.1000 C123 S 5 0 373450 8.0500 NaN S 6 2 347742 11.1333 NaN S 7 0 STON/O2. 3101282 7.9250 NaN S 8 0 A/5 21171 7.2500 NaN S 9 0 A/5. 2151 8.0500 NaN S 10 0 237736 30.0708 NaN C 11 0 350406 7.8542 NaN S 12 1 PP 9549 16.7000 G6 S 13 1 349909 21.0750 NaN S 14 0 330877 8.4583 NaN Q
函数操作
# 函数,返回第10行数据
def ten_row(column):
ten_item = column.loc[9]
return ten_item
ten_row = titanic_survival.apply(ten_row)
print(ten_row)
PassengerId 10 Survived 1 Pclass 2 Name Nasser, Mrs. Nicholas (Adele Achem) Sex female Age 14 SibSp 1 Parch 0 Ticket 237736 Fare 30.0708 Cabin NaN Embarked C dtype: object
# 每一列缺失值得个数是多少
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 1 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 10 Embarked 0 dtype: int64
# 对数据进行一次转换
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print(classes)
0 Third Class 1 First Class 2 Third Class 3 First Class 4 Third Class 5 Third Class 6 First Class 7 Third Class 8 Third Class 9 Second Class 10 Third Class 11 First Class 12 Third Class 13 Third Class 14 Third Class dtype: object
# 对数据进行转换
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor, axis=1)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)
0 adult 1 adult 2 adult 3 adult 4 adult 5 unknown 6 adult 7 minor 8 adult 9 minor 10 minor 11 adult 12 adult 13 adult 14 minor dtype: object
# 数据关联处理
titanic_survival['age_labels']=age_labels
age_group_survial = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survial)
Survived age_labels adult 0.5 minor 0.5 unknown 0.0