1:读取+抽取
import pandas
data_info=pandas.read_csv("1.csv")
col_name=data_info.columns.tolist()
gram_columns=[]
for c in col_name:
if c.endswith("ame"):
gram_columns.append(c)
gram_df=data_info[gram_columns]
2:抽取单列的缺失值+计算单列平均值
import pandas as pd
age=tiT["Age"]
age_is_null=pd.isnull(age)
age_null_true=age[age_is_null]
ahe_null_count=len(age_null_true)
correct_mean_age = tiT["Age"].mean()
3:二维对应+计算平均值list
passenger_class=[1,2,3]
fares_by_class={}
#这里用了一下dict
for this_class in passenger_class:
pclass_rows=tiT[tiT["Pclass"]==this_class]
pclass_fares=pclss_rows["Fares"]
fare_for_class=pclass_fares.mean()
fares_for_class[this_class}=fare_for_class
4:pivot.table(没写aggfunc也会自动计算均值*函数重载)
p_s=tiT.pivot.table(index="Pclass",value="Survived",aggfunc=np.mean)
#index:统计的对象
#value:统计对象的特征
#aggfunc:统计对象特征的平均值
5:dropna:删除缺失值对应行列(subset为检索对象)
#specifying axis=1 or axis='columns' will drop any columns that have null valuesdrop_na_columns = titanic_survival. dropna(axis=1)
new_titanic_survival = titanic_survival. dropna (axis=0, subset=["Age" "sex”])
#print new_titanic_survival
6:定位loc
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print (row_index_83_age)
7:特定列排序+reset index
new_titanic_survival = titanic_survival.sort_values("Age" , ascending=False)
print (new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print('-------------')
print(titanic_reindexed.loc[0:10])
8:自定义函数(基本变换)+column/row格式(需要apply)
def not_null_count(column) :
column_null = pd. isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print (column_nu11_count)
def is_minor(row):
if row["Age"]< 18:
return True
else:
return False
minors =titanic_survival.apply(is_minor,axis=1)
#print minors
def generate_age_labe1(row):
age = row["Age"]I
if pd.isnul1(age) :
return "unknown"
elif age< 18:
return "minor"
else:
return "adult"
age_labe1s = titanic_surviva1.apply(generate_age_label, axis=1)
print (age_labels)
titanic_survival['age_labels']=age_labels
age_group_survival=titanic_survival.pivot_table(index="age_labels",values="Survived")
#在原DF后增加新的列,并pivot制表
9:series处理
import pandas
from pandas import Series
data_info=pandas.read_csv("1.csv")
series_File=data_info['FileType']
series_ID=data_info['GlobalID']
File_type=series_File.values
File_ID=series_ID.values
series_custom=Series(File_type,index=File_ID)
temp=series_custom[5:10]
original_index=series_custom.index.tolist()
sorted_index=sorted(original_index)
# sorted_by_index=series_custom.reindex(sorted_index)
sc2=series_custom.sort_index()
sc3=series_custom.sort_values()