#利用泰坦尼克数据
data.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
构建list:
cols = data.columns
colname = []
typename= []
for col in cols:
colname.append(col)
typename.append(str(data[col].dtype))
print(colname)
print(typename)
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
['int64', 'int64', 'int64', 'object', 'object', 'float64', 'int64', 'int64', 'object', 'float64', 'object', 'object']
如果报错: Unnamed: 0
解决方案一: 最有效
pd.read_csv(path, index_col=0)
或
pd.to_csv(path, index=False)
解决方案二:未测试
用pandas处理数据是,产生了 Unnamed:0 列,解决方案如下:
df.loc[ : , ~df.columns.str.contains("^Unnamed")]
构建表:
from pandas.core.frame import DataFrame
c={"feature_name" : colname,
"dtype" : typename}
datatype=DataFrame(c)#将字典转换成为数据框
datatype.head()
dtype feature_name
0 int64 PassengerId
1 int64 Survived
2 int64 Pclass
3 object Name
4 object Sex
结束!
拓展:
和空值统计表合并
各列空值统计
参考博客:https://blog.csdn.net/zgcr654321/article/details/93188454
# calculate the missing value percent of features
def draw_missing_data_table(data):
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum() / data.shape[0]).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=["Total", "Percent"])
missing_data.reset_index(inplace=True)
missing_data.rename(columns={"index": "feature_name"}, inplace=True)
return missing_data
caldata = draw_missing_data_table(data)
caldata.head(5)
feature_name Total Percent
0 Cabin 36 0.800000
1 Age 10 0.222222
2 Embarked 0 0.000000
3 Fare 0 0.000000
4 Ticket 0 0.000000
Total为空值总数 Precent为 占总数的百分比
合并两表:
fina = pd.merge(caldata, datatype, on='feature_name', how='left')
fina.head()
结果:
feature_name Total Percent dtype
0 Cabin 36 0.800000 object
1 Age 10 0.222222 float64
2 Embarked 0 0.000000 object
3 Fare 0 0.000000 float64
4 Ticket 0 0.000000 object
这回真的结束啦!!!