导包
# 导包
import pandas as pd
import numpy as np
import random as rnd
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
读文件
filename = r'F:\数据集\titanic'
train_df = pd.read_csv(filename+'\\train.csv')
test_df = pd.read_csv(filename+'\\test.csv')
combine = [train_df, test_df]
查看信息
df.columns.values #获取列的标题
df.shape
df.info()
df.describe()
# 默认是描述数字类型的属性,而include= “O“ 则是描述object类型的熟悉, include= ”all“则是对所有属性的描述。
df.describe(include = 'all')
df.describe(include=['O'])
df.sample(5)
df.head()
df.tail()
# 查找指定行
df.loc[df['列名'] > 500]
# 把有缺失值的显示出来
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
查看不同种类的个数和占比
print(train['Sex'].value_counts())
print()
print(train['Sex'].value_counts(normalize=True))
得到结果:
male 577
female 314
Name: Sex, dtype: int64
male 0.647587
female 0.352413
Name: Sex, dtype: float64
透视表
columns = ['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch