1-1数据载入及初步观察
import numpy as np
import pandas as pd
df = pd.read_csv('train.csv')
df.head(3)
df = pd.read_csv('/Users/chenandong/Documents/datawhale数据分析每个人题目设计/招募阶段/第一单元项目集合/train.csv')
df.head(3)
chunker = pd.read_csv('train.csv', chunksize=1000)
df = pd.read_csv('train.csv', names=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口'],index_col='乘客ID',header=0)
df.head()
df.info()
df.head(10)
df.tail(15)
df.isnull().head()
df.to_csv('train_chinese.csv')
1-2Pandas基础
import numpy as np
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
example_1 = pd.Series(sdata)
example_1
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
example_2 = pd.DataFrame(data)
example_2
df = pd.read_csv('/Users/chenandong/Documents/datawhale数据分析每个人题目设计/titanic/train.csv')
df.head(3)
df.columns
df['Cabin'].head(3)
df.Cabin.head(3)
test_1 = pd.read_csv('test_1.csv')
test_1.head(3)
del test_1['a']
test_1.head(3)
df.drop(['PassengerId','Name','Age','Ticket'],axis=1).head(3)
df.head(3)
df[df["Age"]<10].head(3)
midage = df[(df["Age"]>10)& (df["Age"]<50)]
midage.head(3)
midage = midage.reset_index(drop=True)
midage.head(3)
midage.loc[[100],['Pclass','Sex']]
midage.loc[[100,105,108],['Pclass','Name','Sex']]
midage.iloc[[100,105,108],[2,3,4]]
1-3探索性数据分析
text = pd.read_csv('train_chinese.csv')
text.head()
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
index=['2', '1'],
columns=['d', 'a', 'b', 'c'])
frame
frame.sort_values(by='c', ascending=True)
frame.sort_index()
frame.sort_index(axis=1)
frame.sort_index(axis=1, ascending=False)
frame.sort_values(by=['a', 'c'], ascending=False)
text.sort_values(by=['票价', '年龄'], ascending=False).head(3)
frame1_a = pd.DataFrame(np.arange(9.).reshape(3, 3),
columns=['a', 'b', 'c'],
index=['one', 'two', 'three'])
frame1_b = pd.DataFrame(np.arange(12.).reshape(4, 3),
columns=['a', 'e', 'c'],
index=['first', 'one', 'two', 'second'])
frame1_a
frame1_b
frame1_a + frame1_b
max(text['兄弟姐妹个数'] + text['父母子女个数'])
frame2 = pd.DataFrame([[1.4, np.nan],
[7.1, -4.5],
[np.nan, np.nan],
[0.75, -1.3]
], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
frame2
frame2.describe()
text['票价'].describe()
text['父母子女个数'].describe()