一、向量、矩阵、数组
import numpy as np
vector_row = np.array([1,2,3])
vector_column = np.array([[1],[2],[3]])
matrix = np.array([[1,2],[1,2],[1,2]])
from scipy import sparse
matrix = np.array([[0,0],[0,1],[3,0]])
matrix_sparse = sparse.csr_matrix(matrix)
print(matrix_sparse)
vector[2]
vector[:]
vector[:3]
vector[3:]
vector[-1]
matrix[1,1]
matrix[:2,:]
matrix[:,1:2]
matrix.shape
matrix.size
matrix.ndim
add_100 = lambda i: i + 100
vectorized_add_100 = np.vectorize(add_100)
vectorized_add_100(matrix)
matrix + 100
np.max(matrix)
np.min(matrix)
np.max(matrix,axis=0)
np.max(matrix,axis=1)
np.mean(matrix)
np.var(matrix)
np.std(matrix)
np.mean(matrix,axis=0)
matrix.reshape(2,3)
matrix.reshape(2,-1)
matrix.T
matrix.flatten()
np.linalg.matrix_rank(matrix)
matrix = np.array([[1,-1,3],[1,1,6],[3,8,9]])
np.linalg.det(matrix)
matrix.diagonal()
matrix.diagonal(offset=1)
matrix.diagonal(offset=-1)
matrix.trace()
a,b = np.linalg.eig(matrix)
a
b
a = np.array([1,2,3])
b = np.array([4,5,6])
np.dot(a,b)
np.add(a,b)
np.subtract(a,b)
np.dot(a,b)
a @ b
a * b
np.linalg.inv(matrix)
np.random.seed(0)
np.random.random(3)
np.random.randint(0,11,3)
np.random.normal(0,1,3)
np.random.uniform(1,2,3)
二、加载数据
from sklearn import datasets
digits = datasets.load_digits()
features = digits.data
target = digits.target
features[0]
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
import pandas as pd
url = 'htttps://tinyurl.com/simulated_data'
data = pd.read_csv(url)
data.head(2)
url = 'htttps://tinyurl.com/simulated_excel'
data = pd.read_excel(url,sheetname=0,header=1)
data.head(2)
data = pd.read_json(url,orient='columns')
from aqlalchemy import create_engine
database_connection = create_engine('sqlite:///sample.db')
dataframe = pd.read_sql_query('SELECT * FROM data',database_connection)
三、数据整理
data = pd.DataFrame()
data['name'] = ['aaa','bbb']
data['age'] = [38,25]
data['driver'] = [True,False]
new_person = pd.Series(['ccc',40,True],index=['name','age','driver'])
data.append(new_person,ignore_index=True)
data.head(2)
data.shape
data.describe()
data.iloc[0]
data.iloc[1:4]
data.iloc[:4]
data.loc[:,'name']
data[data['age'] == 38].head(1)
data[(data['age'] <= 38) & (data['driver'] == False)]
data['age'].replace(38,40)
data['age'].replace([38,40],[25,50])
data.replace(1,'one')
data.replace(r'1st','First',regex=True)
data.rename(columns={
'age':'Age'})
data.rename(columns={
'age':'Age','name':'Name'})
print('max:',data['age'].max())
print('min:',data['age'].min())
print('mean:',data['age'].mean())
print('sum:',data['age'].sum())
print('count:',data['age'].count())
data['age'].unique()
data['age'].value_counts()
data['age'].nunique()
data[data['age'].isnull()]
data = pd.read_csv(url,na_values=[np.nan,'NONE',-999])
data.drop('age',axis=1)
data.drop('age','name',axis=1)
data[data['age'] != 38]
data.drop_duplicates()
data.drop_duplicates(subset=['age'])
data.drop_duplicates(subset=['age'],keep='last')
data.groupby('age').mean()
data.groupby(['name','age'])['driver'].mean()
import pandas as pd
import numpy as np
time = pd.date_range('06/06/2017',periods=100000,freq='30s')
data = pd.DataFrame(index=time)
data['count'] = np.random.randint(1,10,100000)
data.resample('w').sum()
data.resample('w',label='left').sum()
for name in data['name'][0:2]:
print(name.upper())
def uppercase(x):
return x.upper()
data['name'].apply(uppercase)[0:2]
data.groupby('age').apply(lambda x: x.count())
pd.concat([data_a,data_b],axis=0)
pd.concat([data_a,data_b],axis=1)
pd.merge(data_a,data_b,on='id')
pd.merge(data_a,data_b,on='id',how='outer')
pd.merge(data_a,data_b,on='id',how='left')
四、处理数值型数据
import numpy as np
from sklearn import preprocessing
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_feature = minmax_scale.fit_transform(feature)
scaled_feature
x = np.array([[-1000.1],[-200.2],[500.5],[600.6],[9000.9]])
scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(x)
standardized
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)
from sklearn.preprocessing import Normalizer
features = np.array([[0.5,0.5],[1.1,3.4],[1.5,20.2],[1.63,34.4],[10.9,3.3]])
normalizer = Normalizer(norm='l2')
normalizer.transform(features)
from sklearn.preprocessing import PolynomialFeatures
features = np.array([[2,3],[2,3],[2,3]])
polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)
polynomial_interaction.fit_transform(features)
from sklearn.preprocessing import FunctionTransformer
def add_ten(x):
return x+10
ten_transformer = FunctionTransformer(add_ten)
ten_transformer.transform(features)
import pandas as pd
df = pd.DataFrame(features,columns=['feature_1','feature_2'])
df.apply(add_ten)
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
features,_ = make_blobs(n_samples = 10,n_features = 2,centers = 1,random_state = 1)
features[0,0] = 10000
features[0,1] = 10000
outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(features)
outlier_detector.predict(features)
houses = pd.DataFrame()
houses['Price'] = [534433,392333,293222,4322032]
houses['Bathrooms'] = [2,3.5,2,116]
houses['Square_Feet'] = [1500,2500,1500,48000]
houses[houses['Bathrooms'] < 20]
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)
houses
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses
from sklearn.preprocessing import Binarizer
age = np.array([[6],[12],[20],[36],[65]])
binarizer = Binarizer(18)
binarizer.fit_transform(age)
np.digitize(age,bins=[20,30,64])
np.digitize(age,bins=[20,30,64],right=True)
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
features,_ = make_blobs(n_samples = 50,n_features = 2,centers = 3,random_state = 1)
data = pd.DataFrame(features,columns=['feature_1','feature_2'])
clusterer = KMeans(3,random_state=0)
clusterer.fit(features)
data['group'] = clusterer.predict(features)
data.head(5)
features = np.array([[1.1,11.1],[2.2,22.2],[3.3,33.3],[4.4,44.4],[np.nan,55]])
features[~np.isnan(features).any(axis=1)]
data = pd.DataFrame(features,columns=['feature_1','feature_2'])
data.dropna()
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
features,_ = make_blobs(n_samples = 1000,n_features = 2,random_state = 1)
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan
features_knn_imputed = KNN(k=5,verbose=0).complete(standardized_features)
print('TRUE:',true_value)
print('Imputed:',features_knn_imputed[0,0])
from sklearn.preprocessing import Imputer
mean_inputer = Imputer(strategy='mean'ssssssssssss,axis=0)
features_mean_inputed = mean_imputer.fit_transform(features)
print('TRUE:',true_value)
print('Imputed:',features_mean_inputed[0,0])
五、处理分类数据
import numpy as np
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer
feature = np.array([['a'],['b'],['a'],['c'],['a']])
one_hot = LabelBinarizer()
one_hot.fit_transform(feature)
one_hot.classes_
one_hot.inverse_transform(one_hot.transform(feature))
one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)
one_hot_multiclass.classes_
import pandas as pd
pd.get_dummies(feature[:,0])
data = pd.DataFrame({
'score':['low','low','medium','medium','high']})
scale_mapper = {
'low':1,'medium':2,'high':3}
data['score'].replace(scale_mapper)
from sklearn.feature_extraction import DictVectorizer
data = [{
'red':2,'blue':4