import numpy as np
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer
features=np.array([["Texas"],
["California"],
["Texas"],
["Delaware"],
["Texas"]])
one_hot=LabelBinarizer()
one_hot.fit_transform(features)
array([[0, 0, 1],
[1, 0, 0],
[0, 0, 1],
[0, 1, 0],
[0, 0, 1]])
one_hot.classes_
array(['California', 'Delaware', 'Texas'], dtype='<U10')
one_hot.inverse_transform(one_hot.fit_transform(features))
array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')
import pandas as pd
pd.get_dummies(features[:,0])
| California | Delaware | Texas |
---|
0 | 0 | 0 | 1 |
---|
1 | 1 | 0 | 0 |
---|
2 | 0 | 0 | 1 |
---|
3 | 0 | 1 | 0 |
---|
4 | 0 | 0 | 1 |
---|
multiclass_features=([["Texas","Florida"],
["California","Alabama"],
["Texas","Florida"],
["Delware","Florida"],
["Texas","Alabama"]])
muti_one_hot=MultiLabelBinarizer()
muti_one_hot.fit_transform(multiclass_features)
array([[0, 0, 0, 1, 1],
[1, 1, 0, 0, 0],
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 0],
[1, 0, 0, 0, 1]])
muti_one_hot.classes_
array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
dtype=object)
import pandas as pd
dataframe=pd.DataFrame({"score":["low","low","medium","medium","high"]})
dataframe
| score |
---|
0 | low |
---|
1 | low |
---|
2 | medium |
---|
3 | medium |
---|
4 | high |
---|
mapper={"low":1,
"medium":2,
"high":3}
dataframe["score"].replace(mapper)
0 1
1 1
2 2
3 2
4 3
Name: score, dtype: int64
mapper1={"low":1,
"medium":2.5,
"high":5}
dataframe["score"].replace(mapper1)
0 1.0
1 1.0
2 2.5
3 2.5
4 5.0
Name: score, dtype: float64
from sklearn.feature_extraction import DictVectorizer
data_dict=[{"red":2,"blue":4},
{"red":4,"blue":3},
{"red":1,"yellow":2},
{"red":2,"yellow":2}]
dictvectorizer=DictVectorizer(sparse=False)
features=dictvectorizer.fit_transform(data_dict)
features
array([[4., 2., 0.],
[3., 4., 0.],
[0., 1., 2.],
[0., 2., 2.]])
names=dictvectorizer.get_feature_names()
names
['blue', 'red', 'yellow']
df=pd.DataFrame(features,columns=names)
df
| blue | red | yellow |
---|
0 | 4.0 | 2.0 | 0.0 |
---|
1 | 3.0 | 4.0 | 0.0 |
---|
2 | 0.0 | 1.0 | 2.0 |
---|
3 | 0.0 | 2.0 | 2.0 |
---|
df.values
array([[4., 2., 0.],
[3., 4., 0.],
[0., 1., 2.],
[0., 2., 2.]])
from sklearn.neighbors import KNeighborsClassifier
X=np.array([[0,2.1,1.45],
[1,1.18,1.33],
[0,1.22,1.27],
[1,-0.21,-1.19]])
X_nan=np.array([[np.nan,0.87,1.31],
[np.nan,-0.67,-0.22]])
clf=KNeighborsClassifier(3,weights="distance")
model=clf.fit(X[:,1:],X[:,0])
impute_value=model.predict(X_nan[:,1:])
X_nan_impute=np.hstack((impute_value.reshape(-1,1),X_nan[:,1:]))
np.vstack((X_nan_impute,X))
array([[ 0. , 0.87, 1.31],
[ 1. , -0.67, -0.22],
[ 0. , 2.1 , 1.45],
[ 1. , 1.18, 1.33],
[ 0. , 1.22, 1.27],
[ 1. , -0.21, -1.19]])
impute_value
array([0., 1.])
X_nan_impute
array([[ 0. , 0.87, 1.31],
[ 1. , -0.67, -0.22]])
from sklearn.impute import SimpleImputer
X_complete=np.vstack((X_nan,X))
X_complete
array([[ nan, 0.87, 1.31],
[ nan, -0.67, -0.22],
[ 0. , 2.1 , 1.45],
[ 1. , 1.18, 1.33],
[ 0. , 1.22, 1.27],
[ 1. , -0.21, -1.19]])
imputer=SimpleImputer(strategy="most_frequent")
imputer.fit_transform(X_complete)
array([[ 0. , 0.87, 1.31],
[ 0. , -0.67, -0.22],
[ 0. , 2.1 , 1.45],
[ 1. , 1.18, 1.33],
[ 0. , 1.22, 1.27],
[ 1. , -0.21, -1.19]])
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
iris=load_iris()
features=iris.data
target=iris.target
features=features[40:,:]
target=target[40:]
target=np.where((target==0),0,1)
print(target)
print(features[:5,:])
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[[5. 3.5 1.3 0.3]
[4.5 2.3 1.3 0.3]
[4.4 3.2 1.3 0.2]
[5. 3.5 1.6 0.6]
[5.1 3.8 1.9 0.4]]
weight={0:0.9,1:0.1}
rfc=RandomForestClassifier(class_weight=weight)
rfc.fit(features,target)
rfc.predict(features[:5,:])
array([0, 0, 0, 0, 0])
rfc1=RandomForestClassifier(class_weight="balanced")
rfc1.fit(features,target)
rfc1.predict(features[:5,:])
array([0, 0, 0, 0, 0])
class0=np.where(target==0)[0]
class1=np.where(target==1)[0]
n0=len(class0)
n1=len(class1)
class1_downsampled=np.random.choice(class1,size=n0,replace=False)
np.hstack((target[class0],target[class1_downsampled]))
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])