numpy.random
from numpy import random as rnd
np.random.seed(10) // 设置种子能够每次得到相同的随机数,相同的种子,相同的随机数,种子随便设值,保持一致即可.每次调用函数都要设置一次。
rnd.rand(d1,d2,...dn),每一维的数据数
rnd.rand(100,1) //第一维有100个数,第二维有1个数,生成 (包含100个 一个数的数组 的)数组
数的范围是[0,1)
绘图
import matplotlib.pyplot as plt
plt.plot(X,y,".b",markersize=16,color="black",label="") //.b用点绘图,默认蓝色,其他"*"
plt.xlabel("$x_1$",fontSize=18)
plt.ylabel("$y$")
plt.axis([0,2,0,5]),分别x0,2,y0,5的范围
plt.savefig("XXX.png")
plt.legend(["","",""],fontsize=16,loc = 'upper left') //图注解
plt.title("XXX")
plt.show()
data[col].hist(bins=20) // 画某列直方图,有20个bar
data.boxplot() // 箱型图
// 分类图
import matplotlib.pyplot as plt
groups = data.groupby("class")
for name, group in groups:
plt.scatter(group['petal length'],group['petal width'], marker="o", label=name)
plt.legend()
plt.show()
// 平行类别图
from pandas.plotting import parallel_coordinates
parallel_coordinates(data,"tag")
//特征重要性
import seaborn as sns
sns.pairplot(data,hue="class")
线性回归
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
lin_reg.predict(X_new)
print(lin_reg.coef_)
print(lin_reg.intercept_)
from sklearn.linear_model import Ridge
lin_reg = Ridge(alpha=0.1)
//避免过拟合
数据读取查看
import pandas as pd
pd.read_csv("",header=None/"infer", delimiter="",names=["",""])
data.display() // 查看表
data.shape
data.head() // 查看前几个
data[0:2,2:3]
预处理
data.drop_duplicates()
// 类别值OneHotEncoder
from sklearn.preprocssing import OneHotEncoder
enc = OneHotEncoder(drop = "first") //按字母排序,删第一个
enc_f = enc.fit(data[["column_name"]])
print(enc_f.categories_)
enc_df=pd.DateFrame(enc_f.transform(data[["column_name"]]).toarray())
enc_df.columns = ["","",""]
// 标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)
scaler.fit_transform(data)
// 缺失值
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values = np.nan,strategy="mean")
imp.fit_transform(data[["",""]])
数据选择selection和降维
from sklearn.feature_selection import SelectKBest,chi2
select = SelectKBest(chi2,k=2).fit_transform(X,y)
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModle
clf = ExtraTreesClassifier(n_estimators = 50)
clf.fit(X)
clf.feature_importances_
select = SelectFromModel(clf,prefit=true)
select.transform(X)
from sklearn.feature_selection import RFE
clf = ExtraTreesClassifier(n_estimators=50)
selection = RFE(estimator=clf, n_features_to_select=2, step=1)
selection.fit(X,y)
X_new=selection.transform(X)
// PCA降维
from sklearn.decomposition import PCA
PCA(n_components =2 ).fit_transform(X)
简单随机取样(放回和不放回),stratified
from sklearn.model_selection import train_test_split
X_train,x_test,Y_train,y_test = tran_test_split(X,y, test_size=0.3,random_state=0)
// random_state控制每次得到相同的数据集切分结果
数据分析
data[col].mean()
data[col].quantile(0.25)
data[col].max()
data[col].min()
data[col].value_counts()
data.describe() // 上面的基本都能看
DataFrame操作
A.join(B)//表的相加
A.drop("",axis=1)//丢掉某列数据
A[""]//读一列
A[["",""]] // 读多列
聚类
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters = 2, max_iter = 50,random_state=1)
k_means.cluster_centers_
from scipy.cluster import hierarchy
Z = hierarchy.linkage(X.values, 'single') #hierarchy clusting using 'single' link
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right') #plot dendrogram
Z = hierarchy.linkage(X.values, 'complete') #change to 'complete' link
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')
Z = hierarchy.linkage(X.values, 'average') #change to 'complete' link
dn = hierarchy.dendrogram(Z,labels=names.tolist(),orientation='right')
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=15.5, min_samples=5).fit(data)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = pd.DataFrame(db.labels_,columns=['Cluster ID'])
result = pd.concat((data,labels), axis=1)
result.plot.scatter(x='x',y='y',c='Cluster ID', colormap='jet')
// 解决不规则形状
from sklearn.cluster import SpectralClustering
import pandas as pd
spectral = SpectralClustering(n_clusters=2,random_state=1,affinity='rbf',gamma=5000)
分类
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
clf2 = BaggingClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=50) # number estimators is the key parameter
// imbalanced class
weights = {0:10.0, 1:1.0} #the weight put the weight for class "0" to 10 and "1" to 1. So the "0" class is more important
clf = DecisionTreeClassifier(max_depth=2,class_weight=weights)
时间序列
// 移动平均
train['Sales'].rolling(5).mean()
from statsmodels.tsa.exponential_smoothing.ets import ETSModel
ses = ETSModel(train['Sales']) // 简单指数平滑
hw = ETSModel(cvd_train['cvd'], trend='add', seasonal = 'add', seasonal_periods = 4)
from scipy.stats import boxcox
from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib import pyplot as plt
kpss_stat, pval, lags, crit = kpss(heater['demand'], regression = 'c', nlags = 'auto')
print('p Value to the KPSS test is: ',pval)
hw = ETSModel(train['demand_BC'], trend='add', seasonal = 'add', seasonal_periods = 12)
评估模型结果
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(Y,Y_pre) //or mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
disp = plot_confusion_matrix(clf, X_test, Y_test,display_labels=['+','-'],values_format = '.0f',cmap=plt.cm.Blues)
np.set_printoptions(precision=1)
print(disp.confusion_matrix)