机器学习~代码部分_startangle=90-CSDN博客

本文链接：https://blog.csdn.net/m0_57621770/article/details/132313942

Python

Numpy

1.创建数组：
import numpy as np

# 创建一维数组
arr1 = np.array([1, 2, 3, 4, 5])

# 创建二维数组
arr2 = np.array([[1, 2, 3], [4, 5, 6]])

2.数组属性：
# 查看数组形状
print(arr2.shape)

# 查看数组维度
print(arr2.ndim)

# 查看数组数据类型
print(arr2.dtype)

3.数组运算：
# 数组加法
arr_sum = arr1 + arr1

# 数组乘法
arr_product = arr1 * 2

# 数组的逐元素平方
arr_squared = np.square(arr1)

4.数组切片和索引：
# 使用索引获取数组元素
print(arr1[0])  # 第一个元素

# 使用切片获取子数组
print(arr1[1:4])  # 第二到第四个元素（不包括第四个）

# 二维数组索引
print(arr2[0, 1])  # 第一行第二列的元素

5.数组操作：
# 数组转置
arr2_transposed = arr2.T

# 数组重塑
arr1_reshaped = arr1.reshape(5, 1)

# 数组拼接
arr_concatenated = np.concatenate((arr1, arr1))

6.数组统计和数学函数：
# 计算数组的和、均值、标准差
arr_sum = np.sum(arr1)
arr_mean = np.mean(arr1)
arr_std = np.std(arr1)

# 数组数学函数
arr_exp = np.exp(arr1)
arr_sqrt = np.sqrt(arr1)

7.矩阵运算：
# 矩阵乘法
matrix1 = np.array([[1, 2], [3, 4]])
matrix2 = np.array([[5, 6], [7, 8]])
matrix_product = np.dot(matrix1, matrix2)

8.随机数生成：
# 生成随机整数数组
rand_ints = np.random.randint(low=1, high=10, size=5)

# 生成随机正态分布数组
rand_normal = np.random.normal(loc=0, scale=1, size=5)

9.向量运算 使用NumPy的内置函数来避免for循环
np.exp(v) # 输出e^v
np.log()
np.abs()
np.maximum(v,0) # 会将v中的每个元素与0相比求最大主值

Matplotlib

1.绘制简单的折线图：
import matplotlib.pyplot as plt

# 数据
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]

# 创建折线图
plt.plot(x, y)

# 添加标题和标签
plt.title("Simple Line Plot")
plt.xlabel("X-axis")
plt.ylabel("Y-axis")

# 显示图表
plt.show()

2.绘制散点图：
# 数据
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]

# 创建散点图
plt.scatter(x, y)

# 添加标题和标签
plt.title("Scatter Plot")
plt.xlabel("X-axis")
plt.ylabel("Y-axis")

# 显示图表
plt.show()

3.绘制柱状图：
# 数据
categories = ['A', 'B', 'C', 'D', 'E']
values = [15, 30, 10, 25, 20]

# 创建柱状图
plt.bar(categories, values)

# 添加标题和标签
plt.title("Bar Chart")
plt.xlabel("Categories")
plt.ylabel("Values")

# 显示图表
plt.show()

4.绘制直方图：
# 随机生成数据
data = np.random.randn(1000)

# 创建直方图
plt.hist(data, bins=20)

# 添加标题和标签
plt.title("Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")

# 显示图表
plt.show()

5.绘制饼图：
# 数据
labels = ['Apples', 'Bananas', 'Grapes', 'Oranges']
sizes = [35, 20, 30, 15]

# 创建饼图
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)

# 添加标题
plt.title("Pie Chart")

# 显示图表
plt.show()

6.自定义样式和布局：
# 数据
x = np.linspace(0, 10, 100)
y = np.sin(x)

# 创建图表
plt.plot(x, y, label='sin(x)')

# 添加标题、标签和图例
plt.title("Customized Plot")
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
plt.legend()

# 设置样式和布局
plt.style.use('seaborn')  # 使用 Seaborn 风格
plt.tight_layout()  # 调整布局

# 显示图表
plt.show()

Pandas

Pandas适合具有不同列的表格数据（如SQL表，Excel数据表）

1.创建 DataFrame：
import pandas as pd

# 创建一个包含数据的 DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 22]}
df = pd.DataFrame(data)
print(df)

2.读取数据：
# 从 CSV 文件读取数据
csv_path = 'data.csv'
df = pd.read_csv(csv_path)

# 从 Excel 文件读取数据
excel_path = 'data.xlsx'
df = pd.read_excel(excel_path)

3.基本信息查看：
# 查看前几行数据
print(df.head())

# 查看后几行数据
print(df.tail())

# 查看数据的形状（行数和列数）
print(df.shape)

# 查看列名
print(df.columns)

# 查看索引信息
print(df.index)

# 查看基本统计信息
print(df.describe())

4.选择和过滤数据：
# 选择单列数据
age_column = df['Age']

# 选择多列数据
subset = df[['Name', 'Age']]

# 使用条件过滤数据
filtered_data = df[df['Age'] > 25]

5.添加和删除数据：
# 添加新列
df['Salary'] = [50000, 60000, 45000]

# 删除列
df.drop('Salary', axis=1, inplace=True)

# 添加新行
new_row = {'Name': 'David', 'Age': 28}
df = df.append(new_row, ignore_index=True)

6.数据排序：
# 按某列升序排序
df_sorted = df.sort_values(by='Age')

# 按某列降序排序
df_sorted_desc = df.sort_values(by='Age', ascending=False)

7.数据聚合和分组：
# 按列进行分组并计算平均值
grouped = df.groupby('Age')['Salary'].mean()

# 多列分组计算多个统计量
grouped_multi = df.groupby(['Age', 'Gender']).agg({'Salary': 'mean', 'Name': 'count'})

8.数据清洗：
# 处理缺失值
df.dropna()  # 删除含有缺失值的行
df.fillna(value)  # 用指定值填充缺失值

# 数据重复处理
df.drop_duplicates()  # 删除重复行

sklearn

1.数据集加载和处理：
from sklearn.datasets import load_iris

# 加载经典的鸢尾花数据集
data = load_iris()

# 特征矩阵
X = data.data

# 目标向量
y = data.target

2.数据预处理：
from sklearn.preprocessing import StandardScaler

# 创建标准化器
scaler = StandardScaler()

# 对特征矩阵进行标准化
X_scaled = scaler.fit_transform(X)

3.拆分数据集为训练集和测试集：
from sklearn.model_selection import train_test_split

# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

4.训练一个机器学习模型（以分类器为例）：
from sklearn.linear_model import LogisticRegression

# 创建逻辑回归分类器
classifier = LogisticRegression()

# 在训练集上拟合模型
classifier.fit(X_train, y_train)

5.模型预测和评估：
# 在测试集上进行预测
y_pred = classifier.predict(X_test)

# 计算准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

6.交叉验证：
from sklearn.model_selection import cross_val_score

# 使用交叉验证评估模型性能
scores = cross_val_score(classifier, X, y, cv=5)

7.特征选择：
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# 创建选择前 k 个最佳特征的变换器
feature_selector = SelectKBest(score_func=f_classif, k=2)

# 应用特征选择变换
X_selected = feature_selector.fit_transform(X, y)

线性回归（ linear regression）

代价函数
def compute_cost(x, y, w, b): 
   
    m = x.shape[0] # 训练样例数量
    total_cost = 0 # 返回值
    
    cost_sum = 0
    
    for i in range(m):
         
            f_wb = w * x[i] + b
           
            cost = (f_wb - y[i]) ** 2
        
            cost_sum = cost_sum + cost 

    total_cost = (1 / (2 * m)) * cost_sum
    
    return total_cost

计算梯度
def compute_gradient(x, y, w, b): 
 
    m = x.shape[0]
    
    dj_dw = 0
    dj_db = 0
    
    for i in range(m):  

        f_wb =  w * x[i] + b
        
        dj_dw_i = (f_wb - y[i]) * x[i]  
        dj_db_i = f_wb - y[i] 
 
        dj_db += dj_db_i
        dj_dw += dj_dw_i

    dj_dw = dj_dw / m
    dj_db = dj_db / m
        
    return dj_dw, dj_db

计算梯度下降
def gradient_descent(x, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
      num_iters : (int) 运行梯度下降的迭代次数（步长）
    
    """
    m = len(x)
    
    J_history = [] #一个数组来存储每次迭代的代价J和w——主要是为了以后的绘图
    w_history = []
    w = copy.deepcopy(w_in)  #避免在函数内修改全局w
    b = b_in
    
    for i in range(num_iters):

        dj_dw, dj_db = gradient_function(x, y, w, b )  

        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               

        if i<100000:      # 防止资源耗尽 
            cost =  cost_function(x, y, w, b)
            J_history.append(cost) # 每次迭代节省成本J

        if i% math.ceil(num_iters/10) == 0:
            w_history.append(w)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w, b, J_history, w_history #return w and J,w history for graphing

运行上面的梯度下降算法来学习数据集的参数。

# 初始化拟合参数。回想一下，w的形状是(n，)
initial_w = 0.
initial_b = 0.

iterations = 1500
alpha = 0.01

w,b,_,_ = gradient_descent(x_train ,y_train, initial_w, initial_b, 
                     compute_cost, compute_gradient, alpha, iterations)
print("w,b found by gradient descent:", w, b)

->
Iteration    0: Cost     6.74   
Iteration  150: Cost     5.31   
Iteration  300: Cost     4.96   
Iteration  450: Cost     4.76   
Iteration  600: Cost     4.64   
Iteration  750: Cost     4.57   
Iteration  900: Cost     4.53   
Iteration 1050: Cost     4.51   
Iteration 1200: Cost     4.50   
Iteration 1350: Cost     4.49   
w,b found by gradient descent: 1.166362350335582 -3.63029143940436

逻辑回归

def compute_cost(X, y, w, b, *argv):
    """
      X : (ndarray Shape (m,n)) data, m examples by n features
      First calculate z_wb = w[0]*X[i][0]+...+w[n-1]*X[i][n-1]+b
    """
    m, n = X.shape
    loss_sum = 0 
    
    for i in range(m): 
        
        z_wb = 0 

        for j in range(n): 
            
            z_wb_ij = w[j]*X[i][j] 
            z_wb += z_wb_ij
  
        z_wb += b 
    
        f_wb = sigmoid(z_wb)

        loss =  -y[i] * np.log(f_wb) - (1 - y[i]) * np.log(1 - f_wb)
        
        loss_sum += loss 
    
    total_cost = (1 / m) * loss_sum  

    return total_cost

def compute_gradient(X, y, w, b, *argv): 

    m, n = X.shape
    dj_dw = np.zeros(w.shape)  #  dj_dw : (ndarray Shape (n,)) 
    dj_db = 0.  # 标量

    for i in range(m):
           # Calculate f_wb 
            z_wb = 0
                  
            for j in range(n): 
                      
                    z_wb_ij = X[i, j] * w[j]
                    z_wb += z_wb_ij
                
            z_wb += b
            f_wb = sigmoid(z_wb)
        
            dj_db_i = f_wb - y[i]     
            dj_db += dj_db_i
        
            # get dj_dw 
            for j in range(n):
                   
                dj_dw_ij =(f_wb - y[i])* X[i][j]  
                dj_dw[j] += dj_dw_ij
        
    dj_dw = dj_dw / m
    dj_db = dj_db / m
        
    return dj_db, dj_dw

def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_): 
    """
    Args:
      X :    (ndarray Shape (m, n) data, m examples by n features
      y :    (ndarray Shape (m,))  target value 
      w_in : (ndarray Shape (n,))  Initial values of parameters of the model
      b_in : (scalar)              Initial value of parameter of the model
      num_iters : (int)            number of iterations to run gradient descent
      lambda_ : (scalar, float)    regularization constant
      
    """
    
    m = len(X)
    
    # 存储每次迭代的代价J和w的数组，主要用于以后的绘图
    J_history = []
    w_history = []
    
    for i in range(num_iters):

        dj_db, dj_dw = gradient_function(X, y, w_in, b_in, lambda_)   

        w_in = w_in - alpha * dj_dw               
        b_in = b_in - alpha * dj_db              
       
        if i<100000:     
            cost =  cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)

        if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
            w_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w_in, b_in, J_history, w_history #return w and J,w history for graphing

np.random.seed(1)
initial_w = 0.01 * (np.random.rand(2) - 0.5)
initial_b = -8

# Some gradient descent settings
iterations = 10000
alpha = 0.001

w,b, J_history,_ = gradient_descent(X_train ,y_train, initial_w, initial_b, 
                                   compute_cost, compute_gradient, alpha, iterations, 0)

->
teration    0: Cost     0.96   
Iteration 1000: Cost     0.31   
Iteration 2000: Cost     0.30   
Iteration 3000: Cost     0.30   
Iteration 4000: Cost     0.30   
Iteration 5000: Cost     0.30   
Iteration 6000: Cost     0.30   
Iteration 7000: Cost     0.30   
Iteration 8000: Cost     0.30   
Iteration 9000: Cost     0.30   
Iteration 9999: Cost     0.30

逻辑回归（矢量化）

def sigmoid(z):

    s = 1/(1+np.exp(-z))

    return s

初始化(w,b)
这个函数为w创建一个形状为(dim, 1)的0向量，并将b初始化为0。
dim -- 我们想要的w向量的大小(或例子中参数的数量)

def initialize_with_zeros(dim):

    w = np.zeros((dim,1))
    b = 0
    ### END CODE HERE ###

    assert(w.shape == (dim, 1))
    assert(isinstance(b, float) or isinstance(b, int))
    
    return w, b

前向传播和反向传播：优化学习参数(w,b)：计算代价函数和梯度
def propagate(w, b, X, Y):
    
    m = X.shape[1]

    A = sigmoid(np.dot(w.T,X) + b)         
    cost = np.sum(((- np.log(A))*Y + (-np.log(1-A))*(1-Y)))/m 

    dw = (np.dot(X,(A-Y).T))/m
    db = (np.sum(A-Y))/m

    assert(dw.shape == w.shape)
    assert(db.dtype == float)
    cost = np.squeeze(cost)
    assert(cost.shape == ())
    
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

使用梯度下降更新参数：
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
   """
    num_iterations -- 优化循环的迭代次数

    params -- 包含权重w和偏置b的字典
    grads -- 包含相对于代价函数的权重和偏置梯度的字典
    costs -- 优化过程中计算的所有成本的列表，这将用于绘制学习曲线。
   """

    costs = []
    
    for i in range(num_iterations):

        grads, cost = propagate(w, b, X, Y)

        dw = grads["dw"]
        db = grads["db"]

        w = w - (learning_rate*dw)
        b = b - (learning_rate*db)

        if i % 100 == 0:
            costs.append(cost)

        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

def predict(w, b, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
    
    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px * 3, number of examples)
    
    Returns:
    Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
    '''
    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    ### START CODE HERE ### (≈ 1 line of code)
    A = sigmoid(np.dot(w.T,X) + b)           # Dimentions = (1, m)
    ### END CODE HERE ###
    
    #### WORKING SOLUTION 1: USING IF ELSE #### 
    #for i in range(A.shape[1]):
        ## Convert probabilities A[0,i] to actual predictions p[0,i]
        ### START CODE HERE ### (≈ 4 lines of code)
        #if (A[0,i] >= 0.5):
        #    Y_prediction[0, i] = 1
        #else:
        #    Y_prediction[0, i] = 0
        ### END CODE HERE ###
        
    #### WORKING SOLUTION 2: ONE LINE ####
    #for i in range(A.shape[1]):
        ## Convert probabilities A[0,i] to actual predictions p[0,i]
        ### START CODE HERE ### (≈ 4 lines of code)
        #Y_prediction[0, i] = 1 if A[0,i] >=0.5 else 0
        ### END CODE HERE ###
    
    #### WORKING SOLUTION 3: VECTORISED IMPLEMENTATION ####
    Y_prediction = (A >= 0.5) * 1.0
    
    assert(Y_prediction.shape == (1, m))
    
    return Y_prediction

整合：
def model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False):
    """
    Builds the logistic regression model by calling the function you've implemented previously
    
    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train)
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test)
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to true to print the cost every 100 iterations
    
    Returns:
    d -- dictionary containing information about the model.
    """
    
    ### START CODE HERE ###
    
    # initialize parameters with zeros (≈ 1 line of code)
    w, b = initialize_with_zeros(X_train.shape[0])

    # Gradient descent (≈ 1 line of code)
    parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    # Retrieve parameters w and b from dictionary "parameters"
    w = parameters["w"]
    b = parameters["b"]
    
    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)
    ### END CODE HERE ###

    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

测试：
d = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations = 2000, learning_rate = 0.005, print_cost = False)

->

train accuracy: 91.38755980861244 %
test accuracy: 34.0 %