Artificial Intelligence Experiment for Jingdezhen Ceramic University -2023

文章内容由本人编写,人工智能,多位道友摘录组构而成。
如内容有出错,可联系邮箱:1478928343@qq.com、halibaduoxiansheng@gmail.com,俺将第一时间进行修改~

文章目录

Numpy初体验


第一关 Numpy 创建数组

# 引入numpy库
import numpy as np
# 定义cnmda函数
def cnmda(m,n):
    '''
    创建numpy数组
    参数:
   		m:第一维的长度
   		n: 第二维的长度
    返回值:
    	ret: 一个numpy数组
    '''
    
	ret = 0
	
    # 请在此添加创建多维数组的代码并赋值给ret
	#********** Begin *********#
    ret = np.zeros((m, n))
	#********** End **********#
    
	return ret

第二关 Numpy 数组的基本运算

# 引入numpy库
import numpy as np

# 定义opeadd函数
def opeadd(m, b, n):
    '''
    参数:
    m: 是一个数组
    b: 是一个列表
    n: 是列表中的索引
    你需要做的是 m + b[n]
    返回值:
    ret: 一个numpy数组
    '''
    ret = m + b[n]  # 向量与标量相加
    return ret

# 定义opemul函数
def opemul(m, b, n):
    '''
    参数:
    m: 是一个数组
    b: 是一个列表
    n: 是列表中的索引
    你需要做的是 m * b[n]
    返回值:
    ret: 一个numpy数组
    '''
    ret = m * b[n]  # 向量与标量相乘
    return ret

第三关 Numpy 数组的切片和索引

# 引入numpy库
import numpy as np

# 定义ce函数
def ce(a, m, n):
    '''
    参数:
    a: 是一个Numpy数组
    m: 是第m维数组的索引
    n: 第m维数组的前n个元素的索引
    返回值:
    ret: 一个numpy数组
    '''
    ret = a[m][:n]  # 切片找出指定元素
    return ret

第四关 Numpy 数组的堆叠

# 引入numpy库
import numpy as np

# 定义varray函数
def varray(m, n):
    '''
    参数:
    m: 第一个数组
    n: 第二个数组
    返回值:
    ret: 一个numpy数组
    '''
    ret = np.vstack((m, n))  # 垂直叠加
    return ret

# 定义darray函数
def darray(m, n):
    '''
    参数:
    m: 第一个数组
    n: 第二个数组
    返回值:
    ret: 一个numpy数组
    '''
    ret = np.dstack((m, n))  # 深度叠加
    return ret

# 定义harray函数
def harray(m, n):
    '''
    参数:
    m: 第一个数组
    n: 第二个数组
    返回值:
    ret: 一个numpy数组
    '''
    ret = np.hstack((m, n))  # 水平叠加
    return ret

第五关 Numpy 的拆分

# 引入numpy库
import numpy as np

# 定义vsarray函数
def vsarray(m, n):
    '''
    参数:
    m: 第一个数组
    n: 需要拆分到的维度
    返回值:
    ret: 一个numpy数组
    '''
    ret = np.vsplit(m, n)  # 纵向拆分
    return ret

# 定义dsarray函数
def dsarray(m, n):
    '''
    参数:
    m: 第一个数组
    n: 需要拆分到的维度
    返回值:
    ret: 一个numpy数组
    '''
    ret = np.dsplit(m, n)  # 深度拆分
    return ret

# 定义hsarray函数
def hsarray(m, n):
    '''
    参数:
    m: 第一个数组
    n: 需要拆分到的维度
    返回值:
    ret: 一个numpy数组
    '''
    ret = np.hsplit(m, n)  # 横向拆分
    return ret

Pandas初体验


第一关 了解数据处理对象–Series

from pandas import Series, DataFrame
import pandas as pd

def create_series():
    '''
    返回值:
    series_a: 一个Series类型数据
    series_b: 一个Series类型数据
    dict_a:  一个字典类型数据
    '''
    series_a = pd.Series([1, 2, 5, 7], index=['nu', 'li', 'xue', 'xi'])
    dict_a = {'ting': 1, 'shuo': 2, 'du': 32, 'xie': 44}
    series_b = pd.Series(dict_a)

    # 返回 series_a, dict_a, series_b
    return series_a, dict_a, series_b

第二关 了解处理数据对象-DataFrame

from pandas import Series, DataFrame
import pandas as pd

def create_dataframe():
    '''
    返回值:
    df1: 一个DataFrame类型数据
    '''
    df1 = pd.DataFrame(index=['one', 'two', 'three', 'four', 'five'], columns=['states', 'years', 'pops'])
    df1['new_add'] = [7, 4, 5, 8, 2]

    # 返回 df1
    return df1

第三关 读取 CSV 格式数据

from pandas import Series, DataFrame
import pandas as pd

def read_csv_data():
    '''
    返回值:
    df1: 一个DataFrame类型数据
    length1: 一个int类型数据
    '''
    df1 = pd.read_csv('test3/uk_rain_2014.csv')
    df1.columns = ['water_year', 'rain_octsep', 'outflow_octsep', 'rain_decfeb', 'outflow_decfeb', 'rain_junaug', 'outflow_junaug']
    length1 = df1.shape[0]

    # 返回 df1, length1
    return df1, length1

第四关 数据的基本操作——排序

from pandas import Series, DataFrame
import pandas as pd

def sort_gate():
    '''
    返回值:
    s2: 一个Series类型数据
    d2: 一个DataFrame类型数据
    '''

    # s1是Series类型数据,d1是DataFrame类型数据
    s1 = Series([4, 3, 7, 2, 8], index=['z', 'y', 'j', 'i', 'e'])
    d1 = DataFrame({'e': [4, 2, 6, 1], 'f': [0, 5, 4, 2]})

    s2 = s1.sort_index()
    d2 = d1.sort_values(by='f')

    # 返回 s2, d2
    return s2, d2

第五关 数据的基本操作——删除

from pandas import Series, DataFrame
import numpy as np
import pandas as pd

def delete_data():
    '''
    返回值:
    s2: 一个Series类型数据
    d2: 一个DataFrame类型数据
    '''

    # s1是Series类型数据,d1是DataFrame类型数据
    s1 = Series([5, 2, 4, 1], index=['v', 'x', 'y', 'z'])
    d1 = DataFrame(np.arange(9).reshape(3,3), columns=['xx','yy','zz'])
    
    s2 = s1.drop('z')
    d2 = d1.drop('yy', axis=1)

    # 返回 s2, d2
    return s2, d2

第六关 数据的基本操作——算术运算

from pandas import Series, DataFrame
import numpy as np
import pandas as pd

def add_way():
    '''
    返回值:
    df3: 一个DataFrame类型数据
    '''

    # df1,df2是DataFrame类型数据
    df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
    df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

    df3 = df1.add(df2, fill_value=4)

    # 返回 df3
    return df3

第七关 数据的基本操作——去重

from pandas import Series, DataFrame
import pandas as pd

def delete_duplicated():
    '''
    返回值:
    df2: 一个DataFrame类型数据
    '''

    # df1是DataFrame类型数据
    df1 = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})

    df2 = df1.drop_duplicates()

    # 返回 df2
    return df2

第八关 数据重塑

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

def suoying():
    '''
    返回值:
    d1: 一个DataFrame类型数据
    '''
    #s1是Series类型数据
    s1 = Series(np.random.randn(10),
                index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

    d1 = s1.unstack()

    # 返回 d1
    return d1


suoying()

Python机器学习软件包Scikit-Learn的学习与运用


第一关 使用scikit-learn导入数据集

from sklearn import datasets
def getIrisData():

    '''
    导入Iris数据集

    返回值:
    X - 前5条训练特征数据
    y - 前5条训练数据类别
    X_shape - 训练特征数据的二维数组大小
    '''
    #初始化
    X = [] 
    y = [] 
    X_shape = () 

    #   请在此添加实现代码   #
    #********** Begin *********#
    iris=datasets.load_iris()
    X=iris.data[:5]
    y=iris.target[:5]
    X_shape=iris.data.shape
    #********** End **********#
    return X,y,X_shape

第二关 数据预处理——标准化

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
'''
Data descrption:
The data contains 20,640 observations on 9 variables.
This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.
dataset : dict-like object with the following attributes:
    dataset.data : ndarray, shape [20640, 8]
        Each row corresponding to the 8 feature values in order.
    dataset.target : numpy array of shape (20640,)
        Each value corresponds to the average house value in units of 100,000.
    dataset.feature_names : array of length 8
        Array of ordered feature names used in the dataset.
    dataset.DESCR : string
        Description of the California housing dataset.
'''
dataset = fetch_california_housing("./step4/")
X_full, y = dataset.data, dataset.target
#抽取其中两个特征数据
X = X_full[:, [0, 5]]
def getMinMaxScalerValue():
    '''
    对特征数据X进行MinMaxScaler标准化转换,并返回转换后的数据前5条
    返回值:
    X_first5 - 数据列表
    '''
    X_first5 = []
    #   请在此添加实现代码   #
    # ********** Begin *********#
    min_max=MinMaxScaler()
    X_first5=min_max.fit_transform(X)[:5]


    # ********** End **********#
    return X_first5
def getScaleValue():
    '''
        对目标数据y进行简单scale标准化转换,并返回转换后的数据前5条
        返回值:
        y_first5 - 数据列表
        '''
    y_first5 = []
    #   请在此添加实现代码   #
    # ********** Begin *********#
    y_first5=scale(y)[:5]
    
    # ********** End **********#
    return y_first5
def getStandardScalerValue():
    '''
    对特征数据X进行StandardScaler标准化转换,并返回转换后的数据均值和缩放比例
    返回值:
    X_mean - 均值
    X_scale - 缩放比例值
    '''
    X_mean = None
    X_scale = None
    #   请在此添加实现代码   #
    #********** Begin *********#
    a=StandardScaler().fit(X)
    X_mean=a.mean_
    X_scale=a.scale_
    #********** End **********#
    return X_mean,X_scale
    

第三关 文本数据特征提取

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


categories = [
    'alt.atheism',
    'talk.religion.misc',
]

# 加载对应目录的新闻数据,包含857 个文档
data = fetch_20newsgroups("./step5/",subset='train', categories=categories)
X = data.data

def transfer2CountVector():
    '''
    使用CountVectorizer方法提取特征向量,返回词汇表大小和前五条特征向量

    返回值:
    vocab_len - 标量,词汇表大小
    tokenizer_list - 数组,对测试字符串test_str进行分词后的结果
    '''

    vocab_len = 0

    test_str = "what's your favorite programming language?"
    tokenizer_list = []
    #Begin *********
    v=CountVectorizer()
    v.fit(X)
    vocab_len=len(v.vocabulary_)
    jxh=v.build_analyzer()
    tokenizer_list=jxh(test_str)
    #   请在此添加实现代码   #
    # **********

    # ********** End **********#

    return vocab_len,tokenizer_list

def transfer2TfidfVector():
    '''
        使用TfidfVectorizer方法提取特征向量,并将向量化转换器应用到新的测试数据

        TfidfVectorizer()方法的参数设置:
        min_df = 2,stop_words="english"

        test_data - 需要转换的原数据

        返回值:
        transfer_test_data - 二维数组ndarray
        '''

    test_data = ['Once again, to not believe in God is different than saying\n>I BELIEVE that God does not exist. I still maintain the position, even\n>after reading the FAQs, that strong atheism requires faith.\n>\n \nNo it in the way it is usually used. In my view, you are saying here that\ndriving a car requires faith that the car drives.\n \nFor me it is a conclusion, and I have no more faith in it than I have in the\npremises and the argument used.\n \n \n>But first let me say the following.\n>We might have a language problem here - in regards to "faith" and\n>"existence". I, as a Christian, maintain that God does not exist.\n>To exist means to have being in space and time. God does not HAVE\n>being - God IS Being. Kierkegaard once said that God does not\n>exist, He is eternal. With this said, I feel it\'s rather pointless\n>to debate the so called "existence" of God - and that is not what\n>I\'m doing here. I believe that God is the source and ground of\n>being. When you say that "god does not exist", I also accept this\n>statement - but we obviously mean two different things by it. However,\n>in what follows I will use the phrase "the existence of God" in it\'s\n>\'usual sense\' - and this is the sense that I think you are using it.\n>I would like a clarification upon what you mean by "the existence of\n>God".\n>\n \nNo, that\'s a word game. The term god is used in a different way usually.\nWhen you use a different definition it is your thing, but until it is\ncommonly accepted you would have to say the way I define god is ... and\nthat does not exist, it is existence itself, so I say it does not exist.\n \nInterestingly, there are those who say that "existence exists" is one of\nthe indubitable statements possible.\n \nFurther, saying god is existence is either a waste of time, existence is\nalready used and there is no need to replace it by god, or you are implying\nmore with it, in which case your definition and your argument so far\nare incomplete, making it a fallacy.\n \n \n(Deletion)\n>One can never prove that God does or does not exist. When you say\n>that you believe God does not exist, and that this is an opinion\n>"based upon observation", I will have to ask "what observtions are\n>you refering to?" There are NO observations - pro or con - that\n>are valid here in establishing a POSITIVE belief.\n(Deletion)\n \nWhere does that follow? Aren\'t observations based on the assumption\nthat something exists?\n \nAnd wouldn\'t you say there is a level of definition that the assumption\n"god is" is meaningful. If not, I would reject that concept anyway.\n \nSo, where is your evidence for that "god is" is meaningful at some level?\n   Benedikt\n']
    transfer_test_data = None

    #   请在此添加实现代码   #
    # ********** Begin *********#
    
    v=TfidfVectorizer(min_df=2,stop_words="english")
    jxh=v.fit_transform(X) 
    #v.get_feature_names()
    #jxh.toarray()
    transfer_test_data=v.transform(test_data).toarray()
    
    # ********** End **********#
    return transfer_test_data


第四关 使用scikit-learn分类器SVM对digits数据分类

import matplotlib.pyplot as plt

# 导入数据集,分类器相关包
from sklearn import datasets, svm, metrics

# 导入digits数据集
digits = datasets.load_digits()
n_samples = len(digits.data)
data = digits.data

# 使用前一半的数据集作为训练数据,后一半数据集作为测试数据
train_data,train_target = data[:n_samples // 2],digits.target[:n_samples // 2]
test_data,test_target = data[n_samples // 2:],digits.target[n_samples // 2:]


def createModelandPredict():
    '''
    创建分类模型并对测试数据预测

    返回值:
    predicted - 测试数据预测分类值
    '''
    predicted = None
    #   请在此添加实现代码   #
    #********** Begin *********#
    classifier=svm.SVC()
    classifier.fit(train_data,train_target)
    predicted=classifier.predict(test_data)
    
    #********** End **********#

    return predicted


第五关 模型持久化

# 导入数据集,分类器相关包
from sklearn import datasets, svm, metrics
import pickle
 
# 导入digits数据集
digits = datasets.load_digits()
n_samples = len(digits.data)
data = digits.data
 
# 使用前一半的数据集作为训练数据,后一半数据集作为测试数据
train_data,train_target = data[:n_samples // 2],digits.target[:n_samples // 2]
test_data,test_target = data[n_samples // 2:],digits.target[n_samples // 2:]
 
 
def createModel():
    classifier = svm.SVC()
    classifier.fit(train_data,train_target)
    return classifier
 
local_file = 'dumpfile'
def dumpModel():
    '''
    存储分类模型
    '''
    clf = createModel()
    # 请在此处补全模型存储语句 #
    #********** Begin *********#
    f_model = open(local_file,'wb')
    pickle.dump(clf,f_model)
    #********** End **********#
 
def loadModel():
    '''
    加载模型,并使用模型对测试数据进行预测,返回预测值
    返回值:
    predicted - 模型预测值
    '''
    predicted = None
    # 请在此处补全模型加载语句,并对预测数据分类返回预测值#
    #********** Begin *********#
    fw = open(local_file,'rb')
    classifier = pickle.loads(fw.read())
    predicted = classifier.predict(test_data)
    #********** End **********#
 
    return predicted
 
 
 

第六关 模型评估-量化预测效果

from sklearn.metrics import accuracy_score,precision_score,f1_score,precision_recall_fscore_support
from sklearn.svm import LinearSVC,SVC
def bin_evaluation(X_train, y_train, X_test, y_test):
    '''
    评估二分类模型
    :param X_train: 训练数据集
    :param y_train: 训练集类别
    :param X_test: 测试数据集
    :param y_test: 测试集实际类别
    :return:
    correct_num - 正确分类的样本个数
    prec - 正类的准确率
    recall - 正类的召回率
    f_score - 正类的f值
    '''
    classifier = LinearSVC()
    correct_num, prec, recall, fscore = None, None, None, None
    #   请在此添加实现代码   #
    # ********** Begin *********#
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
 
    correct_num = accuracy_score(y_test, y_pred, normalize=False)
    prec, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
 
    return correct_num, prec, recall, fscore
 
 
 
 
    # ********** End **********#
def multi_evaluation(X_train,y_train,X_test,y_test):
    '''
    评估多分类模型
    :param X_train: 训练数据集
    :param y_train: 训练集类别
    :param X_test: 测试数据集
    :param y_test: 测试集实际类别
    :return:
    acc - 模型的精度
    prec - 准确率
    f_score - f值
    '''
    #初始化
    acc,prec,f_score = None,None,None
    classifier = SVC(kernel='linear')
    #   请在此添加实现代码   #
    # ********** Begin *********#
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
 
    acc = accuracy_score(y_test, y_pred)
    prec, zhaohui, f_score, sp_score = precision_recall_fscore_support(y_test, y_pred, average='macro')
 
    return acc,prec,f_score
 
 
 
    # ********** End **********#

理解机器学习基本概念:从电影评分预测讲起


第一关 统计数据集的基本属性

def stat_data(train_data):
    """求出用户数和电影数,评分数目, 平均评分, 最大评分, 最小评分
    参数:
        train_data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集
    返回值:
        num_user - 整数,用户数
        num_movie - 整数,电影数
        num_rating - 整数,评分数目
        avg_rating - 浮点数,平均评分
        max_rating - 浮点数,最大评分
        min_rating - 浮点数,最小评分
    """
    num_user = 0
    num_movie = 0
    num_rating = 0
    avg_rating = 0
    max_rating = 0
    min_rating = 0
    
    # 求用户数和电影数
    num_user = train_data['user'].nunique()
    num_movie = train_data['movie'].nunique()
    
    # 求评分数目
    num_rating = train_data.shape[0]
    
    # 求平均评分
    avg_rating = train_data['rating'].mean()
    
    # 求最大评分和最小评分
    max_rating = train_data['rating'].max()
    min_rating = train_data['rating'].min()
    
    return num_user, num_movie, num_rating, avg_rating, max_rating, min_rating

第二关 进一步探索数据集

# -*- coding:utf-8 -*-
 
def avg_rating_of_users_movies(data):
    """求出每个用户的平均评分
    参数:
        data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集
    返回值:
        user2avg_r - Pandas的DataFrame对象,有一列'rating'
        movie2avg_r - Pandas的DataFrame对象,有一列'rating'
    """
    user2avg_r = ''
    movie2avg_r = ''
    # 请在此添加实现代码
    #********** Begin *********#        
    user2avg_r = data.groupby('user')['rating'].mean().reset_index(name='rating')
    movie2avg_r = data.groupby('movie')['rating'].mean().reset_index(name='rating')
    #**********  End  *********#    
    return user2avg_r, movie2avg_r
    
def top_10_user_movie_on_avg_rating(user2avg_r, movie2avg_r):    
    """求出平均评分最高的10个用户和10个电影
    参数:
        user2avg_r - Pandas的DataFrame对象,有一列'rating'
        movie2avg_r - Pandas的DataFrame对象,有一列'rating'
    返回值:
        top10_users - 整数列表,用户ID数组,比如[3,4,5,6]代表前4个用户账户是3,4,5,6
        top10_movies - 整数列表,电影ID数组,比如[30,40,50,60]代表前4个电影编号是3,4,5,6
    """
    top10_users = []
    top10_movies = []
    # 请在此添加实现代码
    #********** Begin *********#        
    top10_users = user2avg_r.sort_values(by='rating', ascending=False).head(10)['user'].tolist()
    top10_movies = movie2avg_r.sort_values(by='rating', ascending=False).head(10)['movie'].tolist()
    #**********  End  *********#
    return top10_users, top10_movies

第三关 实现基础预测评分算法

def learn(train_data, N, M):
    """从训练数据中学习得到模型
    参数:
        train_data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集
        N - 整数,用户数目
        M - 整数,电影数目        
    返回值:
        g - 数据集中的平均每用户每电影评分值参数
        alpha - 浮点数组,用户评分偏差参数数组,举例alpha[9]表示用户9的评分偏差
        beta - 浮点数组,电影评分偏差参数数组,举例beta[90]表示电影90的评分偏差
    """        
    # 导入Step2的模块
    from stat_rating import avg_rating_of_users_movies
    import numpy as np    
    
    # 模型参数
    g = 0  # 模型参数:所有用户所有电影的平均评分
    alpha = np.zeros(N)  # 模型参数:每个用户的评分偏好
    beta = np.zeros(M)  # 模型参数:每个电影的评分偏好
    
    # 计算平均每用户每电影评分值参数g
    g = np.mean(train_data['rating'])
    
    # 计算用户评分偏差参数alpha
    for i in range(N):
        user_ratings = train_data[train_data['user'] == i]['rating']

        # 检查该用户是否有评分数据
        if len(user_ratings) == 0:
            alpha[i] = 0 - g;
        else:
            alpha[i] = np.mean(user_ratings) - g
    
    # 计算电影评分偏差参数beta
    for j in range(M):
        movie_ratings = train_data[train_data['movie'] == j]['rating']

        # 检查该电影是否有评分数据
        if len(movie_ratings) == 0:
            beta[j] = 0 - g;
        else:
            beta[j] = np.mean(movie_ratings) - g
    
    return g, alpha, beta


第四关 应用模型做预测

def predict(g, alpha, beta, test_data):
    """预测用户对电影的评分
    参数:
        g - 浮点数,模型参数平均电影评分参数
        alpha - 浮点数组,用户评分偏差参数数组
        beta - 浮点数组,电影评分偏差参数数组
        test_data - Pandas的DataFrame对象,有两列'user','movie',是测试数据集
    返回值:
        ret - 浮点数数组,预测的评分数组,每个值对应test_data中的每一行的评分值
    """
    ret = []
    N = len(alpha)
    M = len(beta)

    # 预测每条测试数据的评分
    for index, row in test_data.iterrows():
        user = row['user']
        movie = row['movie']

        # 防止索引超出范围
        if user >= N or movie >= M:
            ret.append(g)
        else:
            # 预测评分公式:预测评分 = 模型参数g + 用户评分偏差alpha + 电影评分偏差beta
            prediction = g + alpha[user] + beta[movie]
            ret.append(prediction)

    return ret


第五关 评估机器学习模型

import numpy as np

def RMSE(predicted_rating, true_rating):
    """计算RMSE值
    参数:
        predicted_rating - list,预测的评分
        true_rating - list,真实的评分
    返回值:
        rmse - 浮点数,RMSE值
    """
    rmse = 0
    
    # 将列表转换为numpy数组
    predicted_rating = np.array(predicted_rating)
    true_rating = np.array(true_rating)
    
    # 计算平方差和
    squared_error = np.square(predicted_rating - true_rating).sum()
    
    # 计算均方根误差
    mean_squared_error = squared_error / len(predicted_rating)
    rmse = np.sqrt(mean_squared_error)
    
    return rmse


第六关 基于梯度下降法的模型参数估计

# -*- coding:utf-8 -*-

def gradient(u, m, r, g, alpha, beta):
    """求出用户权重的梯度
    参数:
        u - 整数,用户ID
        m - 整数,电影ID
        r - 整数,实际评分
        g - 浮点数,平均评分参数
        alpha - 浮点数组,用户评分偏差参数数组
        beta - 浮点数组,电影评分偏差参数数组
    返回值:
        grad_alpha - 浮点数,用户梯度值,举例grad_alpha[9]表示用户9的评分偏差梯度
        grad_beta - 浮点数,电影梯度值,举例grad_beta[90]表示电影90的评分偏差梯度
    """
    grad_alpha = 0
    grad_beta = 0
    # 请在此添加实现代码
    #********** Begin *********#        
     # 获取用户u对电影m的预测评分
    predict_rating = g + alpha[u] + beta[m]
    
    # 计算梯度
    grad_alpha = -2 * (r - predict_rating)
    grad_beta = -2 * (r - predict_rating)
    #**********  End  *********#
    
    return grad_alpha, grad_beta
    
def learn(train_data, N, M, steps, tao, g):
    """学习模型
    参数:
        train_data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集        
        N - 整数,用户数目
        M - 整数,电影数目
        steps - 整数,迭代次数
        tao - 浮点数,学习速率
        g - 浮点数,平均电影评分
    返回值:
        alpha - 浮点数组,用户评分偏差参数数组,举例alpha[9]表示用户9的评分偏差
        beta - 浮点数组,电影评分偏差参数数组,举例beta[90]表示电影90的评分偏差
    """
    import numpy as np

    #以正态分布初始化模型参数bu和bi
    alpha = np.zeros(N)+0.01
    beta = np.zeros(M)+0.01
    
    #迭代循环
    for step in range(steps):
        for row in train_data.itertuples():
            u = row.user
            m = row.movie
            r = row.rating
            # 请在此添加实现代码
            #********** Begin *********#        
            # 计算梯度
            grad_alpha, grad_beta = gradient(u, m, r, g, alpha, beta)
            
            # 更新参数
            alpha[u] -= tao * grad_alpha
            beta[m] -= tao * grad_beta
            #**********  End  *********#
            
    return alpha, beta

TensorFlow初体验


第一关 变量创建和初始化

# -*- coding: utf-8 -*-
import tensorflow as tf

def variables_create():
    '''
    返回值:
    weights: 一个Tensor变量
    '''
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    
    weights = tf.Variable(initial_value=tf.constant(0.0, shape=[1]), name='big_weights')
    # 或者使用下面的初始化方法
    # weights = tf.Variable(initial_value=tf.zeros([1]), name='big_weights')
    
    # ********** End **********#
    
    # 返回weights
    return weights

第二关 变量保存

# -*- coding: utf-8 -*-

import tensorflow as tf
import time


def variables_save():
    '''
    参数:
    weights: Tensor变量
    biase:   Tensor变量
    const123:Tensor变量    
    '''
    weights = tf.Variable(tf.random_normal([784, 200]), name="weights")
    biase = tf.Variable(tf.zeros([1]), name='biases')
    const123 = tf.Variable([[3]], name="jjdaren")
    init_op = tf.global_variables_initializer()
    sess = tf.Session()
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#

    saver = tf.train.Saver()
    sess.run(init_op)
    saver.save(sess, "save_dir/store.ckpt")

    # ********** End **********#

    sess.close()

第三关 变量恢复

# -*- coding: utf-8 -*-

import tensorflow as tf
import time

def variable_restore():
    '''
    参数:
    const123:Tensor变量
    
    返回值:
    final_result: 一个Tensor类型变量
    '''
    const123 = tf.Variable([[2]], name="jjdaren")
    sess = tf.Session()
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#

    saver = tf.train.Saver()
    saver.restore(sess, "src/step3/save_dir1/store.ckpt")

    # ********** End **********#
    final_result = sess.run(const123)
    sess.close()
    # 返回 final_result
    return final_result

第四关 占位符使用–简单乘法实现

import tensorflow as tf

def placer_holders():
    '''
    参数:
    input1:占位符
    input2:占位符
    input1_value:矩阵
    input2_value:矩阵
    返回值:
    result: 一个Tensor类型变量
    '''
    input1 = tf.placeholder(dtype="float32", shape=[1, 2])
    input2 = tf.placeholder(dtype="float32", shape=[2, 1])
    input1_value = [[2, 4]]
    input2_value = [[1], [2]]

    # 定义乘法操作
    result = tf.matmul(input1, input2)

    # 创建 TensorFlow 会话
    with tf.Session() as sess:
        # 运行乘法操作,并传入输入的值
        output = sess.run(result, feed_dict={input1: input1_value, input2: input2_value})

    # 返回结果
    return output

PyTorch之线性回归


第一关 初始化参数

import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms

# Download MNIST dataset
train_dataset = dsets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)

# Create DataLoader with batch_size=100, shuffle=True
batch_size = 100
shuffle = True
data_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=shuffle)

# Get the data type of the loaded dataset
data_type = type(data_loader.dataset)

# Output the data type
print(data_type)

第二关 建立模型,定义损失和优化函数

import torch.nn as nn
#/********** Begin *********/
# 线性回归模型
class LinearRegression(nn.Module):
    def __init__(self):
        # 调用Module的初始化
        super(LinearRegression, self).__init__()
        # 输入和输出分别为一维
        self.linear = nn.Linear(1, 1)
    # module调用forward,将按forward进行前向传播,并构建网络
    def forward(self, x):
        out = self.linear(x)
        return out
# 实例化一个新建的模型变量model
model = LinearRegression()
# 输出该模型 model 的‘.parameters'属性
print(model.parameters)
#/********** End *********/


第三关 训练模型


import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable

import os
import sys
path = os.path.split(os.path.abspath(os.path.realpath(sys.argv[0])))[0] + os.path.sep

print(path)

# 超参数
input_size = 1
output_size = 1
num_epochs = 60
learning_rate = 0.001

# 数据集
x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
                    [9.779], [6.182], [7.59], [2.167], [7.042],
                    [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)

y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
                    [3.366], [2.596], [2.53], [1.221], [2.827],
                    [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)

# 线性回归模型
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        out = self.linear(x)
        return out

model = LinearRegression(input_size, output_size)


#创建输出文件 output.txt
f = open(path + 'output.txt', 'w')
f.seek(0)
f.truncate()   #清空文件

#/********** Begin *********/ 
# 创建损失函数MSELoss
criterion = nn.MSELoss()
# 创建SGD的Optimizer,学习率l'r为0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# 训练模型
for epoch in range(num_epochs):
    # 将x_train,y_train数据转换为Variable类型
    inputs = Variable(torch.from_numpy(x_train))
    targets = Variable(torch.from_numpy(y_train))
    # Forward
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    # Backward
    loss.backward()
    #Optimize
    optimizer.step()
    #共训练60次,分别10次输出一回loss信息,并将输出信息存到文件中
    
    if (epoch+1) % 10 == 0:
        f.writelines('Epoch [%d/%d], Loss: %.4f \n'%(epoch+1, num_epochs, loss.data[0]))
        print ('Epoch [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, loss.data[0]))
f.close()

#/********** End *********/

#保存模型
torch.save(model,path + 'model.pkl')


第四关 validation

import torch
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from torch.autograd import Variable
import torch.nn as nn

import warnings
warnings.filterwarnings('ignore')

import os,sys
path = os.path.split(os.path.abspath(os.path.realpath(sys.argv[0])))[0] + os.path.sep
path = path[:-6]
print("validation path:" ,path)

# Linear Regression Model
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        out = self.linear(x)
        return out

model = LinearRegression(1, 1)


x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
                    [9.779], [6.182], [7.59], [2.167], [7.042],
                    [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)

y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
                    [3.366], [2.596], [2.53], [1.221], [2.827],
                    [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)

#加载整个模型
model = torch.load( path + 'step3/model.pkl')

#/********** Begin *********/
#将模型转化为测试模式
#将模型转化为测试模式
model.eval()
#利用 model 计算预测值
predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
print(predicted)
#画图
plt.plot(x_train, y_train, 'ro', label='Original data')
plt.plot(x_train, predicted, label='Fitted line')
plt.legend()
plt.savefig(path + "step4/outputimages/mylossTest.png")

#/********** End *********/



K-means聚类算法


第一关 计算欧几里得距离

import numpy as np

def euclid_distance(x1, x2):
    """计算欧几里得距离
    参数:
        x1 - numpy数组
        x2 - numpy数组
    返回值:
        distance - 浮点数,欧几里得距离
    """
    distance = np.sqrt(np.sum(np.square(x1 - x2)))
    return distance

第二关 计算样本的最近邻聚类中心

# -*- coding: utf-8 -*-
import numpy as np

def nearest_cluster_center(x, centers):
    """计算各个聚类中心与输入样本最近的
    参数:
        x - numpy数组
        centers - numpy二维数组
    返回值:
        cindex - 整数,类中心的索引值,比如3代表分配x到第3个聚类中
    """
    cindex = -1
    from distance import euclid_distance
    
    # 计算输入样本与各个聚类中心的距离
    distances = [euclid_distance(x, center) for center in centers]
    
    # 找到最近距离对应的聚类中心索引
    cindex = np.argmin(distances)
    
    return cindex

第三关 计算各聚类中心

import numpy as np

def estimate_centers(X, y_estimated, n_clusters):
    """重新计算各聚类中心
    参数:
        X - numpy二维数组,代表数据集的样本特征矩阵
        y_estimated - numpy数组,估计的各个样本的聚类中心索引
        n_clusters - 整数,设定的聚类个数
    返回值:
        centers - numpy二维数组,各个样本的聚类中心
    """
    centers = np.zeros((n_clusters, X.shape[1]))
    
    for i in range(n_clusters):
        cluster_indices = np.where(y_estimated == i)  # 获取属于当前聚类中心的样本索引
        cluster_samples = X[cluster_indices]  # 获取属于当前聚类中心的样本特征
        centers[i] = np.mean(cluster_samples, axis=0)  # 计算当前聚类中心的均值
    
    return centers

第四关 评估聚类效果

# -*- coding: utf-8 -*-
import numpy as np

def acc(x1, x2):
    """计算精度
    参数:
        x1 - numpy数组
        x2 - numpy数组
    返回值:
        value - 浮点数,精度
    """
    value = np.mean(x1 == x2)
    return value

第五关 组合已实现的函数完成K-means算法

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

from distance import euclid_distance
from estimate import estimate_centers
from loss import acc
from near import nearest_cluster_center
	
#随机种子对聚类的效果会有影响,为了便于测试,固定随机数种子
np.random.seed(5)

#读入数据集
dataset = pd.read_csv('./data/iris.csv')

#取得样本特征矩阵
X = dataset[['150','4','setosa','versicolor']].as_matrix()
y = np.array(dataset['virginica'])

#读入数据
n_clusters, n_iteration = input().split(',')
n_clusters = int(n_clusters)#聚类中心个数
n_iteration = int(n_iteration)#迭代次数

#随机选择若干点作为聚类中心
point_index_lst = np.arange(len(y))
np.random.shuffle(point_index_lst)
cluster_centers = X[point_index_lst[:n_clusters]]

#开始算法流程
y_estimated = np.zeros(len(y))
for _ in range(n_iteration):
    # 分配样本到最近的聚类中心
    for i, x in enumerate(X):
        y_estimated[i] = nearest_cluster_center(x, cluster_centers)
        
    # 重新计算聚类中心
    cluster_centers = estimate_centers(X, y_estimated, n_clusters)
    
print('%.3f' % acc(y_estimated, y))

pandas数据处理

第一关 将超市销售excel文件根据商品的类别筛选存储

import pandas as pd
df=pd.read_excel("xlscl/step1/超市销售数据.xlsx",dtype={"商品编码":str,"商品条码":str})
writer = pd.ExcelWriter("xlscl/step1/类别销售.xlsx")
#代码开始
 
df_ = df["类别"].unique()
for x in df_:
    dfdata = df.loc[df["类别"]==x]
    dfdata.to_excel(writer, sheet_name=x, index=False)
    
writer.save()
 
#代码结束

第二关 将银行信息excel文件按地区筛选存储

import pandas
writer = pandas.ExcelWriter('test/银行一线城市.xlsx')
data=pandas.read_excel("test/银行信息.xlsx",dtype={"银行编号":str})
#代码开始
 
dfdata1 = data.loc[data["城市"] == "北京市", ["银行编号", "名称"]]
dfdata1 = dfdata1.sort_values("银行编号")
dfdata1.to_excel(writer, sheet_name="北京市", index=False)
 
dfdata2 = data.loc[data["城市"] == "上海市", ["银行编号", "名称"]]
dfdata2 = dfdata2.sort_values("银行编号")
dfdata2.to_excel(writer, sheet_name="上海市", index=False)
 
dfdata3 = data.loc[data["城市"] == "广州市", ["银行编号", "名称"]]
dfdata3 = dfdata3.sort_values("银行编号")
dfdata3.to_excel(writer, sheet_name="广州市", index=False)
 
dfdata4 = data.loc[data["城市"] == "深圳市", ["银行编号", "名称"]]
dfdata4 = dfdata4.sort_values("银行编号")
dfdata4.to_excel(writer, sheet_name="深圳市", index=False)
'''
list_ = ["北京市", "上海市", "广州市", "深圳市"]
for x in list_:
    df = data.loc[data["城市"]==x, ["银行编号", "名称"]]
    df = df.sort_values("银行编号")
    df.to_excel(writer, sheet_name=x, index=False)
'''
#代码结束
writer.save()
 

第三关 将gdpecxcel文件按年份筛选存储

import pandas
writer = pandas.ExcelWriter('test/GDP分年份.xlsx')
data=pandas.read_excel("test/各省GDP.xlsx",dtype={"年份":str},)
#代码开始
 
for i in range(2000, 2017):
    df = data.loc[data["年份"]==str(i),["省份", "GDP"]]
    df = df.sort_values(by='GDP', ascending=False)
    df.to_excel(writer, sheet_name=str(i), index=False)
#代码结束
writer.save()
 

第四关 统计超市销售excel文件各类别和各日的数据,并将统计结果存入新的工作簿

import pandas as pd
df=pd.read_excel("xlscl/step1/超市销售数据.xlsx")
writer = pd.ExcelWriter('xlscl/step2/统计数据.xlsx')
#代码开始
 
df_type = df.groupby(["类别"])["合计金额"].sum()
df_date = df.groupby(["日期"])["合计金额"].sum()
 
df_type.sort_values(ascending=False, inplace=True)
 
 
df_type.to_excel(writer, sheet_name="类别统计")
df_date.to_excel(writer, sheet_name="日期统计")
 
writer.save()
 
 
#代码结束
 

第五关 将超市销售excel文件分别存放在多个日期工作簿的不同类别工作表中

import pandas as pd
df=pd.read_excel("xlscl/step1/超市销售数据.xlsx",dtype={"商品编码":str,"商品条码":str})
#代码开始
 
df_dates = df["日期"].unique()  # 日期list
for date in df_dates:
    file_name = str(date).replace('-', '')[:8]
    writer = pd.ExcelWriter('./xlscl/step3/rq/'+file_name+'.xlsx')  # 文件
 
    df_data = df.loc[df["日期"]==date]  # date日期内数据
    df_types = df_data["类别"].unique()  # date日期内的类别list
 
    for type in df_types:
        df_type0 = df_data.loc[df["类别"]==type]  # 单一类别信息
        df_type0.to_excel(writer, sheet_name=type)
 
    df_type1 = df_data.groupby(["类别"])["合计金额"].sum()
    df_type1.sort_values(ascending=False, inplace=True)
    df_type1.to_excel(writer, sheet_name="类别统计", index_label="类别")
    writer.save()
#代码结束

😶‍🌫️
thanks

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值