python零基础到精通头歌实训

仅供交流学习

实验一:Numpy初体验

第一关:Numpy创建数组
代码:

# 引入numpy库
import numpy as np
# 定义cnmda函数
def cnmda(m,n):
    '''
    创建numpy数组
    参数:
           m:第一维的长度
           n: 第二维的长度
    返回值:
        ret: 一个numpy数组
    '''
    
    ret = 0
    
    # 请在此添加创建多维数组的代码并赋值给ret
    #********** Begin *********#
    b=np.arange(n);
    ret=np.array([b]*m)   
    #********** End **********#
    return ret

第二关:Numpy数组的基本运算

# 引入numpy库
import numpy as np
# 定义opeadd函数
def opeadd(m,b,n):
    '''
    参数:
    m:是一个数组
    b:是一个列表
    n:是列表中的索引
    你需要做的是 m+b[n]
    返回值:
    ret: 一个numpy数组
    '''    
    ret = 0

    #********** Begin *********#
    ret=m+b[n]
      #********** End **********#

    return ret
# 定义opemul函数
def opemul(m,b,n):
    '''
    参数:
    m:是一个数组
    b:是一个列表
    n:是列表中的索引
    你需要做的是 m*b[n]
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0

    #********** Begin *********#
    ret=m*b[n]
    #********** End **********#
    return ret

第三关:Numpy数组的切片与索引

# 引入numpy库
import numpy as np
# 定义cnmda函数
def ce(a,m,n):
    '''
    参数:
    a:是一个Numpy数组
    m:是第m维数组的索引
    n:第m维数组的前n个元素的索引
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0
    # 请在此添加切片的代码,实现找出多维数组a中第m个数组的前n个元素 并赋值给ret
    #********** Begin *********#
    ret=a[m,:n]
    #********** End **********#
    return ret

第四关:Numpy数组的堆叠

# 引入numpy库
import numpy as np
# 定义varray函数
def  varray(m,n):
    '''
    参数:
    m:是第一个数组
    n:是第二个数组
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0
    # 请在此添加代码实现数组的垂直叠加并赋值给ret
    #********** Begin *********#
    ret=np.vstack((m,n))

    #********** End **********#
    return ret
# 定义darray函数
def  darray(m,n):
    '''
    参数:
    m:是第一个数组
    n:是第二个数组
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0
    # 请在此添加代码实现数组的深度叠加并赋值给ret
    #********** Begin *********#
    ret=np.dstack((m,n))

    #********** End **********#
    return ret
 # 定义harray函数
def  harray(m,n):
    '''
    参数:
    m:是第一个数组
    n:是第二个数组
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0
    # 请在此添加代码实现数组的水平叠加并赋值给ret
    #********** Begin *********#
    ret=np.hstack((m,n))
    #********** End **********#
    return ret

第五关:Numpy的拆分

# 引入numpy库
import numpy as np
# 定义varray函数
def  vsarray(m,n):
    '''
    参数:
    m:是第一个数组
    n:是需要拆分到的维度
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0
    # 请在此添加代码实现数组的纵向拆分并赋值给ret
    #********** Begin *********#
    ret=np.vsplit(m,n)

    #********** End **********#
    return ret
# 定义darray函数
def  dsarray(m,n):
    '''
    参数:
    m:是第一个数组
    n:是需要拆分到的维度
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0
    # 请在此添加代码实现数组的深度拆分并赋值给ret
    #********** Begin *********#
    ret=np.dsplit(m,n)
    #********** End **********#
    return ret
 # 定义harray函数
def  hsarray(m,n):
    '''
    参数:
    m:是第一个数组
    n:是需要拆分到的维度
    返回值:
    ret: 一个numpy数组
    '''
    ret = 0
    # 请在此添加代码实现数组的水平拆分并赋值给ret
    #********** Begin *********#
    ret=np.hsplit(m,n)
    #********** End **********#
    return ret

实验二:Pandas初体验

第1关:了解数据处理对象–Series

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import  pandas as pd
 
def create_series():
    '''
    返回值:
    series_a: 一个Series类型数据
    series_b: 一个Series类型数据
    dict_a:  一个字典类型数据
    '''
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    series_a=Series([1,2,5,7],index=['nu','li','xue','xi'])
    dict_a={'ting':1, 'shuo':2, 'du':32, 'xie':44}
    series_b=Series(dict_a)
    # ********** End **********#
 
    # 返回series_a,dict_a,series_b
    return series_a,dict_a,series_b

第2关:了解数据处理对象-DataFrame

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import  pandas as pd
 
def create_dataframe():
    '''
    返回值:
    df1: 一个DataFrame类型数据
    '''
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    dictionary = {'states':['0hio','0hio','0hio','Nevada','Nevada'],
         'years':[2000,2001,2002,2001,2002],
         'pops':[1.5,1.7,3.6,2.4,2.9]}
    df1 = DataFrame(dictionary)
    df1=DataFrame(dictionary,index=['one','two','three','four','five'])
    df1['new_add']=[7,4,5,8,2]
    # ********** End **********#
 
    #返回df1
    return df1

第3关:读取CSV格式数据

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import  pandas as pd
def read_csv_data():
    '''
    返回值:
    df1: 一个DataFrame类型数据
    length1: 一个int类型数据
    '''
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    df1 = pd.read_csv('test3/uk_rain_2014.csv', header=0)
    df1.columns = ['water_year','rain_octsep','outflow_octsep','rain_decfeb', 'outflow_decfeb', 'rain_junaug', 'outflow_junaug']
    length1=len(df1)
    # ********** End **********#
    #返回df1,length1
    return df1,length1

第4关:数据的基本操作——排序

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import  pandas as pd
def sort_gate():
    '''
    返回值:
    s2: 一个Series类型数据
    d2: 一个DataFrame类型数据
    '''
 
    # s1是Series类型数据,d1是DataFrame类型数据
    s1 = Series([4, 3, 7, 2, 8], index=['z', 'y', 'j', 'i', 'e'])
    d1 = DataFrame({'e': [4, 2, 6, 1], 'f': [0, 5, 4, 2]})
 
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    s2=s1.sort_index()
    d2=d1.sort_values(by='f')
    # ********** End **********#
 
    #返回s2,d2
    return s2,d2

第5关:数据的基本操作——删除

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import numpy as np
import  pandas as pd
 
def delete_data():
    '''
    返回值:
    s2: 一个Series类型数据
    d2: 一个DataFrame类型数据
    '''
 
    # s1是Series类型数据,d1是DataFrame类型数据
    s1 = Series([5, 2, 4, 1], index=['v', 'x', 'y', 'z'])
    d1=DataFrame(np.arange(9).reshape(3,3), columns=['xx','yy','zz'])
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    s2=s1.drop('z')
    d2=d1.drop(['yy'],axis=1)
    # ********** End **********#
 
    # 返回s2,d2
    return s2, d2

第6关:数据的基本操作——算术运算

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import numpy as np
import  pandas as pd
 
def add_way():
    '''
    返回值:
    df3: 一个DataFrame类型数据
    '''
 
    # df1,df2是DataFrame类型数据
    df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
    df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
 
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    df3=df1.add(df2,fill_value=4)
    # ********** End **********#
 
    # 返回df3
    return df3

第7关:数据的基本操作——去重

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import  pandas as pd
 
def delete_duplicated():
    '''
    返回值:
    df2: 一个DataFrame类型数据
    '''
 
    # df1是DataFrame类型数据
    df1 = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    df2=df1.drop_duplicates()
 
    # ********** End **********#
 
    # 返回df2
    return df2

第8关:层次化索引

# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import  pandas as pd
import numpy as np
def suoying():
    '''
    返回值:
    d1: 一个DataFrame类型数据
    '''
    #s1是Series类型数据
    s1=Series(np.random.randn(10),
           index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    d1=s1.unstack()
    # ********** End **********#
 
    # 返回d1
    return d1
 
suoying()

实验三: Python机器学习软件包Scikit-Learn的学习与运用

第1关:使用scikit-learn导入数据集

from sklearn import datasets
def getIrisData():

    '''
    导入Iris数据集

    返回值:
    X -5条训练特征数据
    y -5条训练数据类别
    X_shape - 训练特征数据的二维数组大小
    '''
    #初始化
    X = [] 
    y = [] 
    X_shape = () 

    #   请在此添加实现代码   #
    #********** Begin *********#
    iris = datasets.load_iris()
    X = iris.data[:5]
	y = iris.target[:5]
	X_shape = iris.data.shape
    #********** End **********#

    return X,y,X_shape

第2关:数据预处理 — 标准化

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
'''
Data descrption:
The data contains 20,640 observations on 9 variables.
This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.
dataset : dict-like object with the following attributes:
    dataset.data : ndarray, shape [20640, 8]
        Each row corresponding to the 8 feature values in order.
    dataset.target : numpy array of shape (20640,)
        Each value corresponds to the average house value in units of 100,000.
    dataset.feature_names : array of length 8
        Array of ordered feature names used in the dataset.
    dataset.DESCR : string
        Description of the California housing dataset.
'''
dataset = fetch_california_housing("./step4/")
X_full, y = dataset.data, dataset.target
#抽取其中两个特征数据
X = X_full[:, [0, 5]]
def getMinMaxScalerValue():
    '''
    对特征数据X进行MinMaxScaler标准化转换,并返回转换后的数据前5返回值:
    X_first5 - 数据列表
    '''
    X_first5 = []
    #   请在此添加实现代码   #
    # ********** Begin *********#
    X_first5 = MinMaxScaler().fit_transform(X)
    X_first5 = X_first5[:5]

    # ********** End **********#
    return X_first5
def getScaleValue():
    '''
        对目标数据y进行简单scale标准化转换,并返回转换后的数据前5返回值:
        y_first5 - 数据列表
        '''
    y_first5 = []
    #   请在此添加实现代码   #
    # ********** Begin *********#
    y_first5 = scale(y)
    y_first5 = y_first5[:5]
    # ********** End **********#
    return y_first5
def getStandardScalerValue():
    '''
    对特征数据X进行StandardScaler标准化转换,并返回转换后的数据均值和缩放比例
    返回值:
    X_mean - 均值
    X_scale - 缩放比例值
    '''
    X_mean = None
    X_scale = None
    #   请在此添加实现代码   #
    #********** Begin *********#
    scale = StandardScaler().fit(X)
    X_mean = scale.mean_
    X_scale = scale.scale_

    #********** End **********#
    return X_mean,X_scale

第3关:文本数据特征提取

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


categories = [
    'alt.atheism',
    'talk.religion.misc',
]

# 加载对应目录的新闻数据,包含857 个文档
data = fetch_20newsgroups("./step5/",subset='train', categories=categories)
X = data.data

def transfer2CountVector():
    '''
    使用CountVectorizer方法提取特征向量,返回词汇表大小和前五条特征向量

    返回值:
    vocab_len - 标量,词汇表大小
    tokenizer_list - 数组,对测试字符串test_str进行分词后的结果
    '''

    vocab_len = 0

    test_str = "what's your favorite programming language?"
    tokenizer_list = []

    #   请在此添加实现代码   #
    # ********** Begin *********#
    vectorizer = CountVectorizer()
    vectorizer.fit(X)
    vocab_len = len(vectorizer.vocabulary_)

    analyze = vectorizer.build_analyzer()
    tokenizer_list = analyze(test_str)
    
    # ********** End **********#

    return vocab_len,tokenizer_list

def transfer2TfidfVector():
    '''
        使用TfidfVectorizer方法提取特征向量,并将向量化转换器应用到新的测试数据

        TfidfVectorizer()方法的参数设置:
        min_df = 2,stop_words="english"

        test_data - 需要转换的原数据

        返回值:
        transfer_test_data - 二维数组ndarray
        '''

    test_data = ['Once again, to not believe in God is different than saying\n>I BELIEVE that God does not exist. I still maintain the position, even\n>after reading the FAQs, that strong atheism requires faith.\n>\n \nNo it in the way it is usually used. In my view, you are saying here that\ndriving a car requires faith that the car drives.\n \nFor me it is a conclusion, and I have no more faith in it than I have in the\npremises and the argument used.\n \n \n>But first let me say the following.\n>We might have a language problem here - in regards to "faith" and\n>"existence". I, as a Christian, maintain that God does not exist.\n>To exist means to have being in space and time. God does not HAVE\n>being - God IS Being. Kierkegaard once said that God does not\n>exist, He is eternal. With this said, I feel it\'s rather pointless\n>to debate the so called "existence" of God - and that is not what\n>I\'m doing here. I believe that God is the source and ground of\n>being. When you say that "god does not exist", I also accept this\n>statement - but we obviously mean two different things by it. However,\n>in what follows I will use the phrase "the existence of God" in it\'s\n>\'usual sense\' - and this is the sense that I think you are using it.\n>I would like a clarification upon what you mean by "the existence of\n>God".\n>\n \nNo, that\'s a word game. The term god is used in a different way usually.\nWhen you use a different definition it is your thing, but until it is\ncommonly accepted you would have to say the way I define god is ... and\nthat does not exist, it is existence itself, so I say it does not exist.\n \nInterestingly, there are those who say that "existence exists" is one of\nthe indubitable statements possible.\n \nFurther, saying god is existence is either a waste of time, existence is\nalready used and there is no need to replace it by god, or you are implying\nmore with it, in which case your definition and your argument so far\nare incomplete, making it a fallacy.\n \n \n(Deletion)\n>One can never prove that God does or does not exist. When you say\n>that you believe God does not exist, and that this is an opinion\n>"based upon observation", I will have to ask "what observtions are\n>you refering to?" There are NO observations - pro or con - that\n>are valid here in establishing a POSITIVE belief.\n(Deletion)\n \nWhere does that follow? Aren\'t observations based on the assumption\nthat something exists?\n \nAnd wouldn\'t you say there is a level of definition that the assumption\n"god is" is meaningful. If not, I would reject that concept anyway.\n \nSo, where is your evidence for that "god is" is meaningful at some level?\n   Benedikt\n']
    transfer_test_data = None

    #   请在此添加实现代码   #
    # ********** Begin *********#
    tfidf_vertor = TfidfVectorizer(min_df=2, stop_words="english")
    tfidf_vertor.fit(X)
    transfer_test_data = tfidf_vertor.transform(test_data).toarray()
    # ********** End **********#

    return transfer_test_data

第4关:使用scikit-learn分类器SVM对digits数据分类

import matplotlib.pyplot as plt

# 导入数据集,分类器相关包
from sklearn import datasets, svm, metrics

# 导入digits数据集
digits = datasets.load_digits()
n_samples = len(digits.data)
data = digits.data

# 使用前一半的数据集作为训练数据,后一半数据集作为测试数据
train_data,train_target = data[:n_samples // 2],digits.target[:n_samples // 2]
test_data,test_target = data[n_samples // 2:],digits.target[n_samples // 2:]


def createModelandPredict():
    '''
    创建分类模型并对测试数据预测

    返回值:
    predicted - 测试数据预测分类值
    '''
    predicted = None
    #   请在此添加实现代码   #
    #********** Begin *********#
    classifier = svm.SVC()
    classifier.fit(train_data,train_target)
    predicted = classifier.predict(test_data)
    
    #********** End **********#

    return predicted

第5关:模型持久化

# 导入数据集,分类器相关包
from sklearn import datasets, svm, metrics
import pickle

# 导入digits数据集
digits = datasets.load_digits()
n_samples = len(digits.data)
data = digits.data

# 使用前一半的数据集作为训练数据,后一半数据集作为测试数据
train_data,train_target = data[:n_samples // 2],digits.target[:n_samples // 2]
test_data,test_target = data[n_samples // 2:],digits.target[n_samples // 2:]


def createModel():
    classifier = svm.SVC()
    classifier.fit(train_data,train_target)
    return classifier

local_file = 'dumpfile'
def dumpModel():
    '''
    存储分类模型

    '''
    clf = createModel()
    # 请在此处补全模型存储语句 #
    #********** Begin *********#
    f_model = open(local_file, 'wb')

    pickle.dump(clf, f_model)

    #********** End **********#

def loadModel():
    '''
    加载模型,并使用模型对测试数据进行预测,返回预测值

    返回值:
    predicted - 模型预测值
    '''
    predicted = None
    # 请在此处补全模型加载语句,并对预测数据分类返回预测值#
    #********** Begin *********#
    fw = open(local_file, 'rb')
    classifier = pickle.loads(fw.read())
    predicted = classifier.predict(test_data)
    #********** End **********#

    return predicted

第6关:模型评估-量化预测效果

from sklearn.metrics import accuracy_score,precision_score,f1_score,precision_recall_fscore_support
from sklearn.svm import LinearSVC,SVC
def bin_evaluation(X_train, y_train, X_test, y_test):
    '''
    评估二分类模型
    :param X_train: 训练数据集
    :param y_train: 训练集类别
    :param X_test: 测试数据集
    :param y_test: 测试集实际类别
    :return:
    correct_num - 正确分类的样本个数
    prec - 正类的准确率
    recall - 正类的召回率
    f_score - 正类的f值
    '''
    classifier = LinearSVC()
    correct_num, prec, recall, fscore = None, None, None, None
    #   请在此添加实现代码   #
    # ********** Begin *********#
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    correct_num = accuracy_score(y_test, y_pred, normalize=False)
    prec, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)

    return correct_num, prec, recall, fscore
    # ********** End **********#
def multi_evaluation(X_train,y_train,X_test,y_test):
    '''
    评估多分类模型
    :param X_train: 训练数据集
    :param y_train: 训练集类别
    :param X_test: 测试数据集
    :param y_test: 测试集实际类别
    :return:
    acc - 模型的精度
    prec - 准确率
    f_score - f值
    '''
    #初始化
    acc,prec,f_score = None,None,None
    classifier = SVC(kernel='linear')
    #   请在此添加实现代码   #
    # ********** Begin *********#
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec, zhaohui, f_score, sp_score = precision_recall_fscore_support(y_test, y_pred, average='macro')

    return acc,prec,f_score
    # ********** End **********#

实验四:理解机器学习基本概念:从电影评分预测讲起

第1关:统计数据集的基本属性

# -*- coding:utf-8 -*-

def stat_data(train_data):
    """求出用户数和电影数,评分数目, 平均评分, 最大评分, 最小评分
    参数:
        train_data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集
    返回值:
        num_user - 整数,用户数
        num_movie - 整数,电影数
        num_rating - 整数,评分数目
        avg_rating - 浮点数,平均评分
        max_rating - 浮点数,最大评分
        min_rating - 浮点数,最小评分
    """
    num_user = 0
    num_movie = 0
    num_rating = 0
    avg_rating = 0
    max_rating = 0
    min_rating = 0
    # 请在此添加实现代码
    #********** Begin *********#        
    num_user = train_data['user'].nunique()
    num_movie = train_data['movie'].nunique()
    num_rating = train_data['rating'].size
    avg_rating = train_data['rating'].mean()
    max_rating = train_data['rating'].max()
    min_rating = train_data['rating'].min()
    #**********  End  *********#
    return num_user, num_movie, num_rating, avg_rating, max_rating, min_rating 

第2关:进一步探索数据集

# -*- coding:utf-8 -*-

def avg_rating_of_users_movies(data):
    """求出每个用户的平均评分
    参数:
        data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集
    返回值:
        user2avg_r - Pandas的DataFrame对象,有一列'rating'
        movie2avg_r - Pandas的DataFrame对象,有一列'rating'
    """
    user2avg_r = ''
    movie2avg_r = ''
    # 请在此添加实现代码
    #********** Begin *********#        
    user2avg_r = data.groupby('user')['rating'].mean().reset_index(name='rating')
    movie2avg_r = data.groupby('movie')['rating'].mean().reset_index(name='rating')
    #**********  End  *********#    
    return user2avg_r, movie2avg_r
    
def top_10_user_movie_on_avg_rating(user2avg_r, movie2avg_r):    
    """求出平均评分最高的10个用户和10个电影
    参数:
        user2avg_r - Pandas的DataFrame对象,有一列'rating'
        movie2avg_r - Pandas的DataFrame对象,有一列'rating'
    返回值:
        top10_users - 整数列表,用户ID数组,比如[3,4,5,6]代表前4个用户账户是3,4,5,6
        top10_movies - 整数列表,电影ID数组,比如[30,40,50,60]代表前4个电影编号是3,4,5,6
    """
    top10_users = []
    top10_movies = []
    # 请在此添加实现代码
    #********** Begin *********#        
    top10_users = user2avg_r.sort_values(by='rating',ascending=False).head(10)['user'].tolist()
    top10_movies = movie2avg_r.sort_values(by='rating',ascending=False).head(10)['movie'].tolist()
    #**********  End  *********#
    return top10_users, top10_movies

第3关:实现基础预测评分算法

def learn(train_data, N, M):
    """从训练数据中学习得到模型
    参数:
        train_data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集
        N - 整数,用户数目
        M - 整数,电影数目        
    返回值:
        g - 数据集中的平均每用户每电影评分值参数
        alpha - 浮点数组,用户评分偏差参数数组,举例alpha[9]表示用户9的评分偏差
        beta - 浮点数组,电影评分偏差参数数组,举例beta[90]表示电影90的评分偏差
    """        
    # 导入Step2的模块
    from stat_rating import avg_rating_of_users_movies
    import numpy as np    
    
    # 模型参数
    g = 0  # 模型参数:所有用户所有电影的平均评分
    alpha = np.zeros(N)  # 模型参数:每个用户的评分偏好
    beta = np.zeros(M)  # 模型参数:每个电影的评分偏好
    
    # 计算平均每用户每电影评分值参数g
    g = np.mean(train_data['rating'])
    
    # 计算用户评分偏差参数alpha
    for i in range(N):
        user_ratings = train_data[train_data['user'] == i]['rating']

        # 检查该用户是否有评分数据
        if len(user_ratings) == 0:
            alpha[i] = 0 - g;
        else:
            alpha[i] = np.mean(user_ratings) - g
    
    # 计算电影评分偏差参数beta
    for j in range(M):
        movie_ratings = train_data[train_data['movie'] == j]['rating']

        # 检查该电影是否有评分数据
        if len(movie_ratings) == 0:
            beta[j] = 0 - g;
        else:
            beta[j] = np.mean(movie_ratings) - g
    
    return g, alpha, beta


第4关:应用模型做预测

def predict(g, alpha, beta, test_data):
    """预测用户对电影的评分
    参数:
        g - 浮点数,模型参数平均电影评分参数
        alpha - 浮点数组,用户评分偏差参数数组
        beta - 浮点数组,电影评分偏差参数数组
        test_data - Pandas的DataFrame对象,有两列'user','movie',是测试数据集
    返回值:
        ret - 浮点数数组,预测的评分数组,每个值对应test_data中的每一行的评分值
    """
    ret = []
    N = len(alpha)
    M = len(beta)

    # 预测每条测试数据的评分
    for index, row in test_data.iterrows():
        user = row['user']
        movie = row['movie']

        # 防止索引超出范围
        if user >= N or movie >= M:
            ret.append(g)
        else:
            # 预测评分公式:预测评分 = 模型参数g + 用户评分偏差alpha + 电影评分偏差beta
            prediction = g + alpha[user] + beta[movie]
            ret.append(prediction)

    return ret


第5关:评估机器学习模型

import numpy as np

def RMSE(predicted_rating, true_rating):
    """计算RMSE参数:
        predicted_rating - list,预测的评分
        true_rating - list,真实的评分
    返回值:
        rmse - 浮点数,RMSE"""
    rmse = 0
    
    # 将列表转换为numpy数组
    predicted_rating = np.array(predicted_rating)
    true_rating = np.array(true_rating)
    
    # 计算平方差和
    squared_error = np.square(predicted_rating - true_rating).sum()
    
    # 计算均方根误差
    mean_squared_error = squared_error / len(predicted_rating)
    rmse = np.sqrt(mean_squared_error)
    
    return rmse


第6关:基于梯度下降法的模型参数估计

# -*- coding:utf-8 -*-

def gradient(u, m, r, g, alpha, beta):
    """求出用户权重的梯度
    参数:
        u - 整数,用户ID
        m - 整数,电影ID
        r - 整数,实际评分
        g - 浮点数,平均评分参数
        alpha - 浮点数组,用户评分偏差参数数组
        beta - 浮点数组,电影评分偏差参数数组
    返回值:
        grad_alpha - 浮点数,用户梯度值,举例grad_alpha[9]表示用户9的评分偏差梯度
        grad_beta - 浮点数,电影梯度值,举例grad_beta[90]表示电影90的评分偏差梯度
    """
    grad_alpha = 0
    grad_beta = 0
    # 请在此添加实现代码
    #********** Begin *********#        
     # 获取用户u对电影m的预测评分
    predict_rating = g + alpha[u] + beta[m]
    
    # 计算梯度
    grad_alpha = -2 * (r - predict_rating)
    grad_beta = -2 * (r - predict_rating)
    #**********  End  *********#
    
    return grad_alpha, grad_beta
    
def learn(train_data, N, M, steps, tao, g):
    """学习模型
    参数:
        train_data - Pandas的DataFrame对象,有四列'user','movie','rating','timestamp',是训练数据集        
        N - 整数,用户数目
        M - 整数,电影数目
        steps - 整数,迭代次数
        tao - 浮点数,学习速率
        g - 浮点数,平均电影评分
    返回值:
        alpha - 浮点数组,用户评分偏差参数数组,举例alpha[9]表示用户9的评分偏差
        beta - 浮点数组,电影评分偏差参数数组,举例beta[90]表示电影90的评分偏差
    """
    import numpy as np

    #以正态分布初始化模型参数bu和bi
    alpha = np.zeros(N)+0.01
    beta = np.zeros(M)+0.01
    
    #迭代循环
    for step in range(steps):
        for row in train_data.itertuples():
            u = row.user
            m = row.movie
            r = row.rating
            # 请在此添加实现代码
            #********** Begin *********#        
            # 计算梯度
            grad_alpha, grad_beta = gradient(u, m, r, g, alpha, beta)
            
            # 更新参数
            alpha[u] -= tao * grad_alpha
            beta[m] -= tao * grad_beta
            #**********  End  *********#
            
    return alpha, beta

实验五、TensorFlow初体验

第1关:变量创建和初始化

import tensorflow as tf
 
def variables_create():
    '''
    返回值:
    weights: 一个Tensor变量
    '''
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    weights = tf.Variable(tf.random_normal([784, 200]), name="big_weights")
    init_op=tf.global_variables_initializer()
    # ********** End **********#
    # 返回weights
    return weights

第2关:变量保存

import tensorflow as tf
import time
 
def variables_save():
    '''
    参数:
    weights: Tensor变量
    biase:   Tensor变量
    const123:Tensor变量    
    '''
    weights=tf.Variable(tf.random_normal([784,200]),name="weights")
    biase=tf.Variable(tf.zeros([1]),name='biases')
    const123=tf.Variable(([[3]]),name="jjdaren")
    init_op=tf.global_variables_initializer()
    sess=tf.Session()
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    saver = tf.train.Saver()  # 创建Saver对象
    sess.run(init_op)  # 初始化变量
    save_dir = 'save_dir/'
    saver.save(sess, save_dir + 'store.ckpt')  # 保存会话
 
    # ********** End **********#
 
    sess.close()

第3关:变量恢复

import tensorflow as tf
import time
 
def variable_restore():
    '''
    参数:
    const123:Tensor变量
    
    返回值:
    final_result: 一个Tensor类型变量
    '''
    const123 = tf.Variable(([[2]]), name="jjdaren")
    sess=tf.Session()
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    saver = tf.train.Saver()  # 创建Saver对象
    with tf.Session() as sess:
        saver.restore(sess, "src/step3/save_dir1/store.ckpt")  # 恢复模型
 
        final_result = sess.run(const123)  # 获取恢复的变量的值
 
    # ********** End **********#
    # 返回 final_result
    return final_result

第4关:占位符使用–简单乘法实现

import tensorflow as tf
import time
 
def placer_holders():
    '''
    参数:
    input1:占位符
    input2:占位符
    input1_value:矩阵
    input2_value:矩阵
    返回值:
    result: 一个Tensor类型变量
    '''
    input1=tf.placeholder(dtype="float32",shape=[1,2])
    input2=tf.placeholder(dtype="float32",shape=[2,1])
    input1_value=[[2,4]]
    input2_value=[[1],[2]]
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    result = tf.matmul(input1, input2)  # 执行矩阵乘法
 
    with tf.Session() as sess:
        result_value = sess.run(result, feed_dict={input1: input1_value, input2: input2_value})  # 运行图并获取结果
 
    # ********** End **********#
    #返回result
    return result_value

实验六:PyTorch之线性回归

第1关:初始化参数

import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms

#/********** Begin *********/
# 下载MNIST数据集
Mnist_dataset = dsets.MNIST(root='./data',
                            train=True,
                            transform=transforms.ToTensor(),
                            download=True)
# 创建batch_size=100, shuffle=True的DataLoader类型的变量data_loader
data_loader = torch.utils.data.DataLoader(dataset=Mnist_dataset,
                                          batch_size=100,
                                          shuffle=True)
#输出 data_loader中数据类型
print(type(data_loader.dataset))
#/********** End *********/


第2关:建立模型,定义损失和优化函数

import torch.nn as nn
#/********** Begin *********/
# 线性回归模型
class LinearRegression(nn.Module):
    def __init__(self):
        # 调用Module的初始化
        super(LinearRegression, self).__init__()
        # 输入和输出分别为一维
        self.linear = nn.Linear(1, 1)
    # module调用forward,将按forward进行前向传播,并构建网络
    def forward(self, x):
        out = self.linear(x)
        return out
# 实例化一个新建的模型变量model
model = LinearRegression()
# 输出该模型 model 的‘.parameters'属性
print(model.parameters)
#/********** End *********/


第3关:训练模型


import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable

import os
import sys
path = os.path.split(os.path.abspath(os.path.realpath(sys.argv[0])))[0] + os.path.sep

print(path)

# 超参数
input_size = 1
output_size = 1
num_epochs = 60
learning_rate = 0.001

# 数据集
x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
                    [9.779], [6.182], [7.59], [2.167], [7.042],
                    [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)

y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
                    [3.366], [2.596], [2.53], [1.221], [2.827],
                    [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)

# 线性回归模型
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        out = self.linear(x)
        return out

model = LinearRegression(input_size, output_size)


#创建输出文件 output.txt
f = open(path + 'output.txt', 'w')
f.seek(0)
f.truncate()   #清空文件

#/********** Begin *********/ 
# 创建损失函数MSELoss
criterion = nn.MSELoss()
# 创建SGD的Optimizer,学习率l'r为0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# 训练模型
for epoch in range(num_epochs):
    # 将x_train,y_train数据转换为Variable类型
    inputs = Variable(torch.from_numpy(x_train))
    targets = Variable(torch.from_numpy(y_train))
    # Forward
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    # Backward
    loss.backward()
    #Optimize
    optimizer.step()
    #共训练60次,分别10次输出一回loss信息,并将输出信息存到文件中
    
    if (epoch+1) % 10 == 0:
        f.writelines('Epoch [%d/%d], Loss: %.4f \n'%(epoch+1, num_epochs, loss.data[0]))
        print ('Epoch [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, loss.data[0]))
f.close()

#/********** End *********/

#保存模型
torch.save(model,path + 'model.pkl')


第4关:validation

import torch
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from torch.autograd import Variable
import torch.nn as nn

import warnings
warnings.filterwarnings('ignore')

import os,sys
path = os.path.split(os.path.abspath(os.path.realpath(sys.argv[0])))[0] + os.path.sep
path = path[:-6]
print("validation path:" ,path)

# Linear Regression Model
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        out = self.linear(x)
        return out

model = LinearRegression(1, 1)


x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
                    [9.779], [6.182], [7.59], [2.167], [7.042],
                    [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)

y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
                    [3.366], [2.596], [2.53], [1.221], [2.827],
                    [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)

#加载整个模型
model = torch.load( path + 'step3/model.pkl')

#/********** Begin *********/
#将模型转化为测试模式
#将模型转化为测试模式
model.eval()
#利用 model 计算预测值
predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
print(predicted)
#画图
plt.plot(x_train, y_train, 'ro', label='Original data')
plt.plot(x_train, predicted, label='Fitted line')
plt.legend()
plt.savefig(path + "step4/outputimages/mylossTest.png")

#/********** End *********/



实验七:K-means聚类算法

第1关:计算欧几里得距离

# -*- coding: utf-8 -*-
import numpy as np
def euclid_distance(x1, x2):
    """计算欧几里得距离
    参数:
        x1 - numpy数组
        x2 - numpy数组
    返回值:
        distance - 浮点数,欧几里得距离
    """
    distance = 0
    #   请在此添加实现代码     #
    #********** Begin *********#
    import numpy as np
    distance = np.sqrt(np.sum((x1-x2)**2))
    #********** End ***********#
    return distance


第2关:计算样本的最近邻聚类中心

# -*- coding: utf-8 -*-
def nearest_cluster_center(x, centers):
    """计算各个聚类中心与输入样本最近的
    参数:
        x - numpy数组
        centers - numpy二维数组
    返回值:
        cindex - 整数,类中心的索引值,比如3代表分配x到第3个聚类中
    """
    cindex = -1
    from distance import euclid_distance
    #   请在此添加实现代码     #
    #********** Begin *********#
    #计算点到各个中心的距离
    n_clusters = len(centers)
    distance_list = []
    for cluster_index in range(n_clusters):
        distance_list.append((cluster_index, euclid_distance(x, centers[cluster_index])))
    #找出最小距离的类
    distance_list = sorted(distance_list, key=lambda s:s[1])
    cindex = distance_list[0][0]
    #********** End ***********#    
    return cindex


第3关:计算各聚类中心

# -*- coding: utf-8 -*-
def estimate_centers(X, y_estimated, n_clusters):
    """重新计算各聚类中心
    参数:
        X - numpy二维数组,代表数据集的样本特征矩阵
        y_estimated - numpy数组,估计的各个样本的聚类中心索引
        n_clusters - 整数,设定的聚类个数
    返回值:
        centers - numpy二维数组,各个样本的聚类中心
    """
    import numpy as np
    centers = np.zeros((n_clusters, X.shape[1]))
    #   请在此添加实现代码     #
    #********** Begin *********#
    for i in range(n_clusters):
        centers[i] = np.mean(X[y_estimated==i], 0)
    #********** End ***********#
    return centers    


第4关:评估聚类效果

# -*- coding: utf-8 -*-
def acc(x1, x2):
    """计算精度
    参数:
        x1 - numpy数组
        x2 - numpy数组
    返回值:
        value - 浮点数,精度
    """
    value = 0
    #   请在此添加实现代码     #
    #********** Begin *********#
    import numpy as np
    value = float(np.sum(x1==x2))/len(x1)
    #********** End ***********#
    return value


第5关:组合已实现的函数完成K-means算法

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from distance import euclid_distance
from estimate import estimate_centers
from loss import acc
from near import nearest_cluster_center
#随机种子对聚类的效果会有影响,为了便于测试,固定随机数种子
np.random.seed(5)
#读入数据集
dataset = pd.read_csv('./data/iris.csv')
#取得样本特征矩阵
X = dataset[['150','4','setosa','versicolor']].as_matrix()
y = np.array(dataset['virginica'])
#读入数据
n_clusters, n_iteration = input().split(',')
n_clusters = int(n_clusters)#聚类中心个数
n_iteration = int(n_iteration)#迭代次数
#随机选择若干点作为聚类中心
point_index_lst = np.arange(len(y))
np.random.shuffle(point_index_lst)
cluster_centers = X[point_index_lst[:n_clusters]]
#开始算法流程
y_estimated = np.zeros(len(y))
#   请在此添加实现代码     #
#********** Begin *********#
for iter in range(n_iteration):
    for xx_index in range(len(X)):
        #计算各个点最接近的聚类中心
        y_estimated[xx_index] = nearest_cluster_center(X[xx_index], cluster_centers)
    #计算各个聚类中心
    cluster_centers = estimate_centers(X, y_estimated, n_clusters)
#********** End ***********#
print('%.3f' % acc(y_estimated, y))



实验八:pandas数据处理

第1关:将超市销售excel文件根据商品的类别筛选存储

import pandas as pd
df=pd.read_excel("xlscl/step1/超市销售数据.xlsx",dtype={"商品编码":str,"商品条码":str})
writer = pd.ExcelWriter("xlscl/step1/类别销售.xlsx")
#代码开始
 
df_ = df["类别"].unique()
for x in df_:
    dfdata = df.loc[df["类别"]==x]
    dfdata.to_excel(writer, sheet_name=x, index=False)
    
writer.save()
 
#代码结束

第2关:将银行信息excel文件按地区筛选存储

import pandas
writer = pandas.ExcelWriter('test/银行一线城市.xlsx')
data=pandas.read_excel("test/银行信息.xlsx",dtype={"银行编号":str})
#代码开始
 
dfdata1 = data.loc[data["城市"] == "北京市", ["银行编号", "名称"]]
dfdata1 = dfdata1.sort_values("银行编号")
dfdata1.to_excel(writer, sheet_name="北京市", index=False)
 
dfdata2 = data.loc[data["城市"] == "上海市", ["银行编号", "名称"]]
dfdata2 = dfdata2.sort_values("银行编号")
dfdata2.to_excel(writer, sheet_name="上海市", index=False)
 
dfdata3 = data.loc[data["城市"] == "广州市", ["银行编号", "名称"]]
dfdata3 = dfdata3.sort_values("银行编号")
dfdata3.to_excel(writer, sheet_name="广州市", index=False)
 
dfdata4 = data.loc[data["城市"] == "深圳市", ["银行编号", "名称"]]
dfdata4 = dfdata4.sort_values("银行编号")
dfdata4.to_excel(writer, sheet_name="深圳市", index=False)

//可仅打以下几行
list_ = ["北京市", "上海市", "广州市", "深圳市"]
for x in list_:
    df = data.loc[data["城市"]==x, ["银行编号", "名称"]]
    df = df.sort_values("银行编号")
    df.to_excel(writer, sheet_name=x, index=False)
//到这结束,仅需打五行

#代码结束
writer.save()
 

第3关:将gdpecxcel文件按年份筛选存储

import pandas
writer = pandas.ExcelWriter('test/GDP分年份.xlsx')
data=pandas.read_excel("test/各省GDP.xlsx",dtype={"年份":str},)
#代码开始
 
for i in range(2000, 2017):
    df = data.loc[data["年份"]==str(i),["省份", "GDP"]]
    df = df.sort_values(by='GDP', ascending=False)
    df.to_excel(writer, sheet_name=str(i), index=False)
#代码结束
writer.save()
 

第4关:统计超市销售excel文件各类别和各日的数据,并将统计结果存入新的工作簿

import pandas as pd
df=pd.read_excel("xlscl/step1/超市销售数据.xlsx")
writer = pd.ExcelWriter('xlscl/step2/统计数据.xlsx')
#代码开始
 
df_type = df.groupby(["类别"])["合计金额"].sum()
df_date = df.groupby(["日期"])["合计金额"].sum()
 
df_type.sort_values(ascending=False, inplace=True)
 
 
df_type.to_excel(writer, sheet_name="类别统计")
df_date.to_excel(writer, sheet_name="日期统计")
 
writer.save()
 
 
#代码结束
 

第5关:将超市销售excel文件分别存放在多个日期工作簿的不同类别工作表中

import pandas as pd
df=pd.read_excel("xlscl/step1/超市销售数据.xlsx",dtype={"商品编码":str,"商品条码":str})
#代码开始
 
df_dates = df["日期"].unique()  # 日期list
for date in df_dates:
    file_name = str(date).replace('-', "")[:8]
    writer = pd.ExcelWriter('./xlscl/step3/rq/'+file_name+'.xlsx')  # 文件
 
    df_data = df.loc[df["日期"]==date]  # date日期内数据
    df_types = df_data["类别"].unique()  # date日期内的类别list
 
    for type in df_types:
        df_type0 = df_data.loc[df["类别"]==type]  # 单一类别信息
        df_type0.to_excel(writer, sheet_name=type)
 
    df_type1 = df_data.groupby(["类别"])["合计金额"].sum()
    df_type1.sort_values(ascending=False, inplace=True)
    df_type1.to_excel(writer, sheet_name="类别统计", index_label="类别")
    writer.save()
#代码结束
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值