Question 17-20 Homework 2 Machine Learning Foundation 机器学习基石 作业二 python code

一维直线上有N个点分成两类,hypothesis set是positvie or negative的射线,VC dimension是2N,其中N个点中间的interval任选一个作为射线的起点,总共2(N-1)个,全是o或者x的分类方法有两个。(注意最右边的点x>max(x[i[)作为起点,positive ray和negative ray分类的效果相同,所以算作一类,最左边的点x<min(x[i])同理。)

Question 17-18 要求枚举所有的2N种射线(实现时我是将射线起点设在两个相邻的点的中点),找出Ein最小的作为分类器。

Question 19-20 扩展到多维的点,当x[i]维度是d时,选取其中一维d使得Ein最小,只用这一维做出prediction,其他维度的feature忽略。

import numpy as np
import math
import sys

def pro3():
    epsilon = 0.05
    delta = 0.05
    dvc = 10
    N = [420000, 440000, 460000, 480000, 500000]
    for i in N:
        res = 4 * ((2 * i) ** dvc) * math.exp(-0.125 * (epsilon ** 2) * i)
        print i, res

def quadratic(a, b, c):
    temp = b * b - 4 * a * c
    # print temp, math.sqrt(temp)
    if temp < 0:
        return None
    if a == 0:
        return - c / b
    m = (- b - math.sqrt(temp)) / (2.0 * a)
    n = (- b + math.sqrt(temp)) / (2.0 * a)
    return m, n

def pro4():
    delta = 0.05
    dvc = 50
    N = 5#10000
    def mh(n):
        return (2 * n) ** dvc
    # original vc bound
    epsilon = math.sqrt(8.0 / N * math.log(4.0 * mh(2 * N) / delta))
    print epsilon
    # Rademacher Penalty Bound
    epsilon = math.sqrt(2.0 / N * math.log(2 * N * (mh(N)))) + math.sqrt(2.0 / N * math.log(1.0 / delta)) + 1.0 / N
    print epsilon
    # Variant VC bound:
    epsilon = math.sqrt(16.0 / N * math.log(2 * (mh(N)) / math.sqrt(delta)))
    print epsilon
    # ep = 7.04877656418#5.17714785419#0.331308785962 # current min

    # Parrondo and Van den Broek
    # print math.log(6.0 * mh(2 * N) / delta)
    # epsilon = math.sqrt(1.0 / N * (2 * ep + math.log(6.0 * mh(2 * N) / delta)))
    print quadratic(N, - 2.0, - math.log(6.0 * mh(2 * N) / delta))
    # print epsilon
    # Devroye
    # epsilon = math.sqrt(0.5 / N * (4 * ep *(1 + ep) + math.log(4) + dvc * math.log(2 * N ** 2) - math.log(delta))) #math.log(4 * N ** (2 * dvc) / delta)))
    print quadratic(2 * N - 4, - 4, - math.log(4) - dvc * math.log(2 * N ** 2) + math.log(delta))

# pro 16, 17
def h(x, theta, s):
    return np.sign(x - theta) * s

def decison_stump():
    min_s = 0
    min_theta = 0

    N = 20
    ave_Ein = []
    ave_Eout = []
    repeat = 5000
    np.random.seed(1234)
    noise_prob = 0.2
    for rep in range(0, repeat):
        train_x = np.random.uniform(-1, 1, N)
        train_y = np.zeros(train_x.shape)
        train_xy = []
        min_ein = 1.0
        for i in range(0, len(train_x)):
            if train_x[i] > 0:
                sgn = 1
            else:
                sgn = -1
            if np.random.random() < noise_prob:
                # print "noise", train_x[i]
                train_y[i] = - sgn
            else:
                train_y[i] = sgn
            train_xy.append((train_x[i], train_y[i]))
        train_xy.sort(key=lambda z: z[0])
        x = np.array([train_xy[i][0] for i in range(0, len(train_xy))])
        y = np.array([train_xy[i][1] for i in range(0, len(train_xy))])
        # print train_xy
        for s in [-1, 1]:
            for i in range(0, len(train_xy)):
                if i == len(train_xy) - 1:
                    theta = -2  # all is -1 when s = -1, all is +1 when s = 1
                else:
                    theta = (train_xy[i][0] + train_xy[i + 1][0]) / 2.0
                predicted_y = h(x, theta, s)
                diff = [predicted_y[i] != y[i] for i in range(0, len(y))]
                error = 1.0 * np.count_nonzero(diff) / N
                # print error, min_ein
                if error < min_ein:
                    min_ein = error
                    min_s = s
                    min_theta = theta
        # all is -1 or all is +1
        ave_Ein.append(min_ein)
        eout = 0.5 + 0.3 * min_s * (np.abs(min_theta) - 1)
        ave_Eout.append(eout)
    print "ave Ein", np.average(ave_Ein)
    print "ave Eout", np.average(ave_Eout)
    # 0.1729
    # 0.259461300445

# pro 19
def load_data(filename):
    x = []
    y = []
    try:
        fin = open(filename, 'r')
    except IOError:
        return
    for data in fin:
        data = data.strip("\n")
        data = data.strip("\r")
        data = data.strip(" ")
        items = data.split(" ")
        # print items
        y.append(float(items[-1]))
        tmp = []
        for i in range(0, len(items) - 1):
            tmp.append(float(items[i]))
        x.append(tmp)
    fin.close()
    return x, y

# def multi_dim_h(x, theta, s):
#     return np.sign(x - theta) * s

def multi_dim_decison_stump():
    min_s = 0
    min_theta = 0
    train_file = "hw2_train.dat"
    test_file = "hw2_test.dat"
    train_x, train_y = load_data(train_file)
    test_x, test_y = load_data(test_file)
    dim = len(train_x[0])
    np.random.seed(1234)
    final_s = np.zeros(dim)
    final_theta = np.zeros(dim)
    final_ein = np.zeros(dim)  # E in in each dimension
    for d in range(0, dim):  # try each dimension
        train_xy = []
        min_ein = 1.0  # best of the best
        for i in range(0, len(train_x)):
            train_xy.append((train_x[i][d], train_y[i]))
        train_xy.sort(key=lambda z: z[0])
        x = np.array([train_xy[i][0] for i in range(0, len(train_xy))])
        y = np.array([train_xy[i][1] for i in range(0, len(train_xy))])
        # print train_xy
        for s in [-1, 1]:
            for i in range(0, len(train_xy)):
                if i == len(train_xy) - 1:
                    theta = -100  # all is -1 when s = -1, all is +1 when s = 1
                else:
                    theta = (train_xy[i][0] + train_xy[i + 1][0]) / 2.0
                predicted_y = h(x, theta, s)
                diff = [predicted_y[i] != y[i] for i in range(0, len(y))]
                error = 1.0 * np.count_nonzero(diff) / len(train_x)
                # print error, min_ein
                if error < min_ein:
                    min_ein = error
                    min_s = s
                    min_theta = theta
        # all is -1 or all is +1
        final_s[d] = min_s
        final_theta[d] = min_theta
        final_ein[d] = min_ein
        # choose one dimension only as the input to predict
    idx = np.argmin(final_ein)
    print "best Ein", np.average(final_ein[idx])
    x = np.array([test_x[i][idx] for i in range(0, len(test_x))])
    y = np.array([test_y[i] for i in range(0, len(test_y))])
    predicted_y = h(x, final_theta[idx], final_s[idx])
    diff = [predicted_y[i] != y[i] for i in range(0, len(y))]
    error = 1.0 * np.count_nonzero(diff) / len(test_x)
    print "best Eout", np.average(error)
if __name__ == '__main__':
    # pro4()
    # pro3()
    # decison_stump()
    multi_dim_decison_stump()

 

Python机器学习智能问答系统是一种基于自然语言处理和机器学习技术的智能问答系统,可以回答用户提出的问题。以下是一个简单的Python机器学习智能问答系统的实现步骤: 1. 收集和整理数据:从各种来源收集和整理数据,包括文本、图像、音频等。 2. 数据预处理:对数据进行清洗、分词、去除停用词、词干提取等预处理操作,以便后续的特征提取和模型训练。 3. 特征提取:将预处理后的数据转换为机器学习算法可以处理的特征向量,常用的特征提取方法包括词袋模型、TF-IDF、Word2Vec等。 4. 模型训练:使用机器学习算法对特征向量进行训练,得到一个可以回答问题的模型。 5. 问题回答:将用户提出的问题转换为特征向量,使用训练好的模型进行预测,得到问题的答案。 以下是一个简单的Python机器学习智能问答系统的代码实现: ```python import nltk import numpy as np import pandas as pd import sklearn import string import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # 加载数据 data = pd.read_csv('data.csv') # 数据预处理 def preprocess(text): # 去除标点符号 text = text.translate(str.maketrans('', '', string.punctuation)) # 转换为小写 text = text.lower() # 分词 tokens = nltk.word_tokenize(text) # 去除停用词 stop_words = set(nltk.corpus.stopwords.words('english')) tokens = [token for token in tokens if token not in stop_words] # 词干提取 stemmer = nltk.stem.PorterStemmer() tokens = [stemmer.stem(token) for token in tokens] # 返回处理后的文本 return ' '.join(tokens) data['processed_text'] = data['text'].apply(preprocess) # 特征提取 vectorizer = TfidfVectorizer() features = vectorizer.fit_transform(data['processed_text']) # 模型训练 def get_answer(question): # 预处理问题 question = preprocess(question) # 转换为特征向量 question_vec = vectorizer.transform([question]) # 计算问题和每个文本之间的相似度 sims = cosine_similarity(question_vec, features) # 找到最相似的文本 index = np.argmax(sims) # 返回答案 return data.iloc[index]['answer'] # 问题回答 question = 'What is Python?' answer = get_answer(question) print(answer) ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值