机器学习---基础算法-第1天

import operator
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from time import time
import math


def is_prime(x):
    return 0 not in [x % i for i in range(2, int(math.sqrt(x)) + 1)]


def is_prime3(x):
    flag = True
    for p in p_list2:
        if p > math.sqrt(x):
            break
        if x % p == 0:
            flag = False
            break
    if flag:
        p_list2.append(x)
    return flag


if __name__ == "__main__":
    a = 2
    b = 100000

    # 方法1:直接计算
    t = time()
    p = [p for p in range(a, b) if 0 not in [p % d for d in range(2, int(math.sqrt(p)) + 1)]]
    print time() - t
    print p

    # 方法2:利用filter
    t = time()
    p = filter(is_prime, range(a, b))
    print time() - t
    print p

    # 方法3:利用filter和lambda
    t = time()
    is_prime2 = (lambda x: 0 not in [x % i for i in range(2, int(math.sqrt(x)) + 1)])
    p = filter(is_prime2, range(a, b))
    print time() - t
    print p

    # 方法4:定义
    t = time()
    p_list = []
    for i in range(2, b):
        flag = True
        for p in p_list:
            if p > math.sqrt(i):
                break
            if i % p == 0:
                flag = False
                break
        if flag:
            p_list.append(i)
    print time() - t
    print p_list

    # 方法5:定义和filter
    p_list2 = []
    t = time()
    filter(is_prime3, range(2, b))
    print time() - t
    print p_list2

    print '---------------------'
    a = 1180
    b = 1230
    a = 1600
    b = 1700
    p_list2 = []
    p = np.array(filter(is_prime3, range(2, b+1)))
    p = p[p >= a]
    print p
    p_rate = float(len(p)) / float(b-a+1)
    print '素数的概率:', p_rate, '\t',
    print '公正赔率:', 1/p_rate
    print '合数的概率:', 1-p_rate, '\t',
    print '公正赔率:', 1 / (1-p_rate)
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt


def clip(x, path):
    for i in range(len(x)):
        if x[i] >= path:
            x[i] %= path


if __name__ == "__main__":
    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False

    path = 5000     # 环形公路的长度
    n = 100         # 公路中的车辆数目
    v0 = 50          # 车辆的初始速度
    p = 0.3         # 随机减速概率
    Times = 3000

    np.random.seed(0)
    x = np.random.rand(n) * path
    x.sort()
    v = np.tile([v0], n).astype(np.float)

    plt.figure(figsize=(10, 8), facecolor='w')
    for t in range(Times):
        plt.scatter(x, [t]*n, s=1, c='k', alpha=0.05)
        for i in range(n):
            if x[(i+1)%n] > x[i]:
                d = x[(i+1) % n] - x[i]   # 距离前车的距离
            else:
                d = path - x[i] + x[(i+1) % n]
            if v[i] < d:
                if np.random.rand() > p:
                    v[i] += 1
                else:
                    v[i] -= 1
            else:
                v[i] = d - 1
        v = v.clip(0, 150)
        x += v
        clip(x, path)
    plt.xlim(0, path)
    plt.ylim(0, Times)
    plt.xlabel(u'车辆位置', fontsize=16)
    plt.ylabel(u'模拟时间', fontsize=16)
    plt.title(u'环形公路车辆堵车模拟', fontsize=20)
    plt.tight_layout(pad=2)
    plt.show()
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


def enum_row(row):
    print row['state']


def find_state_code(row):
    if row['state'] != 0:
        print process.extractOne(row['state'], states, score_cutoff=80)


def capital(str):
    return str.capitalize()


def correct_state(row):
    if row['state'] != 0:
        state = process.extractOne(row['state'], states, score_cutoff=80)
        if state:
            state_name = state[0]
            return ' '.join(map(capital, state_name.split(' ')))
    return row['state']


def fill_state_code(row):
    if row['state'] != 0:
        state = process.extractOne(row['state'], states, score_cutoff=80)
        if state:
            state_name = state[0]
            return state_to_code[state_name]
    return ''


if __name__ == "__main__":
    pd.set_option('display.width', 200)
    data = pd.read_excel('sales.xlsx', sheetname='sheet1', header=0)
    print 'data.head() = \n', data.head()
    print 'data.tail() = \n', data.tail()
    print 'data.dtypes = \n', data.dtypes
    print 'data.columns = \n', data.columns
    for c in data.columns:
        print c,
    print
    data['total'] = data['Jan'] + data['Feb'] + data['Mar']
    print data.head()
    print data['Jan'].sum()
    print data['Jan'].min()
    print data['Jan'].max()
    print data['Jan'].mean()

    print '============='
    # 添加一行
    s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum()
    print s1
    s2 = pd.DataFrame(data=s1)
    print s2
    print s2.T
    print s2.T.reindex(columns=data.columns)
    # 即:
    s = pd.DataFrame(data=data[['Jan', 'Feb', 'Mar', 'total']].sum()).T
    s = s.reindex(columns=data.columns, fill_value=0)
    print s
    data = data.append(s, ignore_index=True)
    data = data.rename(index={15:'Total'})
    print data.tail()

    # apply的使用
    print '==============apply的使用=========='
    data.apply(enum_row, axis=1)

    state_to_code = {"VERMONT": "VT", "GEORGIA": "GA", "IOWA": "IA", "Armed Forces Pacific": "AP", "GUAM": "GU",
                     "KANSAS": "KS", "FLORIDA": "FL", "AMERICAN SAMOA": "AS", "NORTH CAROLINA": "NC", "HAWAII": "HI",
                     "NEW YORK": "NY", "CALIFORNIA": "CA", "ALABAMA": "AL", "IDAHO": "ID",
                     "FEDERATED STATES OF MICRONESIA": "FM",
                     "Armed Forces Americas": "AA", "DELAWARE": "DE", "ALASKA": "AK", "ILLINOIS": "IL",
                     "Armed Forces Africa": "AE", "SOUTH DAKOTA": "SD", "CONNECTICUT": "CT", "MONTANA": "MT",
                     "MASSACHUSETTS": "MA",
                     "PUERTO RICO": "PR", "Armed Forces Canada": "AE", "NEW HAMPSHIRE": "NH", "MARYLAND": "MD",
                     "NEW MEXICO": "NM",
                     "MISSISSIPPI": "MS", "TENNESSEE": "TN", "PALAU": "PW", "COLORADO": "CO",
                     "Armed Forces Middle East": "AE",
                     "NEW JERSEY": "NJ", "UTAH": "UT", "MICHIGAN": "MI", "WEST VIRGINIA": "WV", "WASHINGTON": "WA",
                     "MINNESOTA": "MN", "OREGON": "OR", "VIRGINIA": "VA", "VIRGIN ISLANDS": "VI",
                     "MARSHALL ISLANDS": "MH",
                     "WYOMING": "WY", "OHIO": "OH", "SOUTH CAROLINA": "SC", "INDIANA": "IN", "NEVADA": "NV",
                     "LOUISIANA": "LA",
                     "NORTHERN MARIANA ISLANDS": "MP", "NEBRASKA": "NE", "ARIZONA": "AZ", "WISCONSIN": "WI",
                     "NORTH DAKOTA": "ND",
                     "Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY",
                     "RHODE ISLAND": "RI",
                     "DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME"}
    states = state_to_code.keys()
    print fuzz.ratio('Python Package', 'PythonPackage')
    print process.extract('Mississippi', states)
    print process.extract('Mississipi', states, limit=1)
    print process.extractOne('Mississipi', states)
    data.apply(find_state_code, axis=1)

    print 'Before Correct State:\n', data['state']
    data['state'] = data.apply(correct_state, axis=1)
    print 'After Correct State:\n', data['state']
    data.insert(5, 'State Code', np.nan)
    data['State Code'] = data.apply(fill_state_code, axis=1)
    print data

    # group by
    print '==============group by================'
    print data.groupby('State Code')
    print 'All Columns:\n'
    print data.groupby('State Code').sum()
    print 'Short Columns:\n'
    print data[['State Code', 'Jan', 'Feb', 'Mar', 'total']].groupby('State Code').sum()

    # 写入文件
    data.to_excel('sales_result.xls', sheet_name='Sheet1', index=False)
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures


def extend(a, b):
    return 1.05*a-0.05*b, 1.05*b-0.05*a


if __name__ == '__main__':
    pd.set_option('display.width', 200)
    data = pd.read_csv('iris.data', header=None)
    columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
    data.rename(columns=dict(zip(np.arange(5), columns)), inplace=True)
    data['type'] = pd.Categorical(data['type']).codes
    print data.head(5)
    x = data.loc[:, columns[:-1]]
    y = data['type']

    pca = PCA(n_components=2, whiten=True, random_state=0)
    x = pca.fit_transform(x)
    print '各方向方差:', pca.explained_variance_
    print '方差所占比例:', pca.explained_variance_ratio_
    print x[:5]
    cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
    mpl.rcParams['font.sans-serif'] = u'SimHei'
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark)
    plt.grid(b=True, ls=':')
    plt.xlabel(u'组份1', fontsize=14)
    plt.ylabel(u'组份2', fontsize=14)
    plt.title(u'鸢尾花数据PCA降维', fontsize=18)
    # plt.savefig('1.png')
    plt.show()

    x, x_test, y, y_test = train_test_split(x, y, train_size=0.7)
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=True)),
        ('lr', LogisticRegressionCV(Cs=np.logspace(-3, 4, 8), cv=5, fit_intercept=False))
    ])
    model.fit(x, y)
    print '最优参数:', model.get_params('lr')['lr'].C_
    y_hat = model.predict(x)
    print '训练集精确度:', metrics.accuracy_score(y, y_hat)
    y_test_hat = model.predict(x_test)
    print '测试集精确度:', metrics.accuracy_score(y_test, y_test_hat)

    N, M = 500, 500     # 横纵各采样多少个值
    x1_min, x1_max = extend(x[:, 0].min(), x[:, 0].max())   # 第0列的范围
    x2_min, x2_max = extend(x[:, 1].min(), x[:, 1].max())   # 第1列的范围
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    x1, x2 = np.meshgrid(t1, t2)                    # 生成网格采样点
    x_show = np.stack((x1.flat, x2.flat), axis=1)   # 测试点
    y_hat = model.predict(x_show)  # 预测值
    y_hat = y_hat.reshape(x1.shape)  # 使之与输入的形状相同
    plt.figure(facecolor='w')
    plt.pcolormesh(x1, x2, y_hat, cmap=cm_light)  # 预测值的显示
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, edgecolors='k', cmap=cm_dark)  # 样本的显示
    plt.xlabel(u'组份1', fontsize=14)
    plt.ylabel(u'组份2', fontsize=14)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.grid(b=True, ls=':')
    patchs = [mpatches.Patch(color='#77E0A0', label='Iris-setosa'),
              mpatches.Patch(color='#FF8080', label='Iris-versicolor'),
              mpatches.Patch(color='#A0A0FF', label='Iris-virginica')]
    plt.legend(handles=patchs, fancybox=True, framealpha=0.8, loc='lower right')
    plt.title(u'鸢尾花Logistic回归分类效果', fontsize=17)
    # plt.savefig('2.png')
    plt.show()
import numpy as np
import matplotlib as mpl
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor


def read_data():
    plt.figure(figsize=(13, 7), facecolor='w')
    plt.subplot(121)
    data = pd.read_csv('C0904.csv', header=0)
    x = data['H2O'].values
    plt.plot(x, 'r-', lw=1, label=u'C0904')
    plt.title(u'实际排放数据0904', fontsize=18)
    plt.legend(loc='upper right')
    plt.grid(b=True)

    plt.subplot(122)
    data = pd.read_csv('C0911.csv', header=0)
    x = data['H2O'].values
    plt.plot(x, 'r-', lw=1, label=u'C0911')
    plt.title(u'实际排放数据0911', fontsize=18)
    plt.legend(loc='upper right')
    plt.grid(b=True)

    plt.tight_layout(2, rect=(0, 0, 1, 0.95))
    plt.suptitle(u'如何找到下图中的异常值', fontsize=20)
    plt.show()


if __name__ == "__main__":
    mpl.rcParams['font.sans-serif'] = [u'simHei']
    mpl.rcParams['axes.unicode_minus'] = False

    # read_data()
    data = pd.read_csv('C0904.csv', header=0)   # C0911.csv, C0904.csv
    x = data['H2O'].values
    print x

    width = 500
    delta = 10
    eps = 0.15
    N = len(x)
    p = []
    abnormal = []
    for i in np.arange(0, N-width, delta):
        s = x[i:i+width]
        p.append(np.ptp(s))
        if np.ptp(s) > eps:
            abnormal.append(range(i, i+width))
    abnormal = np.array(abnormal).flatten()
    abnormal = np.unique(abnormal)
    # plt.plot(p, lw=1)
    # plt.grid(b=True)
    # plt.show()

    plt.figure(figsize=(18, 7), facecolor='w')
    plt.subplot(131)
    plt.plot(x, 'r-', lw=1, label=u'原始数据')
    plt.title(u'实际排放数据', fontsize=18)
    plt.legend(loc='upper right')
    plt.grid(b=True)

    plt.subplot(132)
    t = np.arange(N)
    plt.plot(t, x, 'r-', lw=1, label=u'原始数据')
    plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值')
    plt.legend(loc='upper right')
    plt.title(u'异常检测', fontsize=18)
    plt.grid(b=True)

    # 预测
    plt.subplot(133)
    select = np.ones(N, dtype=np.bool)
    select[abnormal] = False
    t = np.arange(N)
    dtr = DecisionTreeRegressor(criterion='mse', max_depth=10)
    br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3)
    br.fit(t[select].reshape(-1, 1), x[select])
    y = br.predict(np.arange(N).reshape(-1, 1))
    y[select] = x[select]
    plt.plot(x, 'g--', lw=1, label=u'原始值')    # 原始值
    plt.plot(y, 'r-', lw=1, label=u'校正值')     # 校正值
    plt.legend(loc='upper right')
    plt.title(u'异常值校正', fontsize=18)
    plt.grid(b=True)

    plt.tight_layout(1.5, rect=(0, 0, 1, 0.95))
    plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22)
    plt.show()
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
import matplotlib as mpl
import matplotlib.pyplot as plt


if __name__ == '__main__':
    pd.set_option('display.width', 300)
    pd.set_option('display.max_columns', 300)

    data = pd.read_csv('car.data', header=None)
    n_columns = len(data.columns)
    columns = ['buy', 'maintain', 'doors', 'persons', 'boot', 'safety', 'accept']
    new_columns = dict(zip(np.arange(n_columns), columns))
    data.rename(columns=new_columns, inplace=True)
    print data.head(10)

    # one-hot编码
    x = pd.DataFrame()
    for col in columns[:-1]:
        t = pd.get_dummies(data[col])
        t = t.rename(columns=lambda x: col+'_'+str(x))
        x = pd.concat((x, t), axis=1)
    print x.head(10)
    # print x.columns
    y = pd.Categorical(data['accept']).codes

    x, x_test, y, y_test = train_test_split(x, y, train_size=0.7)
    clf = LogisticRegressionCV(Cs=np.logspace(-3, 4, 8), cv=5)
    clf.fit(x, y)
    print clf.C_
    y_hat = clf.predict(x)
    print '训练集精确度:', metrics.accuracy_score(y, y_hat)
    y_test_hat = clf.predict(x_test)
    print '测试集精确度:', metrics.accuracy_score(y_test, y_test_hat)
    n_class = len(data['accept'].unique())
    y_test_one_hot = label_binarize(y_test, classes=np.arange(n_class))
    y_test_one_hot_hat = clf.predict_proba(x_test)
    fpr, tpr, _ = metrics.roc_curve(y_test_one_hot.ravel(), y_test_one_hot_hat.ravel())
    print 'Micro AUC:\t', metrics.auc(fpr, tpr)
    print 'Micro AUC(System):\t', metrics.roc_auc_score(y_test_one_hot, y_test_one_hot_hat, average='micro')
    auc = metrics.roc_auc_score(y_test_one_hot, y_test_one_hot_hat, average='macro')
    print 'Macro AUC:\t', auc

    mpl.rcParams['font.sans-serif'] = u'SimHei'
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(8, 7), dpi=80, facecolor='w')
    plt.plot(fpr, tpr, 'r-', lw=2, label='AUC=%.4f' % auc)
    plt.legend(loc='lower right')
    plt.xlim((-0.01, 1.02))
    plt.ylim((-0.01, 1.02))
    plt.xticks(np.arange(0, 1.1, 0.1))
    plt.yticks(np.arange(0, 1.1, 0.1))
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.grid(b=True, ls=':')
    plt.title(u'ROC曲线和AUC', fontsize=18)
    plt.show()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值