机器学习-估算收入阶层

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import numpy as np

#加载数据集
input_file = 'adult.txt'
#读取数据
X = []
y = []
count_lessthan50k = 0               #最好使用每个类型数据点
count_morethan50k = 0               #数量相等的数据进行训练
num_images_threshold = 10000        #每种类型样本选取1000个

with open(input_file, 'r') as f:   #打开文件,读取数据
    for line in f.readlines():
        if '?' in line:
            continue
        data = line[:-1].split(', ')
        if data[-1] == '<=50K' and count_lessthan50k < num_images_threshold:
            X.append(data)
            count_lessthan50k = count_lessthan50k + 1
        elif data[-1] == '>=50K' and count_morethan50k < num_images_threshold:
            X.append(data)
            count_morethan50k = count_morethan50k + 1
        if count_morethan50k >= num_images_threshold and count_lessthan50k >= num_images_threshold:
            break
X = np.array(X)
#print(X.shape)

#将字符串转换为数值数据
label_encoder = []              #转换器列表
X_encoded = np.empty(X.shape)   #(10000, 15)
#print(X[0])
#['39' 'State-gov' '77516' 'Bachelors' '13' 'Never-married' 'Adm-clerical'
# 'Not-in-family' 'White' 'Male' '2174' '0' '40' 'United-States' '<=50K']
for i,item in enumerate(X[0]):
    if item.isdigit():          #判断是否是数值,如果是数值不用进行处理
        X_encoded[:,i] = X[:,i] #二维矩阵的第i列数据
    else:
        label_encoder.append(preprocessing.LabelEncoder())          #添加转换器
        X_encoded[:,i] = label_encoder[-1].fit_transform(X[:,i])    #训练转换器并转换数据
X = X_encoded[:,:-1].astype(int)
y = X_encoded[:,-1].astype(int)

#建立分类器
classifier_gaussiannb = GaussianNB()
classifier_gaussiannb.fit(X, y)

#交叉验证
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
classifier_gaussiannb.fit(X_train, y_train)
y_test_predict = classifier_gaussiannb.predict(X_test)

#性能评价
targets_name = ['class0', 'class1', 'class2']
#print(classifier_gaussiannb.score(X_test, y_test))
#print(classification_report(y_test, y_test_predict, target_names=targets_name))

#为单一数据点分类
input_data = ['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married', 'Adm-clerical',
              'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States']
count = 0
input_data_encoded = [-1] * len(input_data)         #全部元素初始化为-1
for i,item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
    else:
        input_data_encoded[i] = int(label_encoder[count].transform(input_data[i]))
        count = count + 1
input_data_encoded = np.array(input_data_encoded)

# 预测并打印特定数据点的输出结果
output_class = classifier_gaussiannb.predict(input_data_encoded)
print(label_encoder[-1].inverse_transform(output_class)[0])

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值