背景介绍:
本节将根据14个属性建立分类器评估一个收入阶层,高于‘50k’的为一个阶层,低于‘50k’的为一个阶层。主要是数据的读取有点难,数据放在txt文件中,并且标签是字符串。。我最后的准确率并不是很高,大家可以调调参,或者对数据在进行处理。。数据下载地址:https://archive.ics.uci.edu/ml/datasets/Census+Income
第一步:数据的读取
def read_data(path):
X = []
count_least50k = 0 # 收入低于50k
count_morethan50k = 0 # 收入高于50k
num_images_threshold = 10000 # 我们两种类型的数据只打算各收集10000条
with open(path, 'r') as f:
for line in f.readlines():
if '?' in line:
continue # 代表有缺失值 我们直接扔掉 因为数据量本身很大
data = line.strip().split(',') # 读出当前行的数据 把标签去掉,并且把逗号空格搞掉
# print(data)
if data[-1] == ' <=50K' and count_least50k < num_images_threshold:
X.append(data)
count_least50k += 1
elif data[-1] == ' >50K' and count_morethan50k < num_images_threshold:
X.append(data)
count_morethan50k += 1
if count_least50k > num_images_threshold and count_morethan50k > num_images_threshold:
break
X = np.array(X)
return X
这里我们读取数据,并且设了一个阈值,>50k和小于50k的数据分别只读取10000条,主要是为了防止数据倾斜。。
第二步: 数据的预处理
很明显 ,数据中存在一些字符串数据,它在一列中的取值是有限个,跟标签很像,所以,我们用sklearn中的LabelEncoder将字符串转化为数字
def process_data(data):
# 对数据进行预处理
Label_coder = []
x_encoded = np.empty(data.shape)
for i, item in enumerate(data[1]): # 遍历每一列, 将一些字符串转换为数字
if item.isdigit(): # 判断当前是否是数字
x_encoded[:, i] = data[:, i] # 是数字则不变
else:
Label_coder.append(LabelEncoder()) # 为每一列添加一个标签编码器
x_encoded[:, i] = Label_coder[-1].fit_transform(data[:, i])
X = x_encoded[:, :-1].astype(np.int)
y = x_encoded[:, -1].astype(np.int)
return X, y
处理后的数据:
第三步:建立模型,对数据进行预测
这里我们分别用了贝叶斯和随机森林
def model(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gn = GaussianNB()
gn.fit(X_train, y_train)
rd = RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=8)
rd.fit(X_train, y_train)
print("贝叶斯训练集准确率:{}, 测试集准确率:{}".format(gn.score(X_train, y_train), gn.score(X_test, y_test)))
print("随机森林训练集准确率:{}, 测试集准确率:{}".format(rd.score(X_train, y_train), rd.score(X_test, y_test)))
输出结果:
源代码:
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
def read_data(path):
X = []
count_least50k = 0 # 收入低于50k
count_morethan50k = 0 # 收入高于50k
num_images_threshold = 10000 # 我们两种类型的数据只打算各收集10000条
with open(path, 'r') as f:
for line in f.readlines():
if '?' in line:
continue # 代表有缺失值 我们直接扔掉 因为数据量本身很大
data = line.strip().split(',') # 读出当前行的数据 把标签去掉,并且把逗号空格搞掉
# print(data)
if data[-1] == ' <=50K' and count_least50k < num_images_threshold:
X.append(data)
count_least50k += 1
elif data[-1] == ' >50K' and count_morethan50k < num_images_threshold:
X.append(data)
count_morethan50k += 1
if count_least50k > num_images_threshold and count_morethan50k > num_images_threshold:
break
X = np.array(X)
return X
def process_data(data):
# 对数据进行预处理
Label_coder = []
x_encoded = np.empty(data.shape)
for i, item in enumerate(data[1]): # 遍历每一列, 将一些字符串转换为数字
if item.isdigit(): # 判断当前是否是数字
x_encoded[:, i] = data[:, i] # 是数字则不变
else:
Label_coder.append(LabelEncoder()) # 为每一列添加一个标签编码器
x_encoded[:, i] = Label_coder[-1].fit_transform(data[:, i])
X = x_encoded[:, :-1].astype(np.int)
y = x_encoded[:, -1].astype(np.int)
return X, y
def model(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gn = GaussianNB()
gn.fit(X_train, y_train)
rd = RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=8)
rd.fit(X_train, y_train)
print("贝叶斯训练集准确率:{}, 测试集准确率:{}".format(gn.score(X_train, y_train), gn.score(X_test, y_test)))
print("随机森林训练集准确率:{}, 测试集准确率:{}".format(rd.score(X_train, y_train), rd.score(X_test, y_test)))
if __name__ == '__main__':
path = './data/adult.data.txt'
data = read_data(path) # 读数据
# print(data)
# 接着对数据进行预处理
X, y = process_data(data)
print(X)
print(y)
print(X.shape)
print(y.shape)
# 下一步构造模型
model(X, y)