1.手写数字识别
fromosimportlistdir
importnumpyasnp
importpandasaspd
fromsklearn.neighborsimportKNeighborsClassifier
# 1、读取训练集
trainingFileList = listdir('train') # 返回trainingDigits目录下的文件名
m = len(trainingFileList) # 返回文件夹下文件的个数
x_train = np.zeros((m, 1024)) # 初始化训练的Mat矩阵,测试集
y_train = [] # 训练集的Labels
# list(range(946))
foriinrange(m):
fileNameStr = trainingFileList[i] # 获得文件的名字
classNumber = int(fileNameStr.split('_')[0]) # 获得分类的数字标签
y_train.append(classNumber) # 将获得的类别添加到hwLabels中
fileContent = ''
fr = open('train/'+fileNameStr)
forlineinfr.readlines():
fileContent = fileContent+str(line.strip('\n'))# 逐行读取txt文件并去掉行尾的换行符
x_train[i, :] = np.array(list(str(fileContent)))# 将一串数字转换成一维数组作为一个训练样本
# np.array(list('110111001'))
# 2、读取测试集
testsFileList = listdir('test') # 返回trainingDigits目录下的文件名
m = len(testsFileList) # 返回文件夹下文件的个数
x_test = np.zeros((m, 1024)) # 初始化训练的Mat矩阵,测试集
id = [] # 测试集的样本id
foriinrange(m):
fileNameStr = testsFileList[i] # 获得文件的名字
classNumber = fileNameStr.split('.')[0] # 获得id
id.append(classNumber) # 将获得的id添加到列表id中
fileContent = ''
fr = open('test/'+fileNameStr)
forlineinfr.readlines():
fileContent = fileContent+str(line.strip('\n'))# 逐行读取txt文件并去掉行尾的换行符
x_test[i, :] = np.array(list(str(fileContent)))# 将一串数字转换成一维数组作为一个测试样本
# 3、预测并输出
clf = KNeighborsClassifier()# 默认值:邻居数n_neighbors = 5,欧氏距离p = 2
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
result = pd.DataFrame(id,columns = ['id'])
result['label'] = y_predict
result.to_csv('result.csv',index = False)
2.手写数字识别样例代码
importnumpyasnp
importoperator
fromosimportlistdir
defclassify0(inX, dataSet, labels, k):
m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量
diffMat=np.tile(inX,(m, 1))-dataSet#行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算
sqDiffMat = diffMat**2#减完后,对每个数做平方
sqDistances = sqDiffMat.sum(axis=1) #平方后按行求和,axis=0表示列相加,axis=1表示行相加
distances = sqDistances*0.5#开方计算出欧式距离
sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值
classCount = {} #用于类别/次数的字典,key为类别, value为次数
foriinrange(k):
• voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别
• classCount[voteIlabel] = classCount.get(voteIlabel,0) +1#对类别次数进行累加
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序
returnsortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别
defimg2vector(filename):
returnVect = np.zeros((1, 1024)) #创建1x1024零向量
fr = open(filename) #打开文件
foriinrange(32): #按行读取
• lineStr = fr.readline() #读一行数据
• forjinrange(32): #每一行的前32个元素依次添加到returnVect中
• returnVect[0, 32*i+j] = int(lineStr[j])
returnreturnVect#返回转换后的1x1024向量
np.set_printoptions(threshold=np.inf)
\# print(img2vector("kNN_hand_writing/trainingDigits/1_1.txt"))
defloadTrainData():
hwLabels = [] #测试集的Labels
trainingFileList = listdir('train') #返回trainingDigits目录下的文件名
m = len(trainingFileList) #返回文件夹下文件的个数
trainingMat = np.zeros((m, 1024)) #初始化训练的Mat矩阵,测试集
foriinrange(m): #从文件名中解析出训练集的类别
• fileNameStr = trainingFileList[i] #获得文件的名字
• classNumber = int(fileNameStr.split('_')[0])#获得分类的数字
• hwLabels.append(classNumber)#将获得的类别添加到hwLabels中
• \#将每一个文件的1x1024数据存储到trainingMat矩阵中
• trainingMat[i,:] = img2vector('train/%s'% (fileNameStr))
returnhwLabels,trainingMat
defhandwritingClassTest():
\#错误检测计数
errorCount = 0.0
hwLabels,trainingMat=loadTrainData()
\#返回testDigits目录下的文件名
testFileList = listdir('test')
\#测试数据的数量
mTest = len(testFileList)
\#从文件中解析出测试集的类别并进行分类测试
f = open('result.csv', 'w', encoding='utf-8')
f.write("id,label\n")
foriinrange(mTest):
• \#获得文件的名字
• fileNameStr = testFileList[i]
• \#获得分类的数字
• \# classNumber = int(fileNameStr.split('_')[0])
• id = fileNameStr.split('.')[0]
• \#获得测试集的1x1024向量,用于训练
• vectorUnderTest = img2vector('test/%s'% (fileNameStr))
• \#获得预测结果
• classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
• f.write(f"{id},{classifierResult}\n")
• \# print(fileNameStr+"分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
• \# if(classifierResult != classNumber):
• \# errorCount += 1.0
\# print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest))
f.close()
handwritingClassTest()
3约会网站配对效果改进样例代码
importoperator
importnumpyasnp
fromosimportlistdir
defclassify0(inX, dataSet, labels, k):
• m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量
• diffMat=np.tile(inX,(m, 1))-dataSet#行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算
• sqDiffMat = diffMat**2#减完后,对每个数做平方
• sqDistances = sqDiffMat.sum(axis=1) #平方后按行求和,axis=0表示列相加,axis=1表示行相加
• distances = sqDistances*0.5#开方计算出欧式距离
• sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值
• classCount = {} #用于类别/次数的字典,key为类别, value为次数
• foriinrange(k):
• voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别
• classCount[voteIlabel] = classCount.get(voteIlabel,0) +1#对类别次数进行累加
• sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序
• returnsortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别
deffile2matrix(filename):
\#打开文件
fr = open(filename)
\#读取文件所有内容
arrayOLines = fr.readlines()
arrayOLines = arrayOLines[1:]
\#得到文件行数
numberOfLines = len(arrayOLines)
\#返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列
returnMat = np.zeros((numberOfLines,3))
\#返回的分类标签向量
classLabelVector = []
\#行的索引值
index = 0
forlineinarrayOLines:
• \#s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ')
• line = line.strip()
• \#使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。
• listFromLine = line.split(',')
• \#将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵
• returnMat[index,:] = listFromLine[0:3]
• \#根据文本中标记的喜欢的程度进行分类,0代表不喜欢,1代表魅力一般,2代表极具魅力
• classLabelVector.append(int(listFromLine[-1]))
• \# if listFromLine[-1] == '0':
• \# classLabelVector.append(1)
• \# elif listFromLine[-1] == '1':
• \# classLabelVector.append(2)
• \# elif listFromLine[-1] == '2':
• \# classLabelVector.append(3)
• index += 1
returnreturnMat, classLabelVector
deftestfile2matrix(filename):
\#打开文件
fr = open(filename)
\#读取文件所有内容
arrayOLines = fr.readlines()
arrayOLines = arrayOLines[1:]
\#得到文件行数
numberOfLines = len(arrayOLines)
\#返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列
returnMat = np.zeros((numberOfLines,3))
\#返回的分类标签向量
\#行的索引值
index = 0
idlst = []
forlineinarrayOLines:
• \#s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ')
• line = line.strip()
• \#使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。
• listFromLine = line.split(',')
• \#将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵
• returnMat[index,:] = listFromLine[1:4]
• idlst.append(listFromLine[0])
• index += 1
returnreturnMat, idlst
defautoNorm(dataSet):
\#获得数据的最小值
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
\#最大值和最小值的范围
ranges = maxVals-minVals
\#shape(dataSet)返回dataSet的矩阵行列数
normDataSet = np.zeros(np.shape(dataSet))
\#返回dataSet的行数
m = dataSet.shape[0]
\#原始值减去最小值
normDataSet = dataSet-np.tile(minVals, (m, 1))
\#除以最大和最小值的差,得到归一化数据
normDataSet = normDataSet/np.tile(ranges, (m, 1))
\#返回归一化数据结果,数据范围,最小值
returnnormDataSet, ranges, minVals
defdatingClassTest():
\#打开的文件名
filename = "train/train.csv"
\#将返回的特征矩阵和分类向量分别存储到datingDataMat和datingLabels中
datingDataMat, datingLabels = file2matrix(filename)
\#取所有数据的百分之十
hoRatio = 0.10
\#数据归一化,返回归一化后的矩阵,数据范围,数据最小值
normMat, ranges, minVals = autoNorm(datingDataMat)
testDataMat, idlst = testfile2matrix("test/test.csv")
testDataMat, _, _ = autoNorm(testDataMat)
\#获得normMat的行数
m = normMat.shape[0]
\#百分之十的测试数据的个数
numTestVecs = testDataMat.shape[0]
\#分类错误计数
errorCount = 0.0
withopen('result.csv', 'w') asf:
• f.write("id,label\n")
• foriinrange(numTestVecs):
• \#前numTestVecs个数据作为测试集,后m-numTestVecs个数据作为训练集
• classifierResult = classify0(testDataMat[i,:], normMat, datingLabels, 4)
• f.write(f"{idlst[i]},{classifierResult}\n")
if__name__ == '__main__':
datingClassTest()
4.隐形眼镜选择样例代码
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
\# import pydotplus
\# from sklearn.externals.six import StringIO
if __name__ == '__main__':
with open('train/train.csv', 'r') as fr: #加载文件
• lenses = [inst.strip().split(',') for inst in fr.readlines()[1:]] #处理文件
lenses_target = [] #提取每组数据的类别,保存在列表里
for each in lenses:
• lenses_target.append(each[-1])
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] #特征标签
lenses_list = [] #保存lenses数据的临时列表
lenses_dict = {} #保存lenses数据的字典,用于生成pandas
for each_label in lensesLabels: #提取信息,生成字典
• for each in lenses:
• lenses_list.append(each[lensesLabels.index(each_label)])
• lenses_dict[each_label] = lenses_list
• lenses_list = []
print(lenses_dict) #打印字典信息
lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame
print(lenses_pd)
le = LabelEncoder() #创建LabelEncoder()对象,用于序列化
for col in lenses_pd.columns: #为每一列序列化
• lenses_pd[col] = le.fit_transform(lenses_pd[col])
print(lenses_pd)
clf = tree.DecisionTreeClassifier(max_depth = 4) #创建DecisionTreeClassifier()类
clf = clf.fit(lenses_pd.values.tolist(), lenses_target) #使用数据,构建决策树
test_csv = pd.read_csv("test/test.csv")
test_pd = pd.DataFrame(test_csv, columns=["age","prescript","astigmatic","tearRate"])
for col in test_pd.columns: #为每一列序列化
• test_pd[col] = le.fit_transform(test_pd[col])
print(test_pd)
result = clf.predict(test_pd)
with open("result.csv", 'w') as f:
• f.write('id,class\n')
• for i,x in enumerate(result):
• if x == "no lenses":
• f.write(f"{i+1},1\n")
• elif x == "soft":
• f.write(f"{i+1},2\n")
• elif x == "hard":
• f.write(f"{i+1},3\n")
5.垃圾邮件分类样例代码
# -*- coding: UTF-8 -*-
importnumpyasnp
importrandom
importre
"""
函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0
Parameters:
vocabList - createVocabList返回的列表
inputSet - 切分的词条列表
Returns:
returnVec - 文档向量,词集模型
"""
defsetOfWords2Vec(vocabList, inputSet):
returnVec = [0] *len(vocabList) #创建一个其中所含元素都为0的向量
forwordininputSet: #遍历每个词条
• ifwordinvocabList: #如果词条存在于词汇表中,则置1
• returnVec[vocabList.index(word)] = 1
• else: print("the word: %s is not in my Vocabulary!"%word)
returnreturnVec #返回文档向量
"""
函数说明:根据vocabList词汇表,构建词袋模型
Parameters:
vocabList - createVocabList返回的列表
inputSet - 切分的词条列表
Returns:
returnVec - 文档向量,词袋模型
"""
defbagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList) #创建一个其中所含元素都为0的向量
forwordininputSet: #遍历每个词条
• ifwordinvocabList: #如果词条存在于词汇表中,则计数加一
• returnVec[vocabList.index(word)] += 1
returnreturnVec #返回词袋模型
"""
函数说明:朴素贝叶斯分类器训练函数
Parameters:
trainMatrix - 训练文档矩阵,即setOfWords2Vec返回的returnVec构成的矩阵
trainCategory - 训练类别标签向量,即loadDataSet返回的classVec
Returns:
p0Vect - 非侮辱类的条件概率数组
p1Vect - 侮辱类的条件概率数组
pAbusive - 文档属于侮辱类的概率
"""
deftrainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix) #计算训练的文档数目
numWords = len(trainMatrix[0]) #计算每篇文档的词条数
pAbusive = sum(trainCategory)/float(numTrainDocs) #文档属于侮辱类的概率
p0Num = np.ones(numWords); p1Num = np.ones(numWords) #创建numpy.ones数组,词条出现数初始化为1,拉普拉斯平滑
p0Denom = 2.0; p1Denom = 2.0 #分母初始化为2,拉普拉斯平滑
foriinrange(numTrainDocs):
• iftrainCategory[i] == 1: #统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)···
• p1Num += trainMatrix[i]
• p1Denom += sum(trainMatrix[i])
• else: #统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)···
• p0Num += trainMatrix[i]
• p0Denom += sum(trainMatrix[i])
p1Vect = np.log(p1Num/p1Denom) #取对数,防止下溢出
p0Vect = np.log(p0Num/p0Denom)
returnp0Vect,p1Vect,pAbusive #返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率
"""
函数说明:朴素贝叶斯分类器分类函数
Parameters:
vec2Classify - 待分类的词条数组
p0Vec - 非侮辱类的条件概率数组
p1Vec -侮辱类的条件概率数组
pClass1 - 文档属于侮辱类的概率
Returns:
0 - 属于非侮辱类
1 - 属于侮辱类
"""
defclassifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify*p1Vec) +np.log(pClass1) #对应元素相乘。logA * B = logA + logB,所以这里加上log(pClass1)
p0 = sum(vec2Classify*p0Vec) +np.log(1.0-pClass1)
ifp1>p0:
• return1
else:
• return0
"""
函数说明:接收一个大字符串并将其解析为字符串列表
Parameters:
无
Returns:
无
"""
deftextParse(bigString): #将字符串转换为字符列表
listOfTokens = re.split(r'\W+', bigString) #将特殊符号作为切分标志进行字符串切分,即非字母、非数字
return [tok.lower() fortokinlistOfTokensiflen(tok) >2] #除了单个字母,例如大写的I,其它单词变成小写
defcreateVocabList(dataSet):
vocabSet = set([]) #创建一个空的不重复列表
fordocumentindataSet:
• vocabSet = vocabSet|set(document) #取并集
returnlist(vocabSet)
"""
函数说明:测试朴素贝叶斯分类器
Parameters:
无
Returns:
无
"""
defspamTest():
docList = []; classList = []; fullText = []
foriinrange(1, 21): #遍历25个txt文件
• wordList = textParse(open('train/%d_spam.txt'%i, encoding='ISO-8859-1').read()) #读取每个垃圾邮件,并字符串转换成字符串列表
• docList.append(wordList)
• fullText.append(wordList)
• classList.append(1) #标记垃圾邮件,1表示垃圾文件
• wordList = textParse(open('train/%d_ham.txt'%i, encoding='ISO-8859-1').read()) #读取每个非垃圾邮件,并字符串转换成字符串列表
• docList.append(wordList)
• fullText.append(wordList)
• classList.append(0) #标记非垃圾邮件,1表示垃圾文件
foriinrange(1, 11): #遍历25个txt文件
• wordList = textParse(open('test/%d.txt'%i, encoding='ISO-8859-1').read()) #读取每个垃圾邮件,并字符串转换成字符串列表
• docList.append(wordList)
• fullText.append(wordList)
• classList.append(-1) #标记垃圾邮件,1表示垃圾文件
vocabList = createVocabList(docList) #创建词汇表,不重复
trainingSet = list(range(40))
testSet = list(range(40, 50)) #创建存储训练集的索引值的列表和测试集的索引值的列表
trainMat = []; trainClasses = [] #创建训练集矩阵和训练集类别标签系向量
fordocIndexintrainingSet: #遍历训练集
• trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) #将生成的词集模型添加到训练矩阵中
• trainClasses.append(classList[docIndex]) #将类别添加到训练集类别标签系向量中
p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) #训练朴素贝叶斯模型
errorCount = 0 #错误分类计数
withopen("result.csv", "w") asf:
• f.write("id,label\n")
• fordocIndexintestSet: #遍历测试集
• wordVector = setOfWords2Vec(vocabList, docList[docIndex]) #测试集的词集模型
• r = classifyNB(np.array(wordVector), p0V, p1V, pSpam)
• f.write(f"{docIndex-39},{r}\n")
if__name__ == '__main__':
spamTest()
6.病马死亡预测
fromsklearn.linear_modelimportLogisticRegression
"""
函数说明:使用Sklearn构建Logistic回归分类器
Parameters:
无
Returns:
无
"""
defcolicSklearn():
frTrain = open('train/horseColicTraining.txt') #打开训练集
frTest = open('test/horseColicTest.txt') #打开测试集
trainingSet = []; trainingLabels = []
testSet = []
forlineinfrTrain.readlines():
• currLine = line.strip().split('\t')
• lineArr = []
• foriinrange(len(currLine)-1):
• lineArr.append(float(currLine[i]))
• trainingSet.append(lineArr)
• trainingLabels.append(float(currLine[-1]))
forlineinfrTest.readlines():
• currLine = line.strip().split('\t')
• lineArr =[]
• foriinrange(len(currLine)):
• lineArr.append(float(currLine[i]))
• testSet.append(lineArr)
classifier = LogisticRegression(solver='sag',max_iter=5000).fit(trainingSet, trainingLabels)
test_accurcy = classifier.predict(testSet)
withopen("result.csv", "w") asf:
• f.write("predict\n")
• f.write('\n'.join([str(x) forxintest_accurcy]))
if__name__ == '__main__':
colicSklearn()
7.水果分类器样例代码
importnumpyasnp
importoperator
fromosimportlistdir
importpandasaspd
defclassify0(inX, dataSet, labels, k):
• m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量
• diffMat=np.tile(inX,(m, 1))-dataSet#行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算
• sqDiffMat = diffMat**2#减完后,对每个数做平方
• sqDistances = sqDiffMat.sum(axis=1) #平方后按行求和,axis=0表 示列相加,axis-1表示行相加
• distances = sqDistances*0.5#开方计算出欧式距离
• sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值2
• classCount = {} #用于类别/次数的字典,key为类别, value为次数
• foriinrange(k):
• voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别
• classCount[voteIlabel] = classCount.get(voteIlabel,0) +1#对类别次数进行累加
• sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序
• returnsortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别
train = pd.read_csv('train/train.csv')
\# dataSet=np.array([[250,100],[270,120],[111,230],[130,260],[200,80],[70,190]])
dataSet = np.array(list(zip(train['x'], train['y'])))
\# print(dataSet)
labels = np.array(list(train['class']))
\# inX=[105,210]
test = pd.read_csv('test/test.csv')
testSet = zip(test['x'], test['y'])
withopen('result.csv', 'w', encoding='utf-8') asf:
f.write('class\n')
fortxintestSet:
• f.write(str(classify0(tx, dataSet, labels, 3)) +'\n')
8 乳腺癌预测
importnumpyasnp
importoperator
fromosimportlistdir
importpandasaspd
importcsv #用于处理csv文件
importrandom #用于随机数
importmath
importoperator #
fromsklearnimportneighbors
fromsklearn.metricsimportaccuracy_score
fromsklearn.preprocessingimportMinMaxScaler
test_df = pd.read_csv('test/test.csv')
train_df = pd.read_csv('train/train.csv')
X, y = train_df.iloc[:, 2:], train_df.iloc[:, 1]
test_X = test_df.iloc[:, 1:]
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
test_X = scaler.transform(test_X)
\#coding:utf-8
fromsklearnimportneighbors
importsklearn
knn = neighbors.KNeighborsClassifier()
\#训练数据集
knn.fit(X, y)
\#预测
predict = knn.predict(test_X)
result = pd.DataFrame(data={'ID':range(1, len(predict)+1), 'Diagnosis':predict})
result.to_csv('result.csv',index=False)
ans_df = pd.read_csv('answer.csv')
9 基于svm的手写数字识别
题目
import numpy as np
from os import listdir
from sklearn.svm import SVC
import pandas as pd
def img2vector(filename):
"""
将32x32的二进制图像转换为1x1024向量。
Parameters:
filename - 文件名
Returns:
returnVect - 返回的二进制图像的1x1024向量
"""
#创建1x1024零向量
returnVect = np.zeros((1, 1024))
#打开文件
fr = open(filename)
#按行读取
for i in range(32):
#读一行数据
lineStr = fr.readline()
#每一行的前32个元素依次添加到returnVect中
for j in range(32):
returnVect[0, 32*i+j] = int(lineStr[j])
#返回转换后的1x1024向量
return returnVect
def handwritingClassTest():
"""
手写数字分类测试
Parameters:
无
Returns:
无
"""
#测试集的Labels
hwLabels = []
#返回trainingDigits目录下的文件名
trainingFileList = listdir('train')
#返回文件夹下文件的个数
m = len(trainingFileList)
#初始化训练的Mat矩阵,测试集
trainingMat = np.zeros((m, 1024))
#从文件名中解析出训练集的类别
for i in range(m):
#获得文件的名字
fileNameStr = trainingFileList[i]
#获得分类的数字
classNumber = int(fileNameStr.split('_')[0])
#将获得的类别添加到hwLabels中
hwLabels.append(classNumber)
#将每一个文件的1x1024数据存储到trainingMat矩阵中
trainingMat[i,:] = img2vector('train/%s' % (fileNameStr))
clf = SVC(C=200,kernel='rbf', gamma='auto')
clf.fit(trainingMat,hwLabels)
#返回testDigits目录下的文件列表
testFileList = listdir('test')
#错误检测计数
errorCount = 0.0
#测试数据的数量
mTest = len(testFileList)
# 预测结果列表
predicts = []
#从文件中解析出测试集的类别并进行分类测试
for i in range(mTest):
#获得文件的名字
fileNameStr = testFileList[i]
#获得分类的数字
# classNumber = int(fileNameStr.split('_')[0])
#获得测试集的1x1024向量,用于训练
vectorUnderTest = img2vector('test/%s' % (fileNameStr))
#获得预测结果
# classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
classifierResult = clf.predict(vectorUnderTest)
predicts.append(classifierResult[0])
# print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
# if(classifierResult != classNumber):
# errorCount += 1.0
# print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))
pd.DataFrame(data={'num':predicts}).to_csv('result.csv', index=False)
if __name__ == '__main__':
handwritingClassTest()
10 花朵预测
import pandas as pd
\#from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
train_data = pd.read_csv('train/train.csv')
test_data = pd.read_csv('test/test.csv')
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]
\#lr = LogisticRegression()
lr=KNeighborsClassifier(n_neighbors=3)
lr.fit(X, y)
predict = lr.predict(test_data)
pd.DataFrame(data={'class':predict}).to_csv('result.csv', index=False)
10.1 花的分类
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
train_data = pd.read_csv('train/train.csv')
test = pd.read_csv('test/test.csv')
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]
LRKNN=KNeighborsClassifier(n_neighbors=3)
LRKNN.fit(X, y)
predict = LRKNN.predict(test)
pd.DataFrame(data={'Id':range(1,len(predict) + 1),'Species':predict}).to_csv('result.csv', index=False)
11.心脏病预测
#导入数据库
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
#划分测试
train_data = pd.read_csv('train/train.csv')
test = pd.read_csv('test/test.csv')
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]
LRKNN=KNeighborsClassifier(n_neighbors=2)
LRKNN.fit(X, y)
predict = LRKNN.predict(test)
pd.DataFrame(data={'ID':range(1,len(predict) + 1),'target':predict}).to_csv('result.csv', index=False)
12.预测电信报务提供商的客户流失情况
#导入数据库
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
#测试
train_data = pd.read_csv('train/train.csv')
test = pd.read_csv('test/test.csv')
X = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]
LRKNN=KNeighborsClassifier(n_neighbors=3)
LRKNN.fit(X, y)
predict = LRKNN.predict(test)
pd.DataFrame(data={'ID':range(1,len(predict) + 1),'churn':predict}).to_csv('result.csv', index=False)
13.糖尿病预测
题目:
import math
import numpy as np
import random
import warnings
warnings.filterwarnings("ignore")
def load_diabetes():
X = []
y = []
line = input()
while line:
dx = []
data = [l for l in line.strip().split(',')]
X.append(np.array([np.float(d) for d in data[:-1]]))
y.append(np.float(data[-1]))
line = input()
return np.array(X),np.array(y)
def train_test_split(X,Y,test_size=0.2,random_state=2333):
random.seed(random_state)
n_samples = len(X)
indices = np.arange(n_samples)
train_indexs = list(set(random.sample(indices.tolist(),int(n_samples*(1-test_size)))))
test_indexs = [k for k in indices if k not in train_indexs]
return X[train_indexs],X[test_indexs],Y[train_indexs],Y[test_indexs]
X,y = load_diabetes()
class LinearRegression:
def __init__(self):
'''初始化模型'''
self.coef_ = None
self.interception_ = None
self._theta = None
def fit_normal(self,X_train,y_train):
'''根据训练数据集X_train,y_train训练模型'''
assert X_train.shape[0] == y_train.shape[0],'the number of X_train must equal to the number of y_train'
X_b = np.hstack([np.ones((len(X_train),1)),X_train])
self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict(self,X_predict):
assert self._theta is not None,'must fit before predict'
assert X_predict.shape[1] == len(self.coef_),'the feature number of X_predict must equal to X_train '
X_b = np.hstack([np.ones((len(X_predict),1)),X_predict])
return X_b.dot(self._theta)
#计算均方误差
def mse(self,y,y_pre):
sq_error = (y_pre - y) ** 2
sum_sq_error = np.sum(sq_error)
mse = sum_sq_error/y.size
return mse
#计算均方根误差
def rmse(self,y,y_pre):
sq_error = (y_pre - y) ** 2
total_sq_error = np.sum(sq_error)
mse = total_sq_error/y.size
rmse = math.sqrt(mse)
return rmse
def r2_score(self,y,y_pre):
return 1-(self.mse(y,y_pre)/np.var(y))
def score(self,X_test,y_test):
'''根据测试数据集确定当前模型的准确度'''
y_predict = self.predict(X_test)
return self.r2_score(y_test,y_predict),self.rmse(y_test,y_predict)
def __repr__(self):
return 'LinearRegression()'
#调用train_test_split函数拆分数据集,带默认值参数可以忽略
x_train,x_test,y_train,y_test = train_test_split(X,y)
reg = LinearRegression()
reg.fit_normal(x_train,y_train)
r2,rmse = reg.score(x_test,y_test)
print("debug_begin");
def test(rmse,r2):
if rmse>50 or r2>0.5:
print(True)
else:
print(False)
print("debug_end");
test(rmse,r2)
14.knn算法的实现
题目:
import numpy as np
import operator
from os import listdir
import sys
import codecs
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
def classify0(inX, dataSet, labels, k):
m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量
diffMat=np.tile(inX,(m, 1))-dataSet #行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算
sqDiffMat = diffMat**2 #减完后,对每个数做平方
#按行求和,axis=0表示列相加,axis=1表示行相加
sqDistances = sqDiffMat.sum(axis=1)
#开方计算出欧式距离
distances = sqDistances*0.5
sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值2
classCount = {} #用于类别/次数的字典,key为类别, value为次数
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #对类别次数进行累加
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序
return sortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别
dataSet=np.array([[250,100],[270,120],[111,230],[130,260],[200,80],[70,190]])
labels=["理科生","理科生","文科生","文科生","理科生","文科生"]
f = open("in1-1.txt", 'r')
for line in f.readlines():
inX = line .split(" ")
inX = [ int(x) for x in inX ]
print(classify0(inX,dataSet,labels,3))
7967

被折叠的 条评论
为什么被折叠?



