【无标题】

1.手写数字识别

fromosimportlistdir
importnumpyasnp
importpandasaspd
fromsklearn.neighborsimportKNeighborsClassifier
​
#   1、读取训练集
trainingFileList = listdir('train')  # 返回trainingDigits目录下的文件名
m = len(trainingFileList)  # 返回文件夹下文件的个数
x_train = np.zeros((m, 1024))  # 初始化训练的Mat矩阵,测试集
y_train = []  # 训练集的Labels
# list(range(946))
foriinrange(m):
    fileNameStr = trainingFileList[i]  # 获得文件的名字
    classNumber = int(fileNameStr.split('_')[0])  # 获得分类的数字标签
    y_train.append(classNumber)  # 将获得的类别添加到hwLabels中
    fileContent = ''
    fr = open('train/'+fileNameStr)
    forlineinfr.readlines():
        fileContent = fileContent+str(line.strip('\n'))#  逐行读取txt文件并去掉行尾的换行符
    x_train[i, :] = np.array(list(str(fileContent)))# 将一串数字转换成一维数组作为一个训练样本
# np.array(list('110111001'))

# 2、读取测试集

testsFileList = listdir('test')  # 返回trainingDigits目录下的文件名
m = len(testsFileList)  # 返回文件夹下文件的个数
x_test = np.zeros((m, 1024))  # 初始化训练的Mat矩阵,测试集
id = []  # 测试集的样本id
foriinrange(m):
    fileNameStr = testsFileList[i]  # 获得文件的名字
    classNumber = fileNameStr.split('.')[0]  # 获得id
    id.append(classNumber)  # 将获得的id添加到列表id中
    fileContent = ''
    fr = open('test/'+fileNameStr)
    forlineinfr.readlines():
        fileContent = fileContent+str(line.strip('\n'))#  逐行读取txt文件并去掉行尾的换行符
    x_test[i, :] = np.array(list(str(fileContent)))# 将一串数字转换成一维数组作为一个测试样本
​
#   3、预测并输出
clf = KNeighborsClassifier()#   默认值:邻居数n_neighbors = 5,欧氏距离p = 2
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
result = pd.DataFrame(id,columns = ['id'])
result['label'] = y_predict
result.to_csv('result.csv',index = False)

2.手写数字识别样例代码

importnumpyasnp
​
importoperator
​
fromosimportlistdir
​
​
​
defclassify0(inX, dataSet, labels, k):
​
  m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量
​
  diffMat=np.tile(inX,(m, 1))-dataSet#行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算
​
  sqDiffMat = diffMat**2#减完后,对每个数做平方
​
  sqDistances = sqDiffMat.sum(axis=1) #平方后按行求和,axis=0表示列相加,axis=1表示行相加
​
  distances = sqDistances*0.5#开方计算出欧式距离
​
  sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值
​
  classCount = {} #用于类别/次数的字典,key为类别, value为次数 
​
  foriinrange(k):
​
•    voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别
​
•    classCount[voteIlabel] = classCount.get(voteIlabel,0) +1#对类别次数进行累加
​
  sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序
​
  returnsortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别
​
​
​
defimg2vector(filename):
​
  returnVect = np.zeros((1, 1024)) #创建1x1024零向量
​
  fr = open(filename) #打开文件
​
  foriinrange(32): #按行读取
​
•    lineStr = fr.readline() #读一行数据
​
•    forjinrange(32): #每一行的前32个元素依次添加到returnVect中
​
•      returnVect[0, 32*i+j] = int(lineStr[j])
​
  returnreturnVect#返回转换后的1x1024向量
​
​
​
np.set_printoptions(threshold=np.inf)
​
\# print(img2vector("kNN_hand_writing/trainingDigits/1_1.txt"))
​
defloadTrainData():
​
  hwLabels = [] #测试集的Labels
​
  trainingFileList = listdir('train') #返回trainingDigits目录下的文件名
​
  m = len(trainingFileList) #返回文件夹下文件的个数
​
  trainingMat = np.zeros((m, 1024)) #初始化训练的Mat矩阵,测试集
​
  foriinrange(m): #从文件名中解析出训练集的类别
​
•    fileNameStr = trainingFileList[i] #获得文件的名字
​
•    classNumber = int(fileNameStr.split('_')[0])#获得分类的数字
​
•    hwLabels.append(classNumber)#将获得的类别添加到hwLabels中
​
•    \#将每一个文件的1x1024数据存储到trainingMat矩阵中
​
•    trainingMat[i,:] = img2vector('train/%s'% (fileNameStr))
​
  returnhwLabels,trainingMat
​
​
​
defhandwritingClassTest():
​
  \#错误检测计数
​
  errorCount = 0.0
​
  hwLabels,trainingMat=loadTrainData()
​
  \#返回testDigits目录下的文件名
​
  testFileList = listdir('test')
​
  \#测试数据的数量
​
  mTest = len(testFileList)
​
  \#从文件中解析出测试集的类别并进行分类测试
​
  f = open('result.csv', 'w', encoding='utf-8')
​
  f.write("id,label\n")
​
  foriinrange(mTest):
​
•    \#获得文件的名字
​
•    fileNameStr = testFileList[i]
​
•    \#获得分类的数字
​
•    \# classNumber = int(fileNameStr.split('_')[0])
​
•    id = fileNameStr.split('.')[0]
​
•    \#获得测试集的1x1024向量,用于训练
​
•    vectorUnderTest = img2vector('test/%s'% (fileNameStr))
​
•    \#获得预测结果
​
•    classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
​
•    f.write(f"{id},{classifierResult}\n")
​
•    \# print(fileNameStr+"分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
​
•    \# if(classifierResult != classNumber):
​
•      \# errorCount += 1.0
​
  \# print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest))
​
  f.close()
​
​
​
handwritingClassTest()

3约会网站配对效果改进样例代码

​
importoperator
importnumpyasnp
​
fromosimportlistdir
​
​
​
defclassify0(inX, dataSet, labels, k):
​
•   m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量
​
•   diffMat=np.tile(inX,(m, 1))-dataSet#行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算
​
•   sqDiffMat = diffMat**2#减完后,对每个数做平方
​
•   sqDistances = sqDiffMat.sum(axis=1) #平方后按行求和,axis=0表示列相加,axis=1表示行相加
​
•   distances = sqDistances*0.5#开方计算出欧式距离
​
•   sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值
​
•   classCount = {} #用于类别/次数的字典,key为类别, value为次数 
​
•   foriinrange(k):
​
•       voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别
​
•       classCount[voteIlabel] = classCount.get(voteIlabel,0) +1#对类别次数进行累加
​
•   sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序
​
•   returnsortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别
​
​
​
​
​
deffile2matrix(filename):
​
  \#打开文件
​
  fr = open(filename)
​
  \#读取文件所有内容
​
  arrayOLines = fr.readlines()
​
  arrayOLines = arrayOLines[1:]
​
  \#得到文件行数
​
  numberOfLines = len(arrayOLines)
​
  \#返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列
​
  returnMat = np.zeros((numberOfLines,3))
​
  \#返回的分类标签向量
​
  classLabelVector = []
​
  \#行的索引值
​
  index = 0
​
  forlineinarrayOLines:
​
•    \#s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ')
​
•    line = line.strip()
​
•    \#使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。
​
•    listFromLine = line.split(',')
​
•    \#将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵
​
•    returnMat[index,:] = listFromLine[0:3]
​
•    \#根据文本中标记的喜欢的程度进行分类,0代表不喜欢,1代表魅力一般,2代表极具魅力
​
•    classLabelVector.append(int(listFromLine[-1]))
​
•    \# if listFromLine[-1] == '0':
​
•    \#   classLabelVector.append(1)
​
•    \# elif listFromLine[-1] == '1':
​
•    \#   classLabelVector.append(2)
​
•    \# elif listFromLine[-1] == '2':
​
•    \#   classLabelVector.append(3)
​
•    index += 1
​
  returnreturnMat, classLabelVector
​
​
​
deftestfile2matrix(filename):
​
  \#打开文件
​
  fr = open(filename)
​
  \#读取文件所有内容
​
  arrayOLines = fr.readlines()
​
  arrayOLines = arrayOLines[1:]
​
  \#得到文件行数
​
  numberOfLines = len(arrayOLines)
​
  \#返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列
​
  returnMat = np.zeros((numberOfLines,3))
​
  \#返回的分类标签向量
​
  \#行的索引值
​
  index = 0
​
  idlst = []
​
  forlineinarrayOLines:
​
•    \#s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ')
​
•    line = line.strip()
​
•    \#使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。
​
•    listFromLine = line.split(',')
​
•    \#将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵
​
•    returnMat[index,:] = listFromLine[1:4]
​
•    idlst.append(listFromLine[0])
​
•    index += 1
​
  returnreturnMat, idlst
​
​
​
defautoNorm(dataSet):
​
  \#获得数据的最小值
​
  minVals = dataSet.min(0)
​
  maxVals = dataSet.max(0)
​
  \#最大值和最小值的范围
​
  ranges = maxVals-minVals
​
  \#shape(dataSet)返回dataSet的矩阵行列数
​
  normDataSet = np.zeros(np.shape(dataSet))
​
  \#返回dataSet的行数
​
  m = dataSet.shape[0]
​
  \#原始值减去最小值
​
  normDataSet = dataSet-np.tile(minVals, (m, 1))
​
  \#除以最大和最小值的差,得到归一化数据
​
  normDataSet = normDataSet/np.tile(ranges, (m, 1))
​
  \#返回归一化数据结果,数据范围,最小值
​
  returnnormDataSet, ranges, minVals
​
​
​
defdatingClassTest():
​
  \#打开的文件名
​
  filename = "train/train.csv"
​
  \#将返回的特征矩阵和分类向量分别存储到datingDataMat和datingLabels中
​
  datingDataMat, datingLabels = file2matrix(filename)
​
  \#取所有数据的百分之十
​
  hoRatio = 0.10
​
  \#数据归一化,返回归一化后的矩阵,数据范围,数据最小值
​
  normMat, ranges, minVals = autoNorm(datingDataMat)
​
  testDataMat, idlst = testfile2matrix("test/test.csv")
​
  testDataMat, _, _ = autoNorm(testDataMat)
​
  \#获得normMat的行数
​
  m = normMat.shape[0]
​
  \#百分之十的测试数据的个数
​
  numTestVecs = testDataMat.shape[0]
​
  \#分类错误计数
​
  errorCount = 0.0
​
​
​
  withopen('result.csv', 'w') asf:
​
•    f.write("id,label\n")
​
•    foriinrange(numTestVecs):
​
•      \#前numTestVecs个数据作为测试集,后m-numTestVecs个数据作为训练集
​
•      classifierResult = classify0(testDataMat[i,:], normMat, datingLabels, 4)
​
•      f.write(f"{idlst[i]},{classifierResult}\n")
​
​
​
if__name__ == '__main__':
​
  datingClassTest()

4.隐形眼镜选择样例代码

import pandas as pd
​
from sklearn.preprocessing import LabelEncoder
​
from sklearn import tree
​
\# import pydotplus
​
\# from sklearn.externals.six import StringIO
​
​
​
if __name__ == '__main__':
​
  with open('train/train.csv', 'r') as fr:                    #加载文件
​
•    lenses = [inst.strip().split(',') for inst in fr.readlines()[1:]]    #处理文件
​
  lenses_target = []                            #提取每组数据的类别,保存在列表里
​
  for each in lenses:
​
•    lenses_target.append(each[-1])
​
  lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']      #特征标签    
​
  lenses_list = []                            #保存lenses数据的临时列表
​
  lenses_dict = {}                            #保存lenses数据的字典,用于生成pandas
​
  for each_label in lensesLabels:                      #提取信息,生成字典
​
•    for each in lenses:
​
•      lenses_list.append(each[lensesLabels.index(each_label)])
​
•    lenses_dict[each_label] = lenses_list
​
•    lenses_list = []
​
  print(lenses_dict)                            #打印字典信息
​
  lenses_pd = pd.DataFrame(lenses_dict)                  #生成pandas.DataFrame
​
  print(lenses_pd)
​
  le = LabelEncoder()                            #创建LabelEncoder()对象,用于序列化      
​
  for col in lenses_pd.columns:                      #为每一列序列化
​
•    lenses_pd[col] = le.fit_transform(lenses_pd[col])
​
  print(lenses_pd)
​
  clf = tree.DecisionTreeClassifier(max_depth = 4)            #创建DecisionTreeClassifier()类
​
  clf = clf.fit(lenses_pd.values.tolist(), lenses_target)          #使用数据,构建决策树
​
  test_csv = pd.read_csv("test/test.csv")
​
  test_pd = pd.DataFrame(test_csv, columns=["age","prescript","astigmatic","tearRate"])
​
  for col in test_pd.columns:                      #为每一列序列化
​
•    test_pd[col] = le.fit_transform(test_pd[col])
​
  print(test_pd)
​
​
​
  result = clf.predict(test_pd)
​
  with open("result.csv", 'w') as f:
​
•    f.write('id,class\n')
​
•    for i,x in enumerate(result):
​
•      if x == "no lenses":
​
•        f.write(f"{i+1},1\n")
​
•      elif x == "soft":
​
•        f.write(f"{i+1},2\n")
​
•      elif x == "hard":
​
•        f.write(f"{i+1},3\n")

5.垃圾邮件分类样例代码

# -*- coding: UTF-8 -*-

importnumpyasnp

importrandom

importre

"""

函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0

Parameters:

vocabList - createVocabList返回的列表

inputSet - 切分的词条列表

Returns:

returnVec - 文档向量,词集模型

"""

defsetOfWords2Vec(vocabList, inputSet):

returnVec = [0] *len(vocabList) #创建一个其中所含元素都为0的向量

forwordininputSet: #遍历每个词条

• ifwordinvocabList: #如果词条存在于词汇表中,则置1

• returnVec[vocabList.index(word)] = 1

• else: print("the word: %s is not in my Vocabulary!"%word)

returnreturnVec #返回文档向量

"""

函数说明:根据vocabList词汇表,构建词袋模型

Parameters:

vocabList - createVocabList返回的列表

inputSet - 切分的词条列表

Returns:

returnVec - 文档向量,词袋模型

"""

defbagOfWords2VecMN(vocabList, inputSet):

returnVec = [0]*len(vocabList) #创建一个其中所含元素都为0的向量

forwordininputSet: #遍历每个词条

• ifwordinvocabList: #如果词条存在于词汇表中,则计数加一

• returnVec[vocabList.index(word)] += 1

returnreturnVec #返回词袋模型

"""

函数说明:朴素贝叶斯分类器训练函数

Parameters:

trainMatrix - 训练文档矩阵,即setOfWords2Vec返回的returnVec构成的矩阵

trainCategory - 训练类别标签向量,即loadDataSet返回的classVec

Returns:

p0Vect - 非侮辱类的条件概率数组

p1Vect - 侮辱类的条件概率数组

pAbusive - 文档属于侮辱类的概率

"""

deftrainNB0(trainMatrix,trainCategory):

numTrainDocs = len(trainMatrix) #计算训练的文档数目

numWords = len(trainMatrix[0]) #计算每篇文档的词条数

pAbusive = sum(trainCategory)/float(numTrainDocs) #文档属于侮辱类的概率

p0Num = np.ones(numWords); p1Num = np.ones(numWords) #创建numpy.ones数组,词条出现数初始化为1,拉普拉斯平滑

p0Denom = 2.0; p1Denom = 2.0 #分母初始化为2,拉普拉斯平滑

foriinrange(numTrainDocs):

• iftrainCategory[i] == 1: #统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)···

• p1Num += trainMatrix[i]

• p1Denom += sum(trainMatrix[i])

• else: #统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)···

• p0Num += trainMatrix[i]

• p0Denom += sum(trainMatrix[i])

p1Vect = np.log(p1Num/p1Denom) #取对数,防止下溢出

p0Vect = np.log(p0Num/p0Denom)

returnp0Vect,p1Vect,pAbusive #返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率

"""

函数说明:朴素贝叶斯分类器分类函数

Parameters:

vec2Classify - 待分类的词条数组

p0Vec - 非侮辱类的条件概率数组

p1Vec -侮辱类的条件概率数组

pClass1 - 文档属于侮辱类的概率

Returns:

0 - 属于非侮辱类

1 - 属于侮辱类

"""

defclassifyNB(vec2Classify, p0Vec, p1Vec, pClass1):

p1 = sum(vec2Classify*p1Vec) +np.log(pClass1) #对应元素相乘。logA * B = logA + logB,所以这里加上log(pClass1)

p0 = sum(vec2Classify*p0Vec) +np.log(1.0-pClass1)

ifp1>p0:

• return1

else:

• return0

"""

函数说明:接收一个大字符串并将其解析为字符串列表

Parameters:

Returns:

"""

deftextParse(bigString): #将字符串转换为字符列表

listOfTokens = re.split(r'\W+', bigString) #将特殊符号作为切分标志进行字符串切分,即非字母、非数字

return [tok.lower() fortokinlistOfTokensiflen(tok) >2] #除了单个字母,例如大写的I,其它单词变成小写

defcreateVocabList(dataSet):

vocabSet = set([]) #创建一个空的不重复列表

fordocumentindataSet:

• vocabSet = vocabSet|set(document) #取并集

returnlist(vocabSet)

"""

函数说明:测试朴素贝叶斯分类器

Parameters:

Returns:

"""

defspamTest():

docList = []; classList = []; fullText = []

foriinrange(1, 21): #遍历25个txt文件

• wordList = textParse(open('train/%d_spam.txt'%i, encoding='ISO-8859-1').read()) #读取每个垃圾邮件,并字符串转换成字符串列表

• docList.append(wordList)

• fullText.append(wordList)

• classList.append(1) #标记垃圾邮件,1表示垃圾文件

• wordList = textParse(open('train/%d_ham.txt'%i, encoding='ISO-8859-1').read()) #读取每个非垃圾邮件,并字符串转换成字符串列表

• docList.append(wordList)

• fullText.append(wordList)

• classList.append(0) #标记非垃圾邮件,1表示垃圾文件

foriinrange(1, 11): #遍历25个txt文件

• wordList = textParse(open('test/%d.txt'%i, encoding='ISO-8859-1').read()) #读取每个垃圾邮件,并字符串转换成字符串列表

• docList.append(wordList)

• fullText.append(wordList)

• classList.append(-1) #标记垃圾邮件,1表示垃圾文件

vocabList = createVocabList(docList) #创建词汇表,不重复

trainingSet = list(range(40))

testSet = list(range(40, 50)) #创建存储训练集的索引值的列表和测试集的索引值的列表

trainMat = []; trainClasses = [] #创建训练集矩阵和训练集类别标签系向量

fordocIndexintrainingSet: #遍历训练集

• trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) #将生成的词集模型添加到训练矩阵中

• trainClasses.append(classList[docIndex]) #将类别添加到训练集类别标签系向量中

p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) #训练朴素贝叶斯模型

errorCount = 0 #错误分类计数

withopen("result.csv", "w") asf:

• f.write("id,label\n")

• fordocIndexintestSet: #遍历测试集

• wordVector = setOfWords2Vec(vocabList, docList[docIndex]) #测试集的词集模型

• r = classifyNB(np.array(wordVector), p0V, p1V, pSpam)

• f.write(f"{docIndex-39},{r}\n")

if__name__ == '__main__':

spamTest()

6.病马死亡预测

fromsklearn.linear_modelimportLogisticRegression

"""

函数说明:使用Sklearn构建Logistic回归分类器

Parameters:

Returns:

"""

defcolicSklearn():

frTrain = open('train/horseColicTraining.txt') #打开训练集

frTest = open('test/horseColicTest.txt') #打开测试集

trainingSet = []; trainingLabels = []

testSet = []

forlineinfrTrain.readlines():

• currLine = line.strip().split('\t')

• lineArr = []

• foriinrange(len(currLine)-1):

• lineArr.append(float(currLine[i]))

• trainingSet.append(lineArr)

• trainingLabels.append(float(currLine[-1]))

forlineinfrTest.readlines():

• currLine = line.strip().split('\t')

• lineArr =[]

• foriinrange(len(currLine)):

• lineArr.append(float(currLine[i]))

• testSet.append(lineArr)

classifier = LogisticRegression(solver='sag',max_iter=5000).fit(trainingSet, trainingLabels)

test_accurcy = classifier.predict(testSet)

withopen("result.csv", "w") asf:

• f.write("predict\n")

• f.write('\n'.join([str(x) forxintest_accurcy]))

if__name__ == '__main__':

colicSklearn()

7.水果分类器样例代码

importnumpyasnp

importoperator

fromosimportlistdir

importpandasaspd

defclassify0(inX, dataSet, labels, k):

• m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量

• diffMat=np.tile(inX,(m, 1))-dataSet#行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算

• sqDiffMat = diffMat**2#减完后,对每个数做平方

• sqDistances = sqDiffMat.sum(axis=1) #平方后按行求和,axis=0表 示列相加,axis-1表示行相加

• distances = sqDistances*0.5#开方计算出欧式距离

• sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值2

• classCount = {} #用于类别/次数的字典,key为类别, value为次数

• foriinrange(k):

• voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别

• classCount[voteIlabel] = classCount.get(voteIlabel,0) +1#对类别次数进行累加

• sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序

• returnsortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别

train = pd.read_csv('train/train.csv')

\# dataSet=np.array([[250,100],[270,120],[111,230],[130,260],[200,80],[70,190]])

dataSet = np.array(list(zip(train['x'], train['y'])))

\# print(dataSet)

labels = np.array(list(train['class']))

\# inX=[105,210]

test = pd.read_csv('test/test.csv')

testSet = zip(test['x'], test['y'])

withopen('result.csv', 'w', encoding='utf-8') asf:

f.write('class\n')

fortxintestSet:

• f.write(str(classify0(tx, dataSet, labels, 3)) +'\n')

8 乳腺癌预测

importnumpyasnp

importoperator

fromosimportlistdir

importpandasaspd

importcsv #用于处理csv文件

importrandom #用于随机数

importmath

importoperator #

fromsklearnimportneighbors

fromsklearn.metricsimportaccuracy_score

fromsklearn.preprocessingimportMinMaxScaler

test_df = pd.read_csv('test/test.csv')

train_df = pd.read_csv('train/train.csv')

X, y = train_df.iloc[:, 2:], train_df.iloc[:, 1]

test_X = test_df.iloc[:, 1:]

scaler = MinMaxScaler()

scaler.fit(X)

X = scaler.transform(X)

test_X = scaler.transform(test_X)

\#coding:utf-8

fromsklearnimportneighbors

importsklearn

knn = neighbors.KNeighborsClassifier()

\#训练数据集

knn.fit(X, y)

\#预测

predict = knn.predict(test_X)

result = pd.DataFrame(data={'ID':range(1, len(predict)+1), 'Diagnosis':predict})

result.to_csv('result.csv',index=False)

ans_df = pd.read_csv('answer.csv')

9 基于svm的手写数字识别

题目

import numpy as np

from os import listdir

from sklearn.svm import SVC

import pandas as pd

def img2vector(filename):

"""

将32x32的二进制图像转换为1x1024向量。

Parameters:

filename - 文件名

Returns:

returnVect - 返回的二进制图像的1x1024向量

"""

#创建1x1024零向量

returnVect = np.zeros((1, 1024))

#打开文件

fr = open(filename)

#按行读取

for i in range(32):

#读一行数据

lineStr = fr.readline()

#每一行的前32个元素依次添加到returnVect中

for j in range(32):

returnVect[0, 32*i+j] = int(lineStr[j])

#返回转换后的1x1024向量

return returnVect

def handwritingClassTest():

"""

手写数字分类测试

Parameters:

Returns:

"""

#测试集的Labels

hwLabels = []

#返回trainingDigits目录下的文件名

trainingFileList = listdir('train')

#返回文件夹下文件的个数

m = len(trainingFileList)

#初始化训练的Mat矩阵,测试集

trainingMat = np.zeros((m, 1024))

#从文件名中解析出训练集的类别

for i in range(m):

#获得文件的名字

fileNameStr = trainingFileList[i]

#获得分类的数字

classNumber = int(fileNameStr.split('_')[0])

#将获得的类别添加到hwLabels中

hwLabels.append(classNumber)

#将每一个文件的1x1024数据存储到trainingMat矩阵中

trainingMat[i,:] = img2vector('train/%s' % (fileNameStr))

clf = SVC(C=200,kernel='rbf', gamma='auto')

clf.fit(trainingMat,hwLabels)

#返回testDigits目录下的文件列表

testFileList = listdir('test')

#错误检测计数

errorCount = 0.0

#测试数据的数量

mTest = len(testFileList)

# 预测结果列表

​ predicts = []

​ #从文件中解析出测试集的类别并进行分类测试

​ for i in range(mTest):

​ #获得文件的名字

​ fileNameStr = testFileList[i]

​ #获得分类的数字

# classNumber = int(fileNameStr.split('_')[0])

​ #获得测试集的1x1024向量,用于训练

​ vectorUnderTest = img2vector('test/%s' % (fileNameStr))

​ #获得预测结果

# classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)

​ classifierResult = clf.predict(vectorUnderTest)

​ predicts.append(classifierResult[0])

# print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))

# if(classifierResult != classNumber):

# errorCount += 1.0

# print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))

​ pd.DataFrame(data={'num':predicts}).to_csv('result.csv', index=False)

if __name__ == '__main__':

handwritingClassTest()

10 花朵预测

import pandas as pd

\#from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

train_data = pd.read_csv('train/train.csv')

test_data = pd.read_csv('test/test.csv')

X = train_data.iloc[:, :-1]

y = train_data.iloc[:, -1]

\#lr = LogisticRegression()

lr=KNeighborsClassifier(n_neighbors=3)

lr.fit(X, y)

predict = lr.predict(test_data)

pd.DataFrame(data={'class':predict}).to_csv('result.csv', index=False)

10.1 花的分类

import pandas as pd

from sklearn.neighbors import KNeighborsClassifier

train_data = pd.read_csv('train/train.csv')

test = pd.read_csv('test/test.csv')

X = train_data.iloc[:, :-1]

y = train_data.iloc[:, -1]

LRKNN=KNeighborsClassifier(n_neighbors=3)

LRKNN.fit(X, y)

predict = LRKNN.predict(test)

pd.DataFrame(data={'Id':range(1,len(predict) + 1),'Species':predict}).to_csv('result.csv', index=False)

11.心脏病预测

#导入数据库

import pandas as pd

from sklearn.neighbors import KNeighborsClassifier

#划分测试

train_data = pd.read_csv('train/train.csv')

test = pd.read_csv('test/test.csv')

X = train_data.iloc[:, :-1]

y = train_data.iloc[:, -1]

LRKNN=KNeighborsClassifier(n_neighbors=2)

LRKNN.fit(X, y)

predict = LRKNN.predict(test)

pd.DataFrame(data={'ID':range(1,len(predict) + 1),'target':predict}).to_csv('result.csv', index=False)

12.预测电信报务提供商的客户流失情况

#导入数据库

import pandas as pd

from sklearn.neighbors import KNeighborsClassifier

#测试

train_data = pd.read_csv('train/train.csv')

test = pd.read_csv('test/test.csv')

X = train_data.iloc[:, :-1]

y = train_data.iloc[:, -1]

LRKNN=KNeighborsClassifier(n_neighbors=3)

LRKNN.fit(X, y)

predict = LRKNN.predict(test)

pd.DataFrame(data={'ID':range(1,len(predict) + 1),'churn':predict}).to_csv('result.csv', index=False)

13.糖尿病预测

题目:

import math

import numpy as np

import random

import warnings

warnings.filterwarnings("ignore")

def load_diabetes():

X = []

y = []

line = input()

while line:

dx = []

data = [l for l in line.strip().split(',')]

X.append(np.array([np.float(d) for d in data[:-1]]))

y.append(np.float(data[-1]))

line = input()

return np.array(X),np.array(y)

def train_test_split(X,Y,test_size=0.2,random_state=2333):

random.seed(random_state)

n_samples = len(X)

indices = np.arange(n_samples)

train_indexs = list(set(random.sample(indices.tolist(),int(n_samples*(1-test_size)))))

test_indexs = [k for k in indices if k not in train_indexs]

return X[train_indexs],X[test_indexs],Y[train_indexs],Y[test_indexs]

X,y = load_diabetes()

class LinearRegression:

def __init__(self):

'''初始化模型'''

self.coef_ = None

self.interception_ = None

self._theta = None

def fit_normal(self,X_train,y_train):

'''根据训练数据集X_train,y_train训练模型'''

assert X_train.shape[0] == y_train.shape[0],'the number of X_train must equal to the number of y_train'

X_b = np.hstack([np.ones((len(X_train),1)),X_train])

self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)

self.interception_ = self._theta[0]

self.coef_ = self._theta[1:]

return self

def predict(self,X_predict):

assert self._theta is not None,'must fit before predict'

assert X_predict.shape[1] == len(self.coef_),'the feature number of X_predict must equal to X_train '

X_b = np.hstack([np.ones((len(X_predict),1)),X_predict])

return X_b.dot(self._theta)

#计算均方误差

def mse(self,y,y_pre):

sq_error = (y_pre - y) ** 2

sum_sq_error = np.sum(sq_error)

mse = sum_sq_error/y.size

return mse

#计算均方根误差

def rmse(self,y,y_pre):

sq_error = (y_pre - y) ** 2

total_sq_error = np.sum(sq_error)

mse = total_sq_error/y.size

rmse = math.sqrt(mse)

return rmse

def r2_score(self,y,y_pre):

return 1-(self.mse(y,y_pre)/np.var(y))

def score(self,X_test,y_test):

'''根据测试数据集确定当前模型的准确度'''

y_predict = self.predict(X_test)

return self.r2_score(y_test,y_predict),self.rmse(y_test,y_predict)

def __repr__(self):

return 'LinearRegression()'

#调用train_test_split函数拆分数据集,带默认值参数可以忽略

x_train,x_test,y_train,y_test = train_test_split(X,y)

reg = LinearRegression()

reg.fit_normal(x_train,y_train)

r2,rmse = reg.score(x_test,y_test)

print("debug_begin");

def test(rmse,r2):

if rmse>50 or r2>0.5:

print(True)

else:

print(False)

print("debug_end");

test(rmse,r2)

14.knn算法的实现

题目:

import numpy as np

import operator

from os import listdir

import sys

import codecs

sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())

def classify0(inX, dataSet, labels, k):

m=dataSet.shape[0] #返回dataSet的行数,即已知数据集中的所有点的数量

diffMat=np.tile(inX,(m, 1))-dataSet #行向量方向上将inX复制m次,然后和dataSet矩阵做相减运算

sqDiffMat = diffMat**2 #减完后,对每个数做平方

#按行求和,axis=0表示列相加,axis=1表示行相加

sqDistances = sqDiffMat.sum(axis=1)

#开方计算出欧式距离

distances = sqDistances*0.5

sortedDistIndices = distances.argsort() #对距离从小到大排序,注意argsort函数返回的是数组值从小到大的索引值2

classCount = {} #用于类别/次数的字典,key为类别, value为次数

for i in range(k):

voteIlabel = labels[sortedDistIndices[i]] #取出第近的元素对应的类别

classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #对类别次数进行累加

sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #根据字典的值从大到小排序

return sortedClassCount[0][0] #返回次数最多的类别,即所要分类的类别

dataSet=np.array([[250,100],[270,120],[111,230],[130,260],[200,80],[70,190]])

labels=["理科生","理科生","文科生","文科生","理科生","文科生"]

f = open("in1-1.txt", 'r')

for line in f.readlines():

inX = line .split(" ")

inX = [ int(x) for x in inX ]

print(classify0(inX,dataSet,labels,3))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

哈都婆

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值