使用kNN算法的手写识别系统
1 2 3 4 5 6
| 1. 收集数据:提供文本文件 2. 准备数据:编写函数classify0(),将图像格式转换为分类器使用的list格式 3. 分析数据:在Python命令提示符中检察数据,确保它符合要求 4. 训练数据:此步骤不适用kNN 5. 测试算法:编写函数使用提供的部分数据集作为测试样本,另一部分作为验证样本 6. 使用算法:
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| import numpy as np import operator as op import matplotlib import matplotlib.pyplot as plt import os def img2vector(filename): returnVec = np.zeros((1, 1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVec[0, 32*i+j] = int(lineStr[j]) return returnVec
|
1 2 3
| fileName = '0_0.txt' testVec = img2vector('AnacondaProjects\\digits\\testDigits\\%s' % fileName) testVec[0, 0:31]
|
array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0.])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet sqDiffMat = diffMat ** 2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances ** 0.5 sortedDistIndicies = distances.argsort() classCount = {} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(),key=op.itemgetter(1), reverse=True) return sortedClassCount[0][0]
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
| def handwritingClassTest(): hwLabels = [] trainingFileList = os.listdir('AnacondaProjects\\digits\\trainingDigits') m = len(trainingFileList) trainingMat = np.zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i,:] = img2vector('AnacondaProjects\\digits\\trainingDigits\\%s' % fileNameStr) testFileList = os.listdir('AnacondaProjects\\digits\\testDigits') errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('AnacondaProjects\\digits\\testDigits\\%s' % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) if (classifierResult != classNumStr): errorCount += 1.0 print("\nthe total number of errors is: %d" % errorCount) print("\nthe total error rate is: %f" % (float(errorCount)/float(mTest)))
|