#!/usr/bin/python # -*- coding:utf-8 -*- #author: by Tong import sys import csv reload(sys) sys.setdefaultencoding("utf-8") def setlibsvm(): readin = open('data.txt','r') output = open("lib_to_svm.txt",'wb') try: col = 0 the_line = readin.readline() while the_line: if col == 0: output.write(the_line) col = col +1 the_line = readin.readline() continue the_line = the_line.strip('\n') output_line = '' index = 0 # print the_line.split('\t') for sub_line in the_line.split(' '): if index == 0 : output_line = sub_line if index > 0 and sub_line != '0': output_line += ' '+str(index)+':'+str(sub_line) index = index + 1 output.write(output_line) output.write('\n') the_line = readin.readline() finally: readin.close() if __name__ == "__main__": setlibsvm()
数据的格式为:
label feature1 feature2 feature3
其中以空格作为分隔符,其中的代码可以改变
label feature1 feature2 feature3 feature4 1 1:434 2:32 3:2 0 1:323 3:3 4:3 0 2:3 4:3 0 1:23 2:3 3:2 4:1这是最终的数据格式。好处是对于稀疏的数据能够节省空间。原数据格式为:label feature1 feature2 feature3 feature4 1 434 32 2 0 0 323 0 3 3 0 0 3 0 3 0 23 3 2 1 1 3 12 0 55