#ecoding=utf-8
import math
import re
import csv
def fileREAD(fileURL,access):
"传入文件路径,返回存储文件内容的二维列表"
localArray = [] # 创建一个列表用于存储文件内容
csvfile = file(fileURL, access)
reader = csv.reader(csvfile)
for line in reader:
localArray.append(line)
csvfile.close()
return localArray
def getLine(inList,Line):
"获得某一行数据"
return inList[Line]
def getRow(inList,Row):
"获得某一列数据"
listReturn = []
for i in inList:
listReturn.append(i[Row])
return listReturn
def setLine(inList,childList,Line):
"设置矩阵某一行数据"
inList[Line] = childList
def setRow(inList,chikdList,Row):
"设置矩阵的某一列"
i = 0
for i in range(0,len(chikdList)):
inList[i][Row] = chikdList[i]
def addLine(inList,childLine):
"给数据矩阵添加一行"
inList.append(childLine)
def addRow(inList,childRow):
"给数据矩阵添加一列"
j = 0
for i in inList:
i.append(childRow[j])
j = j+1
def getAVG(inList):
"求数值属性的均值"
sumOfList = 0
lengOfList = 0
for i in inList:
if re.match(r'[0-9]+',i):
sumOfList = sumOfList + float(i)
lengOfList = lengOfList + 1
else:
continue
if lengOfList != 0 :
return sumOfList/lengOfList
else:
return "当前特征无平均值"
def getAVE(inList):
"求数值属性的方差"
#先求平均数
sumOfList = 0
lengOfList = 0
su = 0
for i in inList:
if re.match(r'[0-9]+', i):
sumOfList = sumOfList + float(i)
lengOfList = lengOfList + 1
else:
continue
if lengOfList != 0:
avg = sumOfList / lengOfList
for j in inList:
if re.match(r'[0-9]+',j):
su += (float(j) - avg) ** 2
else:
continue
return math.sqrt(su)
else:
return "当前特征无方差"
def average(seq, total=0.0):
num = 0
for item in seq:
total += item
num += 1
return total / num
def getQUANTILE(inList,inlocaltion):
"求数值属性的分位数"
if inlocaltion >1 or inlocaltion<0 or inlocaltion == 1:
return "输入的分位数数值错误"
localLst = []
leng = 0
for i in inList:
if re.match(r'[0-9]+',i):
localLst.append(float(i))
leng = leng + 1
else:
continue
if leng == 0:
return "当前特征不可求中位数"
localLst.sort()
if inlocaltion == 0.5:
if len(localLst)%2 == 1:
return localLst[len(localLst)//2]
else:
return (localLst[len(localLst)//2-1]+localLst[len(localLst)//2])/2.0
elif inlocaltion<1 and inlocaltion>=0:
return localLst[int(len(localLst)*inlocaltion)]
def fileREAD(fileURL,access):
"传入文件路径,返回存储文件内容的二维列表"
localArray = [] # 创建一个列表用于存储文件内容
csvfile = file(fileURL, access)
reader = csv.reader(csvfile)
for line in reader:
localArray.append(line)
csvfile.close()
return localArray
def removeNoiseAuto(inList):
"利用IRQ识别噪声数据并去除该数据"
Q3 = getQUANTILE(inList,0.75)
Q1 = getQUANTILE(inList,0.25)
IRQ = Q3 - Q1
for i in range(1,len(inList),1):
if float(inList[i]) - Q3 > 1.5*IRQ or Q1 - float(inList[i]) > 1.5*IRQ:
inList[i] = ''
return inList
def removeNoiseByThresholdMin(inList,inThresholdMin):
"根据最小阈值去除噪声数据去除该数据"
for i in range(1, len(inList), 1):
if float(inList[i]) < inThresholdMin:
inList[i] = ''
return inList
def removeNoiseByThresholdMax(inList,inThresholdMax):
"根据最大阈值去除噪声数据去除该数据"
for i in range(1, len(inList), 1):
if float(inList[i]) > inThresholdMax:
inList[i] = ''
return inList
def autoPaddingByAVG(inList):
"利用均值补全缺失值"
avg = getAVG(inList)
for i in range(1, len(inList), 1):
if inList[i] == '':
inList[i] = str(avg)
return inList
def autoPaddingByMedian(inList):
"利用中位数补全缺失值"
avg = getQUANTILE(inList,0.5)
for i in range(1, len(inList), 1):
if inList[i] == '':
inList[i] = str(avg)
return inList
def binningWidth(inList,width):
"数据离散化:等宽分箱"
dic = {}
for i in range(1,len(inList)):
dic[i] =float(inList[i])
dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序
dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对
for varlo in dict:
dictList.append(list(varlo))
i = 0 # 用于记录每个箱开始位置
j = 0 #用于记录每个箱结束位置
innerList = []
for i in range(0, len(dictList)):
if dictList[i][1] - dictList[j][1] > width:
avg = average(innerList)
for k in range(j, i, 1):
dictList[k][1] = avg
innerList = []
j = i
innerList.append(dictList[i][1])
if (i == len(dictList)-1):
avg = average(innerList)
for k in range(j, i, 1):
dictList[k][1] = avg
innerList = []
dictList[i][1] = avg
dic1 = {}
for i in range(0, len(dictList)):
dic1[dictList[i][0]] = dictList[i][1]
ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse=False) # 先将列表按KEY排序
for i in range(0, len(ad)):
inList[i + 1] = ad[i][1]
return inList
def binningDeep(inList,deep1):
"数据离散化:等频分箱"
deep = deep1 -1
dic = {}
for i in range(1,len(inList)):
dic[i] =float(inList[i])
dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序
dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对
for varlo in dict:
dictList.append(list(varlo))
innerList = []
for i in range(0,deep): #为了排除0的干扰,首先处理掉deep个元素
innerList.append(dictList[i][1])
for i in range(deep, len(dictList)):
if i % deep == 0:
avg = average(innerList)
for j in range(i-deep,i):
dictList[j][1] = avg
innerList = []
innerList.append(dictList[i][1])
if i == len(dictList)-1:
avg = average(innerList)
for j in range((i+1)/deep*deep,i+1):
dictList[j][1] = avg
dic1 = {}
for i in range(0, len(dictList)):
dic1[dictList[i][0]] = dictList[i][1]
ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse= False) # 先将列表按KEY排序
for i in range(0,len(ad)):
inList[i+1] = ad[i][1]
return inList
def oneHot(inList,Row):
"对输入数据矩阵的某一列使用oneHot编码"
rowList0 = getRow(inList,Row)
rowHead = rowList0[0]
rowList = []
for i in range(1,len(rowList0)):
rowList.append(rowList0[i])
rowmsg = {}
j = 0
for i in rowList:
if rowmsg.has_key(i):
rowmsg[i] = rowmsg[i] + 1
else:
rowmsg[i] = 1
for i in rowmsg.keys():
addList = []
addList.append(i)
for j in rowList:
if j == i:
addList.append('1')
else:
addList.append('0')
addRow(inList,addList)
for i in inList:
print i
def minMax(inList):
"最大最小归一化"
innerList = []
for i in range(1,len(inList)):
if re.match(r'[0-9]+', inList[i]):
innerList.append(float(inList[i]))
maxvalue = max(innerList)
minvalue = min(innerList)
for i in range(1,len(inList)):
if re.match(r'[0-9]+', inList[i]):
a = (float(inList[i])-minvalue)/(maxvalue - minvalue)
b = "%.4f" %a
inList[i] = str(b)
return inList
def zScore(inList):
"zScore归一化"
print inList
u = getAVG(inList)
ave = getAVE(inList)
stand = math.sqrt(ave)
for i in range(1,len(inList)):
if re.match(r'[0-9]+', inList[i]):
a = (float(inList[i])-u)/stand
b = "%.4f" % a
inList[i] = str(b)
return inList
def similarityDistance(inList1,inList2,n):
"距离相似度"
sum = 0
for i in range(1,len(inList1)):
sum = sum + abs(float(inList1[i])-float(inList2[i])) ** n
a = float(1)/2
return pow(sum,a)
def similaritySim(inList1,inList2):
"余弦相似度计算"
sum = 0
for i in range(1,len(inList1)):
sum = sum + float(inList1[i])*float(inList2[i])
sum1 = 0
sum2 = 0
for i in range(1,len(inList1)):
sum1 = sum1 + float(inList1[i])**2
for i in range(1, len(inList2)):
sum2 = sum2 + float(inList2[i]) ** 2
return sum/(math.sqrt(sum1)*math.sqrt(sum2))
fileInput = fileREAD("D:\\PythonWorkSpace\\ExternalFile\\train.csv","r")
# #获得某一行数据
# print getLine(fileInput,1)
#
# #获得某一列数据
# print getRow(fileInput,0)
# #设置某一行数据
# print "设置前:"
# print getLine(fileInput,1)
# setLine(fileInput,getLine(fileInput,2),1)
# print "设置后:"
# print getLine(fileInput,1)
# #设置某一列数据
# print "设置前:"
# print getRow(fileInput,1)
# setRow(fileInput,getRow(fileInput,2),1)
# print "设置后:"
# print getRow(fileInput,1)
# #均值
# print getAVG(getRow(fileInput,9))
# #方差
# print getAVE(getRow(fileInput,9))
# #分位数
# print getQUANTILE(getRow(fileInput,9),0.5)
# #噪声数据过滤1
# print removeNoiseAuto(getRow(fileInput,1))
#
# #噪声数据过滤2
# print removeNoiseByThresholdMin(getRow(fileInput,0),10)
#
# #噪声数据过滤3
# print removeNoiseByThresholdMax(getRow(fileInput,0),10)
# #缺失值补全1
# print autoPaddingByAVG(getRow(fileInput,0))
#
# #缺失值补全2
# print autoPaddingByMedian(getRow(fileInput,0))
# #等宽分箱
# print binningWidth(getRow(fileInput,0),3)
#
# #等频分箱
# print binningDeep(getRow(fileInput,0),3)
# #ONE-HOT编码
# oneHot(fileInput,1)
# for i in fileInput:
# print i
# #最大最小归一化
# print minMax(getRow(fileInput,0))
#
# #zScore归一化
# print zScore(getRow(fileInput,0))
# #距离相似度
# print similarityDistance(getRow(fileInput,0),getRow(fileInput,0),2)
# # 余弦相似度计算
# print similaritySim(getRow(fileInput,0),getRow(fileInput,1))