python 数据预处理_python数据预处理练习

​#ecoding=utf-8

import math

import re

import csv

def fileREAD(fileURL,access):

"传入文件路径,返回存储文件内容的二维列表"

localArray = []  # 创建一个列表用于存储文件内容

csvfile = file(fileURL, access)

reader = csv.reader(csvfile)

for line in reader:

localArray.append(line)

csvfile.close()

return localArray

def getLine(inList,Line):

"获得某一行数据"

return inList[Line]

def getRow(inList,Row):

"获得某一列数据"

listReturn = []

for i in inList:

listReturn.append(i[Row])

return listReturn

def setLine(inList,childList,Line):

"设置矩阵某一行数据"

inList[Line] = childList

def setRow(inList,chikdList,Row):

"设置矩阵的某一列"

i = 0

for i in range(0,len(chikdList)):

inList[i][Row] = chikdList[i]

def addLine(inList,childLine):

"给数据矩阵添加一行"

inList.append(childLine)

def addRow(inList,childRow):

"给数据矩阵添加一列"

j = 0

for i in inList:

i.append(childRow[j])

j = j+1

def getAVG(inList):

"求数值属性的均值"

sumOfList = 0

lengOfList = 0

for i in inList:

if re.match(r'[0-9]+',i):

sumOfList = sumOfList + float(i)

lengOfList = lengOfList + 1

else:

continue

if lengOfList != 0 :

return sumOfList/lengOfList

else:

return "当前特征无平均值"

def getAVE(inList):

"求数值属性的方差"

#先求平均数

sumOfList = 0

lengOfList = 0

su = 0

for i in inList:

if re.match(r'[0-9]+', i):

sumOfList = sumOfList + float(i)

lengOfList = lengOfList + 1

else:

continue

if lengOfList != 0:

avg = sumOfList / lengOfList

for j in inList:

if re.match(r'[0-9]+',j):

su += (float(j) - avg) ** 2

else:

continue

return math.sqrt(su)

else:

return "当前特征无方差"

def average(seq, total=0.0):

num = 0

for item in seq:

total += item

num += 1

return total / num

def getQUANTILE(inList,inlocaltion):

"求数值属性的分位数"

if inlocaltion >1 or inlocaltion<0 or inlocaltion == 1:

return "输入的分位数数值错误"

localLst = []

leng = 0

for i in inList:

if re.match(r'[0-9]+',i):

localLst.append(float(i))

leng = leng + 1

else:

continue

if leng == 0:

return "当前特征不可求中位数"

localLst.sort()

if inlocaltion == 0.5:

if len(localLst)%2 == 1:

return localLst[len(localLst)//2]

else:

return (localLst[len(localLst)//2-1]+localLst[len(localLst)//2])/2.0

elif inlocaltion<1 and inlocaltion>=0:

return localLst[int(len(localLst)*inlocaltion)]

def fileREAD(fileURL,access):

"传入文件路径,返回存储文件内容的二维列表"

localArray = []  # 创建一个列表用于存储文件内容

csvfile = file(fileURL, access)

reader = csv.reader(csvfile)

for line in reader:

localArray.append(line)

csvfile.close()

return localArray

def removeNoiseAuto(inList):

"利用IRQ识别噪声数据并去除该数据"

Q3 = getQUANTILE(inList,0.75)

Q1 = getQUANTILE(inList,0.25)

IRQ = Q3 - Q1

for i in range(1,len(inList),1):

if float(inList[i]) - Q3 > 1.5*IRQ or Q1 - float(inList[i]) > 1.5*IRQ:

inList[i] = ''

return inList

def removeNoiseByThresholdMin(inList,inThresholdMin):

"根据最小阈值去除噪声数据去除该数据"

for i in range(1, len(inList), 1):

if float(inList[i]) < inThresholdMin:

inList[i] = ''

return inList

def removeNoiseByThresholdMax(inList,inThresholdMax):

"根据最大阈值去除噪声数据去除该数据"

for i in range(1, len(inList), 1):

if float(inList[i]) > inThresholdMax:

inList[i] = ''

return inList

def autoPaddingByAVG(inList):

"利用均值补全缺失值"

avg = getAVG(inList)

for i in range(1, len(inList), 1):

if inList[i] == '':

inList[i] = str(avg)

return inList

def autoPaddingByMedian(inList):

"利用中位数补全缺失值"

avg = getQUANTILE(inList,0.5)

for i in range(1, len(inList), 1):

if inList[i] == '':

inList[i] = str(avg)

return inList

def binningWidth(inList,width):

"数据离散化:等宽分箱"

dic = {}

for i in range(1,len(inList)):

dic[i] =float(inList[i])

dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False)  # 先将列表按value排序

dictList = []  # 将排序后元素赋值给一个列表,用于存储K-V对

for varlo in dict:

dictList.append(list(varlo))

i = 0  # 用于记录每个箱开始位置

j = 0  #用于记录每个箱结束位置

innerList = []

for i in range(0, len(dictList)):

if dictList[i][1] - dictList[j][1] > width:

avg = average(innerList)

for k in range(j, i, 1):

dictList[k][1] = avg

innerList = []

j = i

innerList.append(dictList[i][1])

if (i == len(dictList)-1):

avg = average(innerList)

for k in range(j, i, 1):

dictList[k][1] = avg

innerList = []

dictList[i][1] = avg

dic1 = {}

for i in range(0, len(dictList)):

dic1[dictList[i][0]] = dictList[i][1]

ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse=False)  # 先将列表按KEY排序

for i in range(0, len(ad)):

inList[i + 1] = ad[i][1]

return inList

def binningDeep(inList,deep1):

"数据离散化:等频分箱"

deep = deep1 -1

dic = {}

for i in range(1,len(inList)):

dic[i] =float(inList[i])

dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False)  # 先将列表按value排序

dictList = []  # 将排序后元素赋值给一个列表,用于存储K-V对

for varlo in dict:

dictList.append(list(varlo))

innerList = []

for i in range(0,deep):  #为了排除0的干扰,首先处理掉deep个元素

innerList.append(dictList[i][1])

for i in range(deep, len(dictList)):

if i % deep == 0:

avg = average(innerList)

for j in range(i-deep,i):

dictList[j][1] = avg

innerList = []

innerList.append(dictList[i][1])

if i == len(dictList)-1:

avg = average(innerList)

for j in range((i+1)/deep*deep,i+1):

dictList[j][1] = avg

dic1 = {}

for i in range(0, len(dictList)):

dic1[dictList[i][0]] = dictList[i][1]

ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse= False)  # 先将列表按KEY排序

for i in range(0,len(ad)):

inList[i+1] = ad[i][1]

return inList

def oneHot(inList,Row):

"对输入数据矩阵的某一列使用oneHot编码"

rowList0 = getRow(inList,Row)

rowHead = rowList0[0]

rowList = []

for i in range(1,len(rowList0)):

rowList.append(rowList0[i])

rowmsg = {}

j = 0

for i in rowList:

if rowmsg.has_key(i):

rowmsg[i] = rowmsg[i] + 1

else:

rowmsg[i] = 1

for i in rowmsg.keys():

addList = []

addList.append(i)

for j in rowList:

if j == i:

addList.append('1')

else:

addList.append('0')

addRow(inList,addList)

for i in inList:

print i

def  minMax(inList):

"最大最小归一化"

innerList = []

for i in range(1,len(inList)):

if re.match(r'[0-9]+', inList[i]):

innerList.append(float(inList[i]))

maxvalue = max(innerList)

minvalue = min(innerList)

for i in range(1,len(inList)):

if re.match(r'[0-9]+', inList[i]):

a = (float(inList[i])-minvalue)/(maxvalue - minvalue)

b = "%.4f" %a

inList[i] = str(b)

return inList

def  zScore(inList):

"zScore归一化"

print inList

u = getAVG(inList)

ave = getAVE(inList)

stand = math.sqrt(ave)

for i in range(1,len(inList)):

if re.match(r'[0-9]+', inList[i]):

a = (float(inList[i])-u)/stand

b = "%.4f" % a

inList[i] = str(b)

return inList

def similarityDistance(inList1,inList2,n):

"距离相似度"

sum = 0

for i in range(1,len(inList1)):

sum = sum + abs(float(inList1[i])-float(inList2[i])) ** n

a = float(1)/2

return pow(sum,a)

def similaritySim(inList1,inList2):

"余弦相似度计算"

sum = 0

for i in range(1,len(inList1)):

sum = sum + float(inList1[i])*float(inList2[i])

sum1 = 0

sum2 = 0

for i in range(1,len(inList1)):

sum1 = sum1 + float(inList1[i])**2

for i in range(1, len(inList2)):

sum2 = sum2 + float(inList2[i]) ** 2

return sum/(math.sqrt(sum1)*math.sqrt(sum2))

fileInput = fileREAD("D:\\PythonWorkSpace\\ExternalFile\\train.csv","r")

# #获得某一行数据

# print getLine(fileInput,1)

#

# #获得某一列数据

# print getRow(fileInput,0)

# #设置某一行数据

# print "设置前:"

# print getLine(fileInput,1)

# setLine(fileInput,getLine(fileInput,2),1)

# print "设置后:"

# print getLine(fileInput,1)

# #设置某一列数据

# print "设置前:"

# print getRow(fileInput,1)

# setRow(fileInput,getRow(fileInput,2),1)

# print "设置后:"

# print getRow(fileInput,1)

# #均值

# print getAVG(getRow(fileInput,9))

# #方差

# print getAVE(getRow(fileInput,9))

# #分位数

# print getQUANTILE(getRow(fileInput,9),0.5)

# #噪声数据过滤1

# print removeNoiseAuto(getRow(fileInput,1))

#

# #噪声数据过滤2

# print removeNoiseByThresholdMin(getRow(fileInput,0),10)

#

# #噪声数据过滤3

# print removeNoiseByThresholdMax(getRow(fileInput,0),10)

# #缺失值补全1

# print autoPaddingByAVG(getRow(fileInput,0))

#

# #缺失值补全2

# print autoPaddingByMedian(getRow(fileInput,0))

# #等宽分箱

# print binningWidth(getRow(fileInput,0),3)

#

# #等频分箱

# print binningDeep(getRow(fileInput,0),3)

# #ONE-HOT编码

# oneHot(fileInput,1)

# for i in fileInput:

#     print i

# #最大最小归一化

# print minMax(getRow(fileInput,0))

#

# #zScore归一化

# print zScore(getRow(fileInput,0))

# #距离相似度

# print similarityDistance(getRow(fileInput,0),getRow(fileInput,0),2)

# # 余弦相似度计算

# print similaritySim(getRow(fileInput,0),getRow(fileInput,1))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值