import sys
from math import sqrt
def pearson_distance(vector1, vector2) :
if len(vector1) == 1:
return abs(vector1[0]-vector2[0])
"""
Calculate distance between two vectors using pearson method
See more : http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
"""
'''
print('vector1')
print(vector1)
print('vector2')
print(vector2)
'''
sum1 = sum(vector1)
sum2 = sum(vector2)
sum1Sq = sum([pow(v,2) for v in vector1])
sum2Sq = sum([pow(v,2) for v in vector2])
pSum = sum([vector1[i] * vector2[i] for i in range(len(vector1))])
num = pSum - (sum1*sum2/len(vector1))
den = sqrt((sum1Sq - pow(sum1,2)/len(vector1)) * (sum2Sq - pow(sum2,2)/len(vector1)))
if den == 0 : return 0.0
return 1.0 - num/den
def pick_outlier(vlist,vdict):
vlen = len(vdict[vlist[0]])
avg = [0.0 for i in range(vlen) ]
#print('avg init')
#print(avg)
for x in vdict:
vec = vdict[x]
for i in range(len(vec)) :
avg[i] += vec[i]
avg = [ item / len(vlist) for item in avg]
#print('avg done')
#print(avg)
max1=0
max1no=0
max2=0
max2no=0
for i in range(1,len(vlist)-1):
#print(i+1,abs(vlist[i]-vlist[i-1])+abs(vlist[i]-vlist[i+1]),abs(vlist[i]-median(vlist)),round(abs(vlist[i]-avg(vlist)),2))
pd1 = pearson_distance(vdict[vlist[i]],vdict[vlist[i-1]])
pd2 = pearson_distance(vdict[vlist[i]],vdict[vlist[i+1]])
#print('pd1 pd2:',pd1,pd2)
if pd1+pd2>max1:
max1=pd1+pd2
max1no=i+1
avgpd = pearson_distance(vdict[vlist[i]],avg)
#print('avgpd:',avgpd)
if avgpd>max2:
max2=avgpd
max2no=i+1
print('pick max1 max2:',max1no,max2no,vlist[max1no-1],vlist[max2no-1])
if max1no == max2no:
return max1no
else:
return -1
fd = open(sys.argv[1])
vlist=[]
vdict={}
for v in fd:
v = v.replace('\r','').replace('\n','')
tmpv = v.split()
vlist.append(tmpv[0])
vdict[str(tmpv[0])]=[]
for i in range(1,len(tmpv)):
vdict[str(tmpv[0])].append(float(tmpv[i]))
fd.close()
roundno = 1
pickno = pick_outlier(vlist,vdict)
print(len(vlist),len(vdict))
while pickno != -1:
print('round:',roundno,' pick outlier:',pickno)
roundno+=1
'''
if roundno>1:
break
'''
vdict.pop(vlist[pickno-1])
vlist.pop(pickno-1)
print(len(vlist),len(vdict))
pickno = pick_outlier(vlist,vdict)