feature=[]
result=[]
theta=[]
tempfeature=[]
test_result=[]
def getDataFromFile():
with open('kr-vs-kp.data.txt') as fileData:
#recordIndex=0
for each in fileData:
cur=each.split(',')
feature.append([ord(eachItem)for eachItem in cur[:-1]])#transform chars to ASCII
#feature[recordIndex].insert(0,1)#
#recordIndex+=1
if cur[-1][:-1]=='won':
result.append(1)
else:
result.append(0)#'won\\n','nowin\\n'
feature.pop()
feature.pop()
tempfeature=feature
#print(feature[0])
#print(result) #read data set from file
def feature_scale(feature_list):
featureIndex=0
for eachFeatureListItem in feature_list:
#print('before scale the list is:',eachFeatureListItem)
averageValue=sum(eachFeatureListItem)/len(eachFeatureListItem)
maxValue=max(eachFeatureListItem)
minValue=min(eachFeatureListItem)
feature_list[featureIndex]=[(ea-averageValue)/(maxValue-minValue+1) for ea in eachFeatureListItem]
#print("after scale the list is :",feature_list[featureIndex])
#print('max and min',max(feature_list[featureIndex]),min(feature_list[featureIndex]))
featureIndex+=1
def inittheta():
for th in range(37):
theta.append(1.)
def hypothesisOfLogisticRegression(theta_list,feature_record):#theta_list index 0-36,feature_list index 0-35
product_thetalist_featurelist=.0
for eachvalue in range(36):#calculate product of theta and feature from theta1 to theta35
product_thetalist_featurelist+=theta_list[eachvalue+1]*feature_record[eachvalue]
product_thetalist_featurelist+=theta_list[0]
#print(product_thetalist_featurelist)
#print(math.exp(-product_thetalist_featurelist))
#print(1/(1+math.exp(-product_thetalist_featurelist)) )
return 1/(1+math.exp(-product_thetalist_featurelist))
#print(hypothesisOfLogisticRegression(theta, feature[0]))
#print(sum(feature[0]))
def calculteCostFunction(thetaList,featureList,resultList):
for calCost in range(3196):
#calculate cost function
Costvalue=.0
Costvalue+=resultList[calCost]*math.log10(hypothesisOfLogisticRegression(thetaList, featureList[calCost]))+(1-resultList[calCost])*math.log10(1-hypothesisOfLogisticRegression(thetaList, featureList[calCost]))
#print('cost function value is',(-1/3196*Costvalue))
return (-1/3196*Costvalue)
def SGD():
temptheta=theta
tempCost=.0
Cost=1.
while(Cost-tempCost>0.0000000001):
for eachRecord in range(3196):
if(Cost-tempCost>0.0000000001):
Cost=calculteCostFunction(theta, feature, result)
print('the value of cost function is:',Cost)
for j in range(1,37):
theta[j]=theta[j]-0.01*(hypothesisOfLogisticRegression(temptheta,feature[eachRecord])-result[eachRecord])*feature[eachRecord][j-1]
theta[0]=theta[0]-0.01*(hypothesisOfLogisticRegression(temptheta,feature[eachRecord])-result[eachRecord])
temptheta=theta
tempCost=calculteCostFunction(theta, feature, result)
print('new cost is ',tempCost)
else:
print("Find the optimal theta")
#print(theta)
def GD():
temp2theta=theta
temp2Cost=.0
Cost2=1.
sumValue=[.0]
while(Cost2-temp2Cost>0.00001):
Cost2=calculteCostFunction(theta, feature, result)
print('Cost is',Cost2)
for j in range(1,37):
sumValue.append(.0)
for i in range(3196):
sumValue[j]+=(hypothesisOfLogisticRegression(temp2theta, feature[i])-result[i])*feature[i][j-1]
theta[j]-=0.01*sumValue[j]
for i1 in range(3196):
sumValue[0]+=(hypothesisOfLogisticRegression(temp2theta, feature[i1])-result[i1])
theta[0]-=0.01*sumValue[0]
temp2Cost=calculteCostFunction(theta, feature, result)
print('new cost is ',temp2Cost)
print('Find optimal theta ',theta)
print('cost is',temp2Cost)
def getTestResult():
for f1 in feature:
if(hypothesisOfLogisticRegression(theta, f1)>=0.5):
test_result.append(1)
else:
test_result.append(0)
def TestLRClassifier():
correct_count=0
wrong_count=0
for ii in range(3196):
if(test_result[ii]==result[ii]):
correct_count+=1
else:
wrong_count+=1
print('the correct proportion is ',correct_count/len(feature))
print('the wrong proportion is ',wrong_count/len(feature))
getDataFromFile()
inittheta()
#print(theta)
feature_scale(feature)
hypothesisOfLogisticRegression(theta, feature[0])
calculteCostFunction(theta, feature, result)
GD()
#print('finally',theta)
getTestResult()
TestLRClassifier()