importnumpy as npimportjsonimportcodecs#计算编辑距离
defedit_distance(word1, word2):
len1=len(word1)
len2=len(word2)
dp= np.zeros((len1 + 1, len2 + 1))for i in range(len1 + 1):
dp[i][0]=ifor j in range(len2 + 1):
dp[0][j]=jfor i in range(1, len1 + 1):for j in range(1, len2 + 1):if word1[i - 1] == word2[j - 1]:
temp=0else:
temp= 1dp[i][j]= min(dp[i - 1][j - 1] + temp, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))returndp[len1][len2]#190801#根据编辑距离计算相似度
defsimility(word1, word2):
res=edit_distance(word1, word2)
maxLen=max(len(word1), len(word2))return 1-res*1.0/maxLen
bianhaos=[]
sub_sens=[]
with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write_sub.txt','r','utf8') as f:for line inf:#bianhao,sub_sen = line.split('<->')
#sub_sen = sub_sen.strip().strip('').strip('')
#bianhaos.append(bianhao)
sub_sens.append(line)
count=len(sub_sens)
leibie= [-1]*count
cla=0print(count)for i inrange(count):if leibie[i] != -1:continueleibie[i]=cla
sub1=sub_sens[i]for j inrange(count):if leibie[j] != -1:continuesub2=sub_sens[j]
sim=simility(sub1,sub2)if sim >= 0.5:
leibie[j]=cla
cla= cla + 1
print(i)print(leibie)
with open('leibie05.json','w') as f:
json.dump(leibie,f)