1.python(Numpy实现)
具体公式见reference中的论文。
# -*- coding: utf-8 -*-
"""
Created on Mon May 19 09:32:00 2018
@author: wangyao
"""
import jieba
#simhash值直接用包计算,pip install simhash
from simhash import Simhash
import re
import numpy as np
import pandas as pd
#停用词
stopwords = [line.strip() for line in open('Stopwords.txt', 'r', encoding='utf-8').readlines()]
#文本预处理+特征提取
def get_features(s):
width = 3
string = ''
s = re.sub(r'[^\w]+', '', s)
s = jieba.lcut(s)
X,Y = ['\u4e00','\u9fa5']
s = [ i for i in s if len(i) > 1 and X<=i<=Y and i not in stopwords]
for i in s:
string += i
if string:
return [string[i:i + width] for i in range(max(len(string) - width + 1, 1))]
else:
print ("请输入中文文档")
#list1 = df.content.apply(lambda x: isinstance(x, str))
#文本预处理
def Pre_Processing(s):
string = ''
s = re.sub(r'[^\w]+', '', s)
s = jieba.lcut(s)
X,Y = ['\u4e00','\u9fa5']
s = [i for i in s if len(i) > 1 and X<=i<=Y and i not in stopwords]
string = string.join(s)
if string:
return string
else:
print('请勿输入空字符串或者完全由停用词组成的无意义的句子')
#simhash包自带的汉明距离
def hanming_simhash(s1,s2):
hanmingdistance = Simhash(Pre_Processing(s1)).distance(Simhash(Pre_Processing(s2)))
#return hanming_distance
return 1-hanmingdistance/64
#将字符串转化为hashcode
def ToSimhashcode(s):
if type(s) == str:
return Simhash(get_features(s)).value
else:
print('输入的句子格式需要是字符串')
#自己写的汉明距离
def hanming_distance(s1,s2):
if type(s1) == str and type(s2) == str:
hanmingdistance = bin(int(hex(Simhash(get_features(s1)).value),16)^int(hex(Simhash(get_features(s2)).value),16)).count('1')
elif type(s1) == int and type(s2) == int:
hanmingdistance = bin(int(hex(s1),16)^int(hex(s2),16)).