将python算法转为scala_海量文本去重simhash算法(python&scala)

1.python(Numpy实现)

具体公式见reference中的论文。

# -*- coding: utf-8 -*-

"""

Created on Mon May 19 09:32:00 2018

@author: wangyao

"""

import jieba

#simhash值直接用包计算,pip install simhash

from simhash import Simhash

import re

import numpy as np

import pandas as pd

#停用词

stopwords = [line.strip() for line in open('Stopwords.txt', 'r', encoding='utf-8').readlines()]

#文本预处理+特征提取

def get_features(s):

width = 3

string = ''

s = re.sub(r'[^\w]+', '', s)

s = jieba.lcut(s)

X,Y = ['\u4e00','\u9fa5']

s = [ i for i in s if len(i) > 1 and X<=i<=Y and i not in stopwords]

for i in s:

string += i

if string:

return [string[i:i + width] for i in range(max(len(string) - width + 1, 1))]

else:

print ("请输入中文文档")

#list1 = df.content.apply(lambda x: isinstance(x, str))

#文本预处理

def Pre_Processing(s):

string = ''

s = re.sub(r'[^\w]+', '', s)

s = jieba.lcut(s)

X,Y = ['\u4e00','\u9fa5']

s = [i for i in s if len(i) > 1 and X<=i<=Y and i not in stopwords]

string = string.join(s)

if string:

return string

else:

print('请勿输入空字符串或者完全由停用词组成的无意义的句子')

#simhash包自带的汉明距离

def hanming_simhash(s1,s2):

hanmingdistance = Simhash(Pre_Processing(s1)).distance(Simhash(Pre_Processing(s2)))

#return hanming_distance

return 1-hanmingdistance/64

#将字符串转化为hashcode

def ToSimhashcode(s):

if type(s) == str:

return Simhash(get_features(s)).value

else:

print('输入的句子格式需要是字符串')

#自己写的汉明距离

def hanming_distance(s1,s2):

if type(s1) == str and type(s2) == str:

hanmingdistance = bin(int(hex(Simhash(get_features(s1)).value),16)^int(hex(Simhash(get_features(s2)).value),16)).count('1')

elif type(s1) == int and type(s2) == int:

hanmingdistance = bin(int(hex(s1),16)^int(hex(s2),16)).

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值