#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 3 16:35:27 2017
@author: vicky
"""
#!/usr/bin/python
# coding=utf-8
class simhash:
#构造函数
def __init__(self, tokens='', hashbits=128):
self.hashbits = hashbits
self.hash = self.simhash(tokens);
#toString函数
def __str__(self):
return str(self.hash)
#生成simhash值
def simhash(self, tokens):
v = [0] * self.hashbits
for t in [self._string_hash(x) for x in tokens]: #t为token的普通hash值
for i in range(self.hashbits):
bitmask = 1 << i
if t & bitmask :
v[i] += 1 #查看当前bit位是否为1,是的话将该位+1
else:
v[i] -= 1 #否则的话,该位-1
fingerprint = 0
for i in range(self.hashbits):
if v[i] >= 0:
fingerprint += 1 << i
return fingerprint #整个文档的fingerprint为最终各个位>=0的和
#求海明距离
def hamming_distance(self, other):
x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
tot = 0;
while x :
tot += 1
x &= x - 1
return tot
#求相似度
def similarity (self, other):
a = float(self.hash)
b = float(other.hash)
if a > b : return b / a
else: return a / b
#针对source生成hash值 (一个可变长度版本的Python的内置散列)
def _string_hash(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** self.hashbits - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
return x
if __name__ == '__main__':
s = 'This is a test string for testing'
s=t[0]
hash1 = simhash(s.split())
s = 'This is a test string for testing also'
s=t[2]
hash2 = simhash(s.split())
s = 'nai nai ge xiong cao'
hash3 = simhash(s.split())
print(hash1.hamming_distance(hash2) , " " , hash1.similarity(hash2))
print(hash1.hamming_distance(hash3) , " " , hash1.similarity(hash3))
print(t[0].hamming_distance(hash2) , " " , hash1.similarity(hash2))
print(hash1.hamming_distance(hash3) , " " , hash1.similarity(hash3))
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
# 载入 “stopwords”分词变量
stopwords = ['省','市','区','镇','村','路','街','号']
np.split(t[1],'省')
which(t[1]=='省')
s = u'中文截取'
s.decode('utf8')[0:3].encode('utf8')
import jieba
seg_list = jieba.cut("我来到北京清华大学")
print("Default Mode:", ' '.join(seg_list))
def tokenize_only(text):
# 首先分句,接着分词,而标点也会作为词例存在
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# 过滤所有不含字母的词例(例如:数字、纯标点)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
import re
import sys
n=size(t) #输入地址数据t,n=样本数量
for data in data_list:
#data_utf8=data.decode('utf8')
#print data_utf8
#country = data
province = ''
city = ''
district = ''
#pattern = re.compile(PATTERN3)
#pattern = re.compile(PATTERN)
#m=data.split('省')
m=''
t2=t
for i in range(n):
m=re.split(u'省|市|区|镇|村|路|街', t[i])
t2[i]=''.join(m)
m.group(2)
if not m:
print country + '|||'
continue
#print m.group()
#country = '中国'
if t[0].rfind('省') >= 0:
province = m[0]
if m.lastindex >= 2:
city = m.group(2)
if m.lastindex >= 3:
district = m.group(3)
out = '%s|%s|%s|%s' %(province, city, district)
print out