address.py-20170903

最新推荐文章于 2022-02-06 17:20:58 发布

vicky428

最新推荐文章于 2022-02-06 17:20:58 发布

阅读量157

点赞数

分类专栏： py－笔试&学习笔记

本文链接：https://blog.csdn.net/vicky428/article/details/95894150

版权

py－笔试&学习笔记专栏收录该内容

20 篇文章 0 订阅

订阅专栏

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep  3 16:35:27 2017

@author: vicky
"""

#!/usr/bin/python  
# coding=utf-8  
class simhash:  
     
    #构造函数  
    def __init__(self, tokens='', hashbits=128):         
        self.hashbits = hashbits  
        self.hash = self.simhash(tokens);  
     
    #toString函数     
    def __str__(self):  
        return str(self.hash)  
     
    #生成simhash值     
    def simhash(self, tokens):  
        v = [0] * self.hashbits  
        for t in [self._string_hash(x) for x in tokens]: #t为token的普通hash值            
            for i in range(self.hashbits):  
                bitmask = 1 << i  
                if t & bitmask :  
                    v[i] += 1 #查看当前bit位是否为1,是的话将该位+1  
                else:  
                    v[i] -= 1 #否则的话,该位-1  
        fingerprint = 0  
        for i in range(self.hashbits):  
            if v[i] >= 0:  
                fingerprint += 1 << i  
        return fingerprint #整个文档的fingerprint为最终各个位>=0的和  
     
    #求海明距离  
    def hamming_distance(self, other):  
        x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)  
        tot = 0;  
        while x :  
            tot += 1  
            x &= x - 1  
        return tot  
     
    #求相似度  
    def similarity (self, other):  
        a = float(self.hash)  
        b = float(other.hash)  
        if a > b : return b / a  
        else: return a / b  
     
    #针对source生成hash值   (一个可变长度版本的Python的内置散列)  
    def _string_hash(self, source):         
        if source == "":  
            return 0  
        else:  
            x = ord(source[0]) << 7  
            m = 1000003  
            mask = 2 ** self.hashbits - 1  
            for c in source:  
                x = ((x * m) ^ ord(c)) & mask  
            x ^= len(source)  
            if x == -1:  
                x = -2  
            return x  
              
if __name__ == '__main__':  
    s = 'This is a test string for testing' 
    s=t[0]
    hash1 = simhash(s.split())  
     
    s = 'This is a test string for testing also' 
    s=t[2]
    hash2 = simhash(s.split())  
     
    s = 'nai nai ge xiong cao'  
    hash3 = simhash(s.split())  
     
    print(hash1.hamming_distance(hash2) , "   " , hash1.similarity(hash2))  
    print(hash1.hamming_distance(hash3) , "   " , hash1.similarity(hash3)) 
    
   
    print(t[0].hamming_distance(hash2) , "   " , hash1.similarity(hash2))  
    print(hash1.hamming_distance(hash3) , "   " , hash1.similarity(hash3)) 
    
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

# 载入 “stopwords”分词变量
stopwords = ['省','市','区','镇','村','路','街','号']
np.split(t[1],'省')

which(t[1]=='省')
s = u'中文截取'
s.decode('utf8')[0:3].encode('utf8')
import jieba
seg_list = jieba.cut("我来到北京清华大学")
print("Default Mode:", ' '.join(seg_list))

def tokenize_only(text):
    # 首先分句，接着分词，而标点也会作为词例存在
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # 过滤所有不含字母的词例（例如：数字、纯标点）
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

import re  
import sys  

n=size(t) #输入地址数据t,n＝样本数量

for data in data_list:  
    #data_utf8=data.decode('utf8')  
    #print data_utf8  
    #country = data  
    province = ''  
    city = ''  
    district = ''  
    #pattern = re.compile(PATTERN3)  
    #pattern = re.compile(PATTERN)  
    #m=data.split('省')
    m=''
    t2=t
for i in range(n):
    m=re.split(u'省|市|区|镇|村|路|街', t[i]) 
    t2[i]=''.join(m)
    m.group(2)
    if not m:  
        print country + '|||'  
        continue  
    #print m.group()  
    #country = '中国'  
    if t[0].rfind('省') >= 0:  
        province = m[0]  
    if m.lastindex >= 2:  
        city = m.group(2)  
    if m.lastindex >= 3:  
        district = m.group(3)  
    out = '%s|%s|%s|%s' %(province, city, district)  
    print out

vicky428

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
address.py-20170903

#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Sun Sep 3 16:35:27 2017@author: vicky"""#!/usr/bin/python # coding=utf-8 class simhash: #构造函数 def __init__(se...
复制链接

扫一扫

专栏目录