address.py-20170903

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep  3 16:35:27 2017

@author: vicky
"""

#!/usr/bin/python  
# coding=utf-8  
class simhash:  
     
    #构造函数  
    def __init__(self, tokens='', hashbits=128):         
        self.hashbits = hashbits  
        self.hash = self.simhash(tokens);  
     
    #toString函数     
    def __str__(self):  
        return str(self.hash)  
     
    #生成simhash值     
    def simhash(self, tokens):  
        v = [0] * self.hashbits  
        for t in [self._string_hash(x) for x in tokens]: #t为token的普通hash值            
            for i in range(self.hashbits):  
                bitmask = 1 << i  
                if t & bitmask :  
                    v[i] += 1 #查看当前bit位是否为1,是的话将该位+1  
                else:  
                    v[i] -= 1 #否则的话,该位-1  
        fingerprint = 0  
        for i in range(self.hashbits):  
            if v[i] >= 0:  
                fingerprint += 1 << i  
        return fingerprint #整个文档的fingerprint为最终各个位>=0的和  
     
    #求海明距离  
    def hamming_distance(self, other):  
        x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)  
        tot = 0;  
        while x :  
            tot += 1  
            x &= x - 1  
        return tot  
     
    #求相似度  
    def similarity (self, other):  
        a = float(self.hash)  
        b = float(other.hash)  
        if a > b : return b / a  
        else: return a / b  
     
    #针对source生成hash值   (一个可变长度版本的Python的内置散列)  
    def _string_hash(self, source):         
        if source == "":  
            return 0  
        else:  
            x = ord(source[0]) << 7  
            m = 1000003  
            mask = 2 ** self.hashbits - 1  
            for c in source:  
                x = ((x * m) ^ ord(c)) & mask  
            x ^= len(source)  
            if x == -1:  
                x = -2  
            return x  
              
if __name__ == '__main__':  
    s = 'This is a test string for testing' 
    s=t[0]
    hash1 = simhash(s.split())  
     
    s = 'This is a test string for testing also' 
    s=t[2]
    hash2 = simhash(s.split())  
     
    s = 'nai nai ge xiong cao'  
    hash3 = simhash(s.split())  
     
    print(hash1.hamming_distance(hash2) , "   " , hash1.similarity(hash2))  
    print(hash1.hamming_distance(hash3) , "   " , hash1.similarity(hash3)) 
    
   
    print(t[0].hamming_distance(hash2) , "   " , hash1.similarity(hash2))  
    print(hash1.hamming_distance(hash3) , "   " , hash1.similarity(hash3)) 
    
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

# 载入 “stopwords”分词变量
stopwords = ['省','市','区','镇','村','路','街','号']
np.split(t[1],'省')

which(t[1]=='省')
s = u'中文截取'
s.decode('utf8')[0:3].encode('utf8')
import jieba
seg_list = jieba.cut("我来到北京清华大学")
print("Default Mode:", ' '.join(seg_list))

def tokenize_only(text):
    # 首先分句,接着分词,而标点也会作为词例存在
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # 过滤所有不含字母的词例(例如:数字、纯标点)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

import re  
import sys  

n=size(t) #输入地址数据t,n=样本数量

for data in data_list:  
    #data_utf8=data.decode('utf8')  
    #print data_utf8  
    #country = data  
    province = ''  
    city = ''  
    district = ''  
    #pattern = re.compile(PATTERN3)  
    #pattern = re.compile(PATTERN)  
    #m=data.split('省')
    m=''
    t2=t
for i in range(n):
    m=re.split(u'省|市|区|镇|村|路|街', t[i]) 
    t2[i]=''.join(m)
    m.group(2)
    if not m:  
        print country + '|||'  
        continue  
    #print m.group()  
    #country = '中国'  
    if t[0].rfind('省') >= 0:  
        province = m[0]  
    if m.lastindex >= 2:  
        city = m.group(2)  
    if m.lastindex >= 3:  
        district = m.group(3)  
    out = '%s|%s|%s|%s' %(province, city, district)  
    print out  

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值