又是一段精典的python代码,

#!/usr/bin/env python
#encoding=utf-8
import redis,codecs,sys,time,datetime,doctest
reload(sys)
sys.setdefaultencoding('utf-8')
class Unbuffered:
    def __init__(self, stream):
        self.stream = stream

    def write(self, data):
        self.stream.write(data)
        self.stream.flush()

    def __getattr__(self, attr):
        return getattr(self.stream, attr)

sys.stdout = Unbuffered(sys.stdout)

def read_keys():
    keys=r.keys()
    r=redis.Redis(host='localhost',db=6)
    print len(keys)
    f=codecs.open("query_keys.txt","w","utf-8")
    #print r.info()
    for key in keys:
        print key
        #print type(key)
        f.write("%s\n"%(key,))
    f.close()

def read_relevent_words():
    keys=r.keys()
    r=redis.Redis(host='localhost',db=6)
    print len(keys)
    f=codecs.open("query_relevent_words.txt","w","utf-8")
    for key in keys:
#        print r.get(key)
        f.write("%s\n"%(r.get(key),))
    f.close()

def parser_one_line_one_words():
    ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
    f=codecs.open("query_relevent_words.txt","r","utf-8")
    for line in f.readlines():
        li=line.strip().split("*")
        for elem in li:
            ff.write("%s\n"%(elem,))
    ff.close()


def parser_one_line_one_words2():
    s=set()
    ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
    f=codecs.open("query_relevent_words.txt","r","utf-8")
    for line in f.readlines():
        li=line.strip().split("*")
        for elem in li:
            s.add(elem.strip())
            ff.write("%s\n"%(elem,))
    ff.close()
    print len(s)

def compare_pareser_one_line_one_words_result_lost_line_for_tmp():
    f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
    f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
    count=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        if a.strip()<>b.replace(" ","").strip():
            print count,a,b
            time.sleep(5)   

def build_invert_index():
    """
    对wname建倒排索引
    以set结构存放倒排数据
    """
    r=redis.Redis(db=1)
    p=r.pipeline()
    count=0
    #for line in codecs.open("../result_text.txt","r","utf-8").readlines():
    for line in codecs.open("../output_result_process","r","utf-8").readlines():
        count+=1
        #if count<2553148:
        #    continue
        #print count
        #print line,
        #print line.strip().split(" ").__len__()
        for elem in line.strip().split(" "):
            p.sadd(elem.strip(),count)
        if count%10000==0:
            print count
            print "batch insert to redis ..."
            s=datetime.datetime.now()
            p.execute()
            e=datetime.datetime.now()
            print "done:%s"%((e-s).seconds)
    p.execute()



def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
        return True
    else:
        return False
   
def is_number(uchar):
    """判断一个unicode是否是数字"""
    if uchar >= u'\u0030' and uchar<=u'\u0039':
        return True
    else:
        return False
   
def is_alphabet(uchar):
    """判断一个unicode是否是英文字母"""
    if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
        return True
    else:
        return False

def is_other(uchar):
    """判断是否非汉字,数字和英文字符"""
    if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
        return True
    else:
       return False

def _filter(line):
    """
    对分词后的文本wname字符进行非中文汉字、字母、数字的替换
    """
    r=[]
    for elem in line.strip().split(" "):
        element=elem.strip()
        if type(element)<>type(u""):
            element=element.decode("utf-8","ignore")
        if is_other(element)==False:
            r.append(element)
    return " ".join(r)

def post_process_wname_segments_illegal_characters():
    f=codecs.open("../output_result_process","w","utf-8")
    for line in codecs.open("../output_result","r","utf-8").readlines():
        s=_filter(line)
        print s
        f.write(_filter(line)+"\n")
    f.close()

def build_word_segments_hash_map():
    """
    给查询词和相关词建立原词-分词结果之间的hashmap
    """
    r2=redis.Redis(db=2)
    p=r2.pipeline()
    f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
    #f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
    f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")
    count=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        p.set(a.strip(),b.strip())
        if count%10000==0:
            print count
            print "batch insert to redis ..."
            s=datetime.datetime.now()
            p.execute()
            e=datetime.datetime.now()
            print "done:%s"%((e-s).seconds)
    p.execute()

    f1=codecs.open("query_keys.txt","r","utf-8")
    #f2=codecs.open("query_keys_result.txt","r","utf-8")
    f2=codecs.open("query_keys_result_pku.txt","r","utf-8")
    count=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        p.set(a.strip(),b.strip())
        if count%10000==0:
            print count
            print "batch insert to redis ..."
            s=datetime.datetime.now()
            p.execute()
            e=datetime.datetime.now()
            print "done:%s"%((e-s).seconds)
    p.execute()
    r2.bgsave()

def _build_list_for_inter_args(s1,s2):
    """
    将分词后的字符串组合成一个list形式反加给r.sinter使用,去除无用的东西
    """
    r=[]
    r.extend(s1.split(" "))
    r.extend(s2.split(" "))
    return [elem.strip() for elem in r if elem.strip()<>""]

def final_find_synomns_out():
    """

    """
    #f=codecs.open("synomns.txt","w","utf-8")
    f=codecs.open("synomns_pku.txt","w","utf-8")
    r1=redis.Redis(db=1)
    r2=redis.Redis(db=2)
    f1=codecs.open("query_keys.txt","r","utf-8")
    f2=codecs.open("query_relevent_words.txt","r","utf-8")
    count=0
    validateCount=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        #print count
        query_segments=r2.get(a.strip())
        for elem in b.split("*"):
            if elem.strip()=="":
                continue
            if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:
                validateCount+=1
                if validateCount%1000==0:
                    print "validateCount:%s\n"%validateCount
                f.write("%s|||%s\n"%(a.strip(),elem.strip()))
                f.flush()
    f.close()

def interactive_mode():
    while(True):
        r1=redis.Redis(db=1)
        r2=redis.Redis(db=2)
        input=raw_input("input query|||relevent_word:\n")
        a,b=input.strip().split("|||")
        query_segments=r2.get(a.strip())
        print a.strip(),"==>",query_segments
        print b.strip(),"==>",r2.get(b.strip())
        print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))
        print "========="

def c1(line):
    """
    空格切分
    >>> c1("执手|||把手")
    False
    """
    a,b=line.strip().split("|||")
    return a.split(" ").__len__()>1 or b.split(" ").__len__()>1

def c2(line):
    """
    包含子串
    >>> c2("执手|||把手")
    False
   
    >>> c2("浓缩咖啡|||咖啡")
    True
    """
    a,b=line.strip().split("|||")
    return (a in b) or (b in a)

def filter_synonym_result():
    """
    将pku分词获得的query和relevent_word有交集的synomns_pku.txt,
    对其结果进行过滤
    过滤掉以下条件:
    有空格切分的
    包含子串的进行过滤
    """
   
    f=codecs.open("synomns_pku_filter.txt","w","utf-8")
    for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():
        if c1(line)==False and c2(line)==False:
            f.write(line)
    f.close()
   
           
       
if __name__=="__main__":
#    doctest.testmod()
#    read_relevent_words()
#    parser_one_line_one_words2()
#    compare_pareser_one_line_one_words_result_lost_line_for_tmp()
#    build_invert_index()
#    build_word_segments_hash_map()
#    final_find_synomns_out()   
#    interactive_mode()
#    print _filter("龟 鹿 补 肾丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 补 肾 失眠 体弱 疲乏 壮 阳 ]")
#    print _filter("龟 牌 ( turtle ) 硬壳 防水 全效 蜡 g-223r")
#    post_process_wname_segments_illegal_characters()
    filter_synonym_result()   

posted on 2012-11-09 16:04  lexus 阅读( ...) 评论( ...) 编辑 收藏

转载于:https://www.cnblogs.com/lexus/archive/2012/11/09/2762808.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值