一段python代码_又是一段精典的python代码,

#!/usr/bin/env python

#encoding=utf-8

import redis,codecs,sys,time,datetime,doctest

reload(sys)

sys.setdefaultencoding('utf-8')

class Unbuffered:

def __init__(self, stream):

self.stream = stream

def write(self, data):

self.stream.write(data)

self.stream.flush()

def __getattr__(self, attr):

return getattr(self.stream, attr)

sys.stdout = Unbuffered(sys.stdout)

def read_keys():

keys=r.keys()

r=redis.Redis(host='localhost',db=6)

print len(keys)

f=codecs.open("query_keys.txt","w","utf-8")

#print r.info()

for key in keys:

print key

#print type(key)

f.write("%s\n"%(key,))

f.close()

def read_relevent_words():

keys=r.keys()

r=redis.Redis(host='localhost',db=6)

print len(keys)

f=codecs.open("query_relevent_words.txt","w","utf-8")

for key in keys:

#        print r.get(key)

f.write("%s\n"%(r.get(key),))

f.close()

def parser_one_line_one_words():

ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")

f=codecs.open("query_relevent_words.txt","r","utf-8")

for line in f.readlines():

li=line.strip().split("*")

for elem in li:

ff.write("%s\n"%(elem,))

ff.close()

def parser_one_line_one_words2():

s=set()

ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")

f=codecs.open("query_relevent_words.txt","r","utf-8")

for line in f.readlines():

li=line.strip().split("*")

for elem in li:

s.add(elem.strip())

ff.write("%s\n"%(elem,))

ff.close()

print len(s)

def compare_pareser_one_line_one_words_result_lost_line_for_tmp():

f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")

f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")

count=0

for a,b in zip(f1.readlines(),f2.readlines()):

count+=1

if a.strip()<>b.replace(" ","").strip():

print count,a,b

time.sleep(5)

def build_invert_index():

"""

对wname建倒排索引

以set结构存放倒排数据

"""

r=redis.Redis(db=1)

p=r.pipeline()

count=0

#for line in codecs.open("../result_text.txt","r","utf-8").readlines():

for line in codecs.open("../output_result_process","r","utf-8").readlines():

count+=1

#if count<2553148:

#    continue

#print count

#print line,

#print line.strip().split(" ").__len__()

for elem in line.strip().split(" "):

p.sadd(elem.strip(),count)

if count%10000==0:

print count

print "batch insert to redis ..."

s=datetime.datetime.now()

p.execute()

e=datetime.datetime.now()

print "done:%s"%((e-s).seconds)

p.execute()

def is_chinese(uchar):

"""判断一个unicode是否是汉字"""

if uchar >= u'\u4e00' and uchar<=u'\u9fa5':

return True

else:

return False

def is_number(uchar):

"""判断一个unicode是否是数字"""

if uchar >= u'\u0030' and uchar<=u'\u0039':

return True

else:

return False

def is_alphabet(uchar):

"""判断一个unicode是否是英文字母"""

if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):

return True

else:

return False

def is_other(uchar):

"""判断是否非汉字,数字和英文字符"""

if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):

return True

else:

return False

def _filter(line):

"""

对分词后的文本wname字符进行非中文汉字、字母、数字的替换

"""

r=[]

for elem in line.strip().split(" "):

element=elem.strip()

if type(element)<>type(u""):

element=element.decode("utf-8","ignore")

if is_other(element)==False:

r.append(element)

return " ".join(r)

def post_process_wname_segments_illegal_characters():

f=codecs.open("../output_result_process","w","utf-8")

for line in codecs.open("../output_result","r","utf-8").readlines():

s=_filter(line)

print s

f.write(_filter(line)+"\n")

f.close()

def build_word_segments_hash_map():

"""

给查询词和相关词建立原词-分词结果之间的hashmap

"""

r2=redis.Redis(db=2)

p=r2.pipeline()

f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")

#f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")

f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")

count=0

for a,b in zip(f1.readlines(),f2.readlines()):

count+=1

p.set(a.strip(),b.strip())

if count%10000==0:

print count

print "batch insert to redis ..."

s=datetime.datetime.now()

p.execute()

e=datetime.datetime.now()

print "done:%s"%((e-s).seconds)

p.execute()

f1=codecs.open("query_keys.txt","r","utf-8")

#f2=codecs.open("query_keys_result.txt","r","utf-8")

f2=codecs.open("query_keys_result_pku.txt","r","utf-8")

count=0

for a,b in zip(f1.readlines(),f2.readlines()):

count+=1

p.set(a.strip(),b.strip())

if count%10000==0:

print count

print "batch insert to redis ..."

s=datetime.datetime.now()

p.execute()

e=datetime.datetime.now()

print "done:%s"%((e-s).seconds)

p.execute()

r2.bgsave()

def _build_list_for_inter_args(s1,s2):

"""

将分词后的字符串组合成一个list形式反加给r.sinter使用,去除无用的东西

"""

r=[]

r.extend(s1.split(" "))

r.extend(s2.split(" "))

return [elem.strip() for elem in r if elem.strip()<>""]

def final_find_synomns_out():

"""

"""

#f=codecs.open("synomns.txt","w","utf-8")

f=codecs.open("synomns_pku.txt","w","utf-8")

r1=redis.Redis(db=1)

r2=redis.Redis(db=2)

f1=codecs.open("query_keys.txt","r","utf-8")

f2=codecs.open("query_relevent_words.txt","r","utf-8")

count=0

validateCount=0

for a,b in zip(f1.readlines(),f2.readlines()):

count+=1

#print count

query_segments=r2.get(a.strip())

for elem in b.split("*"):

if elem.strip()=="":

continue

if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:

validateCount+=1

if validateCount%1000==0:

print "validateCount:%s\n"%validateCount

f.write("%s|||%s\n"%(a.strip(),elem.strip()))

f.flush()

f.close()

def interactive_mode():

while(True):

r1=redis.Redis(db=1)

r2=redis.Redis(db=2)

input=raw_input("input query|||relevent_word:\n")

a,b=input.strip().split("|||")

query_segments=r2.get(a.strip())

print a.strip(),"==>",query_segments

print b.strip(),"==>",r2.get(b.strip())

print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))

print "========="

def c1(line):

"""

空格切分

>>> c1("执手|||把手")

False

"""

a,b=line.strip().split("|||")

return a.split(" ").__len__()>1 or b.split(" ").__len__()>1

def c2(line):

"""

包含子串

>>> c2("执手|||把手")

False

>>> c2("浓缩咖啡|||咖啡")

True

"""

a,b=line.strip().split("|||")

return (a in b) or (b in a)

def filter_synonym_result():

"""

将pku分词获得的query和relevent_word有交集的synomns_pku.txt,

对其结果进行过滤

过滤掉以下条件:

有空格切分的

包含子串的进行过滤

"""

f=codecs.open("synomns_pku_filter.txt","w","utf-8")

for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():

if c1(line)==False and c2(line)==False:

f.write(line)

f.close()

if __name__=="__main__":

#    doctest.testmod()

#    read_relevent_words()

#    parser_one_line_one_words2()

#    compare_pareser_one_line_one_words_result_lost_line_for_tmp()

#    build_invert_index()

#    build_word_segments_hash_map()

#    final_find_synomns_out()

#    interactive_mode()

#    print _filter("龟 鹿 补 肾丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 补 肾 失眠 体弱 疲乏 壮 阳 ]")

#    print _filter("龟 牌 ( turtle ) 硬壳 防水 全效 蜡 g-223r")

#    post_process_wname_segments_illegal_characters()

filter_synonym_result()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值