#!/usr/bin/env python
#encoding=utf-8
import redis,codecs,sys,time,datetime,doctest
reload(sys)
sys.setdefaultencoding('utf-8')
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
sys.stdout = Unbuffered(sys.stdout)
def read_keys():
keys=r.keys()
r=redis.Redis(host='localhost',db=6)
print len(keys)
f=codecs.open("query_keys.txt","w","utf-8")
#print r.info()
for key in keys:
print key
#print type(key)
f.write("%s\n"%(key,))
f.close()
def read_relevent_words():
keys=r.keys()
r=redis.Redis(host='localhost',db=6)
print len(keys)
f=codecs.open("query_relevent_words.txt","w","utf-8")
for key in keys:
# print r.get(key)
f.write("%s\n"%(r.get(key),))
f.close()
def parser_one_line_one_words():
ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
f=codecs.open("query_relevent_words.txt","r","utf-8")
for line in f.readlines():
li=line.strip().split("*")
for elem in li:
ff.write("%s\n"%(elem,))
ff.close()
def parser_one_line_one_words2():
s=set()
ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
f=codecs.open("query_relevent_words.txt","r","utf-8")
for line in f.readlines():
li=line.strip().split("*")
for elem in li:
s.add(elem.strip())
ff.write("%s\n"%(elem,))
ff.close()
print len(s)
def compare_pareser_one_line_one_words_result_lost_line_for_tmp():
f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
count=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
if a.strip()<>b.replace(" ","").strip():
print count,a,b
time.sleep(5)
def build_invert_index():
"""
对wname建倒排索引
以set结构存放倒排数据
"""
r=redis.Redis(db=1)
p=r.pipeline()
count=0
#for line in codecs.open("../result_text.txt","r","utf-8").readlines():
for line in codecs.open("../output_result_process","r","utf-8").readlines():
count+=1
#if count<2553148:
# continue
#print count
#print line,
#print line.strip().split(" ").__len__()
for elem in line.strip().split(" "):
p.sadd(elem.strip(),count)
if count%10000==0:
print count
print "batch insert to redis ..."
s=datetime.datetime.now()
p.execute()
e=datetime.datetime.now()
print "done:%s"%((e-s).seconds)
p.execute()
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar >= u'\u0030' and uchar<=u'\u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
return True
else:
return False
def is_other(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
def _filter(line):
"""
对分词后的文本wname字符进行非中文汉字、字母、数字的替换
"""
r=[]
for elem in line.strip().split(" "):
element=elem.strip()
if type(element)<>type(u""):
element=element.decode("utf-8","ignore")
if is_other(element)==False:
r.append(element)
return " ".join(r)
def post_process_wname_segments_illegal_characters():
f=codecs.open("../output_result_process","w","utf-8")
for line in codecs.open("../output_result","r","utf-8").readlines():
s=_filter(line)
print s
f.write(_filter(line)+"\n")
f.close()
def build_word_segments_hash_map():
"""
给查询词和相关词建立原词-分词结果之间的hashmap
"""
r2=redis.Redis(db=2)
p=r2.pipeline()
f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
#f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")
count=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
p.set(a.strip(),b.strip())
if count%10000==0:
print count
print "batch insert to redis ..."
s=datetime.datetime.now()
p.execute()
e=datetime.datetime.now()
print "done:%s"%((e-s).seconds)
p.execute()
f1=codecs.open("query_keys.txt","r","utf-8")
#f2=codecs.open("query_keys_result.txt","r","utf-8")
f2=codecs.open("query_keys_result_pku.txt","r","utf-8")
count=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
p.set(a.strip(),b.strip())
if count%10000==0:
print count
print "batch insert to redis ..."
s=datetime.datetime.now()
p.execute()
e=datetime.datetime.now()
print "done:%s"%((e-s).seconds)
p.execute()
r2.bgsave()
def _build_list_for_inter_args(s1,s2):
"""
将分词后的字符串组合成一个list形式反加给r.sinter使用,去除无用的东西
"""
r=[]
r.extend(s1.split(" "))
r.extend(s2.split(" "))
return [elem.strip() for elem in r if elem.strip()<>""]
def final_find_synomns_out():
"""
"""
#f=codecs.open("synomns.txt","w","utf-8")
f=codecs.open("synomns_pku.txt","w","utf-8")
r1=redis.Redis(db=1)
r2=redis.Redis(db=2)
f1=codecs.open("query_keys.txt","r","utf-8")
f2=codecs.open("query_relevent_words.txt","r","utf-8")
count=0
validateCount=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
#print count
query_segments=r2.get(a.strip())
for elem in b.split("*"):
if elem.strip()=="":
continue
if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:
validateCount+=1
if validateCount%1000==0:
print "validateCount:%s\n"%validateCount
f.write("%s|||%s\n"%(a.strip(),elem.strip()))
f.flush()
f.close()
def interactive_mode():
while(True):
r1=redis.Redis(db=1)
r2=redis.Redis(db=2)
input=raw_input("input query|||relevent_word:\n")
a,b=input.strip().split("|||")
query_segments=r2.get(a.strip())
print a.strip(),"==>",query_segments
print b.strip(),"==>",r2.get(b.strip())
print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))
print "========="
def c1(line):
"""
空格切分
>>> c1("执手|||把手")
False
"""
a,b=line.strip().split("|||")
return a.split(" ").__len__()>1 or b.split(" ").__len__()>1
def c2(line):
"""
包含子串
>>> c2("执手|||把手")
False
>>> c2("浓缩咖啡|||咖啡")
True
"""
a,b=line.strip().split("|||")
return (a in b) or (b in a)
def filter_synonym_result():
"""
将pku分词获得的query和relevent_word有交集的synomns_pku.txt,
对其结果进行过滤
过滤掉以下条件:
有空格切分的
包含子串的进行过滤
"""
f=codecs.open("synomns_pku_filter.txt","w","utf-8")
for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():
if c1(line)==False and c2(line)==False:
f.write(line)
f.close()
if __name__=="__main__":
# doctest.testmod()
# read_relevent_words()
# parser_one_line_one_words2()
# compare_pareser_one_line_one_words_result_lost_line_for_tmp()
# build_invert_index()
# build_word_segments_hash_map()
# final_find_synomns_out()
# interactive_mode()
# print _filter("龟 鹿 补 肾丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 补 肾 失眠 体弱 疲乏 壮 阳 ]")
# print _filter("龟 牌 ( turtle ) 硬壳 防水 全效 蜡 g-223r")
# post_process_wname_segments_illegal_characters()
filter_synonym_result()
转载于:https://www.cnblogs.com/lexus/archive/2012/11/09/2762808.html