将特征数据转换为kv格式并存储为txt词典文件,方便线上编译。
#!/usr/bin/bash
import os
import hashlib
from io import open
# table = o.get_table('table_name')
# print(table.schema)
def out_md5(src):
# md5编译
m = hashlib.md5()
m.update(src.encode('utf-8'))
return m.hexdigest()
os.system('mkdir -p doc_name')
# 读取词表数据到临时表
instance1 = o.execute_sql('drop table if exists tmp_table_name;')
instance2 = o.execute_sql('create table if not exists tmp_table_name as select k,max(v)as v from table_name group by k;')
# 写入数据到当前目录
try:
sql_file = open('./doc_name/doc_name.txt', 'w')
with o.execute_sql('select k,v from tmp_table_name;').open_reader(tunnel=True, limit=False) as reader:
for record in reader:
strs = str(record['k'].encode('utf-8')) + '\t' + str(record['v'].encode('utf-8'))
sql_file.writelines(strs.decode('utf-8') + '\n')
sql_file.close()
except Exception as e:
print('sql file output error: ')
raise
# 校验词典txt大小
if os.path.getsize('./doc_name/doc_name.txt') <= threshold:
print('error: dict size is less than threshold')
exit()
else:
pass
# 写入md5到当前目录
try:
md5_file = open('./doc_name/doc_name.txt.md5', 'w')
with open('./doc_name/doc_name.txt', 'r', encoding='utf-8') as f:
m = out_md5(f.read())
m_str = m.decode('utf-8') + '\t'+ 'doc_name.txt'
md5_file.writelines(m_str + '\n')
md5_file.close()
except Exception as e:
print('md5 output error: ')
raise
# 上传词典文件到线上自动更新目录
try:
os.system('upload ./doc_name/doc_name.txt target_address/doc_name/doc_name.txt')
os.system('upload ./doc_name/doc_name.txt.md5 target_address/doc_name/doc_name.txt.md5')
except Exception as e:
print('oss output error: ')
raise