eng = pymysql.connect(host='', user='root', password='mysql', database='')
eng_gul = pymysql.connect(host='', user='root', password='mysql', database='')
es_host_target = ""
es_target = Elasticsearch([es_host_target], http_auth=('name', 'pswd'), port=9200)
indexName = 'tp'
以下是一些接口 自提
def find_two(index):
sql_find_tow = "SELECT REGNO from add_intcls WHERE IMAGETYPE LIKE '" + index + ".%' ORDER BY RAND() LIMIT 5000"
res_finder = pd.read_sql(sql_find_tow, eng)
return res_finder
def find_in_es(nums, indexName):
bodys = {
"query": {
"term": {
"regis_number.keyword": {
"value": nums
}
}
},
"size": 300,
"_source": ["regis_number", "imageType_second"]
}
return es_target.search(body=bodys, index=indexName)
def find_the_two_index(tow_lv_dict, two_lv):
res = tow_lv_dict.get(two_lv)
if res is None:
return 0 # fan
else:
return tow_lv_dict.get(two_lv)
# find the regno in es and find the diffirent two lv
def find_two_lv_in_es(tow_dict, reg):
ls_es_res_index = []
res = find_in_es(reg, indexName)
es_res = res['hits']['hits']
for i in range(len(es_res)):
es_res_two_lv = es_res[i]['_source']['imageType_second']
es_res_ind = find_the_two_index(tow_dict, es_res_two_lv)
ls_es_res_index.append(es_res_ind)
return ls_es_res_index
# ---- 以下处理是为了获取每个类别5000张
# sql_two = 'SELECT ZLBM,ID FROM graphics_elements_two'
# res_two = pd.read_sql(sql_two, eng_gul)
#
# res_two_ls = res_two.values.tolist()
#
# json_file = 'short.json'
# with open(json_file, 'r', encoding='utf8')as fp:
# ls_json_data = json.load(fp)
#
# toal_ls = []
# for ind_two, id in tqdm(res_two_ls):
# if ind_two not in ls_json_data:
# res = find_two(ind_two)
# res['two'] = id
# ls_res = res.values.tolist()
# toal_ls.extend(ls_res)
# else:
# print('{} is in json'.format(ind_two))
#
#
# dd_csv = DataFrame(toal_ls)
# csv_file = '144_5000.csv'
# dd_csv.to_csv(csv_file, index=False)
# # --get the dict of two lv
# sql_two = 'SELECT ZLBM,ID FROM graphics_elements_two'
# res_two = pd.read_sql(sql_two, eng_gul)
#
# res_two_ls = res_two.values.tolist()
# tow_dict = {}
# for ind_two, id in res_two_ls:
# tow_dict[ind_two] = id
#
# # ---- 以下处理是为了整理标签数据到固定格式
# csv_file = '144_5000.csv'
# df_csv = pd.read_csv(csv_file)
# ls = ['REGNO', 'index']
# for i in range(144):
# ls.append(i)
# df_csv.columns = ['REGNO', 'index']
# df_res = df_csv.reindex(columns=ls, fill_value=0)
# ls_res = df_res.values.tolist()
#
# for i in tqdm(range(len(ls_res))):
# index = ls_res[i][1]
# REGNO = ls_res[i][0]
# res_ls = find_two_lv_in_es(tow_dict, REGNO)
# for ind in res_ls:
# ls_res[i][1+ind] = 1
#
# dd_frame = DataFrame(ls_res)
# dd_frame.columns = ls
# csv_file_s = '144_group_5000.csv'
# dd_frame.to_csv(csv_file_s)
# -- 处理csv文件 要求格式:没有表头,没有index, 类别的地方需要用空格代替
csv_file = 'S_144_group_5000.csv'
txt_file = 'S_144_group_5000.txt'
df = pd.read_csv(csv_file)
df_ls = df.values.tolist()
print('start------------')
with open(txt_file, 'w', encoding='utf-8') as f:
for i in tqdm(range(len(df_ls))):
img_d = df_ls[i][0] + '.jpg'
f.write(img_d)
f.write('\t')
index_inf = list(df.iloc[i][1:])
index_inf_str = ','.join('%s' %id for id in index_inf) # - 这种方法是遍历一遍列表,然后进行字符串操作
# index_inf_str = str(index_inf)
# index_inf_str = index_inf_str.split('[')[1].split(']')[0]
f.write(index_inf_str)
f.write('\n') # -- 回车
# break
print('finished------------')