1
去网站上找数据。比如说我做缅甸语可以去找缅甸语的网站,比如政府网站或者字典或者去github找相关的数据。
如果是分散的一些txt文件可以使用在线合成txt文件,合成一个。
整理下数据。我们收集有可能是长句子,中间有tab之类的,要把这些句子拆开,一个单词一个换行符。然后,我们可以规定一个最大的单词长度,如果超过这个距离就丢掉此数据。然后把单词去重。
def shorted_word(srcdir, filename, destdir, destname):
length = 32
fr = open(os.path.join(srcdir, filename), "r", encoding="utf-8")
fw = open(os.path.join(destdir, destname), "w", encoding="utf-8")
lines = fr.readlines()
lines = set(lines)
fr.close()
words_all = []
maxlen = 0
print("read : ")
for lable1 in tqdm(lines):
lable1 = lable1.strip("\n")
words = lable1.split()
for i in range(len(words)):
if len(words[i]) > length:
if maxlen < len(words[i]):
maxlen = len(words[i])
continue
else:
# fw.write(words[i] + "\n")
words_all.append(words[i])
print("max len : ", maxlen)
print("write : ")
words_all = set(words_all)
for word in tqdm(words_all):
fw.write(word + "\n")
刚才的方法,生成的数据是没有空格的,是一个个的单词,我们也可以不去掉空格。也就是将长句子分割成几个短语,每行是短语。
def shorted_centence(srcdir, filename, destdir, destname):
length = 32
fr = open(os.path.join(srcdir, filename), "r", encoding="utf-8")
fw = open(os.path.join(destdir, destname), "w", encoding="utf-8")
lines = fr.readlines()
lines = set(lines)
fr.close()
for lable1 in tqdm(lines):
lable1 = lable1.strip("\n")
if len(lable1) > length:
if "\t" in lable1:
lable1 = lable1.split("\t")[0]
words = lable1.split()
temp = ""
for i in range(len(words)):
if len(temp) + len(words[i]) > length:
fw.write(temp + "\n")
temp = ""
else:
blank = " "
if temp == "":
blank = ""
temp = temp + blank + words[i]
if temp != "" and temp != "။":
fw.write(temp + "\n")
else:
fw.write(lable1 + "\n")
2
准备字典。字典的意思就是一个单词中可能出现的字符。比如英语就是26个大写字母26个小写字母,如果不考虑数字和特殊符号的话。缅甸语去百度搜缅甸语unicode,有160个字符。
如果不知道字典是什么,可以从刚刚的数据集中生成。也是利用集合。
python str转 char list 直接使用 * 比如 [*line] line是一个str
生成完之后注意检查,把其他语言的去掉。
3
将1准备的数据清洗一下。去掉数据库中超纲单词。比如英语中夹杂着日文中文。中文和日文是2中的字典所不具备的。
这里我给一个清洗超纲单词清洗方法。
参数依次为,字典路径,字典文件名,超纲字符的存放路径,数据集路径,数据集文件名,清洗后的文件路径和名称
def testchars(chars_dir, chars_name, nochar, label_dir, label_name, newlabel):
fchars = open(os.path.join(chars_dir, chars_name), "r", encoding="UTF-8-sig")
flabels = open(os.path.join(label_dir, label_name), "r", encoding="utf-8")
fnewlabels = open(os.path.join(label_dir, newlabel), "w", encoding="utf-8")
char_set = set()
charslines = fchars.readlines()
for line in charslines:
a = line.strip("\n")
char_set.add(a)
labelines = flabels.readlines()
labelines = set(labelines)
fnochar = open(os.path.join(chars_dir, nochar), "w", encoding="UTF-8-sig")
nochar = set()
for line in tqdm(labelines):
line = line.replace("","")
line = line.replace("", "")
line = line.replace('', "")
line = line.replace('', "")
newline = line
line = line.strip("\n")
words = line.split()
linenew = ""
for word in words:
linenew = linenew + " " + word
list_char = [*linenew]
flag = set([*linenew]).issubset(char_set)
if not flag :
for char_ in list_char :
if char_ not in char_set :
nochar.add(char_)
else:
fnewlabels.write(newline)
for i in nochar:
fnochar.write(i + "\n")
注意一下,如果准备的字典文件格式不是utf-8 no bom 编码的话,如果将代码的UTF-8-sig改为UTF-8,读文件的第一行最开始会有个\ueff。【Python问题解决】利用Python读取文件时出现\ufeff的原因及解决办法_奋斗中的编程菜鸟的博客-CSDN博客_python ufeff
要解决这个问题,要不使用UTF-8-sig,要不使用高级的工具比如editplus 或者notepad,将文件编码格式改为utf-8 no bom。推荐使用第二种方式,因为你也不知道你的用的训练工具读文件的方式是怎么样的。
4
分测试集和训练集,我这里测试集是训练集的0.1倍。
def sample(srcdir, filename, train_filename, testname, rate =0.1):
fcrop = open(os.path.join(srcdir, filename), "r", encoding="utf-8")
lines = fcrop.readlines()
lines = set(lines)
fcrop.close()
randomlabels = random.sample(lines, int(rate * len(lines)))
ftrain = open(os.path.join(srcdir, train_filename), "w", encoding="utf-8")
train_label = set(lines).difference(set(randomlabels))
for lable1 in tqdm(train_label):
ftrain.write(lable1)
ftrain.close()
ftest = open(os.path.join(srcdir, testname), "w", encoding="utf-8")
for lable in tqdm(randomlabels):
ftest.write(lable)
ftest.close()
5.
准备字体和背景图。没有背景图也行。
使用github 的 trdg生成图片和标签。这是我的参数。这一步就不细说了。
-c
100000
-w
1
-i
./dicts/shortest.txt
--output_dir
outtest/
--fit
-na
2
-ft
./fonts/my/Pyidaungsu.ttf
-tc
#000000,#888888
-f
64
-k
5
-rk
-do
-b
3
-id
./bg
-bl
1
-rbl
-t
32
6
trdg如果使用背景图会有个问题,如果字的颜色和背景颜色相近他会不产生图片但是会有标签。所以我们得把图片不存在的标签去掉。
def fileexist(srcdir, txt_dir, img_dir, destdir, destname):
fr = open(os.path.join(srcdir, txt_dir), "r", encoding="utf-8")
lines = fr.readlines()
fr.close()
fw = open(os.path.join(destdir, destname), "w", encoding="utf-8")
for lable1 in tqdm(lines):
imgname = lable1.split("\t")[0].split("/")[-1]
result = os.path.isfile(os.path.join(img_dir, imgname))
if result:
fw.write(lable1)
fw.close()