记录做项目时对txt的处理,简单几行python代码,以供大家参考。
读取文本
with open('1.txt','r',encoding = 'utf8')as f:
s = f.read() # 读取文本
# print(s) # 打印文本内容
写入文本
横排写入
将竖排文本转横排,字符超过一定数目换行写入
import textwrap
# 竖转横,超过长度换行
with open(r'en.txt','r',encoding = 'utf8')as f:
r = "".join(f.readlines()) # 竖排转横排
# print(r)
with open(r'en_new.txt','w+',encoding = 'utf8')as f:
text = textwrap.fill(r, 30)
f.write(text)
竖排写入
将横排文本内容转竖排,字符超过一定数目换行写入
with open(r'ch.txt','r',encoding = 'utf8')as f:
r = ''.join(f.read().replace("\n",""))
with open(r'ch_new.txt','w+',encoding = 'utf8')as f:
line = []
i = 0
for i in range(len(list(r))):
line.append(list(r)[i])
i += 1
if len(line) == 200:
line = "".join(line)
f.write(line + "\n") # 将读取内容写入新文本中
line = []
打乱文本行顺序
打乱文本行顺序,便于训练,耗时久
# 打乱文本中的行顺序
import os
import random
root = "D:/train/"
fw = open(root + "train_labels.txt", "w")
lines=[]
with open(root + "train.txt", 'r') as f:
for line in f:
lines.append(line)
random.shuffle(lines)
for line in lines:
print(line)
fw.write(line)
删除文本行
删除相同行
# 删除相同行内容
import shutil
readPath='old.txt'
writePath='new.txt'
lines_seen=set()
outfile=open(writePath,'a+',encoding='utf-8')
f=open(readPath,'r',encoding='utf-8')
for line in f:
if line not in lines_seen:
outfile.write(line)
lines_seen.add(line)
f.close()
outfile.close()
删除一些行
# 删除前几行内容
readPath='out.txt'
writePath='new.txt'
outfile=open(writePath,'a+',encoding='utf-8')
f=open(readPath,'r',encoding='utf-8')
i = 0
for line in f:
i += 1
if i > 20000:
outfile.write(line)
else:
pass
f.close()
outfile.close()
提取某些行
#-*- coding: utf-8 -*-
# 获取文本文件指定行内容
import linecache
def get_line(file, nums_line):
return linecache.getline(file, nums_line).strip()
open_file = 'labels.txt'
save_file = open('new.txt', "w", encoding="utf-8")
for line in range(1, 200):
current_context = get_line(open_file, line)
# print(current_context)
save_file.write(current_context + "\n")
将xml转为txt
处理ocr标签时,需要将标注转为对应格式,获得标签。先读xml内容,再取box
# -*- coding: utf-8 -*-
import xml.dom.minidom
import os
import glob
xml_file = glob.glob(r"xml/*.xml") # 遍历文件夹下xml
for xmlfile in xml_file:
rec = ""
# 读xml文件
DOMTree = xml.dom.minidom.parse(xmlfile)
annotation = DOMTree.documentElement
filename = annotation.getElementsByTagName("filename")[0]
imgname = filename.childNodes[0].data
rec += imgname
objects = annotation.getElementsByTagName("object")
loc = ""
for object in objects:
# 取值
bbox = object.getElementsByTagName("bndbox")[0]
xmin = bbox.getElementsByTagName("xmin")[0]
xmin = xmin.childNodes[0].data
ymin = bbox.getElementsByTagName("ymin")[0]
ymin = ymin.childNodes[0].data
xmax = bbox.getElementsByTagName("xmax")[0]
xmax = xmax.childNodes[0].data
ymax = bbox.getElementsByTagName("ymax")[0]
ymax = ymax.childNodes[0].data
point = xmin + "," + ymax + "," + xmax + "," + ymax + "," + xmax + "," + ymin + "," + xmin + "," + ymin
loc = loc + point + "\n"
txtfile = rec.strip('.jpg') + ".txt"
txt_file = open(r"gt_1/gt_" + txtfile, "w", encoding = "utf-8" )
txt_file.write(loc)
txt_file.close()
将文件名存入txt
import os
file_dir = "D:/annotations/"
f = open("train.txt", "w", encoding = "utf-8")
dirs = os.listdir(file_dir)
# print(len(dirs)) # 打印文件夹中文件个数
# 打印文件名
for file in dirs:
f.write((file + "\n").replace(".xml", "")) # 将后缀替换为空
f.close()
希望能帮助大家,此文章不定期更新。