python处理txt

最新推荐文章于 2024-05-30 15:12:55 发布

置顶 §汤(^_^）园§

最新推荐文章于 2024-05-30 15:12:55 发布

阅读量393

点赞数

文章标签： python

本文链接：https://blog.csdn.net/t_123_y/article/details/107315567

版权

python处理txt

记录做项目时对txt的处理，简单几行python代码，以供大家参考。

读取文本

with open('1.txt','r',encoding = 'utf8')as f:
	s = f.read() # 读取文本
	# print(s) # 打印文本内容

写入文本

横排写入

将竖排文本转横排，字符超过一定数目换行写入

import textwrap
# 竖转横,超过长度换行
with open(r'en.txt','r',encoding = 'utf8')as f:
    r = "".join(f.readlines()) # 竖排转横排
    # print(r)
with open(r'en_new.txt','w+',encoding = 'utf8')as f: 
    text = textwrap.fill(r, 30)
    f.write(text)

竖排写入

将横排文本内容转竖排，字符超过一定数目换行写入

with open(r'ch.txt','r',encoding = 'utf8')as f:
    r = ''.join(f.read().replace("\n",""))
with open(r'ch_new.txt','w+',encoding = 'utf8')as f: 
    line = []
    i = 0
    for i in range(len(list(r))):

        line.append(list(r)[i])
        i += 1
        if len(line) == 200:
            line = "".join(line)
            f.write(line + "\n") # 将读取内容写入新文本中
            line = []

打乱文本行顺序

打乱文本行顺序，便于训练，耗时久

# 打乱文本中的行顺序
import os
import random

root = "D:/train/"
fw = open(root + "train_labels.txt", "w")

lines=[]
with open(root + "train.txt", 'r') as f:
    for line in f:
        lines.append(line)

random.shuffle(lines)

for line in lines:
    print(line)
    fw.write(line)

删除文本行

删除相同行

# 删除相同行内容
import shutil
readPath='old.txt'
writePath='new.txt'
lines_seen=set()
outfile=open(writePath,'a+',encoding='utf-8')
f=open(readPath,'r',encoding='utf-8')
for line in f:
    if line not in lines_seen:
        outfile.write(line)
        lines_seen.add(line)
f.close()
outfile.close()

删除一些行

# 删除前几行内容
readPath='out.txt'
writePath='new.txt'
outfile=open(writePath,'a+',encoding='utf-8')
f=open(readPath,'r',encoding='utf-8')
i = 0
for line in f:
    i += 1
    if i > 20000:
        outfile.write(line)
    else:
        pass
f.close()
outfile.close()

提取某些行

#-*- coding: utf-8 -*-
# 获取文本文件指定行内容
import linecache

def get_line(file, nums_line):
    return linecache.getline(file, nums_line).strip()

open_file = 'labels.txt'
save_file = open('new.txt', "w", encoding="utf-8")

for line in range(1, 200):
    current_context = get_line(open_file, line)
    # print(current_context)
    save_file.write(current_context + "\n")

将xml转为txt

处理ocr标签时，需要将标注转为对应格式，获得标签。先读xml内容，再取box

# -*- coding: utf-8 -*- 
 
import xml.dom.minidom 
import os 
import glob 

xml_file = glob.glob(r"xml/*.xml") # 遍历文件夹下xml
for xmlfile in xml_file:
    rec = ""
    # 读xml文件
    DOMTree = xml.dom.minidom.parse(xmlfile) 
    annotation = DOMTree.documentElement 
    filename = annotation.getElementsByTagName("filename")[0]
    imgname = filename.childNodes[0].data
    rec += imgname

    objects = annotation.getElementsByTagName("object")
    loc = ""
    for object in objects:
        # 取值
        bbox = object.getElementsByTagName("bndbox")[0] 
        xmin = bbox.getElementsByTagName("xmin")[0]
        xmin = xmin.childNodes[0].data
        ymin = bbox.getElementsByTagName("ymin")[0]
        ymin = ymin.childNodes[0].data
        xmax = bbox.getElementsByTagName("xmax")[0]
        xmax = xmax.childNodes[0].data
        ymax = bbox.getElementsByTagName("ymax")[0]
        ymax = ymax.childNodes[0].data
        point = xmin + "," + ymax + "," + xmax + "," + ymax + "," + xmax + "," + ymin + "," + xmin + "," + ymin
        
        loc = loc + point + "\n"
        
    txtfile = rec.strip('.jpg') + ".txt"
    txt_file = open(r"gt_1/gt_" + txtfile, "w", encoding = "utf-8" ) 
    txt_file.write(loc)
    txt_file.close()

将文件名存入txt

import os

file_dir = "D:/annotations/"
f = open("train.txt", "w", encoding = "utf-8")

dirs = os.listdir(file_dir)
# print(len(dirs)) # 打印文件夹中文件个数
# 打印文件名
for file in dirs:    
    f.write((file + "\n").replace(".xml", "")) # 将后缀替换为空
f.close()