#--------------------删除不规则数据:\data不在一行开头,将\data以及本行之前的都删除
import re
def process_file(input_file, output_file):
with open(input_file, 'r') as infile:
with open(output_file, 'w') as outfile:
for line in infile:
# 使用正则表达式匹配行中的 /data/lh123/lh/,但不在行开头的情况
pattern = re.compile(r'(?<!^)/data') # 使用 (?<!^) 限制不在行开头
match = pattern.search(line)
if match:
index = match.start()
line = line[index + len('/data'):]
outfile.write(line)
if __name__ == "__main__":
input_path = "/data/lh123/lh/train.txt"
output_path = "/data/lh123/lh/train1.txt"
process_file(input_path, output_path)
# 进行上一步的原因是还可以尽可能的多保存数据
# ----------不是标准格式的删除---删除力度大
import json
def is_valid_line(line):
try:
# 检查行是否以指定路径开头
if not line.startswith('/data/lh123/lh/verification_code/PaddleOCR-release-2.6/train_data/train/'):
return False
# 获取JSON部分并解析
json_data = line.split('\t')[1]
annotations = json.loads(json_data)
# 检查JSON是否包含所需字段
for annotation in annotations:
if "transcription" not in annotation or "points" not in annotation:
return False
except Exception as e:
return False
return True
def process_file(input_file, output_file):
with open(input_file, 'r') as infile:
with open(output_file, 'w') as outfile:
for line in infile:
if is_valid_line(line):
outfile.write(line)
if __name__ == "__main__":
input_path = "/data/lh123/lh/train2.txt"
output_path = "/data/lh123/lh/train3.txt"
process_file(input_path, output_path)
【点选验证码】整理生成的txt文件错误数据
最新推荐文章于 2024-06-29 16:57:48 发布