import sys
import pandas as pd
import os
import time
import csv
#脚本目的:将文件夹下的所有CSV文本转成TSV文本。
#使用:python transform.py 或者 python3 transform.py
target_dir = "./"
file_names = os.listdir(target_dir)
chunkSize = 1000
#遍历文件夹下所有文件
for index,file_name in enumerate(file_names):
file_path = "./" + file_name
if os.path.isfile(file_path) and ".csv" in file_name and "_new" not in file_name:
start_time = time.time()
print(file_name+" 开始转换")
new_file_path = file_path + "_new"
if os.path.exists(new_file_path):
print(file_name+" 已完成转换\n")
continue
#打开TCP UDP 源文件路径
#指定参数iterator = True返回一个可迭代对象TextFileReader :
#quotechar:引用值
#sep:默认逗号
#read_csv 每一个字符串作为一列
try:
reader_csv = pd.read_csv(file_path,iterator=True,quotechar='"',header=None)
except Exception as e:
#记录错误,转到下一个文本
fo = open("tranform-log.log",mode="a")
fo.write("------------------------------------------\n")
fo.write("转换文件名称:" + file_path + "\n")
fo.write("错误结果:"+ str(sys.exc_info()) + "\n")
fo.close()
continue
#循环获取数据
loop = True
while loop:
try:
#初始化存储一行数据的变量
row_csv_data = []
#分批获取数据
chunk = reader_csv.get_chunk(chunkSize)
#row 属于 nd.array 类 每一行
first_line_flag = True
for row in chunk.values:
#存储新的一行数据
row_list = []
for value in row:
#print(value,type(value))
if isinstance(value,str):
row_list.append(value.replace("\t"," ").strip('"'))
else:
#print(value)
row_list.append(value)
row_csv_data.append(row_list)
# #分批写入新文本
with open(new_file_path,'a+',newline='')as f:
f_csv = csv.writer(f, delimiter='\t',quotechar=None)
f_csv.writerows(row_csv_data)
except StopIteration:
loop = False
print(file_name+"完成时间:"+str(time.time()-start_time)+"\n")
except Exception as err:
loop = False
print("loop is stoped! 有错误 {}".format(sys.exc_info()))
python 将文件夹下的所有CSV文本转成TSV文本
最新推荐文章于 2023-02-20 09:08:57 发布