博客已迁移, 新文章 地址
==================
需求:在进行hadoop测试时,需要造大量数据,例如某个表存在56列,但实际程序逻辑只适用到某几列,我们造的数据 也只需要某几列
构造几列数据,转化为对应数据表格式
涉及模块:os,getopt,sys
输入:源格式,文本文件
输出:目标格式,文本文件
#!/usr/bin/python
# -*- coding: utf-8 -*-
#dataformat.py
#this script change data from your source to the dest data format
#2011-08-05 created version0.1
#2011-10-29 add row-row mapping ,default row value .rebuild all functions. version0.2
#next:add data auto generate by re expression
import os,getopt,sys
#读入文件,返回所有行
def read_file(path):
f = open(path, "r")
lines = f.readlines()
f.close()
return lines
#处理一行,转为目标格式,返回目标行
def one_line_proc(parts, total, ft_map, outsp, empty_fill):
toindex = 0
outline = ""
keys = ft_map.keys()
for i in range(1, total+1):
if i in keys:
fill_index = ft_map[i]
if fill_index.startswith("d"):
outline += fill_index[1:]
else:
outline += parts[int(fill_index)-1]
else:
outline += empty_fill
if i !=total:
outline += outsp
#TODO:加入使用默认值列 若是以d开头,后面是默认,否则取文件对应列 done
#TODO:这里根据这个判断长度也需要换掉 done
return outline
#处理入口,读文件,循环处理每一行,写出
#输入数据分隔符默认\t,输出数据默认分隔符\t
def process(inpath, total, to, outpath, insp="\t", outsp="\t", empty_fill=""):
#TODO:这里将to转为映射格式 done
ft_map = {}
in_count = 0
used_row = []
for to_row in to:
if r"\:" not in to_row and len(to_row.split(":"))==2:
used_row.append(int(to_row.split(":")[1]))
if r"\=" not in str(to_row) and len(str(to_row).split("="))==2:
pass
else:
in_count += 1
for to_row in to:
if r"\=" not in str(to_row) and len(str(to_row).split("="))==2:
ft_map.update({int(to_row.split("=")[0]):"d"+to_row.split("=")[1]})
continue
elif r"\:" not in to_row and len(to_row.split(":"))==2:
ft_map.update({int(to_row.split(":")[0]):to_row.split(":")[1]})
continue
else:
to_index = 0
for i in range(1, 100):
if i not in used_row:
to_index = i
break
ft_map.update({int(to_row):str(to_index)})
used_row.append(to_index)
lines = read_file(inpath)
f = open(outpath,"w")
result=[]
for line in lines:
parts = line.strip("\n").split(insp)
#TODO:这里判断长度必须换掉 done
if len(parts) >= in_count:
outline = one_line_proc(parts, total, ft_map, outsp, empty_fill)
result.append(outline+"\n")
f.writelines(result)
f.close()
#打印帮助信息
def help_msg():
print("功能:原数据文件转为目标数据格式")
print("选项:")
print("\t -i inputfilepath [必输,原文件路径]")
print("\t -t n [必输,n为数字,目标数据总的域个数]")
print("\t -a '1,3,4' [必输,域编号字符串,逗号分隔。指定域用原数据字段填充,未指定用'0'填充]")
print("\t -o outputfilepath [可选,默认为 inputfilepath.dist ]")
print("\t -F 'FS' [可选,原文件域分隔符,默认为\\t ]")
print("\t -P 'OFS' [可选,输出文件的域分隔符,默认为\\t ]")
sys.exit(0)
#程序入口,读入参数,执行
def main():
try:
opts,args = getopt.getopt(sys.argv[1:],"F:P:t:a:i:o:f:h")
for op,value in opts:
if op in ("-h","-H","--help"):
help_msg()
if op == "-i":
inpath = value
elif op == "-o":
outpath = value
elif op == "-t":
total = int(value)
elif op == "-a":
to = value.split(",")
elif op == "-F":
insp = value.decode("string_escape")
elif op == "-P":
outsp = value.decode("string_escape")
elif op == "-f":
empty_fill = value
#考虑下这边放在神马地方合适
if len(opts) < 3:
print(sys.argv[0]+" : the amount of params must great equal than 3")
sys.exit(1)
except getop