hive处理文本数据时需要指定分隔符,一般来说都是用逗号来做分隔,当某个字段的内容是字符串时,特别是有"{}"双引号括起来的json那种,hive处理时会直接将某个字段中的字符串内容中逗号也当成分隔符来处理,造成hive表格字段内容的异常,这里就需要用将字符串中的逗号替换掉。代码如下:
# -*- coding: utf-8 -*-
import re,os,sys
def alter(file_path, new_str):
#pat = re.compile(r"\"[a-zA-Z0-9_\-\+\s/\"\(\);]+,+[a-zA-Z0-9_\-\+\s,/\"\(\);]*\"")
pat = re.compile(r"\"[a-zA-Z0-9_\-\+\s/\"\(\);\.]+,+[a-zA-Z0-9_\-\+\s,/\"\(\);\.]*\"")
with open(file_path, 'r') as f, open("%s.bak" % file_path, "a") as f2:
for line in f:
res = pat.findall(line)
if len(res) == 0:
f2.write(line)
for r in res:
print r
r = r.replace(',', new_str)
print r
f2.write(re.sub(pat, r, line))
os.remove(file_path)
os.rename("%s.bak" % file_path, file_path)
def alter_new(file_path, new_str):
with open(file_path, 'r') as f, open("%s.bak" % file_path, "a") as f2:
for line in f:
j_b = 0
j_e = 0
t = []
p = []
q = []
i = 0
j = 0
num = line.count(',')
js_str = ""
if num != 80:
#json
j_b = line.find('{')
j_e = line.find('}')
if j_e != -1 and j_b != -1:
js_str = line[j_b:j_e + 1]
js_str = js_str.replace(',', new_str)
for l in line:
#char + index
t.append(l + '@' + str(i))
i = i + 1
#not json
for tt in t:
#has json
if tt.split('@')[0] == '"' and j_e != -1 and j_b != -1:
if int(tt.split('@')[1]) > (j_e + 1) or int(tt.split('@')[1]) < (j_b - 1):
p.append(int(tt.split('@')[1]))
#no json
elif tt.split('@')[0] == '"':
p.append(int(tt.split('@')[1]))
while len(p) > 0 and j + 1 < len(p):
temp_str = line[p[j]:p[j + 1] + 1]
temp_str = temp_str.replace(',', new_str)
q.append(temp_str)
j = j+1
if len(q) != 0 and len(js_str) == 0:
line = line[0:p[0]] + q[0] + line[p[1] + 1:len(line)]
f2.write(line)
if len(q) == 0 and len(js_str) != 0:
line = line[0:j_b] + js_str + line[j_e + 1:len(line)]
f2.write(line)
if len(js_str) != 0 and len(q) != 0:
line = line[0:j_b] + js_str + line[j_e + 1:p[0]] + q[0] + line[p[1] + 1:len(line)]
f2.write(line)
if len(js_str) == 0 and len(q) == 0:
f2.write(line)
else:
f2.write(line)
os.remove(file_path)
os.rename("%s.bak" % file_path, file_path)
if __name__ =='__main__':
alter_new(sys.argv[1], sys.argv[2])
第一种方法alter的话比较简单,直接用正则表达式精准匹配到需要替换的字符串,然后将字符串中的逗号替换掉,但这种方法覆盖不够全面,如果字符串不规律的话、很有可能会有没有匹配上而导致替换不成功的
第二种方法alter_new思路比较简单,就是将两个双引号中间的逗号替换掉而一一的去遍历替换,num=80可以替换成具体的分隔符--逗号的个数,大于这个数字说明有字符串中有多出的逗号需要替换掉,首先要找出"{}"这一段数据,并将里面的逗号替换掉,然后再将替换完的字符串合入到原先的那一行内容并保存,这种情况在一行中可能会出现多次。而且对于json和非json分开进行处理。
最后附上验证检测替换结果是否正确的代码:
import sys
def count_comma(file_path):
with open(file_path, 'r') as f:
for line in f:
num = line.count(',')
if num != 80:
print num
print line
if __name__ =='__main__':
count_comma(sys.argv[1])
80可以根据具体情况替换掉,如果替换不成功会打印出逗号的个数和具体行号