从hue导入器进入
页面调用/indexer/api/indexer//indexer/api/indexer/guess_field_types接口,根据页面选择参数,传递format到后端,
{
...
"format":
{
"type": "csv",
"fieldSeparator": ",",
"recordSeparator": "\\n",
"quoteChar": "\"",
"hasHeader": true,
"status": 0
},
...
}
hue/desktop/libs/indexer/src/indexer/api3.py
def guess_field_types(request):
file_format = json.loads(request.POST.get('fileFormat', '{}'))
if file_format['inputFormat'] == 'localfile':
path = urllib_unquote(file_format['path'])
with open(path, 'r') as local_file:
reader = csv.reader(local_file)
csv_data = list(reader)
if file_format['format']['hasHeader']:
sample = csv_data[1:5]
column_row = [re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0]]
else:
sample = csv_data[:4]
column_row = ['field_' + str(count+1) for count, col in enumerate(sample[0])]
field_type_guesses = []
for count, col in enumerate(column_row):
column_samples = [sample_row[count] for sample_row in sample if len(sample_row) > count]
field_type_guess = guess_field_type_from_samples(column_samples)
field_type_guesses.append(field_type_guess)
columns = [
Field(column_row[count], field_type_guesses[count]).to_dict()
for count, col in enumerate(column_row)
]
format_ = {
'columns': columns,
'sample': sample
}
elif file_format['inputFormat'] == 'file':
indexer = MorphlineIndexer(request.user, request.fs)
path = urllib_unquote(file_format["path"])
stream = request.fs.open(path)
encoding = check_encoding(stream.read(10000))
LOG.debug('File %s encoding is %s' % (path, encoding))
stream.seek(0)
_convert_format(file_format["format"], inverse=True)
format_ = indexer.guess_field_types({
"file": {
"stream": stream,
"name": path
},
"format": file_format['format']
})
# Note: Would also need to set charset to table (only supported in Hive)
if 'sample' in format_ and format_['sample']:
format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding)
for col in format_['columns']:
col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding)
elif file_format['inputFormat'] == 'table':
...
elif file_format['inputFormat'] == 'query':
...
elif file_format['inputFormat'] == 'rdbms':
...
elif file_format['inputFormat'] == 'stream':
...
elif file_format['inputFormat'] == 'connector':
...
return JsonResponse(format_)
分析文件读取,数据类型处理:
- 根据数据类型,读取data
- file_format[‘format’][‘hasHeader’]判断是否有表头,有获取样例数据,和表头
循环column_row,获取每行的数据类型 - from indexer.fields import Field, guess_field_type_from_samples
- guess_field_type_from_samples函数是入口,_guess_field_types函数猜测字段类型,内部通过设定好的FIELD_TYPES判断
fields.py文件如下:
class FieldType(object):
def __init__(self, name, regex, heuristic_regex=None):
self._name = name
self._regex = regex
self._heuristic_regex = heuristic_regex
@property
def heuristic_regex(self):
return self._heuristic_regex if self._heuristic_regex else self.regex
@property
def name(self):
return self._name
@property
def regex(self):
return self._regex
def heuristic_match(self, field):
pattern = re.compile(self.heuristic_regex, flags=re.IGNORECASE)
return pattern.match(field)
class Field(object):
def __init__(self, name="new_field", field_type_name="string", operations=None, multi_valued=False, unique=False):
self.name = name
self.field_type_name = field_type_name
self.keep = True
self.operations = operations if operations else []
self.required = False
self.unique = unique
self.multi_valued = multi_valued
self.show_properties = False
def to_dict(self):
return {
'name': self.name,
'type': self.field_type_name,
'unique': self.unique,
'keep': self.keep,
'operations': self.operations,
'required': self.required,
'multiValued': self.multi_valued,
'showProperties': self.show_properties,
'nested': [],
'level': 0,
'length': 100,
'keyType': 'string',
'isPartition': False,
'partitionValue': '',
'comment': '',
'scale': 0,
'precision': 10
}
FIELD_TYPES = [
FieldType('text_general', "^[\\s\\S]*$", heuristic_regex="^[\\s\\S]{101,}$"),
FieldType('string', "^[\\s\\S]*$", heuristic_regex="^[\\s\\S]{1,100}$"),
FieldType('double', "^([+-]?[0-9]+(\.[0-9]+)?(E[+-]?[0-9]+)?)$"),
FieldType('long', "^(?:[+-]?(?:[0-9]+))$"),
FieldType('date', "^([0-9]+-[0-9]+-[0-9]+(\s+|T)[0-9]+:[0-9]+:[0-9]+(\.[0-9]*)?Z?)$"),
FieldType('boolean', "^(true|false|t|f|0|1)$")
]
def get_field_type(type_name):
return [file_type for file_type in FIELD_TYPES if file_type.name in type_name][0]
def guess_field_type_from_samples(samples):
guesses = [_guess_field_type(sample) for sample in samples]
return _pick_best_field(guesses)
def _guess_field_type(field_val):
if field_val == "":
return None
for field_type in FIELD_TYPES[::-1]:
if field_type.heuristic_match(field_val):
return field_type.name
def _pick_best_field(types):
types = set(types)
for field in FIELD_TYPES:
if field.name in types:
return field.name
return "string"
- 最后返回数据
format_ = {
"sample": sample['rows'][:4],
"columns": [
Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
for col in table_metadata.cols
]
}
分析预览格式:分隔符、换行符、引用处理
1. 调用转换函数,传参format_
-
默认:
format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": True, "fieldSeparator": "," }
-
转换函数
def _convert_format(format_dict, inverse=False): for field in format_dict: if isinstance(format_dict[field], basestring): format_dict[field] = _escape_white_space_characters(format_dict[field], inverse)
2. 转义空格字符串:按照MAPPINGS,s.replace()
```python
MAPPINGS = {
"\n": "\\n",
"\t": "\\t",
"\r": "\\r",
" ": "\\s"
}
```