Hue源码解析之-数据预览逻辑实现

17 篇文章 0 订阅
12 篇文章 0 订阅

从hue导入器进入

在这里插入图片描述

页面调用/indexer/api/indexer//indexer/api/indexer/guess_field_types接口,根据页面选择参数,传递format到后端,

{
...

"format": 
	{
		"type": "csv",
		"fieldSeparator": ",",
		"recordSeparator": "\\n",
		"quoteChar": "\"",
		"hasHeader": true,
		"status": 0
	},
...
}

hue/desktop/libs/indexer/src/indexer/api3.py

def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'localfile':
    path = urllib_unquote(file_format['path'])

    with open(path, 'r') as local_file:
      reader = csv.reader(local_file)
      csv_data = list(reader)

      if file_format['format']['hasHeader']:
        sample = csv_data[1:5]
        column_row = [re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0]]
      else:
        sample = csv_data[:4]
        column_row = ['field_' + str(count+1) for count, col in enumerate(sample[0])]

      field_type_guesses = []
      for count, col in enumerate(column_row):
        column_samples = [sample_row[count] for sample_row in sample if len(sample_row) > count]
        field_type_guess = guess_field_type_from_samples(column_samples)
        field_type_guesses.append(field_type_guess)

      columns = [
        Field(column_row[count], field_type_guesses[count]).to_dict()
        for count, col in enumerate(column_row)
      ]

      format_ = {
        'columns': columns,
        'sample': sample
      }

  elif file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    path = urllib_unquote(file_format["path"])
    stream = request.fs.open(path)
    encoding = check_encoding(stream.read(10000))
    LOG.debug('File %s encoding is %s' % (path, encoding))
    stream.seek(0)
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": path
        },
      "format": file_format['format']
    })

    # Note: Would also need to set charset to table (only supported in Hive)
    if 'sample' in format_ and format_['sample']:
      format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding)
    for col in format_['columns']:
      col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding)

  elif file_format['inputFormat'] == 'table':
    ...
  elif file_format['inputFormat'] == 'query':
    ...
  elif file_format['inputFormat'] == 'rdbms':
    ...
  elif file_format['inputFormat'] == 'stream':
   ...
  elif file_format['inputFormat'] == 'connector':
   ...
  return JsonResponse(format_)

分析文件读取,数据类型处理:

  • 根据数据类型,读取data
  • file_format[‘format’][‘hasHeader’]判断是否有表头,有获取样例数据,和表头
    循环column_row,获取每行的数据类型
  • from indexer.fields import Field, guess_field_type_from_samples
    • guess_field_type_from_samples函数是入口,_guess_field_types函数猜测字段类型,内部通过设定好的FIELD_TYPES判断

fields.py文件如下:


class FieldType(object):

  def __init__(self, name, regex, heuristic_regex=None):
    self._name = name
    self._regex = regex
    self._heuristic_regex = heuristic_regex

  @property
  def heuristic_regex(self):
    return self._heuristic_regex if self._heuristic_regex else self.regex

  @property
  def name(self):
    return self._name

  @property
  def regex(self):
    return self._regex

  def heuristic_match(self, field):
    pattern = re.compile(self.heuristic_regex, flags=re.IGNORECASE)

    return pattern.match(field)


class Field(object):

  def __init__(self, name="new_field", field_type_name="string", operations=None, multi_valued=False, unique=False):
    self.name = name
    self.field_type_name = field_type_name
    self.keep = True
    self.operations = operations if operations else []
    self.required = False
    self.unique = unique
    self.multi_valued = multi_valued
    self.show_properties = False

  def to_dict(self):
    return {
      'name': self.name,
      'type': self.field_type_name,
      'unique': self.unique,
      'keep': self.keep,
      'operations': self.operations,
      'required': self.required,
      'multiValued': self.multi_valued,
      'showProperties': self.show_properties,
      'nested': [],
      'level': 0,
      'length': 100,
      'keyType': 'string',
      'isPartition': False,
      'partitionValue': '',
      'comment': '',
      'scale': 0,
      'precision': 10
    }

FIELD_TYPES = [
  FieldType('text_general', "^[\\s\\S]*$", heuristic_regex="^[\\s\\S]{101,}$"),
  FieldType('string', "^[\\s\\S]*$", heuristic_regex="^[\\s\\S]{1,100}$"),
  FieldType('double', "^([+-]?[0-9]+(\.[0-9]+)?(E[+-]?[0-9]+)?)$"),
  FieldType('long', "^(?:[+-]?(?:[0-9]+))$"),
  FieldType('date', "^([0-9]+-[0-9]+-[0-9]+(\s+|T)[0-9]+:[0-9]+:[0-9]+(\.[0-9]*)?Z?)$"),
  FieldType('boolean', "^(true|false|t|f|0|1)$")
]

def get_field_type(type_name):
  return [file_type for file_type in FIELD_TYPES if file_type.name in type_name][0]

def guess_field_type_from_samples(samples):
  guesses = [_guess_field_type(sample) for sample in samples]

  return _pick_best_field(guesses)

def _guess_field_type(field_val):
  if field_val == "":
    return None

  for field_type in FIELD_TYPES[::-1]:
    if field_type.heuristic_match(field_val):
      return field_type.name

def _pick_best_field(types):
  types = set(types)

  for field in FIELD_TYPES:
    if field.name in types:
      return field.name
  return "string"

  • 最后返回数据
format_ = {
	"sample": sample['rows'][:4],
 	"columns": [
        Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
for col in table_metadata.cols
    ]
}

分析预览格式:分隔符、换行符、引用处理

1. 调用转换函数,传参format_
  • 默认:

    format_ = {
     "quoteChar": "\"",
     "recordSeparator": '\\n',
     "type": "csv",
     "hasHeader": True,
     "fieldSeparator": ","
    }
    
  • 转换函数

    def _convert_format(format_dict, inverse=False):
      for field in format_dict:
        if isinstance(format_dict[field], basestring):
          format_dict[field] = _escape_white_space_characters(format_dict[field], inverse)
    
    
2. 转义空格字符串:按照MAPPINGS,s.replace()
```python
MAPPINGS = {
  "\n": "\\n",
 "\t": "\\t",
 "\r": "\\r",
 " ": "\\s"
}
```
最终: 将数据返给前端,前端按照参数处理,显示在页面= =
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值