Hue源码解析之-数据预览逻辑实现

最新推荐文章于 2024-06-05 01:37:21 发布

羊羊羊羊吃青草

最新推荐文章于 2024-06-05 01:37:21 发布

阅读量860

点赞数

分类专栏：后端大数据工具文章标签： python hue hue集群

本文链接：https://blog.csdn.net/qq_18874531/article/details/121753029

版权

Hue导入器 CSV解析字段类型猜测数据格式转换 MorphlineIndexer

关键词由CSDN通过智能技术生成

后端同时被 3 个专栏收录

29 篇文章 0 订阅

订阅专栏

工具

17 篇文章 0 订阅

订阅专栏

大数据

12 篇文章 0 订阅

订阅专栏

从hue导入器进入

在这里插入图片描述

页面调用/indexer/api/indexer//indexer/api/indexer/guess_field_types接口，根据页面选择参数，传递format到后端,

{
...

"format": 
	{
		"type": "csv",
		"fieldSeparator": ",",
		"recordSeparator": "\\n",
		"quoteChar": "\"",
		"hasHeader": true,
		"status": 0
	},
...
}

hue/desktop/libs/indexer/src/indexer/api3.py

def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'localfile':
    path = urllib_unquote(file_format['path'])

    with open(path, 'r') as local_file:
      reader = csv.reader(local_file)
      csv_data = list(reader)

      if file_format['format']['hasHeader']:
        sample = csv_data[1:5]
        column_row = [re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0]]
      else:
        sample = csv_data[:4]
        column_row = ['field_' + str(count+1) for count, col in enumerate(sample[0])]

      field_type_guesses = []
      for count, col in enumerate(column_row):
        column_samples = [sample_row[count] for sample_row in sample if len(sample_row) > count]
        field_type_guess = guess_field_type_from_samples(column_samples)
        field_type_guesses.append(field_type_guess)

      columns = [
        Field(column_row[count], field_type_guesses[count]).to_dict()
        for count, col in enumerate(column_row)
      ]

      format_ = {
        'columns': columns,
        'sample': sample
      }

  elif file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    path = urllib_unquote(file_format["path"])
    stream = request.fs.open(path)
    encoding = check_encoding(stream.read(10000))
    LOG.debug('File %s encoding is %s' % (path, encoding))
    stream.seek(0)
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": path
        },
      "format": file_format['format']
    })

    # Note: Would also need to set charset to table (only supported in Hive)
    if 'sample' in format_ and format_['sample']:
      format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding)
    for col in format_['columns']:
      col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding)

  elif file_format['inputFormat'] == 'table':
    ...
  elif file_format['inputFormat'] == 'query':
    ...
  elif file_format['inputFormat'] == 'rdbms':
    ...
  elif file_format['inputFormat'] == 'stream':
   ...
  elif file_format['inputFormat'] == 'connector':
   ...
  return JsonResponse(format_)

分析文件读取，数据类型处理：

根据数据类型，读取data
file_format[‘format’][‘hasHeader’]判断是否有表头，有获取样例数据，和表头
循环column_row，获取每行的数据类型
from indexer.fields import Field, guess_field_type_from_samples
- guess_field_type_from_samples函数是入口，_guess_field_types函数猜测字段类型，内部通过设定好的FIELD_TYPES判断

fields.py文件如下：


class FieldType(object):

  def __init__(self, name, regex, heuristic_regex=None):
    self._name = name
    self._regex = regex
    self._heuristic_regex = heuristic_regex

  @property
  def heuristic_regex(self):
    return self._heuristic_regex if self._heuristic_regex else self.regex

  @property
  def name(self):
    return self._name

  @property
  def regex(self):
    return self._regex

  def heuristic_match(self, field):
    pattern = re.compile(self.heuristic_regex, flags=re.IGNORECASE)

    return pattern.match(field)


class Field(object):

  def __init__(self, name="new_field", field_type_name="string", operations=None, multi_valued=False, unique=False):
    self.name = name
    self.field_type_name = field_type_name
    self.keep = True
    self.operations = operations if operations else []
    self.required = False
    self.unique = unique
    self.multi_valued = multi_valued
    self.show_properties = False

  def to_dict(self):
    return {
      'name': self.name,
      'type': self.field_type_name,
      'unique': self.unique,
      'keep': self.keep,
      'operations': self.operations,
      'required': self.required,
      'multiValued': self.multi_valued,
      'showProperties': self.show_properties,
      'nested': [],
      'level': 0,
      'length': 100,
      'keyType': 'string',
      'isPartition': False,
      'partitionValue': '',
      'comment': '',
      'scale': 0,
      'precision': 10
    }

FIELD_TYPES = [
  FieldType('text_general', "^[\\s\\S]*$", heuristic_regex="^[\\s\\S]{101,}$"),
  FieldType('string', "^[\\s\\S]*$", heuristic_regex="^[\\s\\S]{1,100}$"),
  FieldType('double', "^([+-]?[0-9]+(\.[0-9]+)?(E[+-]?[0-9]+)?)$"),
  FieldType('long', "^(?:[+-]?(?:[0-9]+))$"),
  FieldType('date', "^([0-9]+-[0-9]+-[0-9]+(\s+|T)[0-9]+:[0-9]+:[0-9]+(\.[0-9]*)?Z?)$"),
  FieldType('boolean', "^(true|false|t|f|0|1)$")
]

def get_field_type(type_name):
  return [file_type for file_type in FIELD_TYPES if file_type.name in type_name][0]

def guess_field_type_from_samples(samples):
  guesses = [_guess_field_type(sample) for sample in samples]

  return _pick_best_field(guesses)

def _guess_field_type(field_val):
  if field_val == "":
    return None

  for field_type in FIELD_TYPES[::-1]:
    if field_type.heuristic_match(field_val):
      return field_type.name

def _pick_best_field(types):
  types = set(types)

  for field in FIELD_TYPES:
    if field.name in types:
      return field.name
  return "string"

最后返回数据

format_ = {
	"sample": sample['rows'][:4],
 	"columns": [
        Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
for col in table_metadata.cols
    ]
}

分析预览格式：分隔符、换行符、引用处理

1. 调用转换函数，传参format_

默认：

format_ = {
 "quoteChar": "\"",
 "recordSeparator": '\\n',
 "type": "csv",
 "hasHeader": True,
 "fieldSeparator": ","
}

转换函数

def _convert_format(format_dict, inverse=False):
  for field in format_dict:
    if isinstance(format_dict[field], basestring):
      format_dict[field] = _escape_white_space_characters(format_dict[field], inverse)

2. 转义空格字符串：按照MAPPINGS，s.replace()

```python
MAPPINGS = {
  "\n": "\\n",
 "\t": "\\t",
 "\r": "\\r",
 " ": "\\s"
}
```

最终: 将数据返给前端，前端按照参数处理，显示在页面= =

羊羊羊羊吃青草

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录