数据预处理:PySpark 的实现线性插值填充缺失值
1. Python 实现线性插值填充缺失值
实现函数为:
def linear_insert(x1, y1, x2, y2, insert_x):
if type(insert_x) == int:
insert_x = [insert_x]
k = (y2 - y1) / (x2 - x1)
return [k * (x - x1) + y1 for x in insert_x]
def fill_na_by_linear(lst):
first_flag = False
first_na = 0
length = len(lst)
for i in range(length):
item = lst[i]
if not first_flag:
if item is None:
first_na = i
if first_na == 0:
# 第一个缺失值填充为 0
lst[0] = 0.0
continue
first_flag = True
else:
if item is not None:
first_flag = False
lst[first_na:i] = linear_insert(first_na - 1, lst[first_na - 1], i, lst[i], ran