此篇为上一篇的接续,对时序数据进行插值
上篇文章 解析kml文件,提取经纬度信息存入csv
将kml里面的信息提取了出来,但是有些数据间隔太大,甚至几十秒才一个采样点,需求对这样的数据进行插值,形成每秒一个采样点的数据,代码较简单,主要应用了scipy库的 interpolate 方法,希望对有同样需求的小伙伴有所帮助。
import pandas as pd
import os
import time
from scipy import interpolate
FinalPath = './final/'
CsvPath = './CSV/'
files = os.listdir(CsvPath)
for file in files:
start = time.time()
name, _ = os.path.splitext(file)
filepath = CsvPath + file
# 读入数据,对时间进行处理
print('dealing ' + file)
df = pd.read_csv(filepath)
# df.drop_duplicates(subset=['time'], keep='first', inplace=True)
df['time_diff'] = df['time'].apply(lambda x: pd.to_datetime(x)).diff()
df['time_diff'] = df['time_diff'].apply(lambda x: x.total_seconds())
df['time_diff'].fillna(0, inplace=True)
df['time_t'] = df['time_diff'].cumsum(axis=0)
a = int(df['time_t'].values.max())
b = int(df.shape[0])
print(f'时间范围 {a} s')
# print(f'原数据 {b} s')
print(f'需要添加 {a - b + 1} s数据')
# print(df.head())
# print(b)
# 计算需要插值的坐标点
t = list(df['time_t'].values)
# print(len(t))
t_all = [i for i in range(a)]
insert_t = list(set(t_all).difference(set(t)))
print(len(insert_t))
# 计算插值
y_lon = df['Lon'].values
y_lat = df['Lat'].values
y_alt = df['altitude'].values
Flinear_lon = interpolate.interp1d(t, y_lon, kind='linear')
y_new_lon = Flinear_lon(insert_t)
# y_new_lon = np.array(y_new_lon).tolist()
y_new_lon = [round(yi, 6) for yi in y_new_lon]
Flinear_lat = interpolate.interp1d(t, y_lat, kind='linear')
y_new_lat = Flinear_lat(insert_t)
# y_new_lat = np.array(y_new_lat).tolist()
y_new_lat = [round(yi, 6) for yi in y_new_lat]
Flinear_alt = interpolate.interp1d(t, y_alt, kind='linear')
y_new_alt = Flinear_alt(insert_t)
# y_new_alt = np.array(y_new_alt).tolist()
y_new_alt = [round(yi, 1) for yi in y_new_alt]
# 插值的表
df_t = pd.DataFrame(insert_t, columns=['time_t'])
df_lon = pd.DataFrame(y_new_lon, columns=['Lon'])
df_lat = pd.DataFrame(y_new_lat, columns=['Lat'])
df_alt = pd.DataFrame(y_new_alt, columns=['altitude'])
df_insert = pd.concat([df_t, df_lon, df_lat, df_alt], axis=1)
# 与原表合并,形成秒采数据
df = pd.concat([df, df_insert], ignore_index=True, join='inner', axis=0)
df.sort_values(by='time_t', inplace=True, ascending=True, ignore_index=True)
# print(df)
df.to_csv(FinalPath + name + '_interpolated.csv', index=False)
# print('done at ' + time.strftime('%Y-%m-%d %H:%M:%S'))
end = time.time()
print('time cost: ' + str(round((end-start), 3)) + ' s')
print('--' * 20)