本文将介绍地理空间数据融合的关键方法,包括多源数据对齐、时空数据融合、异构数据集成和质量评估,并提供完整的Python实现代码。
一、多源空间数据对齐
1.1 坐标系转换与统一
import geopandas as gpd
import pyproj
from shapely.ops import transform
def unify_coordinate_system(gdf, target_crs='EPSG:4326'):
"""统一坐标系到目标CRS"""
if gdf.crs is None:
raise ValueError("输入数据没有定义CRS")
if gdf.crs != target_crs:
return gdf.to_crs(target_crs)
return gdf
def transform_geometry(geom, source_crs, target_crs):
"""单个几何对象的坐标转换"""
project = pyproj.Transformer.from_crs(
source_crs, target_crs, always_xy=True
).transform
return transform(project, geom)
# 使用示例
if __name__ == "__main__":
# 加载不同坐标系的数据
gdf_wgs84 = gpd.read_file('data_wgs84.shp') # EPSG:4326
gdf_utm = gpd.read_file('data_utm.shp') # EPSG:32650
# 统一坐标系
gdf_utm_unified = unify_coordinate_system(gdf_utm)
print(f"原始CRS: {gdf_utm.crs}")
print(f"转换后CRS: {gdf_utm_unified.crs}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
1.2 空间数据匹配与关联
from sklearn.neighbors import NearestNeighbors
def spatial_join_nearest(source_gdf, target_gdf, max_distance=100):
"""基于最近邻的空间连接"""
# 确保坐标系一致
target_gdf = unify_coordinate_system(target_gdf, source_gdf.crs)
# 提取坐标数组
source_coords = np.array([[p.x, p.y] for p in source_gdf.geometry])
target_coords = np.array([[p.x, p.y] for p in target_gdf.geometry])
# 构建最近邻模型
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(target_coords)
distances, indices = nbrs.kneighbors(source_coords)
# 筛选在阈值范围内的匹配
mask = distances[:,0] <= max_distance
matched = source_gdf[mask].copy()
matched.reset_index(drop=True, inplace=True)
# 添加匹配到的属性
matched_target = target_gdf.iloc[indices[mask,0]].reset_index(drop=True)
for col in matched_target.columns:
if col != 'geometry':
matched[col] = matched_target[col]
return matched
# 使用示例
if __name__ == "__main__":
points = gpd.read_file('points.shp')
polygons = gpd.read_file('polygons.shp')
# 执行空间连接
joined = spatial_join_nearest(points, polygons)
print(f"匹配到的点数: {len(joined)}/{len(points)}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
二、时空数据融合
2.1 时空轨迹融合
import pandas as pd
from shapely.geometry import LineString
def fuse_trajectories(traj_gdf, time_threshold='5min', distance_threshold=100):
"""融合多条时空轨迹"""
# 按时间排序
traj_gdf = traj_gdf.sort_values('timestamp')
# 创建轨迹段
segments = []
current_segment = []
for idx, row in traj_gdf.iterrows():
if not current_segment:
current_segment.append(row)
continue
# 计算时间差和距离
last_point = current_segment[-1]
time_diff = pd.to_datetime(row['timestamp']) - pd.to_datetime(last_point['timestamp'])
distance = row.geometry.distance(last_point.geometry)
if time_diff <= pd.Timedelta(time_threshold) and distance <= distance_threshold:
current_segment.append(row)
else:
if len(current_segment) > 1:
segments.append(current_segment)
current_segment = [row]
# 创建融合后的轨迹
fused_trajectories = []
for seg in segments:
coords = [(p.geometry.x, p.geometry.y) for p in seg]
timestamps = [p['timestamp'] for p in seg]
props = {k: [p[k] for p in seg] for k in seg[0].index if k not in ['geometry', 'timestamp']}
fused_trajectories.append({
'geometry': LineString(coords),
'timestamps': timestamps,
'properties': props
})
return gpd.GeoDataFrame(fused_trajectories)
# 使用示例
if __name__ == "__main__":
# 生成模拟轨迹数据
np.random.seed(42)
n_points = 100
base_time = pd.Timestamp.now()
traj_data = []
for i in range(n_points):
traj_data.append({
'geometry': Point(116.4 + np.random.uniform(-0.01, 0.01),
39.9 + np.random.uniform(-0.01, 0.01)),
'timestamp': (base_time + pd.Timedelta(minutes=i*2)).strftime('%Y-%m-%d %H:%M:%S'),
'speed': np.random.uniform(10, 30)
})
traj_gdf = gpd.GeoDataFrame(traj_data)
# 融合轨迹
fused_traj = fuse_trajectories(traj_gdf)
print(f"原始轨迹点: {len(traj_gdf)}")
print(f"融合后轨迹段: {len(fused_traj)}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
2.2 遥感影像与矢量数据融合
import rasterio
from rasterio.mask import mask
from rasterstats import zonal_stats
def raster_vector_fusion(raster_path, vector_gdf, stats=['mean', 'max', 'min']):
"""将栅格数据统计值融合到矢量数据"""
# 执行区域统计
stats_results = zonal_stats(
vector_gdf,
raster_path,
stats=stats,
geojson_out=True
)
# 创建融合后的GeoDataFrame
fused_gdf = gpd.GeoDataFrame.from_features(stats_results)
return fused_gdf
# 使用示例
if __name__ == "__main__":
# 加载NDVI栅格和地块矢量
parcels = gpd.read_file('parcels.shp')
# 融合NDVI统计值
fused_data = raster_vector_fusion('ndvi.tif', parcels)
print(f"融合后的属性字段: {fused_data.columns}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
三、异构数据集成
3.1 空间图数据集成
import networkx as nx
from osmnx import graph_from_place
def integrate_spatial_graph(vector_gdf, graph_type='road'):
"""将矢量数据集成到空间图结构中"""
# 创建图对象
G = nx.Graph()
# 添加节点
for idx, row in vector_gdf.iterrows():
geom = row.geometry
if geom.geom_type == 'Point':
G.add_node(idx, pos=(geom.x, geom.y), **row.drop('geometry').to_dict())
elif geom.geom_type == 'LineString':
# 将线拆分为节点和边
coords = list(geom.coords)
for i, (x, y) in enumerate(coords):
node_id = f"{idx}_{i}"
G.add_node(node_id, pos=(x, y))
for i in range(len(coords)-1):
G.add_edge(f"{idx}_{i}", f"{idx}_{i+1}", **row.drop('geometry').to_dict())
return G
# 使用示例
if __name__ == "__main__":
# 加载道路数据
roads = gpd.read_file('roads.shp')
# 创建空间图
road_graph = integrate_spatial_graph(roads)
print(f"图节点数: {len(road_graph.nodes)}")
print(f"图边数: {len(road_graph.edges)}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
3.2 空间属性关联分析
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
def spatial_feature_analysis(fused_gdf, target_var, spatial_lag=True):
"""空间属性关联分析"""
# 准备特征数据
X = fused_gdf.drop(columns=[target_var, 'geometry'])
y = fused_gdf[target_var]
# 添加空间滞后特征
if spatial_lag:
w = lps.weights.Queen.from_dataframe(fused_gdf)
w.transform = 'r'
fused_gdf['spatial_lag'] = lps.weights.lag_spatial(w, fused_gdf[target_var])
X['spatial_lag'] = fused_gdf['spatial_lag']
# 移除缺失值
valid_mask = X.notnull().all(axis=1) & y.notnull()
X = X[valid_mask]
y = y[valid_mask]
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 训练随机森林模型
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
# 评估模型
score = model.score(X_test, y_test)
print(f"模型R2分数: {score:.3f}")
# 特征重要性
importance = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
return model, importance
# 使用示例
if __name__ == "__main__":
# 加载融合后的数据
fused_data = gpd.read_file('fused_data.shp')
# 分析NDVI与其它属性的关系
model, importance = spatial_feature_analysis(fused_data, 'ndvi_mean')
print("\n特征重要性:")
print(importance)
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
四、融合质量评估
4.1 空间一致性评估
from sklearn.metrics import mean_squared_error
def assess_spatial_consistency(source_gdf, target_gdf, common_var):
"""评估空间数据一致性"""
# 空间连接获取匹配对
matched = spatial_join_nearest(source_gdf, target_gdf)
if len(matched) == 0:
raise ValueError("没有找到匹配的空间要素")
# 计算一致性指标
mse = mean_squared_error(matched[common_var+'_x'], matched[common_var+'_y'])
corr = matched[[common_var+'_x', common_var+'_y']].corr().iloc[0,1]
return {
'matching_pairs': len(matched),
'mse': mse,
'correlation': corr,
'mean_absolute_error': (matched[common_var+'_x'] - matched[common_var+'_y']).abs().mean()
}
# 使用示例
if __name__ == "__main__":
# 加载两个版本的数据
data_v1 = gpd.read_file('data_v1.shp')
data_v2 = gpd.read_file('data_v2.shp')
# 评估温度数据一致性
metrics = assess_spatial_consistency(data_v1, data_v2, 'temperature')
print("\n空间一致性评估:")
for k, v in metrics.items():
print(f"{k}: {v:.4f}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
4.2 时空数据完整性评估
def assess_temporal_completeness(traj_gdf, time_field='timestamp', expected_freq='1H'):
"""评估时空数据完整性"""
# 转换为时间序列
ts = pd.to_datetime(traj_gdf[time_field])
ts_counts = ts.value_counts().sort_index()
# 创建完整时间索引
full_range = pd.date_range(ts.min(), ts.max(), freq=expected_freq)
completeness = ts_counts.reindex(full_range, fill_value=0)
# 计算指标
metrics = {
'total_periods': len(full_range),
'observed_periods': (completeness > 0).sum(),
'completeness_ratio': (completeness > 0).mean(),
'average_count': completeness.mean(),
'gap_intervals': (completeness == 0).sum()
}
return metrics, completeness
# 使用示例
if __name__ == "__main__":
# 加载轨迹数据
trajectory = gpd.read_file('trajectory.shp')
# 评估数据完整性
metrics, _ = assess_temporal_completeness(trajectory)
print("\n时空完整性评估:")
for k, v in metrics.items():
print(f"{k}: {v}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
五、总结
本文介绍了地理空间数据融合的关键技术:
- 数据对齐 - 坐标系转换与空间匹配
- 时空融合 - 轨迹融合与栅格矢量集成
- 异构集成 - 图数据转换与空间分析
- 质量评估 - 一致性与完整性检查
实际应用中应考虑:
- 数据源的时空分辨率和精度差异
- 融合目标的明确性(统计分析/可视化/建模)
- 计算资源的合理分配
- 融合结果的不确定性评估
多源空间数据融合能够打破数据孤岛,挖掘更深层次的空间模式和关系,为智慧城市、环境监测等应用提供更全面的数据基础。