地理空间数据融合与多源数据集成技术

大G哥

于 2025-05-07 12:21:39 发布

阅读量721

点赞数 8

本文链接：https://blog.csdn.net/Java_fenxiang/article/details/147777833

版权

本文将介绍地理空间数据融合的关键方法，包括多源数据对齐、时空数据融合、异构数据集成和质量评估，并提供完整的Python实现代码。

一、多源空间数据对齐

1.1 坐标系转换与统一

import geopandas as gpd
import pyproj
from shapely.ops import transform

def unify_coordinate_system(gdf, target_crs='EPSG:4326'):
    """统一坐标系到目标CRS"""
    if gdf.crs is None:
        raise ValueError("输入数据没有定义CRS")
    
    if gdf.crs != target_crs:
        return gdf.to_crs(target_crs)
    return gdf

def transform_geometry(geom, source_crs, target_crs):
    """单个几何对象的坐标转换"""
    project = pyproj.Transformer.from_crs(
        source_crs, target_crs, always_xy=True
    ).transform
    return transform(project, geom)

# 使用示例
if __name__ == "__main__":
    # 加载不同坐标系的数据
    gdf_wgs84 = gpd.read_file('data_wgs84.shp')  # EPSG:4326
    gdf_utm = gpd.read_file('data_utm.shp')     # EPSG:32650
    
    # 统一坐标系
    gdf_utm_unified = unify_coordinate_system(gdf_utm)
    print(f"原始CRS: {gdf_utm.crs}")
    print(f"转换后CRS: {gdf_utm_unified.crs}")

1.2 空间数据匹配与关联

from sklearn.neighbors import NearestNeighbors

def spatial_join_nearest(source_gdf, target_gdf, max_distance=100):
    """基于最近邻的空间连接"""
    # 确保坐标系一致
    target_gdf = unify_coordinate_system(target_gdf, source_gdf.crs)
    
    # 提取坐标数组
    source_coords = np.array([[p.x, p.y] for p in source_gdf.geometry])
    target_coords = np.array([[p.x, p.y] for p in target_gdf.geometry])
    
    # 构建最近邻模型
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(target_coords)
    distances, indices = nbrs.kneighbors(source_coords)
    
    # 筛选在阈值范围内的匹配
    mask = distances[:,0] <= max_distance
    matched = source_gdf[mask].copy()
    matched.reset_index(drop=True, inplace=True)
    
    # 添加匹配到的属性
    matched_target = target_gdf.iloc[indices[mask,0]].reset_index(drop=True)
    for col in matched_target.columns:
        if col != 'geometry':
            matched[col] = matched_target[col]
    
    return matched

# 使用示例
if __name__ == "__main__":
    points = gpd.read_file('points.shp')
    polygons = gpd.read_file('polygons.shp')
    
    # 执行空间连接
    joined = spatial_join_nearest(points, polygons)
    print(f"匹配到的点数: {len(joined)}/{len(points)}")

二、时空数据融合

2.1 时空轨迹融合

import pandas as pd
from shapely.geometry import LineString

def fuse_trajectories(traj_gdf, time_threshold='5min', distance_threshold=100):
    """融合多条时空轨迹"""
    # 按时间排序
    traj_gdf = traj_gdf.sort_values('timestamp')
    
    # 创建轨迹段
    segments = []
    current_segment = []
    
    for idx, row in traj_gdf.iterrows():
        if not current_segment:
            current_segment.append(row)
            continue
        
        # 计算时间差和距离
        last_point = current_segment[-1]
        time_diff = pd.to_datetime(row['timestamp']) - pd.to_datetime(last_point['timestamp'])
        distance = row.geometry.distance(last_point.geometry)
        
        if time_diff <= pd.Timedelta(time_threshold) and distance <= distance_threshold:
            current_segment.append(row)
        else:
            if len(current_segment) > 1:
                segments.append(current_segment)
            current_segment = [row]
    
    # 创建融合后的轨迹
    fused_trajectories = []
    for seg in segments:
        coords = [(p.geometry.x, p.geometry.y) for p in seg]
        timestamps = [p['timestamp'] for p in seg]
        props = {k: [p[k] for p in seg] for k in seg[0].index if k not in ['geometry', 'timestamp']}
        
        fused_trajectories.append({
            'geometry': LineString(coords),
            'timestamps': timestamps,
            'properties': props
        })
    
    return gpd.GeoDataFrame(fused_trajectories)

# 使用示例
if __name__ == "__main__":
    # 生成模拟轨迹数据
    np.random.seed(42)
    n_points = 100
    base_time = pd.Timestamp.now()
    
    traj_data = []
    for i in range(n_points):
        traj_data.append({
            'geometry': Point(116.4 + np.random.uniform(-0.01, 0.01), 
                           39.9 + np.random.uniform(-0.01, 0.01)),
            'timestamp': (base_time + pd.Timedelta(minutes=i*2)).strftime('%Y-%m-%d %H:%M:%S'),
            'speed': np.random.uniform(10, 30)
        })
    
    traj_gdf = gpd.GeoDataFrame(traj_data)
    
    # 融合轨迹
    fused_traj = fuse_trajectories(traj_gdf)
    print(f"原始轨迹点: {len(traj_gdf)}")
    print(f"融合后轨迹段: {len(fused_traj)}")

2.2 遥感影像与矢量数据融合

import rasterio
from rasterio.mask import mask
from rasterstats import zonal_stats

def raster_vector_fusion(raster_path, vector_gdf, stats=['mean', 'max', 'min']):
    """将栅格数据统计值融合到矢量数据"""
    # 执行区域统计
    stats_results = zonal_stats(
        vector_gdf,
        raster_path,
        stats=stats,
        geojson_out=True
    )
    
    # 创建融合后的GeoDataFrame
    fused_gdf = gpd.GeoDataFrame.from_features(stats_results)
    
    return fused_gdf

# 使用示例
if __name__ == "__main__":
    # 加载NDVI栅格和地块矢量
    parcels = gpd.read_file('parcels.shp')
    
    # 融合NDVI统计值
    fused_data = raster_vector_fusion('ndvi.tif', parcels)
    print(f"融合后的属性字段: {fused_data.columns}")

三、异构数据集成

3.1 空间图数据集成

import networkx as nx
from osmnx import graph_from_place

def integrate_spatial_graph(vector_gdf, graph_type='road'):
    """将矢量数据集成到空间图结构中"""
    # 创建图对象
    G = nx.Graph()
    
    # 添加节点
    for idx, row in vector_gdf.iterrows():
        geom = row.geometry
        if geom.geom_type == 'Point':
            G.add_node(idx, pos=(geom.x, geom.y), **row.drop('geometry').to_dict())
        elif geom.geom_type == 'LineString':
            # 将线拆分为节点和边
            coords = list(geom.coords)
            for i, (x, y) in enumerate(coords):
                node_id = f"{idx}_{i}"
                G.add_node(node_id, pos=(x, y))
            
            for i in range(len(coords)-1):
                G.add_edge(f"{idx}_{i}", f"{idx}_{i+1}", **row.drop('geometry').to_dict())
    
    return G

# 使用示例
if __name__ == "__main__":
    # 加载道路数据
    roads = gpd.read_file('roads.shp')
    
    # 创建空间图
    road_graph = integrate_spatial_graph(roads)
    print(f"图节点数: {len(road_graph.nodes)}")
    print(f"图边数: {len(road_graph.edges)}")

3.2 空间属性关联分析

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def spatial_feature_analysis(fused_gdf, target_var, spatial_lag=True):
    """空间属性关联分析"""
    # 准备特征数据
    X = fused_gdf.drop(columns=[target_var, 'geometry'])
    y = fused_gdf[target_var]
    
    # 添加空间滞后特征
    if spatial_lag:
        w = lps.weights.Queen.from_dataframe(fused_gdf)
        w.transform = 'r'
        fused_gdf['spatial_lag'] = lps.weights.lag_spatial(w, fused_gdf[target_var])
        X['spatial_lag'] = fused_gdf['spatial_lag']
    
    # 移除缺失值
    valid_mask = X.notnull().all(axis=1) & y.notnull()
    X = X[valid_mask]
    y = y[valid_mask]
    
    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # 训练随机森林模型
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    
    # 评估模型
    score = model.score(X_test, y_test)
    print(f"模型R2分数: {score:.3f}")
    
    # 特征重要性
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return model, importance

# 使用示例
if __name__ == "__main__":
    # 加载融合后的数据
    fused_data = gpd.read_file('fused_data.shp')
    
    # 分析NDVI与其它属性的关系
    model, importance = spatial_feature_analysis(fused_data, 'ndvi_mean')
    print("\n特征重要性:")
    print(importance)

四、融合质量评估

4.1 空间一致性评估

from sklearn.metrics import mean_squared_error

def assess_spatial_consistency(source_gdf, target_gdf, common_var):
    """评估空间数据一致性"""
    # 空间连接获取匹配对
    matched = spatial_join_nearest(source_gdf, target_gdf)
    
    if len(matched) == 0:
        raise ValueError("没有找到匹配的空间要素")
    
    # 计算一致性指标
    mse = mean_squared_error(matched[common_var+'_x'], matched[common_var+'_y'])
    corr = matched[[common_var+'_x', common_var+'_y']].corr().iloc[0,1]
    
    return {
        'matching_pairs': len(matched),
        'mse': mse,
        'correlation': corr,
        'mean_absolute_error': (matched[common_var+'_x'] - matched[common_var+'_y']).abs().mean()
    }

# 使用示例
if __name__ == "__main__":
    # 加载两个版本的数据
    data_v1 = gpd.read_file('data_v1.shp')
    data_v2 = gpd.read_file('data_v2.shp')
    
    # 评估温度数据一致性
    metrics = assess_spatial_consistency(data_v1, data_v2, 'temperature')
    print("\n空间一致性评估:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

4.2 时空数据完整性评估

def assess_temporal_completeness(traj_gdf, time_field='timestamp', expected_freq='1H'):
    """评估时空数据完整性"""
    # 转换为时间序列
    ts = pd.to_datetime(traj_gdf[time_field])
    ts_counts = ts.value_counts().sort_index()
    
    # 创建完整时间索引
    full_range = pd.date_range(ts.min(), ts.max(), freq=expected_freq)
    completeness = ts_counts.reindex(full_range, fill_value=0)
    
    # 计算指标
    metrics = {
        'total_periods': len(full_range),
        'observed_periods': (completeness > 0).sum(),
        'completeness_ratio': (completeness > 0).mean(),
        'average_count': completeness.mean(),
        'gap_intervals': (completeness == 0).sum()
    }
    
    return metrics, completeness

# 使用示例
if __name__ == "__main__":
    # 加载轨迹数据
    trajectory = gpd.read_file('trajectory.shp')
    
    # 评估数据完整性
    metrics, _ = assess_temporal_completeness(trajectory)
    print("\n时空完整性评估:")
    for k, v in metrics.items():
        print(f"{k}: {v}")