【AMR自动调制识别数据预处理】DeepSig RadioML 2018.01A 处理为单信噪比单调制数据

本文链接：https://blog.csdn.net/yuuuuuuuk/article/details/129688069

DeepSig RadioML 2018.01A 数据集：下载地址 19G 下载速度很快10+m/s

下载后的目录：除了图中标注，其他无用
在这里插入图片描述数据集包含24种调制样式：‘OOK’, ‘4ASK’, ‘8ASK’, ‘BPSK’, ‘QPSK’, ‘8PSK’, ‘16PSK’, ‘32PSK’, ‘16APSK’, ‘32APSK’, ‘64APSK’, ‘128APSK’, ‘16QAM’, ‘32QAM’, ‘64QAM’, ‘128QAM’, ‘256QAM’, ‘AM-SSB-WC’, ‘AM-SSB-SC’, ‘AM-DSB-WC’, ‘AM-DSB-SC’,‘FM’, ‘GMSK’, ‘OQPSK’

信噪比范围：SNR = 【-20:32:2】 -20到30间隔2，共26种信噪比，具体【-20，-18…28,30】

每个调制，每个信噪比有4096条数据，共计2555904（24 * 26 4096*）条数据，IQ数据，数据格式为（1024, 2）

在数据集存放文件hdf5文件，存在3个参数X(IQ信号)，Y(调制方式），Z（信噪比）
X——>[24264096，1024，2]
Y——>[24264096，24] 采用onehot，所以不是[24264096，1] ，顺序为上述顺序（class-fixed.txt修正后的）
Z——>[24264096，1]

我要切分全部调制方式的信噪比为10db的全部，则代码为

# -*- coding: utf-8 -*-
import h5py

h5file = h5py.File('D:/RF_data/archive/GOLD_XYZ_OSC.0001_1024.hdf5', 'r+')
savefile=h5py.File('mini_dataset.hdf5', 'w')
import numpy as np

X = h5file['X'][15 * 4096:16 * 4096]  # 15是因为10db在  26个信噪比的第15个
Y = h5file['Y'][15 * 4096:16 * 4096]
Z = h5file['Z'][15 * 4096:16 * 4096]
for i in range(1, 24, 1):
    X = np.vstack((X, h5file['X'][4096 * 26 * i + 15 * 4096:4096 * 26 * i + 16 * 4096]))
    Y = np.vstack((Y, h5file['Y'][4096 * 26 * i + 15 * 4096:4096 * 26 * i + 16 * 4096]))
    Z = np.vstack((Z, h5file['Z'][4096 * 26 * i + 15 * 4096:4096 * 26 * i + 16 * 4096]))

savefile['X']=X
savefile['Y']=Y
savefile['Z']=Z

print(savefile['X'].shape)

切分成功，切成的数据维度为98304（24（全部调制）*4096（一类数据））

切分方法二：参考教程
原博文代码有点问题，修改了一下：

# -*- coding: utf-8 -*-
import h5py
h5file = h5py.File('D:/RF_data/archive/GOLD_XYZ_OSC.0001_1024.hdf5', 'r+')

Y = h5file['Y'][:]
Z = h5file['Z'][:]
import numpy as np
import scipy.io as io

Y_array = np.array(Y)  # mod
Z_array = np.array(Z)  # SNR

mod = ['OOK', '4ASK', '8ASK', 'BPSK', 'QPSK', '8PSK', '16PSK', '32PSK', '16APSK', '32APSK', '64APSK', '128APSK',
       '16QAM', '32QAM', '64QAM', '128QAM', '256QAM', 'AM-SSB-WC', 'AM-SSB-SC', 'AM-DSB-WC', 'AM-DSB-SC', 'FM', 'GMSK',
       'OQPSK']
snr = range(-20, 31, 2)
n = 0
for i in range(0, 24, 1):
    for j in range(0, 26, 1):
        IQ = h5file['X'][n * 4096:(n + 1) * 4096, :, :]
        path = mod[i] + "_SNR=" + str(snr[j]) + "dB.mat"
        io.savemat(path, {'name': IQ})
        n = n + 1

print('ending')

最终得到.mat文件，可以使用为原来的hdf5格式，参考第一个代码
在这里插入图片描述

之前检索关于Deepsig 2018数据集资料，都在找采样频率，我自己也找了半天没找到相关资料，突发奇想试了一下newbing，给出了答案
在这里插入图片描述
Deepsig 2018数据集的采样频率为2.018Mhz？

再分享一个生成数据集代码（自动调制识别基本都是tensorflow写的，找个pytorch模型挺费劲）：

from torch.utils.data import Dataset
import os
import h5py
import numpy as np
import torch
import torch.nn.functional as F

catalog = {
    'deep_sig_2018_train': {
        'path': 'data/full_train_set.hdf5',
    },
    'deep_sig_2018_test': {
        'path': 'data/full_test_set.hdf5',
    },
    'deep_sig_micro_train': {
        'path': 'data/micro_train_set.hdf5',
    },
    'deep_sig_micro_test': {
        'path': 'data/micro_test_set.hdf5',
    }
}


class DeepSig2018(Dataset):
    '''
    DeepSig2018_Train Pytorch Dataset Class
    Dataset contains 4096 frames per modulation-SNR combination
    Loads data from DeepSig.ai 2018 dataset
  - f['X'] = 2.5M array of 1024 intervals of 2 I/Q datapoints
  - f['Y'] = 2.5M array of 24 modulation types one-hot-encoded
  - f['Z'] = SNR level in dB --> [['-2dB']]

    Assigns the modulation type as the data label
    '''

    classes = ["OOK", "ASK4", "ASK8", "BPSK", "QPSK", "PSK8", "PSK16",
               "PSK32", "APSK16", "APSK32", "APSK64", "APSK128", "QAM16",
               "QAM32", "QAM64", "QAM128", "QAM256", "AMSSBWC", "AMSSBSC",
               "AMDSBWC", "AMDSBSC", "FM", "GMSK", "OQPS"]

    def __init__(self,
                 transform=None,
                 download=True,
                 split="train",
                 progress=True,
                 min_snr=-float('inf'),
                 max_snr=float('inf'),
                 one_hot=True
                 ):
        super().__init__()
        self.transform = transform

        if split == "train":
            f_path = catalog['deep_sig_2018_train']['path']
        elif split == "test":
            f_path = catalog['deep_sig_2018_test']['path']
        elif split == "micro_train":
            f_path = catalog['deep_sig_micro_train']['path']
        elif split == "micro_test":
            f_path = catalog['deep_sig_micro_test']['path']
        else:
            assert False, ("Split argument is invalid. "
                           "Please choose from 'train', "
                           "'test', 'micro_train', or 'micro_test")

        if not os.path.isfile(f_path):
            if download:
                print("Selected dataset not found.")
                self.download(self, progress=progress)
            else:
                assert False, "No dataset has been downloaded. Please flag set 'download=True' to download."

        #  LOAD DISK DATA TO VARIABLES
        raw_data = h5py.File(f_path, "r")
        self.X = np.array(raw_data['X'])
        self.mod_type = np.array(raw_data['Y'])
        self.SNR = np.array(raw_data['Z'])
        self.one_hot = one_hot

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.index = np.arange(len(self.X))
        self.filter_SNR(self, self.min_snr, self.max_snr)

    @staticmethod
    def filter_SNR(self, min_snr, max_snr):
        '''
        Filters the object to only contain data
        with SNR levels between max_snr and min_snr.

        Args:
        - max_snr(float): Maximum level of SNR to retain
        - min_snr(float): Minimum level of SNR to retain
        '''
        self.min_snr = min_snr
        self.max_snr = max_snr
        self.index, = np.where((self.SNR >= self.min_snr) & (self.SNR <= self.max_snr))

    @staticmethod
    def export_data(set_data, file_name):
        out_path = f'./data/{file_name}.hdf5'
        out_data = h5py.File(out_path, 'w')
        out_data['X'] = set_data[0]
        out_data['Y'] = set_data[1]
        out_data['Z'] = set_data[2]
        out_data.close()
        return f'{file_name} exported to {out_path}'

    @staticmethod
    def data_split(self, f_name='data/GOLD_XYZ_OSC.0001_1024.hdf5'):
        print("Unpacking master file.")
        raw_data = h5py.File(f_name, "r")

        sigs = np.array(raw_data['X'])
        print("Signals allocated.")
        mod_type = np.array(raw_data['Y'])
        print("Labels allocated.")
        SNR = np.array(raw_data['Z'])
        SNR = SNR.squeeze(1)
        print("SNR levels allocated.")
        index = np.arange(len(sigs))
        print("Splitting data...")

        full_train_idx = []
        micro_train_idx = []
        micro_test_idx = []
        full_test_idx = []
        for i in range(0, len(sigs), 4096):
            mtrain_idx, mtest_idx = index[i:820 + i], index[820 + i:1230 + i]  # 820 is ~20% of the dataset
            ftrain_idx, ftest_idx = index[i:3276 + i], index[3276 + i:3276 + 820 + i]

            micro_train_idx.extend(mtrain_idx)
            micro_test_idx.extend(mtest_idx)
            full_train_idx.extend(ftrain_idx)
            full_test_idx.extend(ftest_idx)

        print()
        print("Split complete. Assigning data to sets.")
        micro_train_set = [sigs[micro_train_idx], mod_type[micro_train_idx], SNR[micro_train_idx]]
        micro_test_set = [sigs[micro_test_idx], mod_type[micro_test_idx], SNR[micro_test_idx]]

        full_train_set = [sigs[full_train_idx], mod_type[full_train_idx], SNR[full_train_idx]]
        full_test_set = [sigs[full_test_idx], mod_type[full_test_idx], SNR[full_test_idx]]

        print("Data has been split into 'micro_train_set', 'micro_test_set', 'full_train_set', 'full_test_set'")
        print("Saving data sets into hd5f files.")
        print(self.export_data(micro_train_set, "micro_train_set"))
        print(self.export_data(micro_test_set, "micro_test_set"))
        print(self.export_data(full_train_set, "full_train_set"))
        print(self.export_data(full_test_set, "full_test_set"))

        return [full_train_set, full_test_set, micro_train_set, micro_test_set]

    @staticmethod
    def download(self, progress=True):
        f_path = 'data/GOLD_XYZ_OSC.0001_1024.hdf5'
        if not os.path.isfile(f_path):
            import kaggle
            print("Downloading zip file from Kaggle")
            print("WARNING: A Kaggle API key is needed for the download!!! \n"
                  "If you do not have one, download the file manually and place "
                  "it's extracted contents in the 'data/' folder.")
            kaggle.api.dataset_download_files('pinxau1000/radioml2018', path='./data/', unzip=True, quiet=False)
            print("Data has been downloaded. \n Preparing to split data...")
            self.data_split(self, f_path)
        else:
            print("Master data file located. Preparing to split data...")
            self.data_split(self, f_path)

    def __len__(self):
        return len(self.X[self.index])

    def __getitem__(self, idx):
        tru_idx = self.index[idx]
        data = self.X[tru_idx].T
        label = self.mod_type[tru_idx]

        if not self.one_hot:
            label = label.argmax()

        if self.transform:
            data = self.transform(data)

        return torch.tensor(data), torch.tensor(label, dtype=torch.float)


def test():
    ds = DeepSig2018(split="train")
    x, y = ds[0]
    assert y.shape == torch.Size([24]), "Labels to default to one-hot-encoding with 24 classes."
    assert y.dtype == torch.float, "Labels should be torch.float datatype."


if __name__ == "__main__":
    test()