【无线感知】【P7】WIFI 感知实战2- 数据集处理

前言:

       这里面重点介绍一下如何提取训练的数据集(input, label)

   这个项目是斯坦福大学和多伦多大学的合作项目,完整的项目地址

https://github.com/ermongroup/Wifi_Activity_Recognition

   论文方案:1小时以上

          每次训练的时候,通过csv_import()方法加载数据集,

大概需要1个小时才能加载完毕。

   优化方案:只需要 2分钟

     1: 先通过 csv_import 提取训练input,label 到txt(只运行一次,大概18分钟)

     2: 每次训练的时候,只通过txt_import ,加载训练的数据集以及标签(2分钟)

            这样可以专注模型优化


目录:

  1.  数据集分帧,标签
  2.  训练数据集加载

一   数据集分帧,标签

  1.1: 作用

                 1   输入CSI 信号 进行分帧,每帧1s(1000行)  

                 2   标签进行one-hot 编码

             运行   cross_vali_data_convert_merge.py .

           这个脚本提取 input features  & label 到

          input_files 目录下面. 耗时: 15分钟.

             

1.2: 输入 input 

     采用了分帧的思想:
      window_size = 1000 (#窗户大小,大概1s)
      slide_size = 200      (滑动窗口,帧与帧之间存在overlap,less than window_size!!)

xx  (90, 1000, 90)        x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (180, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (270, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (360, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (450, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (540, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (630, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (720, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (810, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (900, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (990, 1000, 90)      x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (1080, 1000, 90)     x2  (90, 1000, 90)      x (1, 1000, 90)

 xx  (1170, 1000, 90)      x2  (90, 1000, 90)     x (1, 1000, 90)

.......

1.3  label 

      采用了one-hot 编码

      threshold = 60

      以run 为例:  当前 bed/windows > threshold/100,才认为是一个run action

1.4   cross_vali_data_convert_merge.py

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 22 10:10:48 2024

@author: chengxf2
"""

import numpy as np
import csv
import glob
import os
from datetime import datetime

#1s 间隔
window_size = 1000
threshold = 60
#滑动窗口,帧与帧之间存在overlap,less than window_size!!
slide_size = 200 

def dataimport(path1, path2):

    xx = np.empty([0,window_size,90],float)
    yy = np.empty([0,8],float)

    ###Input data###
    #data import from csv
    input_csv_files = sorted(glob.glob(path1))
    #'''
    print("\n input files", len(input_csv_files))
    starttime = datetime.now()
    for f in input_csv_files:
        #print("input_file_name=",f)
        data = [[ float(elm) for elm in v] for v in csv.reader(open(f, "r"))]
        tmp1 = np.array(data)
        x2 =np.empty([0,window_size,90],float)

        #data import by slide window
        k = 0
        while k <= (len(tmp1) + 1 - 2 * window_size):
            x = np.dstack(np.array(tmp1[k:k+window_size, 1:91]).T)
            x2 = np.concatenate((x2, x),axis=0)
            k += slide_size

        xx = np.concatenate((xx,x2),axis=0)
        print("\n xx ",xx.shape, "\t x2 ",x2.shape, "\t x",x.shape)
    xx = xx.reshape(len(xx),-1)
   
    time_interval = datetime.now()-starttime
    
    print("\n 读取input 时间 ",time_interval.seconds)
    ###Annotation data###
    #data import from csv
    starttime = datetime.now()
    #'''
    annotation_csv_files = sorted(glob.glob(path2))
    for ff in annotation_csv_files:
        #print("annotation_file_name=",ff)
        ano_data = [[ str(elm) for elm in v] for v in csv.reader(open(ff,"r"))]
        tmp2 = np.array(ano_data)

        #data import by slide window
        y = np.zeros(((len(tmp2) + 1 - 2 * window_size)//slide_size+1,8))
        k = 0
        while k <= (len(tmp2) + 1 - 2 * window_size):
            y_pre = np.stack(np.array(tmp2[k:k+window_size]))
            bed = 0
            fall = 0
            walk = 0
            pickup = 0
            run = 0
            sitdown = 0
            standup = 0
            noactivity = 0
            for j in range(window_size):
                if y_pre[j] == "bed":
                    bed += 1
                elif y_pre[j] == "fall":
                    fall += 1
                elif y_pre[j] == "walk":
                    walk += 1
                elif y_pre[j] == "pickup":
                    pickup += 1
                elif y_pre[j] == "run":
                    run += 1
                elif y_pre[j] == "sitdown":
                    sitdown += 1
                elif y_pre[j] == "standup":
                    standup += 1
                else:
                    noactivity += 1
            idx = int(k/slide_size)
            if bed > window_size * threshold / 100:
                y[idx,:] = np.array([0,1,0,0,0,0,0,0])
            elif fall > window_size * threshold / 100:
                y[idx,:] = np.array([0,0,1,0,0,0,0,0])
            elif walk > window_size * threshold / 100:
                y[idx,:] = np.array([0,0,0,1,0,0,0,0])
            elif pickup > window_size * threshold / 100:
                y[idx,:] = np.array([0,0,0,0,1,0,0,0])
            elif run > window_size * threshold / 100:
                y[idx,:] = np.array([0,0,0,0,0,1,0,0])
            elif sitdown > window_size * threshold / 100:
                y[idx,:] = np.array([0,0,0,0,0,0,1,0])
            elif standup > window_size * threshold / 100:
                y[idx,:] = np.array([0,0,0,0,0,0,0,1])
            else:
                y[idx,:] = np.array([2,0,0,0,0,0,0,0])
            k += slide_size

        yy = np.concatenate((yy, y),axis=0)
    print(xx.shape,yy.shape)
    time_interval = datetime.now()-starttime
    print("\n 读取 label 时间 ",time_interval.seconds)
    #xx (7111, 90000) yy (7111, 8)
    return (xx, yy)
      
        
        
        
        
        

if __name__ == "__main__":
     
     train_dir = "input_files/"
     if not os.path.exists(train_dir):
         os.mkdir(train_dir)

     #“床”、“摔倒”、“上车”、“跑步”、“坐下”、“站起来”、“走路”
     labels = ["bed", "fall","pickup","run","sitdown","standup","walk"]
     
     for i, label in enumerate(labels):
         print("\n 读取数据集 ",label)
         filepath_input = "./Dataset/Data/input_*" + str(label) + "*.csv"
         filepath_label = "./Dataset/Data/annotation_*" + str(label) + "*.csv"
         
         outputfilename_input = "./input_files/xx_" + str(window_size) + "_" + str(threshold) + "_" + label + ".csv"
         outputfilename_label = "./input_files/yy_" + str(window_size) + "_" + str(threshold) + "_" + label + ".csv"
         
         x,y =dataimport(filepath_input, filepath_label)
         
         print("\n  保存训练数据集",label)
         with open(outputfilename_input, "w") as f:
            writer = csv.writer(f, lineterminator="\n")
            writer.writerows(x)
         with open(outputfilename_label, "w") as f:
            writer = csv.writer(f, lineterminator="\n")
            writer.writerows(y)
         print(label + "\t finish!")
        
         
     


二    训练数据集加载

   

     总共大概18分钟(多了保存时间)

 2.1  论文里面原始方案,耗时较长,至少需要1个小时

         通过 csv_import 函数提取

   

from __future__ import print_function
import gzip
import os
import numpy as np,numpy
import csv
import glob
import pandas as pd
from datetime import datetime

class DataSet(object):
    def __init__(self, images, labels, fake_data=False):
        assert images.shape[0] == labels.shape[0], (
                "images.shape: %s labels.shape: %s" % (images.shape,
                                                        labels.shape))
        self._num_examples = images.shape[0]
        images = images.reshape(images.shape[0],
                                images.shape[1] * images.shape[2])
        self._images = images
        self._labels = labels
        self._epochs_completed = 0
        self._index_in_epoch = 0
    @property
    def images(self):
        return self._images
    @property
    def labels(self):
        return self._labels
    @property
    def num_examples(self):
        return self._num_examples
    @property
    def epochs_completed(self):
        return self._epochs_completed
    def next_batch(self, batch_size, fake_data=False):
        start = self._index_in_epoch
        self._index_in_epoch += batch_size
        if self._index_in_epoch > self._num_examples:
            # Finished epoch
            self._epochs_completed += 1
            # Shuffle the data
            perm = numpy.arange(self._num_examples)
            numpy.random.shuffle(perm)
            self._images = self._images[perm]
            self._labels = self._labels[perm]
            # Start next epoch
            start = 0
            self._index_in_epoch = batch_size
            assert batch_size <= self._num_examples
        end = self._index_in_epoch
        return self._images[start:end], self._labels[start:end]

def csv_import():
    x_dic = {}
    y_dic = {}
    print("csv file importing...")

    for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
#        xx = np.array([[ float(elm) for elm in v] for v in csv.reader(open("./input_files/xx_1000_60_" + str(i) + ".csv","r"))])
#        yy = np.array([[ float(elm) for elm in v] for v in csv.reader(open("./input_files/yy_1000_60_" + str(i) + ".csv","r"))])

#        xx = xx[::2,:]
#        yy = yy[::2,:]
        start_time = datetime.now()
        SKIPROW = 2 #Skip every 2 rows -> overlap 800ms to 600ms  (To avoid memory error)
        num_lines = sum(1 for l in open("./input_files/xx_1000_60_" + str(i) + ".csv"))
        skip_idx = [x for x in range(1, num_lines) if x % SKIPROW !=0]

        xx = np.array(pd.read_csv("./input_files/xx_1000_60_" + str(i) + ".csv", header=None, skiprows = skip_idx))
        yy = np.array(pd.read_csv("./input_files/yy_1000_60_" + str(i) + ".csv", header=None, skiprows = skip_idx))

        # eliminate the NoActivity Data
        rows, cols = np.where(yy>0)
        xx = np.delete(xx, rows[ np.where(cols==0)],0)
        yy = np.delete(yy, rows[ np.where(cols==0)],0)

        xx = xx.reshape(len(xx),1000,90)

        # 1000 Hz to 500 Hz (To avoid memory error)
        xx = xx[:,::2,:90]

        x_dic[str(i)] = xx
        y_dic[str(i)] = yy
        time_interval = datetime.now()-start_time
       
        print(str(i), "finished...", "xx=", xx.shape, "yy=",  yy.shape,str(i),"耗时s ",time_interval.seconds)

    return x_dic["bed"], x_dic["fall"], x_dic["pickup"], x_dic["run"], x_dic["sitdown"], x_dic["standup"], x_dic["walk"], \
        y_dic["bed"], y_dic["fall"], y_dic["pickup"], y_dic["run"], y_dic["sitdown"], y_dic["standup"], y_dic["walk"]


csv_import()

2.2 优化方案,2分钟左右(增加了保存txt 方案)

   预处理:

        先通过csv_import 提取Input, label 到txt(18分钟)

 1: 每次训练的时候,只通过 txt_import加载数据集

        这个只需要2分钟

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 24 13:47:36 2024

@author: chengxf2
"""
import csv 
from datetime import datetime
import numpy as np
from ast import literal_eval

def csv_import():
    #只运行一次,数据预处理
    print("csv file importing...")
    SKIPROW = 2 #Skip every 2 rows -> overlap 800ms to 600ms  (To avoid memory error)
    for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
        start_time = datetime.now()
        label =str(i)
        
        xx_fileName = "./input_files/xx_1000_60_" + label + ".csv"
        yy_fileName = "./input_files/yy_1000_60_" + label + ".csv"
  
        xx_txt = "./input_files/xx_1000_60_txt" + label + ".csv"
        yy_txt = "./input_files/yy_1000_60_txt" + label + ".csv"
        
        
        xx_file = open(xx_fileName, 'r')
        yy_file = open(yy_fileName, 'r')
        lineNum = 0
        xx_lines = xx_file.readlines()
        yy_lines = yy_file.readlines()
        totalrows = 0
        rowsXX =[]
        rowsYY =[]
        
        #数据集种只有七种分类
        for line in yy_lines:
            
            NoActivity= int(line[0])
            #只保留偶数行,且非NoActivity 
            if lineNum%SKIPROW == 0 and NoActivity==0:
                xx = xx_lines[lineNum]
                yy = yy_lines[lineNum]
                arrxx = literal_eval(xx)
                arryy = literal_eval(yy)
                rowsXX.append(arrxx)
                rowsYY.append(arryy)
 
                
                totalrows +=1
            lineNum+=1
        rowsXX = np.array(rowsXX)
        rowsYY = np.array(rowsYY)
        
        
        
        # 1000 Hz to 500 Hz (To avoid memory error)
        rowsXX = rowsXX.reshape(len(rowsXX), 1000,90)
        rowsXX = rowsXX[:,::2,:90]
        a = rowsXX.reshape(-1, rowsXX.shape[1]*rowsXX.shape[2])  # 第一个参数为-1,表示自动计算该维度的大小
        print("save txt")
        #保存
        np.savetxt(xx_txt, a,delimiter=',')
        np.savetxt(yy_txt, rowsYY,delimiter=',')
        # np.savetxt('a.csv', a, fmt='%d', delimiter=',') dtype=np.int
       
        time_interval = datetime.now()-start_time
        print("\n label:",label, "\t totalrows ",totalrows, "\t time_interval",time_interval.seconds,np.shape(rowsXX),np.shape(rowsYY))
        xx_file.close()
        yy_file.close()
                    
              
    print(lineNum)

def txt_import():
    #每次训练的时候只执行该函数
    x_dic = {}
    y_dic = {}
    print("txt file importing...")

    beg_time = datetime.now()
    for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
        
        label =str(i)
        start_time = datetime.now()
  
        xx_txt = "./input_files/xx_1000_60_txt" + label + ".csv"
        yy_txt = "./input_files/yy_1000_60_txt" + label + ".csv"
        
        
    
        arrXX = np.loadtxt(xx_txt,  delimiter=',',dtype=np.float32) 
        arrYY = np.loadtxt(yy_txt,  delimiter=',',dtype=np.int32) 
        arrXX = arrXX.reshape(-1, 500,90)
        time_interval = datetime.now()-start_time
        print(label, "\t 耗时(秒):",time_interval.seconds,"\t xx.shape:",np.shape(arrXX),"\t yy.shape",np.shape(arrYY))
        x_dic[label]=arrXX
        y_dic[label]=arrYY
    
    total_time = datetime.now()-beg_time
    print("\n 总共耗时(分钟): ",total_time.seconds/60)
    return x_dic["bed"], x_dic["fall"], x_dic["pickup"], x_dic["run"], x_dic["sitdown"], x_dic["standup"], x_dic["walk"], \
        y_dic["bed"], y_dic["fall"], y_dic["pickup"], y_dic["run"], y_dic["sitdown"], y_dic["standup"], y_dic["walk"]
 
        
                    

    
txt_import()

  • 28
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值