针对safecast数据集的数据清洗

使用Python进行数据清洗工作记录

Safecast 数据集介绍

Safecast由一个公益组织发起旨在让人们关注身边环境的组织而创造的数据集,这个数据集通过众包的方式收集人们周围的辐射值信息然后由服务器汇总处理后显示在地图上,本文旨在处理从safecast官网中下载下来的数据集,去除其中的异型数据,方便进行机器学习操作。

数据初始格式

从官网下载的数据集大小为16G,共有13亿条数据,这些数据的表头如下如下所示:

Captured Time,Latitude,Longitude,Value,Unit,Location Name,Device ID,MD5Sum,Height,Surface,Radiation,Uploaded Time,Loader ID

因为数据量太大,在windows环境中使用split这个工具将打文件切分成小文件,切分后的结果中,除了第一个文件,其余文件都没有带上表头,因此使用如下Python代码为其余的文件加上表头:

 with open(filename, 'r', encoding='ISO-8859-1') as f:
        content = f.read()
        f.seek(0,0)
        f.write("Captured Time,Latitude,Longitude,Value,Unit,Location Name,Device ID,MD5Sum,Height,Surface,Radiation,Uploaded Time,Loader ID\n"+content)

这个数据集使用Loader ID和Device ID 两者作为标记这条数据的用户标记,跑过代码发现Loader ID和Device ID在同一条数据之中并不会同时存在,因此我们根据Loader ID和Device ID作为用户的ID标识。先要进行一次去重处理,因为可能存在Loader ID和Device ID相同的状况:以下代码求出两个表头下的最大数值:

#!/usr/bin/python
# -*- coding:utf-8 -*-
# coding:utf-8

import pandas as pd
import numpy as np


numL = 0
numD = 0
for i in range( 135):
    # if(i == 0):
        # continue
    filename = 'measurements-out_'+str(i)+'.csv'
    print(filename)
    data = pd.read_csv(filename, usecols=['Loader ID', 'Captured Time', 'Latitude', 'Longitude','Value', 'Unit', 'Device ID'],encoding='ISO-8859-1')
    # pd.to_numeric(data['Device ID'], errors='coerce').fillna(0)
    order = ['Device ID', 'Captured Time', 'Latitude', 'Longitude','Value', 'Unit']
    data1 = data[order]
    data1['Device ID'] = pd.to_numeric(data1['Device ID'], errors = 'coerce')
     data2.dropna(axis = 0, how = 'any',subset = ['Loader ID','Captured Time','Latitude','Longitude','Value','Unit'], inplace = True)
    if(len(data2['Loader ID']) != 0):
        data2['Loader ID'] = data2['Loader ID'].astype(int)

    if(len(data1['Device ID']) != 0):
        D = np.argmax(data1['Device ID'], axis = 1)
    else:
        D = 0
    if(D > numD):
        numD = D
    if(len(data2['Loader ID']) != 0):
        L = np.argmax(data2['Loader ID'], axis = 1)
    else:
        L = 0
    if(L > numD):
        numL = L
    #print("numD = " + str(numD))

print("numL = " + str(numL))
print("numD = " + str(numD))

数据精处理

在获得了两个label下的最大value之后,接着来转换每个数据集中的数据的格式,具体运行代码如下所示:

# -*- coding:utf-8 -*-
import pandas as pd
from datetime import datetime
from datetime import timedelta
from timezonefinder import TimezoneFinder
import numpy as np
import datetime
from pytz import timezone

def datetime_as_timezone(date_time, time_zone):
    if(time_zone == None):
        return date_time
    tz = timezone(time_zone)
    utc = timezone('UTC')
    if(time_zone == 'UTC'):
        return date_time
    return date_time.replace(tzinfo=tz).astimezone(utc)

def datetime_to_str(date_time, local):
    date_time_tzone = datetime_as_timezone(date_time, local)
    return '{0:%Y-%m-%d %H:%M:%S}'.format(date_time_tzone)

def clean_data(filenum):
    open_file = '/home/tony/data/measurements-out_' + str(filenum) + '.csv'
    print(open_file)
    # 读取文件中的几个列
    data_sum = pd.read_csv(open_file, usecols=['Loader ID', 'Captured Time', 'Latitude', 'Longitude','Value', 'Unit', 'Device ID'], encoding='ISO-8859-1')
    # 按照要求排序
    order = ['Loader ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit', 'Device ID']
    data = data_sum[order]
    # 将loader ID转换为User ID
    data.columns = ['User ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit', 'Device ID']
    #删去有缺失值的行
    data['User ID'] = pd.to_numeric(data['User ID'], errors = 'coerce')
    data.dropna(axis = 0, how = 'any',subset = ['User ID','Captured Time','Latitude','Longitude','Value','Unit'], inplace = True)
    #data['User ID']  = data['User ID'].astype(str)
    data['User ID'] = data['User ID'].astype(int)
    #获取Loader Id中有值的数据集
    order2 = ['User ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit']
    data_user = data[order2]
    #data_user.dropna(axis = 0,how = 'any',  inplace = True)
    # data_user = data_user.dropna(subset = ['User ID'])
    order3 = ['Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit', 'Device ID']
    data_device = data_sum[order3]
# print(data_device)
    #data_device.dropna(axis = 0, how = 'any', subset = ['Device ID'], inplace = True)
    data_device = data_device.dropna(subset = ['Device ID'])
    # 将Device ID 改为 User ID
    data_device = data_device.rename(columns={'Device ID':'User ID'})
    data_device['User ID'] = pd.to_numeric(data_device['User ID'], errors = 'coerce')
    data_device.dropna(axis = 0, how = 'any',subset = ['User ID','Captured Time','Latitude','Longitude','Value','Unit'], inplace = True)
    data_device = data_device[order2]
    data_device['User ID'] = data_device['User ID'].astype(int)
    data_device['User ID'] = data_device['User ID'].map(lambda x: (x + 985729))
    # 合并两个表
    data_user = data_user.append(data_device)
    # 将表中的User ID都转换为整形
    data_user['User ID'] = data_user['User ID'].astype(int)
    # 将列名中的空格由下划线所代替
    data_user.columns = data_user.columns.str.replace(' ', '_')
    # 将行中不合法的Caputered Time的数据去掉
    # 其中取时间范围2011-01-01 到2019-10-01之间的数据
    data_user[u'Captured_Time'] = pd.to_datetime(data_user[u'Captured_Time'], format='%Y-%m-%d %H:%M:%S',errors='coerce')
    data_user.dropna(inplace = True)
    begin_date = pd.to_datetime('2011-01-01 00:00:00')
    end_date = pd.to_datetime('2018-01-01 00:00:00')
    data_user = data_user[(data_user[u'Captured_Time'] > begin_date)&(data_user[u'Captured_Time'] < end_date)]
    # 删除单位不是cpm的行
    data_user = data_user[data_user['Unit'].isin(['cpm'])]
    # 增加offset在对应的列的位置
    data_user['offset'] = None

    data_user.columns = ['User ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit', 'Offset']
    # 最后用来增加行数在数据集中
    data_user.to_csv('result.csv', index = False)
    '''
    筛选东京和福岛地区的
    '''

    #筛选东京地区的数据
    data1 = pd.read_csv('result.csv')
    data1 = data1.drop(data1[(data1.Latitude > 36.2) | (data1.Latitude < 34.8) ].index)
    data1 = data1.drop(data1[(data1.Longitude > 139.8) | (data1.Longitude < 138.4) ].index)
    data1['ID'] = range(len(data1))
    order_1 = ['ID','User ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit', 'Offset']
    data1 = data1[order_1]
    data1.to_csv('Tokyo_'+ str(filenum) +'_.csv', index = False)
    #筛选福岛地区的数据
    data2 = pd.read_csv('result.csv')
    data2 = data2.drop(data2[(data2.Latitude > 37.9766) | (data2.Latitude < 36.7981) ].index)
    data2 = data2.drop(data2[(data2.Longitude > 141.053) | (data2.Longitude < 139.157983) ].index)
    data2['ID'] = range(len(data2))
    order_2 = ['ID','User ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit', 'Offset']
    data2 = data2[order_2]
    data2.to_csv('Fukushima_'+str(filenum)+'_.csv', index = False)

    # 转换时区,将所有地区的时间转换为UTC时间
    # Find timezone based on longitude and latitude
    tf = TimezoneFinder(in_memory=True)
    data3 = pd.read_csv('result.csv')
    #print(len(data3))
    # 此函数根据经纬度获得节点所在的时区
    my_func = TimezoneFinder().timezone_at  # Note the no parenthesis on the function call!
    # 在表中增加一列用来存储转换时间
    data3['convert_time'] = None
    #print(data3['Longitude'])
    num = 0
    # 获取地一个经纬度所在的时区
    if(len(data3) == 0):
        order3 = ['User ID', 'convert_time', 'Latitude', 'Longitude', 'Value', 'Unit']
        data3 = data3[order3]
        data3.columns = ['User ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit']
        data3['Offset'] = None
        data3.to_csv('world_'+str(filenum)+'_.csv',index = False)
        return
    local = my_func(lng = data3['Longitude'][0], lat = data3['Latitude'][0])
    print(data3['Longitude'][0], data3['Latitude'][0], local)
    # 根据时区转换时间
    start = datetime.datetime.now()
    for i in range(len(data3)):
        if (num == 0):
            num = data3['User ID'][i]
            data3['convert_time'][i] = datetime_to_str(datetime.datetime.strptime(data3['Captured Time'][i].split(".")[0],'%Y-%m-%d %H:%M:%S'), local)
        else:
            if(num == data3['User ID'][i]):
                # print(data3['Longitude'][i], data3['Latitude'][i], local)
                data3['convert_time'][i] = datetime_to_str(datetime.datetime.strptime(data3['Captured Time'][i].split(".")[0].strip(' \t\r\n'),'%Y-%m-%d %H:%M:%S'), local)
            else:
                num = data3['User ID'][i]
                local = my_func(lng = data3['Longitude'][i], lat = data3['Latitude'][i])
                # print(data3['Longitude'][i], data3['Latitude'][i], local)
                data3['convert_time'][i] = datetime_to_str(datetime.datetime.strptime(data3['Captured Time'][i].split(".")[0].strip(' \t\r\n'),'%Y-%m-%d %H:%M:%S'), local)

    end = datetime.datetime.now()
    print((end - start).seconds)
    order3 = ['User ID', 'convert_time', 'Latitude', 'Longitude', 'Value', 'Unit']
    data3 = data3[order3]
    data3.columns = ['User ID', 'Captured Time', 'Latitude', 'Longitude', 'Value', 'Unit']
    data3['Offset'] = None
    data3.to_csv('world_'+str(filenum)+'_.csv',index = False)

if __name__== "__main__":
    for t in range(0,135):
        clean_data(t)


在精处理的过程中熟悉了pandas库的使用,对于一些库内部的函数有了比较清晰的认识和了解。值得注意的是,timezonefinder只能在python3的环境中运行。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值