需要将澳门居民从珠海信令中筛选出来,主要分析手机信令特殊时间点特征,整体思路:
- 对手机信令数据按ID排序;
- 以一个ID的所有记录作为一个data_batch进行判别;
- 识别一个data_batch中各记录的时间点是否为特殊时间点,生成特征点序列(普通点、入珠点、出珠点);
- 根据特征点对,统计在珠停留时间频数;如果出珠点大于等于2,则为澳门居民。
示例数据下载:https://download.csdn.net/download/baidu_26646129/12060934
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os, time, math
class MacaneseDistinguisher:
def __init__(self, data_path, data_range):
self.data_path = data_path
self.data_range = data_range
self.__data_sort_path = "./data_sort.txt"
self.__data_start_time = 1569859200
self.__data_end_time = 1570723200
def sort_data(self):
data_unsort = []
with open(self.data_path, "r") as rf:
lines = rf.readlines()
data_unsort = lines
data_sort = sorted(data_unsort, key = lambda x: x[0])
with open(self.__data_sort_path, "w") as wf:
for data in data_sort:
wf.write(data)
def generate_data_batches(self):
data_batch = []
with open(self.__data_sort_path, "r") as rf:
data_batch.append(rf.readline())
for line in rf:
if line[0] == data_batch[0][0]:
data_batch.append(line)
else:
yield data_batch
data_batch = []
data_batch.append(line)
yield data_batch
def __convert_to_time_stamp(self, date_str, time_str):
whole_time_str = "2019年"+date_str+" "+time_str
time_array = time.strptime(whole_time_str, "%Y年%m月%d日 %H:%M")
time_stamp = int(time.mktime(time_array))
return time_stamp
def __distinguish_timing(self, data_batch_sort):
in_or_out_timing = [] # 0:普通点 1:入珠海点 2:出珠海点
for i, d in enumerate(data_batch_sort):
x = d[2]
y = d[3]
if x*x + y*y<=0.3:
time_current = d[1]
if i>0:
time_previous = data_batch_sort[i-1][1]
else:
time_previous = self.__data_start_time
if i<len(data_batch_sort)-1:
time_next = data_batch_sort[i+1][1]
else:
time_next = self.__data_end_time
if time_current - time_previous >= 3 * 3600: # 入珠
in_or_out_timing.append(1)
elif time_next - time_current >= 3 * 3600: # 出珠
in_or_out_timing.append(-1)
else:
in_or_out_timing.append(0)
else:
in_or_out_timing.append(0)
return in_or_out_timing
def distinguish_data(self, data_batch):
data_batch_split = [d.split("\t") for d in data_batch]
data_batch_alter = [[d[0], self.__convert_to_time_stamp(d[1], d[2]), float(d[3]), float(d[4])] for d in data_batch_split]
data_batch_sort = sorted(data_batch_alter, key = lambda x:x[1])
# {"id":1, "isMacanese": True, "stayFrequency:":[0,1,2,3,4,5,6,7,8,9,10]}
distinguish_result = {}
distinguish_result["id"] = data_batch_sort[0][0]
distinguish_result["isMacanese"] = False
distinguish_result["stayFrequency"] = [0] * (self.data_range + 1)
in_or_out_timing = self.__distinguish_timing(data_batch_sort)
# 如果第一个时间点为普通时间点的话,停留时间窗口开始时间为其时间戳
if in_or_out_timing[0] == 0:
stay_time_window = [data_batch_sort[0][1]]
else:
stay_time_window = []
for i, ioot in enumerate(in_or_out_timing):
# stay_time_window还未配对成功
if len(stay_time_window)!=2 and ioot!=0:
stay_time_window.append(data_batch_sort[i][1])
if len(stay_time_window)==2:
stay_start = math.floor(stay_time_window[0] * 1.0 / 86400)
stay_end = math.floor(stay_time_window[1] * 1.0 / 86400)
days = stay_end - stay_start
distinguish_result["stayFrequency"][days] += 1
stay_time_window = []
if len(stay_time_window)==1:
stay_start = math.floor(stay_time_window[0] * 1.0 / 86400)
stay_end = math.floor(data_batch_sort[-1][1] * 1.0 / 86400)
days = stay_end - stay_start
distinguish_result["stayFrequency"][days] += 1
# 如果出珠到澳时间点大于等于2,说明是澳门人
count_out = in_or_out_timing.count(-1)
if count_out>=2:
distinguish_result["isMacanese"] = True
print(distinguish_result)
return distinguish_result
if __name__ == "__main__":
data_path = "./data.txt"
data_range = 10
md = MacaneseDistinguisher(data_path, data_range)
md.sort_data()
data_batches = md.generate_data_batches()
while True:
try:
data_batch = next(data_batches)
md.distinguish_data(data_batch)
except StopIteration as e:
print('Generator return value:', e.value)
break