Task3-特征工程

最新推荐文章于 2024-06-07 10:49:04 发布
a_adf
最新推荐文章于 2024-06-07 10:49:04 发布
阅读量84
点赞数
文章标签： python
本文链接：https://blog.csdn.net/a_adf/article/details/115839581
版权
这块的代码有些问题。首先就是agg()函数新版的需要修改
还有Word2Vec()新版里面没有iters换成了epochs，size替换成了vector_size，目前还没有跑成功，会遇到这个问题，请各位大佬帮帮忙
在这里插入图片描述
代码如下
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 17 19:23:52 2021

@author: 李
"""
import gc
import multiprocessing as mp
import os
import pickle
import time
import warnings
from collections import Counter
from copy import deepcopy
from datetime import datetime
from functools import partial
from glob import glob
import geopandas as gpd
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.models import FastText, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pyproj import Proj
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
os.environ['PYTHONHASHSEED'] = '0'
warnings.filterwarnings('ignore')
#不直接对DataFrame做append操作，提升运行速度
def get_data(file_path, max_lines = 2000):
    paths = os.listdir(file_path)
    tmp = []
    for t in tqdm(range(len(paths))):
        if len(tmp) > max_lines:
            break
        p = paths[t]
        with open('{}/{}'.format(file_path, p), encoding = 'utf-8') as f:
            next(f)
            for line in f.readlines():
                tmp.append(line.strip().split(','))
                if len(tmp) > max_lines:
                    break
    tmp_df = pd.DataFrame(tmp)
    tmp_df.columns = ['渔船ID', 'x', 'y', '速度', '方向', 'time', 'type']
    return tmp_df
TRAIN_PATH = r"C:\Users\李\Desktop\datawheal\data\hy_round1_train_20200102"
#采样数据行数
max_lines = 2000
df = get_data(TRAIN_PATH, max_lines = max_lines)
#基本预处理
label_dict1 = {'拖网': 0, '围网': 1, '刺网': 2}
label_dict2 = {0: '拖网', 1: '围网', 2: '刺网'}
name_dict = {'渔船ID': 'id', '速度': 'v', '方向': 'dir', 'type': 'label'}
df.rename(columns = name_dict, inplace = True)
df['label'] = df['label'].map(label_dict1)
cols = ['x', 'y', 'v']
for col in cols:
    df[col] = df[col].astype('float')
df['dir'] = df['dir'].astype('int')
df['time'] = pd.to_datetime(df['time'], format = '%m%d %H:%M:%S')
df['date'] = df['time'].dt.date
df['hour'] = df['time'].dt.hour
df['month'] = df['time'].dt.month
df['weekday'] = df['time'].dt.weekday
df.head()

df['x_dis_diff'] = (df['x'] - 6165599).abs()
df['y_dis_diff'] = (df['y'] - 5202660).abs()
df['base_dis_diff'] = ((df['x_dis_diff'] ** 2) + df['y_dis_diff'] ** 2)  ** 0.5
del df['x_dis_diff'],df['y_dis_diff'] 
df['base_dis_diff'].head()
#对时间进行白黑的划分
df['day_nig'] = 0
df.loc[(df['hour'] > 5) & (df['hour'] < 20), 'day_nig'] = 1 #将df['hour'] > 5) & (df['hour'] < 20的数据划分成1
df['day_nig'].head()
#根据月份划分季度
df['quarter'] = 0
df.loc[(df['month'].isin([1, 2, 3])), 'quarter'] = 1
df.loc[(df['month'].isin([4, 5, 6])), 'quarter'] = 2
df.loc[(df['month'].isin([7, 8, 9])), 'quarter'] = 3
df.loc[(df['month'].isin([10, 11, 12])), 'quarter'] = 4
#动态速度
temp = df.copy()
temp.rename(columns = {'id': 'ship', 'dir': 'd'}, inplace = True)
#给速度一个等级
def v_cut(v):
    if v < 0.1:
        return 0
    elif v < 0.5:
        return 1
    elif v < 1:
        return 2
    elif v < 2.5:
        return 3
    elif v < 5:
        return 4
    elif v < 10:
        return 5
    elif v < 20:
        return 5
    else:
        return 6
# 统计每个ship的对应速度等级个数

def get_v_fea(df):
    df['v_cut'] = df['v'].apply(lambda x: v_cut(x))
    tmp = df.groupby(['ship', 'v_cut'], as_index = False)['v_cut'].agg({'v_cut_count': 'count'}) #agg是指将'v_cut'计数
    tmp = tmp.pivot(index = 'ship', columns = 'v_cut', values = 'v_cut_count')
    new_col_nm = ['v_cut_' + str(col) for col in tmp.columns.tolist()]
    tmp.columns = new_col_nm
    tmp = tmp.reset_index() # 重置索引
    return tmp
c1 = get_v_fea(temp)
#方位进行16均分
def add_direction(df):
    df['d16'] = df['d'].apply(lambda x: int((x / 22.5) + 0.5) % 16 if not np.isnan(x) else np.nan) 
    return df
def get_d_cut_count_fea(df):
    df = add_direction(df)
    tmp = df.groupby(['ship', 'd16'], as_index = False)['d16'].agg({'d16_count': 
    'count'})
    tmp = tmp.pivot(index = 'ship', columns = 'd16', values = 'd16_count')
    new_col_nm = ['d16_' + str(col) for col in tmp.columns.tolist()]
    tmp.columns = new_col_nm
    tmp = tmp.reset_index()
    return tmp
c2 = get_d_cut_count_fea(temp)

#统计速度为0的个数，以及速度不为0的统计量
def get_v0_fea(df):
    df_zero_count = df.query('v == 0')[['ship', 'v']].groupby('ship', as_index = 
    False)['v'].agg({'num_zero_v': 'count'}) 
    #query()函数找到特定的DataFrame行
    df_not_zero_agg = df.query('v != 0')[['ship', 'v']].groupby('ship', as_index = 
    False)['v'].agg({'v_max_drop_0': 'max',
 'v_min_drop_0': 'min',
 'v_mean_drop_0': 'mean',
 'v_std_drop_0': 'std',
 'v_median_drop_0': 'median',
 'v_skew_drop_0': 'skew'})   #skew指偏度，
    tmp = df_zero_count.merge(df_not_zero_agg, on = 'ship', how = 'left')
    return tmp
c3 = get_v0_fea(temp)

def get_precentiles_fea(df_raw):
    key = ['x', 'y', 'v', 'd']
    temp = df_raw[['ship']].drop_duplicates('ship')
    for i in range(len(key)):
        #加入x,v,d,y的中位数和各种位数
        tmp_dscb = df_raw.groupby('ship')[key[i]].describe(percentiles = [0.05] + 
        [ii / 1000 for ii in range(125, 1000, 125)] + [0.95])
        raw_col_nm = tmp_dscb.columns.tolist()
        new_col_nm = [key[i] + '_' + col for col in raw_col_nm]
        tmp_dscb.columns = new_col_nm
        tmp_dscb = tmp_dscb.reset_index()
        #删掉多余的特征
        tmp_dscb = tmp_dscb.drop([f'{key[i]}_count', f'{key[i]}_mean', f'{key[i]}_std', f'{key[i]}_min', f'{key[i]}_max'], axis = 1)
        temp = temp.merge(tmp_dscb, on = 'ship', how = 'left')
    return temp

c4 = get_precentiles_fea(temp)
#计算了一下转角速度和首向的速度,shift(-1)
def get_d_change_rate_fea(df):
    import math
    import time
    temp = df.copy()
    temp.sort_values(['ship', 'time'], ascending = True, inplace = True)
    temp['timenext'] = temp.groupby('ship')['time'].shift(-1)
    temp['ynext'] = temp.groupby('ship')['y'].shift(-1)
    temp['xnext'] = temp.groupby('ship')['x'].shift(-1)
    temp['ynext'] = temp['ynext'].fillna(method = 'ffill')
    temp['xnext'] = temp['xnext'].fillna(method='ffill')
    temp['angle_next'] = (temp['ynext'] - temp['y']) / (temp['xnext'] - temp['x'])
    temp['angle_next'] = np.arctan(temp['angle_next']) / math.pi * 180
    temp['angle_next_next'] = temp['angle_next'].shift(-1)
    temp['timediff'] = np.abs(temp['timenext'] - temp['time'])
    temp['timediff'] = temp['timediff'].fillna(method='ffill')
    temp['hc_xy'] = abs(temp['angle_next_next'] - temp['angle_next'])
    temp.loc[temp['hc_xy'] > 180, 'hc_xy'] = (360 - temp.loc[temp['hc_xy'] > 180,
     'hc_xy'])
    temp['hc_xy_s'] = temp.apply(lambda x: x['hc_xy'] / 
    x['timediff'].total_seconds(), axis=1)
    temp['d_next'] = temp.groupby('ship')['d'].shift(-1)
    temp['hc_d'] = abs(temp['d_next'] - temp['d'])
    temp.loc[temp['hc_d'] > 180, 'hc_d'] = 360 - temp.loc[temp['hc_d'] > 180, 
    'hc_d']
    temp['hc_d_s'] = temp.apply(lambda x: x['hc_d'] / 
    x['timediff'].total_seconds(), axis=1)
    temp1 = temp[['ship', 'hc_xy_s', 'hc_d_s']]
    xy_d_rate = temp1.groupby('ship')['hc_xy_s'].agg([('hc_xy_s_max', 'max')])
    xy_d_rate = xy_d_rate.reset_index()
    d_d_rate = temp1.groupby('ship')['hc_d_s'].agg([('hc_d_s_max', 'max')])
    d_d_rate = d_d_rate.reset_index()
    tmp = xy_d_rate.merge(d_d_rate, on='ship', how='left')
    return tmp
c5 = get_d_change_rate_fea(temp)
f1 = temp.merge(c1, on = 'ship', how = 'left')
f1 = f1.merge(c2, on = 'ship', how = 'left')
f1 = f1.merge(c3, on = 'ship', how = 'left')
f1 = f1.merge(c4, on = 'ship', how = 'left')
f1 = f1.merge(c5, on = 'ship', how = 'left')

pre_cols = df.columns

df['v_bin'] = pd.qcut(df['v'], 200, duplicates='drop') # v进行 200分位数分箱,值的频率
#来选择箱子的均匀间隔，即每个箱子中含有的数的数量是相同的
df['v_bin'] = df['v_bin'].map(dict(zip(df['v_bin'].unique(), range(df['v_bin'].nunique())))) # 分箱后映射编码
for f in ['x', 'y']:
    df[f + '_bin1'] = pd.qcut(df[f], 1000, duplicates='drop') # x,y位置分箱1000
    df[f + '_bin1'] = df[f + '_bin1'].map(dict(zip(df[f + '_bin1'].unique(), range(df[f + '_bin1'].nunique()))))#编码
    df[f + '_bin2'] = df[f] // 10000 # 取整操作
    df[f + '_bin1_count'] = df[f + '_bin1'].map(df[f + '_bin1'].value_counts()) #x,y不同分箱的数量映射
    df[f + '_bin2_count'] = df[f + '_bin2'].map(df[f + '_bin2'].value_counts()) 
    #数量映射
    df[f + '_bin1_id_nunique'] = df.groupby(f + '_bin1')
    ['id'].transform('nunique')#基于分箱1 id数量映射
    df[f + '_bin2_id_nunique'] = df.groupby(f + '_bin2')
    ['id'].transform('nunique')#基于分箱2 id数量映射
for i in [1, 2]:
    # 特征交叉x_bin1（2）,y_bin1（2） 形成类别 统计每类数量映射到列  
    df['x_y_bin{}'.format(i)] = df['x_bin{}'.format(i)].astype('str') + '_' + 
    df['y_bin{}'.format(i)].astype('str')
    df['x_y_bin{}'.format(i)] = df['x_y_bin{}'.format(i)].map(
        dict(zip(df['x_y_bin{}'.format(i)].unique(), 
        range(df['x_y_bin{}'.format(i)].nunique()))))
    df['x_bin{}_y_bin{}_count'.format(i, i)] = 
    df['x_y_bin{}'.format(i)].map(df['x_y_bin{}'.format(i)].value_counts())
for stat in ['max', 'min']:
    # 统计x_bin1 y_bin1的最大最小值
    df['x_y_{}'.format(stat)] = df['y'] - df.groupby('x_bin1')['y'].transform(stat)
    df['y_x_{}'.format(stat)] = df['x'] - df.groupby('y_bin1')['x'].transform(stat)

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
 
def traj_to_bin(traj=None, x_min=12031967.16239096, x_max=14226964.881853,
                y_min=1623579.449434373, y_max=4689471.1780792,
                row_bins=4380, col_bins=3136):

    # Establish bins on x direction and y direction
    x_bins = np.linspace(x_min, x_max, endpoint=True, num=col_bins + 1)
    y_bins = np.linspace(y_min, y_max, endpoint=True, num=row_bins + 1)

    # Determine each x coordinate belong to which bin
    traj.sort_values(by='x', inplace=True)
    x_res = np.zeros((len(traj), ))
    j = 0
    for i in range(1, col_bins + 1):
        low, high = x_bins[i-1], x_bins[i]
        while( j < len(traj)):
            # low - 0.001 for numeric stable.
            if (traj["x"].iloc[j] <= high) & (traj["x"].iloc[j] > low - 0.001):
                x_res[j] = i
                j += 1
            else:
                break
    traj["x_grid"] = x_res
    traj["x_grid"] = traj["x_grid"].astype(int)
    traj["x_grid"] = traj["x_grid"].apply(str)

    # Determine each y coordinate belong to which bin
    traj.sort_values(by='y', inplace=True)
    y_res = np.zeros((len(traj), ))
    j = 0
    for i in range(1, row_bins + 1):
        low, high = y_bins[i-1], y_bins[i]
        while( j < len(traj)):
            # low - 0.001 for numeric stable.
            if (traj["y"].iloc[j] <= high) & (traj["y"].iloc[j] > low - 0.001):
                y_res[j] = i
                j += 1
            else:
                break
    traj["y_grid"] = y_res
    traj["y_grid"] = traj["y_grid"].astype(int)
    traj["y_grid"] = traj["y_grid"].apply(str)

    # Determine which bin each coordinate belongs to.
    traj["no_bin"] = [i + "_" + j for i, j in zip(
        traj["x_grid"].values.tolist(), traj["y_grid"].values.tolist())]
    traj.sort_values(by='time', inplace=True)
    return traj

bin_size = 800
col_bins = int((14226964.881853 - 12031967.16239096) / bin_size)
row_bins = int((4689471.1780792 - 1623579.449434373) / bin_size)

# 特征x_grid,y_grid,no_bin
df = traj_to_bin(df)

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols]
def find_save_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the visit frequency of each bin."""
    visit_count_df = traj_data_df.groupby(["no_bin"]).count().reset_index()
    visit_count_df = visit_count_df[["no_bin", "x"]]
    visit_count_df.rename({"x":"visit_count"}, axis=1, inplace=True)
    return visit_count_df

def find_save_unique_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the unique boat visit count of each bin."""
    unique_boat_count_df = traj_data_df.groupby(["no_bin"])
    ["id"].nunique().reset_index()
    unique_boat_count_df.rename({"id":"visit_boat_count"}, axis=1, inplace=True)

    unique_boat_count_df_save = pd.merge(bin_to_coord_df, unique_boat_count_df,
                                         on="no_bin", how="left")
    return unique_boat_count_df

traj_df = df[["id","x", "y",'time',"no_bin"]]
bin_to_coord_df = traj_df.groupby(["no_bin"]).median().reset_index()
pre_cols = df.columns

# DataFrame tmp for finding POIs
visit_count_df = find_save_visit_count_table(
    traj_df, bin_to_coord_df)
unique_boat_count_df = find_save_unique_visit_count_table(
    traj_df, bin_to_coord_df)

# # 特征'visit_count','visit_boat_count'
df = df.merge(visit_count_df,on='no_bin',how='left')
df = df.merge(unique_boat_count_df,on='no_bin',how='left')

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
pre_cols = df.columns

g = df.groupby('id')
for f in ['x', 'y']:
    #对x,y坐标进行时间平移 1 -1 2
    df[f + '_prev_diff'] = df[f] - g[f].shift(1)
    df[f + '_next_diff'] = df[f] - g[f].shift(-1)
    df[f + '_prev_next_diff'] = g[f].shift(1) - g[f].shift(-1)
    ## 三角形求解上时刻1距离  下时刻-1距离 2距离 
df['dist_move_prev'] = np.sqrt(np.square(df['x_prev_diff']) + 
np.square(df['y_prev_diff']))
df['dist_move_next'] = np.sqrt(np.square(df['x_next_diff']) + 
np.square(df['y_next_diff']))
df['dist_move_prev_next'] = np.sqrt(np.square(df['x_prev_next_diff']) + 
np.square(df['y_prev_next_diff']))
df['dist_move_prev_bin'] = pd.qcut(df['dist_move_prev'], 50, duplicates='drop')
# 2时刻距离等频分箱50
df['dist_move_prev_bin'] = df['dist_move_prev_bin'].map(
    dict(zip(df['dist_move_prev_bin'].unique(),
    range(df['dist_move_prev_bin'].nunique())))
) #上一时刻映射编码

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
pre_cols = df.columns

def start(x):
    try:
        return x[0]
    except:
        return None

def end(x):
    try:
        return x[-1]
    except:
        return None


def mode(x):
    try:
        return pd.Series(x).value_counts().index[0]
    except:
        return None

for f in ['dist_move_prev_bin', 'v_bin']:
    # 上一时刻类别 速度类别映射处理
    df[f + '_sen'] = df['id'].map(df.groupby('id')[f].agg(lambda x: 
    ','.join(x.astype(str))))
    
    # 一系列基本统计量特征 每列执行相应的操作
g = df.groupby('id').agg({
    'id': ['count'], 'x_bin1': [mode], 'y_bin1': [mode], 'x_bin2': [mode], 
    'y_bin2': [mode], 'x_y_bin1': [mode],
    'x': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'y': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'v': ['mean', 'max', 'min', 'std', np.ptp], 'dir': ['mean'],
    'x_bin1_count': ['mean'], 'y_bin1_count': ['mean', 'max', 'min'],
    'x_bin2_count': ['mean', 'max', 'min'], 'y_bin2_count': ['mean', 'max', 'min'],
    'x_bin1_y_bin1_count': ['mean', 'max', 'min'],
    'dist_move_prev': ['mean', 'max', 'std', 'min', 'sum'],
    'x_y_min': ['mean', 'min'], 'y_x_min': ['mean', 'min'],
    'x_y_max': ['mean', 'min'], 'y_x_max': ['mean', 'min'],
    
}).reset_index()
g.columns = ['_'.join(col).strip() for col in g.columns] #提取列名
g.rename(columns={'id_': 'id'}, inplace=True) #重命名id_
cols = [f for f in g.keys() if f != 'id'] #特征列名提取
df = df.merge(g,on='id',how='left')

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
def group_feature(df, key, target, aggs,flag):   
    """python3.8通过字典的形式来构建方法和重命名"""
    agg_dict = []#TIPS
    for ag in aggs:
        agg_dict.append(('{}_{}_{}'.format(target,ag,flag), ag))
#     print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train, flag):
    '''
    统计feature
    注意理解group_feature的使用和效果
    '''
    if (flag == 'on_night') or (flag == 'on_day'): 
        t = group_feature(df, 'ship','speed',['max','mean','median','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left')
        # return train
    
    
    if flag == "0":
        t = group_feature(df, 'ship','direction',['max','median','mean','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left')  
    elif flag == "1":
        t = group_feature(df, 'ship','speed',['max','mean','median','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left')
        t = group_feature(df, 'ship','direction',['max','median','mean','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left') 
        # .nunique().to_dic
        hour_nunique = df.groupby('ship')['speed'].nunique().to_dict()
        train['speed_nunique_{}'.format(flag)] = train['ship'].map(hour_nunique)   
        hour_nunique = df.groupby('ship')['direction'].nunique().to_dict()
        train['direction_nunique_{}'.format(flag)] = 
        train['ship'].map(hour_nunique)  

    t = group_feature(df, 'ship','x',
    ['max','min','mean','median','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','median','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','base_dis_diff',
    ['max','min','mean','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')

       
    train['x_max_x_min_{}'.format(flag)] = train['x_max_{}'.format(flag)] - 
    train['x_min_{}'.format(flag)]
    train['y_max_y_min_{}'.format(flag)] = train['y_max_{}'.format(flag)] - 
    train['y_min_{}'.format(flag)]
    train['y_max_x_min_{}'.format(flag)] = train['y_max_{}'.format(flag)] - 
    train['x_min_{}'.format(flag)]
    train['x_max_y_min_{}'.format(flag)] = train['x_max_{}'.format(flag)] - 
    train['y_min_{}'.format(flag)]
    train['slope_{}'.format(flag)] = train['y_max_y_min_{}'.format(flag)] / 
    np.where(train['x_max_x_min_{}'.format(flag)]==0, 0.001, 
    train['x_max_x_min_{}'.format(flag)])
    train['area_{}'.format(flag)] = train['x_max_x_min_{}'.format(flag)] * 
    train['y_max_y_min_{}'.format(flag)] 
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda 
    x:x.value_counts().index[0]).to_dict()
    train['mode_hour_{}'.format(flag)] = train['ship'].map(mode_hour)
    train['slope_median_{}'.format(flag)] = train['y_median_{}'.format(flag)] / 
    np.where(train['x_median_{}'.format(flag)]==0, 0.001, 
    train['x_median_{}'.format(flag)])

    return train
data  = df.copy()
data.rename(columns={'id':'ship','v':'speed','dir':'direction'}, inplace=True)
# 去重
data_label = data.drop_duplicates(['ship'], keep = 'first')

data_1 = data[data['speed']==0]
data_2 = data[data['speed']!=0]
# df = data_1 
# train = data_label
# flag = "0"
# target = 'direction'
# aggs =['max','median','mean','std','skew']
data_label = extract_feature(data_1, data_label, "0")
data_label = extract_feature(data_2, data_label, "1")

data_1 = data[data['day_nig'] == 0]
data_2 = data[data['day_nig'] == 1]
data_label = extract_feature(data_1, data_label,"on_night")
data_label = extract_feature(data_2, data_label,"on_day")
data_label.rename(columns={'ship':'id','speed':'v','direction':'dir'},inplace=True)
new_cols = [i for i in data_label.columns if i not in df.columns]
df = df.merge(data_label[new_cols + ['id']], on='id',how='left')

df[new_cols].head()




temp = df.copy()
temp.rename(columns={'id':'ship','dir':'d'},inplace=True)

def coefficient_of_variation(x):
    x = x.values
    if np.mean(x) == 0:
        return 0
    return np.std(x) / np.mean(x)

def max_2(x):
    x = list(x.values)
    x.sort(reverse=True)
    return x[1]

def max_3(x):
    x = list(x.values)
    x.sort(reverse=True)
    return x[2]

def diff_abs_mean(x):  # 统计特征 deta绝对值均值
    return np.mean(np.abs(np.diff(x)))

f1 = pd.DataFrame()
for col in ['x', 'y', 'v', 'd']:
    features = temp.groupby('ship', as_index=False)[col].agg({
        '{}_min'.format(col): 'min',
        '{}_max'.format(col): 'max',
        '{}_mean'.format(col): 'mean',
        '{}_median'.format(col): 'median',
        '{}_std'.format(col): 'std',
        '{}_skew'.format(col): 'skew',
        '{}_sum'.format(col): 'sum',
        '{}_diff_abs_mean'.format(col): diff_abs_mean,
        '{}_mode'.format(col): lambda x: x.value_counts().index[0],
        '{}_coefficient_of_variation'.format(col): coefficient_of_variation,
        '{}_max2'.format(col): max_2,
        '{}_max3'.format(col): max_3
    })
    if f1.shape[0] == 0:
        f1 = features
    else:
        f1 = f1.merge(features, on='ship', how='left')

f1['x_max_x_min'] = f1['x_max'] - f1['x_min']
f1['y_max_y_min'] = f1['y_max'] - f1['y_min']
f1['y_max_x_min'] = f1['y_max'] - f1['x_min']
f1['x_max_y_min'] = f1['x_max'] - f1['y_min']
f1['slope'] = f1['y_max_y_min'] / np.where(f1['x_max_x_min'] == 0, 0.001, 
f1['x_max_x_min'])
f1['area'] = f1['x_max_x_min'] * f1['y_max_y_min']
f1['dis_max_min'] = (f1['x_max_x_min'] ** 2 + f1['y_max_y_min'] ** 2) ** 0.5
f1['dis_mean'] = (f1['x_mean'] ** 2 + f1['y_mean'] ** 2) ** 0.5
f1['area_d_dis_max_min'] = f1['area'] / f1['dis_max_min']

# 加速度
temp.sort_values(['ship', 'time'], ascending=True, inplace=True)
temp['ynext'] = temp.groupby('ship')['y'].shift(-1)
temp['xnext'] = temp.groupby('ship')['x'].shift(-1)
temp['ynext'] = temp['ynext'].fillna(method='ffill')
temp['xnext'] = temp['xnext'].fillna(method='ffill')
temp['timenext'] = temp.groupby('ship')['time'].shift(-1)
temp['timediff'] = np.abs(temp['timenext'] - temp['time'])
temp['a_y'] = temp.apply(lambda x: (x['ynext'] - x['y']) / 
x['timediff'].total_seconds(), axis=1)
temp['a_x'] = temp.apply(lambda x: (x['xnext'] - x['x']) / 
x['timediff'].total_seconds(), axis=1)
for col in ['a_y', 'a_x']:
    f2 = temp.groupby('ship', as_index=False)[col].agg({
        '{}_max'.format(col): 'max',
        '{}_mean'.format(col): 'mean',
        '{}_min'.format(col): 'min',
        '{}_median'.format(col): 'median',
        '{}_std'.format(col): 'std'})
    f1 = f1.merge(f2, on='ship', how='left')

# 曲率
temp['y_pre'] = temp.groupby('ship')['y'].shift(1)
temp['x_pre'] = temp.groupby('ship')['x'].shift(1)
temp['y_pre'] = temp['y_pre'].fillna(method='bfill')
temp['x_pre'] = temp['x_pre'].fillna(method='bfill')
temp['d_pre'] = ((temp['x'] - temp['x_pre']) ** 2 + (temp['y'] - temp['y_pre']) ** 
2) ** 0.5
temp['d_next'] = ((temp['xnext'] - temp['x']) ** 2 + (temp['ynext'] - temp['y']) ** 
2) ** 0.5
temp['d_pre_next'] = ((temp['xnext'] - temp['x_pre']) ** 2 + (temp['ynext'] - temp['y_pre']) ** 2) ** 0.5
temp['curvature'] = (temp['d_pre'] + temp['d_next']) / temp['d_pre_next']

f2 = temp.groupby('ship', as_index=False)['curvature'].agg({
    'curvature_max': 'max',
    'curvature_mean': 'mean',
    'curvature_min': 'min',
    'curvature_median': 'median',
    'curvature_std': 'std'})
f1 = f1.merge(f2, on='ship', how='left')
def traj_cbow_embedding(traj_data_corpus=None, embedding_size=70,
                        iters=40, min_count=3, window_size=25,
                        seed=9012, num_runs=5, word_feat="no_bin"):
    """CBOW embedding for trajectory data."""
    boat_id = traj_data_corpus['id'].unique()
    sentences, embedding_df_list, embedding_model_list = [], [], []
    for i in boat_id:
        traj = traj_data_corpus[traj_data_corpus['id']==i]
        sentences.append(traj[word_feat].values.tolist())

    print("\n@Start CBOW word embedding at {}".format(datetime.now()))
    print("-------------------------------------------")
    for i in tqdm(range(num_runs)):
        model = Word2Vec(sentences, vector_size = embedding_size, min_count = 
        min_count, workers= mp.cpu_count(), window=window_size, seed=seed,
         epochs = iters, sg=0)
        # Sentance vector
        embedding_vec = []
        for ind, seq in enumerate(sentences):
            seq_vec, word_count = 0, 0
            for word in seq:
                if word not in model:
                    continue
                else:
                    seq_vec += model[word]
                    word_count += 1
            if word_count == 0:
                embedding_vec.append(embedding_size * [0])
            else:
                embedding_vec.append(seq_vec / word_count)
        embedding_vec = np.array(embedding_vec)
        embedding_cbow_df = pd.DataFrame(embedding_vec, 
            columns=["embedding_cbow_{}_{}".format(word_feat, i) for i in 
            range(embedding_size)])
        embedding_cbow_df["id"] = boat_id
        embedding_df_list.append(embedding_cbow_df)
        embedding_model_list.append(model)
    print("-------------------------------------------")
    print("@End CBOW word embedding at {}".format(datetime.now()))
    return embedding_df_list, embedding_model_list
embedding_size=70

iters=70
min_count=3
window_size=25
num_runs=1

df_list, model_list = traj_cbow_embedding(df, embedding_size=embedding_size, iters=iters, min_count=min_count, window_size=window_size, seed=9012, num_runs=num_runs, word_feat="no_bin")

train_embedding_df_list = [d.reset_index(drop=True) for d in df_list]
fea = train_embedding_df_list[0]
fea = pd.DataFrame(fea)


pre_cols = df.columns
df = df.merge(fea,on='id',how='left')
new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
boat_id = df['id'].unique()
total_embedding = pd.DataFrame(boat_id, columns=["id"])
traj_data = df[['v','dir','id']].rename(columns = {'v':'speed','dir':'direction'})
# Step 1: Construct the words
traj_data_corpus = []
traj_data["speed_str"] = traj_data["speed"].apply(lambda x: str(int(x*100)))
traj_data["direction_str"] = traj_data["direction"].apply(str)
traj_data["speed_dir_str"] = traj_data["speed_str"] + "_" + traj_data["direction_str"]
traj_data_corpus = traj_data[["id", "speed_str","direction_str", "speed_dir_str"]]
print("\n@Round 2 speed embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus, embedding_size=10, iters=40, min_count=3, window_size=25, seed=9102, num_runs=1, word_feat="speed_str")
speed_embedding = df_list[0].reset_index(drop=True)
total_embedding = pd.merge(total_embedding, speed_embedding, on="id", how="left")
print("\n@Round 2 direction embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus, embedding_size=12, iters=70, min_count=3, window_size=25, seed=9102, num_runs=1, word_feat="speed_dir_str")
speed_dir_embedding = df_list[0].reset_index(drop=True)
total_embedding = pd.merge(total_embedding, speed_dir_embedding, on="id", how="left")
boat_id = df['id'].unique()
total_embedding = pd.DataFrame(boat_id, columns=["id"])
traj_data = df[['v','dir','id']].rename(columns = {'v':'speed','dir':'direction'})
# Step 1: Construct the words
traj_data_corpus = []
traj_data["speed_str"] = traj_data["speed"].apply(lambda x: str(int(x*100)))
traj_data["direction_str"] = traj_data["direction"].apply(str)
traj_data["speed_dir_str"] = traj_data["speed_str"] + "_" + traj_data["direction_str"]
traj_data_corpus = traj_data[["id", "speed_str", "direction_str", "speed_dir_str"]]
print("\n@Round 2 speed embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus, embedding_size=10, 
iters=40, min_count=3, window_size=25, seed=9102, num_runs=1, 
word_feat="speed_str")
speed_embedding = df_list[0].reset_index(drop=True)
total_embedding = pd.merge(total_embedding, speed_embedding, on="id", how="left")
print("\n@Round 2 direction embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus, embedding_size=12, 
iters=70, min_count=3, window_size=25, seed=9102, num_runs=1, word_feat="speed_dir_str")
speed_dir_embedding = df_list[0].reset_index(drop=True)
total_embedding = pd.merge(total_embedding, speed_dir_embedding, on="id", 
how="left")
pre_cols = df.columns
df = df.merge(total_embedding,on='id',how='left')
new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
class nmf_list(object):
    def __init__(self,data,by_name,to_list,nmf_n,top_n):
        self.data = data
        self.by_name = by_name
        self.to_list = to_list
        self.nmf_n = nmf_n
        self.top_n = top_n

    def run(self,tf_n):
        df_all = self.data.groupby(self.by_name)[self.to_list].apply(lambda x 
        :'|'.join(x)).reset_index()
        self.data =df_all.copy()

        print('bulid word_fre')
        # 词频的构建
        def word_fre(x):
            word_dict = []
            x = x.split('|')
            docs = []
            for doc in x:
                doc = doc.split()
                docs.append(doc)
                word_dict.extend(doc)
            word_dict = Counter(word_dict)
            new_word_dict = {}
            for key,value in word_dict.items():
                new_word_dict[key] = [value,0]
            del word_dict  
            del x
            for doc in docs:
                doc = Counter(doc)
                for word in doc.keys():
                    new_word_dict[word][1] += 1
            return new_word_dict 
        self.data['word_fre'] = self.data[self.to_list].apply(word_fre)

        print('bulid top_' + str(self.top_n))
        # 设定100个高频词
        def top_100(word_dict):
            return sorted(word_dict.items(),key = lambda x:(x[1][1],x[1]
            [0]),reverse = True)[:self.top_n]
        self.data['top_'+str(self.top_n)] = self.data['word_fre'].apply(top_100)
        def top_100_word(word_list):
            words = []
            for i in word_list:
                i = list(i)
                words.append(i[0])
            return words 
        self.data['top_'+str(self.top_n)+'_word'] = self.data['top_' + 
        str(self.top_n)].apply(top_100_word)
        # print('top_'+str(self.top_n)+'_word的shape')
        print(self.data.shape)

        word_list = []
        for i in self.data['top_'+str(self.top_n)+'_word'].values:
            word_list.extend(i)
        word_list = Counter(word_list)
        word_list = sorted(word_list.items(),key = lambda x:x[1],reverse = True)
        user_fre = []
        for i in word_list:
            i = list(i)
            user_fre.append(i[1]/self.data[self.by_name].nunique())
        stop_words = []
        for i,j in zip(word_list,user_fre):
            if j>0.5:
                i = list(i)
                stop_words.append(i[0])

        print('start title_feature')
        # 讲融合后的taglist当作一句话进行文本处理
        self.data['title_feature'] = self.data[self.to_list].apply(lambda x: 
        x.split('|'))
        self.data['title_feature'] = self.data['title_feature'].apply(lambda line: 
        [w for w in line if w not in stop_words])
        self.data['title_feature'] = self.data['title_feature'].apply(lambda x: ' 
        '.join(x))

        print('start NMF')
        # 使用tfidf对元素进行处理
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(tf_n,tf_n))
        tfidf = tfidf_vectorizer.fit_transform(self.data['title_feature'].values)
        #使用nmf算法，提取文本的主题分布
        text_nmf = NMF(n_components=self.nmf_n).fit_transform(tfidf)


        # 整理并输出文件
        name = [str(tf_n) + self.to_list + '_' +str(x) for x in 
        range(1,self.nmf_n+1)]
        tag_list = pd.DataFrame(text_nmf)
        print(tag_list.shape)
        tag_list.columns = name
        tag_list[self.by_name] = self.data[self.by_name]
        column_name = [self.by_name] + name
        tag_list = tag_list[column_name]
        return tag_list
    
data = df.copy()
data.rename(columns={'v':'speed','id':'ship'},inplace=True)
for j in range(1,4):
    print('********* {} *******'.format(j))
    for i in ['speed','x','y']:
        data[i + '_str'] = data[i].astype(str)
        nmf = nmf_list(data,'ship',i + '_str',8,2)
        nmf_a = nmf.run(j)
        nmf_a.rename(columns={'ship':'id'},inplace=True)
        data_label = data_label.merge(nmf_a,on = 'id',how = 'left')
a_adf
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Task3-特征工程

这块的代码有些问题。首先就是agg()函数新版的需要修改还有Word2Vec()新版里面没有iters换成了epochs，size替换成了vector_size，目前还没有跑成功，会遇到这个问题，请各位大佬帮帮忙代码如下# -*- coding: utf-8 -*-"""Created on Sat Apr 17 19:23:52 2021@author: 李"""import gcimport multiprocessing as mpimport osimport pickle
复制链接

扫一扫