数据处理与分析：从数据预处理到决策制定-CSDN博客

本文链接：https://blog.csdn.net/weixin_43220533/article/details/121222936

1. min_max_scale

def min_max_scale(value,city_min, city_max):
    if value<= city_min:
        return 0
    elif value >= city_max:
        return 1
    else:
        return (value-city_min)/(city_max-city_min)

2. list 交集差集并集

# 获取两个list 的交集
list(set(a).intersection(set(b)))

# 获取两个list 的并集
list(set(a).union(set(b)))

# 获取两个 list 的差集
list(set(b).difference(set(a))) # b中有而a中没有的

3. dataframe isin

 df_filter = df[df['geohash7'].isin(city_hashls)]

4. pivot table

def coffee_competitor(city_name,timelines):
    coffee_dict = {'星巴克': 'coffee_1', 'luckin': 'coffee_2', 'costa': 'coffee_3', 'Manner': 'coffee_4', \
                     '贝瑞咖啡': 'coffee_5', 'Peet': 'coffee_6', 'Seesaw': 'coffee_7', '迪欧咖啡': 'coffee_8', \
                     'ZOOCOFFEE': 'coffee_9', 'Arabica': 'coffee_10', '上岛咖啡': 'coffee_11', 'Mstand': 'coffee_12'}

    coffee_list = list(coffee_dict.keys())
    coffee_number_list = list(coffee_dict.values())
    coffee_ls = ('星巴克','luckin','costa','Manner','贝瑞咖啡','Peet','Seesaw','迪欧咖啡','ZOOCOFFEE','Arabica','上岛咖啡','Mstand')

    # detail
    sql = f"""SELECT distinct gaode_polygon[0]::float as gd_lng,gaode_polygon[1]::float as gd_lat ,brand,name,
    st_geohash(st_setsrid(st_geomfromtext('POINT('|| gaode_polygon[0]::decimal ||' '|| gaode_polygon[1]::decimal || ')'), 4326),7) as geohash7
    from ads_familymart.familymart_competitors 
    where city_name= '{city_name}' and category_name like '%咖啡%' 
    and timelines='{timelines}' and brand in {coffee_ls} """
    df_gaode_data = connect_sql_poi(sql)
    #竞品所在网格
    df_gaode_data['hashcode_round'] = df_gaode_data['geohash7'].apply(lambda x: find_neighbors(x))
    
    # df around
    df_around = get_around_df(df_gaode_data) 
    # 单网格count
    brand_count = pd.pivot_table(df_gaode_data[['brand','geohash7','name']],index=["geohash7"],values=["name"],
               columns=["brand"],aggfunc='count').fillna(0)
    brand_count.columns = brand_count.columns.droplevel(0)
    brand_count = brand_count.reset_index()
    print(len(brand_count),'brand_count',city_name,timelines)
    #网格join
    for brand in coffee_list:
        try:
            df_around[brand] = df_around['hashcode_round'].apply(lambda x: get_fast_food_count(x,brand_count,brand))
        except:
            df_around[brand] = 0.0
   
    #dict
    del df_around['hashcode_round']
    #cols
    df_around.columns = ['geohash7']+coffee_number_list
    df_around["coffee_total" ] = df_around.drop(columns=['geohash7']).apply(lambda x: x.sum(), axis=1)
    print(len(df_around),'df_around',city_name,timelines)
 
    return df_around

5. sort_values

df_input['revenue_predict'] = lgb_model.predict(df_input.drop(columns=['chnl_code']))
    
df_test_predict = df_input[['chnl_code', 'month', 'revenue_predict']].sort_values(
        by=['chnl_code', 'month']).\
        set_index(['chnl_code', 'month']).revenue_predict.unstack().reset_index()

df_test_predict.columns = ['chnl_code', 'month_1', 'month_2']

df_test_predict['model_predict'] = df_test_predict[['month_1', 'month_2']].apply(lambda x: x.mean(), axis=1)

常用python数据处理