1. min_max_scale
def min_max_scale(value,city_min, city_max):
if value<= city_min:
return 0
elif value >= city_max:
return 1
else:
return (value-city_min)/(city_max-city_min)
2. list 交集 差集 并集
# 获取两个list 的交集
list(set(a).intersection(set(b)))
# 获取两个list 的并集
list(set(a).union(set(b)))
# 获取两个 list 的差集
list(set(b).difference(set(a))) # b中有而a中没有的
3. dataframe isin
df_filter = df[df['geohash7'].isin(city_hashls)]
4. pivot table
def coffee_competitor(city_name,timelines):
coffee_dict = {'星巴克': 'coffee_1', 'luckin': 'coffee_2', 'costa': 'coffee_3', 'Manner': 'coffee_4', \
'贝瑞咖啡': 'coffee_5', 'Peet': 'coffee_6', 'Seesaw': 'coffee_7', '迪欧咖啡': 'coffee_8', \
'ZOOCOFFEE': 'coffee_9', 'Arabica': 'coffee_10', '上岛咖啡': 'coffee_11', 'Mstand': 'coffee_12'}
coffee_list = list(coffee_dict.keys())
coffee_number_list = list(coffee_dict.values())
coffee_ls = ('星巴克','luckin','costa','Manner','贝瑞咖啡','Peet','Seesaw','迪欧咖啡','ZOOCOFFEE','Arabica','上岛咖啡','Mstand')
# detail
sql = f"""SELECT distinct gaode_polygon[0]::float as gd_lng,gaode_polygon[1]::float as gd_lat ,brand,name,
st_geohash(st_setsrid(st_geomfromtext('POINT('|| gaode_polygon[0]::decimal ||' '|| gaode_polygon[1]::decimal || ')'), 4326),7) as geohash7
from ads_familymart.familymart_competitors
where city_name= '{city_name}' and category_name like '%咖啡%'
and timelines='{timelines}' and brand in {coffee_ls} """
df_gaode_data = connect_sql_poi(sql)
#竞品所在网格
df_gaode_data['hashcode_round'] = df_gaode_data['geohash7'].apply(lambda x: find_neighbors(x))
# df around
df_around = get_around_df(df_gaode_data)
# 单网格count
brand_count = pd.pivot_table(df_gaode_data[['brand','geohash7','name']],index=["geohash7"],values=["name"],
columns=["brand"],aggfunc='count').fillna(0)
brand_count.columns = brand_count.columns.droplevel(0)
brand_count = brand_count.reset_index()
print(len(brand_count),'brand_count',city_name,timelines)
#网格join
for brand in coffee_list:
try:
df_around[brand] = df_around['hashcode_round'].apply(lambda x: get_fast_food_count(x,brand_count,brand))
except:
df_around[brand] = 0.0
#dict
del df_around['hashcode_round']
#cols
df_around.columns = ['geohash7']+coffee_number_list
df_around["coffee_total" ] = df_around.drop(columns=['geohash7']).apply(lambda x: x.sum(), axis=1)
print(len(df_around),'df_around',city_name,timelines)
return df_around
5. sort_values
df_input['revenue_predict'] = lgb_model.predict(df_input.drop(columns=['chnl_code']))
df_test_predict = df_input[['chnl_code', 'month', 'revenue_predict']].sort_values(
by=['chnl_code', 'month']).\
set_index(['chnl_code', 'month']).revenue_predict.unstack().reset_index()
df_test_predict.columns = ['chnl_code', 'month_1', 'month_2']
df_test_predict['model_predict'] = df_test_predict[['month_1', 'month_2']].apply(lambda x: x.mean(), axis=1)