"""
1:将抓取的红色标记字符串中,sold数字提取
2:end model 分组
3 按照年份分类汇总 ,汇总方式有:ebay_id 均值,sold和,个数最多
"""
import pandas as pd
from collections import Counter
import re
df = pd.DataFrame(pd.read_excel("make_model_市场年代1108.xlsx"))
#处理sold字段--------
dig_sold = pd.DataFrame((re.findall("(\d{1,}) Sold", str(x)) for x in df["sold"]),columns=["dig_sold"], index=df.index)
df = pd.merge(df,dig_sold,left_index=True , right_index= True) #按索引合并
df.loc[df["dig_sold"].isnull(),"dig_sold"] = 0
df.loc[:,"dig_sold"] = df["dig_sold"].astype("int") #强制转化为int
#---------------
end_models = df[["vio_end","model"]].drop_duplicates() #去重
result = []
for models in end_models.values:
vio_end = models[0]
model = models[1]
print(model)
subdf = df.loc[(df.model == model) & (df.vio_end == vio_end),["vio_end","model","ebay_id","dig_sold","ebay_end"]].dropna(how="any")
subdf.loc[:, "ebay_end"] = subdf["ebay_end"].astype("int") # 强制转化为int
# print(subdf.info())
#对年份分类汇总,并按照sold和计算,返回最大和的年份
gsold = subdf.groupby("ebay_end",as_index=False)["dig_sold"].sum() #Dataframe
sold_year = gsold.sort_values(by="dig_sold",ascending=False).values[0][0]
#-----
#返回ebay_id均值最小的年份
gid = subdf.groupby("ebay_end", as_index= False)["ebay_id"].mean()
id_year = int(gid.sort_values(by="ebay_id").values[0][0])
# print(id_year)
glen = subdf.groupby("ebay_end", as_index = False)["ebay_end"].agg([len])
# print(glen)
glen= glen.sort_values(by="len",ascending=False)
first_year = glen.index[0]
first_count = round(glen.values[0][0] / len(subdf),2)
years =[vio_end , model ,len(subdf), sold_year , id_year,first_year , first_count ]
if len(glen) > 1 :
second_year = glen.index[1]
second_count = round(glen.values[1][0] / len(subdf),2)
years.extend([second_year,second_count])
# print(first_year , first_count , second_year, second_count)
result.append(years)
df = pd.DataFrame(result,columns=["vio_end","model","sum","sold预测","id_预测","统计预测1","统计1比率","统计预测2","统计2比率"])
df.to_excel("make_model_ebayno_市场统计1108.xlsx",index=False)
import pandas as pd
df = pd.DataFrame(pd.read_excel("系统sku的fitment信息1108.xlsx"))
geration_df = pd.DataFrame(pd.read_excel("车型最后代系整理1030.xlsx"))
vio_model = df["model"]
ge_models = geration_df["model"].astype(str) #更改数据类型
print(geration_df.info())
print(df.info())
df.loc[df["product_sku"].str.find("RN")!= -1 , "note"] = "通用车型" # 在字符串中查找子串
df.loc[df["vio_end"] < 2013, "note"] = "2013前"
nulldf = df[df["note"].isnull()] #提取为空的, 未处理的
vio_modles = nulldf[["model","vio_end"]].drop_duplicates().sort_values(by="model").values #删除重复项,并按列值排序
for end_model in vio_modles:
model = end_model[0]
vio_end = end_model[1]
subdf = nulldf[(df.model == model) & (df.vio_end == vio_end)]
index = subdf.index
strmodel = str(model)
if strmodel not in ge_models.values:
print(model)
print("nomodel")
df.loc[index,"note"] = "no车型"
continue
ge_start = geration_df.loc[geration_df.model.astype(str) == strmodel , "start"].values[0]
ge_end = geration_df.loc[geration_df.model.astype(str) == strmodel, "end"].values[0]
# print(ge_start , ge_end)
df.loc[index,"最后代系"]= "-".join([str(ge_start),str(ge_end)])
if (vio_end == ge_end):
df.loc[index,"note"] ="同代"
elif vio_end >= ge_start and vio_end < ge_end:
df.loc[index,"note"]="同代扩展"
elif vio_end < ge_start:
if vio_end + 1 == ge_start:
df.loc[index,"note"]="重启一代"
else:
df.loc[index,"note"]="前代扩展"
df.to_excel("晴雨挡最后代系分析1108.xlsx",columns=["product_id","product_sku","product_name","make","model","vio_start","vio_end","note","最后代系"], index = False)