dataframe实践



"""
1:将抓取的红色标记字符串中,sold数字提取
2:end model 分组
3 按照年份分类汇总  ,汇总方式有:ebay_id 均值,sold和,个数最多

"""
import pandas as pd
from collections import Counter
import re
df = pd.DataFrame(pd.read_excel("make_model_市场年代1108.xlsx"))
#处理sold字段--------
dig_sold = pd.DataFrame((re.findall("(\d{1,}) Sold", str(x)) for x in df["sold"]),columns=["dig_sold"], index=df.index)
df = pd.merge(df,dig_sold,left_index=True , right_index= True) #按索引合并
df.loc[df["dig_sold"].isnull(),"dig_sold"] = 0
df.loc[:,"dig_sold"] = df["dig_sold"].astype("int") #强制转化为int
#---------------
end_models = df[["vio_end","model"]].drop_duplicates() #去重
result = []
for models in end_models.values:
    vio_end = models[0]
    model = models[1]
    print(model)
    subdf = df.loc[(df.model == model) & (df.vio_end == vio_end),["vio_end","model","ebay_id","dig_sold","ebay_end"]].dropna(how="any")
    subdf.loc[:, "ebay_end"] = subdf["ebay_end"].astype("int")  # 强制转化为int
    # print(subdf.info())
    #对年份分类汇总,并按照sold和计算,返回最大和的年份
    gsold = subdf.groupby("ebay_end",as_index=False)["dig_sold"].sum() #Dataframe
    sold_year = gsold.sort_values(by="dig_sold",ascending=False).values[0][0]
    #-----
    #返回ebay_id均值最小的年份
    gid = subdf.groupby("ebay_end", as_index= False)["ebay_id"].mean()
    id_year = int(gid.sort_values(by="ebay_id").values[0][0])
    # print(id_year)
    glen = subdf.groupby("ebay_end", as_index = False)["ebay_end"].agg([len])
    # print(glen)
    glen= glen.sort_values(by="len",ascending=False)
    first_year = glen.index[0]
    first_count = round(glen.values[0][0] / len(subdf),2)
    years =[vio_end , model ,len(subdf), sold_year , id_year,first_year , first_count ]
    if len(glen) > 1 :
        second_year = glen.index[1]
        second_count = round(glen.values[1][0] / len(subdf),2)
        years.extend([second_year,second_count])
    # print(first_year , first_count , second_year, second_count)
    result.append(years)
df = pd.DataFrame(result,columns=["vio_end","model","sum","sold预测","id_预测","统计预测1","统计1比率","统计预测2","统计2比率"])
df.to_excel("make_model_ebayno_市场统计1108.xlsx",index=False)


import pandas  as pd
df = pd.DataFrame(pd.read_excel("系统sku的fitment信息1108.xlsx"))
geration_df = pd.DataFrame(pd.read_excel("车型最后代系整理1030.xlsx"))
vio_model = df["model"]
ge_models = geration_df["model"].astype(str) #更改数据类型
print(geration_df.info())
print(df.info())
df.loc[df["product_sku"].str.find("RN")!= -1 , "note"] = "通用车型"   # 在字符串中查找子串
df.loc[df["vio_end"] < 2013, "note"] = "2013前"
nulldf = df[df["note"].isnull()] #提取为空的, 未处理的
vio_modles = nulldf[["model","vio_end"]].drop_duplicates().sort_values(by="model").values #删除重复项,并按列值排序
for end_model in vio_modles:
    model = end_model[0]
    vio_end = end_model[1]
    subdf = nulldf[(df.model == model) & (df.vio_end == vio_end)]
    index = subdf.index
    strmodel = str(model)
    if strmodel not in ge_models.values:
        print(model)
        print("nomodel")
        df.loc[index,"note"] = "no车型"
        continue
    ge_start = geration_df.loc[geration_df.model.astype(str) == strmodel , "start"].values[0]
    ge_end = geration_df.loc[geration_df.model.astype(str) == strmodel, "end"].values[0]
    # print(ge_start , ge_end)
    df.loc[index,"最后代系"]= "-".join([str(ge_start),str(ge_end)])
    if  (vio_end == ge_end):
        df.loc[index,"note"] ="同代"
    elif vio_end >= ge_start and vio_end < ge_end:
        df.loc[index,"note"]="同代扩展"
    elif vio_end < ge_start:
        if vio_end + 1 == ge_start:
            df.loc[index,"note"]="重启一代"
        else:
            df.loc[index,"note"]="前代扩展"
df.to_excel("晴雨挡最后代系分析1108.xlsx",columns=["product_id","product_sku","product_name","make","model","vio_start","vio_end","note","最后代系"], index = False)





  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值