dataframe实践

最新推荐文章于 2023-04-16 01:29:36 发布

zn505119020

最新推荐文章于 2023-04-16 01:29:36 发布

阅读量276

点赞数

分类专栏：工作相关

本文链接：https://blog.csdn.net/zn505119020/article/details/78490429

版权

工作相关专栏收录该内容

136 篇文章 0 订阅

订阅专栏

"""
1：将抓取的红色标记字符串中，sold数字提取
2：end model 分组
3 按照年份分类汇总  ，汇总方式有：ebay_id 均值，sold和，个数最多

"""
import pandas as pd
from collections import Counter
import re
df = pd.DataFrame(pd.read_excel("make_model_市场年代1108.xlsx"))
#处理sold字段--------
dig_sold = pd.DataFrame((re.findall("(\d{1,}) Sold", str(x)) for x in df["sold"]),columns=["dig_sold"], index=df.index)
df = pd.merge(df,dig_sold,left_index=True , right_index= True) #按索引合并
df.loc[df["dig_sold"].isnull(),"dig_sold"] = 0
df.loc[:,"dig_sold"] = df["dig_sold"].astype("int") #强制转化为int
#---------------
end_models = df[["vio_end","model"]].drop_duplicates() #去重
result = []
for models in end_models.values:
    vio_end = models[0]
    model = models[1]
    print(model)
    subdf = df.loc[(df.model == model) & (df.vio_end == vio_end),["vio_end","model","ebay_id","dig_sold","ebay_end"]].dropna(how="any")
    subdf.loc[:, "ebay_end"] = subdf["ebay_end"].astype("int")  # 强制转化为int
    # print(subdf.info())
    #对年份分类汇总，并按照sold和计算，返回最大和的年份
    gsold = subdf.groupby("ebay_end",as_index=False)["dig_sold"].sum() #Dataframe
    sold_year = gsold.sort_values(by="dig_sold",ascending=False).values[0][0]
    #-----
    #返回ebay_id均值最小的年份
    gid = subdf.groupby("ebay_end", as_index= False)["ebay_id"].mean()
    id_year = int(gid.sort_values(by="ebay_id").values[0][0])
    # print(id_year)
    glen = subdf.groupby("ebay_end", as_index = False)["ebay_end"].agg([len])
    # print(glen)
    glen= glen.sort_values(by="len",ascending=False)
    first_year = glen.index[0]
    first_count = round(glen.values[0][0] / len(subdf),2)
    years =[vio_end , model ,len(subdf), sold_year , id_year,first_year , first_count ]
    if len(glen) > 1 :
        second_year = glen.index[1]
        second_count = round(glen.values[1][0] / len(subdf),2)
        years.extend([second_year,second_count])
    # print(first_year , first_count , second_year, second_count)
    result.append(years)
df = pd.DataFrame(result,columns=["vio_end","model","sum","sold预测","id_预测","统计预测1","统计1比率","统计预测2","统计2比率"])
df.to_excel("make_model_ebayno_市场统计1108.xlsx",index=False)

import pandas  as pd
df = pd.DataFrame(pd.read_excel("系统sku的fitment信息1108.xlsx"))
geration_df = pd.DataFrame(pd.read_excel("车型最后代系整理1030.xlsx"))
vio_model = df["model"]
ge_models = geration_df["model"].astype(str) #更改数据类型
print(geration_df.info())
print(df.info())
df.loc[df["product_sku"].str.find("RN")!= -1 , "note"] = "通用车型"   # 在字符串中查找子串
df.loc[df["vio_end"] < 2013, "note"] = "2013前"
nulldf = df[df["note"].isnull()] #提取为空的， 未处理的
vio_modles = nulldf[["model","vio_end"]].drop_duplicates().sort_values(by="model").values #删除重复项，并按列值排序
for end_model in vio_modles:
    model = end_model[0]
    vio_end = end_model[1]
    subdf = nulldf[(df.model == model) & (df.vio_end == vio_end)]
    index = subdf.index
    strmodel = str(model)
    if strmodel not in ge_models.values:
        print(model)
        print("nomodel")
        df.loc[index,"note"] = "no车型"
        continue
    ge_start = geration_df.loc[geration_df.model.astype(str) == strmodel , "start"].values[0]
    ge_end = geration_df.loc[geration_df.model.astype(str) == strmodel, "end"].values[0]
    # print(ge_start , ge_end)
    df.loc[index,"最后代系"]= "-".join([str(ge_start),str(ge_end)])
    if  (vio_end == ge_end):
        df.loc[index,"note"] ="同代"
    elif vio_end >= ge_start and vio_end < ge_end:
        df.loc[index,"note"]="同代扩展"
    elif vio_end < ge_start:
        if vio_end + 1 == ge_start:
            df.loc[index,"note"]="重启一代"
        else:
            df.loc[index,"note"]="前代扩展"
df.to_excel("晴雨挡最后代系分析1108.xlsx",columns=["product_id","product_sku","product_name","make","model","vio_start","vio_end","note","最后代系"], index = False)

zn505119020

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
dataframe实践

"""1：将抓取的红色标记字符串中，sold数字提取2：end model 分组3 按照年份分类汇总，汇总方式有：ebay_id 均值，sold和，个数最多"""import pandas as pdfrom collections import Counterimport redf = pd.DataFrame(pd.read_excel("make_model_市场年代1
复制链接

扫一扫