C#ToLookup代替GroupBy实现分组统计功能

1 篇文章 0 订阅
1 篇文章 0 订阅

假设有20万的数据,需要根据来源链接实现第一级分组统计,再根据来源链接中的广告单元实现二级统计

广告数据实体:


public class AdStatisticsOutput
{
    public int Id { get; set; }
    public string SourceUrl { get; set; }
    public string Host { get; set; }
    public string Origin { get; set; }
    public string Referer { get; set; }
    public DateTime ReportTime { get; set; }
    public int AdUnitId { get; set; }
    public int AdMaterialId { get; set; }
    public string AdUnitName { get; set; }
    /// <summary>
    /// 上报类型,0:浏览,1:点击
    /// </summary>
    public int EscalationType { get; set; }
    public int AdTypeId { get; set; }
    public string AdTypeName { get; set; }
    public string SourceUrlArticleTitle { get; set; }
    public string SourceUrlArticleId { get; set; }
}

一级统计主表实体:

/// <summary>
/// 报表-链接
/// </summary>
[SugarTable(null, "报表-链接")]
public partial class AdStatisticsByArticle : EntityBase
{
    /// <summary>
    /// 统计日期
    /// </summary>
    [SugarColumn(ColumnDescription = "统计日期", IsNullable = true)]
    public DateTime StatisticsTime { get; set; }
    /// <summary>
    /// 广告单元Id
    /// </summary>
    [SugarColumn(ColumnDescription = "广告单元Id", IsNullable = true)]
    public int AdUnitId { get; set; }
    /// <summary>
    /// 广告单元Id
    /// </summary>
    [SugarColumn(ColumnDescription = "文章链接", Length = 1000, IsNullable = true)]
    public string SourceUrl { get; set; }
    /// <summary>
    /// 广告单元Id
    /// </summary>
    [SugarColumn(ColumnDescription = "文章链接标题", Length = 2000, IsNullable = true)]
    public string SourceUrlTitle { get; set; }
    /// <summary>
    /// 广告单元名称
    /// </summary>
    [SugarColumn(ColumnDescription = "文章内容Id", IsNullable = true)]
    public string SourceUrlArticleId { get; set; }
    /// <summary>
    /// 曝光量
    /// </summary>
    [SugarColumn(ColumnDescription = "曝光量", DefaultValue = "0")]
    public int TotalExposureCount { get; set; }
    /// <summary>
    /// 点击量
    /// </summary>
    [SugarColumn(ColumnDescription = "点击量", DefaultValue = "0")]
    public int TotalClickCount { get; set; }
    /// <summary>
    /// 点击率
    /// </summary>
    [SugarColumn(ColumnDescription = "点击率,%", DefaultValue = "0")]
    public decimal Rate { get; set; }

}

二级统计子表实体:

/// <summary>
/// 报表-链接详情
/// </summary>
[SugarTable(null, "报表-链接详情")]
public partial class AdStatisticsByArticleDetail : EntityBase
{
    /// <summary>
    /// 父级id
    /// </summary>
    [SugarColumn(ColumnDescription = "父级id", DefaultValue = "0")]
    public long ParentId { get; set; }
    /// <summary>
    /// 统计日期
    /// </summary>
    [SugarColumn(ColumnDescription = "统计日期", IsNullable = true)]
    public DateTime StatisticsTime { get; set; }
    /// <summary>
    /// 广告单元名称
    /// </summary>
    [SugarColumn(ColumnDescription = "广告单元id", IsNullable = true)]
    public int AdUnitId { get; set; }
    /// <summary>
    /// 广告单元名称
    /// </summary>
    [SugarColumn(ColumnDescription = "广告单元名称", IsNullable = true)]
    public string AdUnitName { get; set; }
    /// <summary>
    /// 广告单元名称
    /// </summary>
    [SugarColumn(ColumnDescription = "广告单元类型id", IsNullable = true)]
    public int AdTypeId { get; set; }
    /// <summary>
    /// 广告单元名称
    /// </summary>
    [SugarColumn(ColumnDescription = "广告单元类型名称", IsNullable = true)]
    public string AdTypeName { get; set; }
    /// <summary>
    /// 曝光量
    /// </summary>
    [SugarColumn(ColumnDescription = "曝光量", DefaultValue = "0")]
    public int ExposureCount { get; set; }
    /// <summary>
    /// 点击量
    /// </summary>
    [SugarColumn(ColumnDescription = "点击量", DefaultValue = "0")]
    public int ClickCount { get; set; }
    /// <summary>
    /// 点击率
    /// </summary>
    [SugarColumn(ColumnDescription = "点击率,%", DefaultValue = "0")]
    public decimal Rate { get; set; }

}

GroupBy方式实现:

public class GroupByToData(SqlSugarScope db, List<AdStatisticsOutput> source)
{
    Stopwatch stopwatch = Stopwatch.StartNew();
    try
    {
        //根据域名完成groupBy
        var groupByList = source.GroupBy(x => x.SourceUrl);
        var inslist = new List<AdStatisticsByArticle>();
        foreach (var item in groupByList)
        {
            var exposureCount = source.Where(x => x.SourceUrl == item.Key).Where(x => x.EscalationType == 0).Count();
            var clickCount = source.Where(x => x.SourceUrl == item.Key).Where(x => x.EscalationType == 1).Count();
            var ins = new AdStatisticsByArticle
            {
                StatisticsTime = DateTime.Now.Date,
                SourceUrl = item.Key,
                TotalExposureCount = exposureCount,
                TotalClickCount = clickCount,
                Rate = exposureCount > 0 ? Math.Round((decimal)clickCount / (decimal)exposureCount, 4) : 0,
                SourceUrlTitle = source.Where(x => x.SourceUrl == item.Key).Select(x => x.SourceUrlArticleTitle).FirstOrDefault(),
                SourceUrlArticleId = source.Where(x => x.SourceUrl == item.Key).Select(x => x.SourceUrlArticleId).FirstOrDefault()
            };
            //当前循环处理会有较大的性能损耗
            inslist.Add(ins);
        }
        //所有域名下的数据批量入主表数据
        var dbmainIds = await db.Insertable(inslist).ExecuteReturnPkListAsync<int>();
        //使用SqlSugar的批量插入返回批量插入的主键(当前实现方式用时很久能执行完毕)
        //批量插入后主表的ID和统计字段
        var childList = new List<AdStatisticsByArticleDetail>();
        for (int i = 0; i < dbmainIds.Count; i++)
        {
            //根据域名和主表ID完成子表数据统计
            var childListByDomainName = source.Where(x => x.SourceUrl == inslist[i].SourceUrl).ToList();
            var childListByDomainNameGroupBy = childListByDomainName.GroupBy(x => x.AdUnitId);

            foreach (var item in childListByDomainNameGroupBy)
            {
                string? adUnitName = childListByDomainName.Where(x => x.AdUnitId == item.Key).Select(x => x.AdUnitName).FirstOrDefault();
                var adTypeId = childListByDomainName.Where(x => x.AdUnitId == item.Key).Select(x => x.AdTypeId).FirstOrDefault();
                var adTypeName = childListByDomainName.Where(x => x.AdUnitId == item.Key).Select(x => x.AdTypeName).FirstOrDefault();
                var totalExposureCount = childListByDomainName.Where(x => x.AdUnitId == item.Key && x.EscalationType == 0).Count();
                var totalClickCount = childListByDomainName.Where(x => x.AdUnitId == item.Key && x.EscalationType == 1).Count();
                var child = new AdStatisticsByArticleDetail
                {
                    ParentId = dbmainIds[i],
                    StatisticsTime = DateTime.Now.Date,
                    AdUnitId = item.Key,
                    AdUnitName = adUnitName,
                    AdTypeId = adTypeId,
                    AdTypeName = adTypeName,
                    ExposureCount = totalExposureCount,
                    ClickCount = totalClickCount,
                    Rate = totalExposureCount > 0 ? Math.Round((decimal)totalClickCount / (decimal)totalExposureCount, 4) : 0
                };
                childList.Add(child);
            }
        }
        //处理子表数据
        await db.Insertable(childList).ExecuteCommandAsync();
        pr.PR(ServiceName, $"报表-来源链接 统计成功", 1);
        stopwatch.Stop();
        pr.PR(ServiceName, $"报表-来源链接 统计完成,耗时:{stopwatch.Elapsed.TotalSeconds}秒", 1);
        Log.Info($"报表-来源链接 统计完成,耗时:{stopwatch.Elapsed.TotalSeconds}秒");
    }
    catch (Exception ex)
    {
        Log.Error(ex.Message);
        pr.PR(ServiceName, $"报表-来源链接 统计失败,请查看日志", 1);
    }
}

 ToLookup方式实现:

private async Task BySourceUrl(SqlSugarScope db, List<AdStatisticsOutput> source)
{
    pr.PR(ServiceName, "统计 报表-来源链接", 1);
    Stopwatch stopwatch = Stopwatch.StartNew();
    try
    {
        //根据域名完成groupBy
        var groupByList = source.ToLookup(x => x.SourceUrl);

        var inslist = new List<AdStatisticsByArticle>();

        var exposuresource = source.Where(x => x.EscalationType == 0).ToLookup(x => x.SourceUrl);
        var clicksource = source.Where(x => x.EscalationType == 1).ToLookup(x => x.SourceUrl);

        #region 计算主表数据
        Stopwatch stopwatch1 = Stopwatch.StartNew();
        foreach (var item in groupByList)
        {
            var id = SnowFlakeSingle.Instance.NextId();
            var exposureCount = exposuresource[item.Key].Count();
            var clickCount = clicksource[item.Key].Count();
            var ins = new AdStatisticsByArticle
            {
                StatisticsTime = DateTime.Now.Date,
                SourceUrl = item.Key,
                TotalExposureCount = exposureCount,
                TotalClickCount = clickCount,
                Rate = exposureCount > 0 ? Math.Round((decimal)clickCount / (decimal)exposureCount, 4) : 0,
                SourceUrlTitle = item.Select(x => x.SourceUrlArticleTitle).FirstOrDefault(),
                SourceUrlArticleId = item.Select(x => x.SourceUrlArticleId).FirstOrDefault()
            };
            inslist.Add(ins);
        }
        stopwatch1.Stop();
        pr.PR(ServiceName, $"主表内存数据计算完成,耗时:{stopwatch1.Elapsed.TotalSeconds}秒", 1);
        Log.Info($"主表内存数据计算完成,耗时:{stopwatch1.Elapsed.TotalSeconds}秒");
        #endregion

        #region 主表数据入库
        Stopwatch stopwatch2 = Stopwatch.StartNew();
        //所有域名下的数据批量入主表数据
        var dbmainIds = await db.Insertable(inslist).ExecuteReturnPkListAsync<int>();
        stopwatch2.Stop();
        pr.PR(ServiceName, $"主表插入数据完成,耗时:{stopwatch2.Elapsed.TotalSeconds}秒", 1);
        Log.Info($"主表插入数据完成,耗时:{stopwatch2.Elapsed.TotalSeconds}秒");
        #endregion

        #region 计算子表数据  
        Stopwatch stopwatch3 = Stopwatch.StartNew();
        var childList = new List<AdStatisticsByArticleDetail>();
        for (int i = 0; i < dbmainIds.Count; i++)
        {
            //根据域名和主表ID完成子表数据统计
            //var childListByDomainName = source.Where(x => x.SourceUrl == inslist[i].SourceUrl).ToList();
            var childListByDomainName = groupByList[inslist[i].SourceUrl].ToList();

            var childListByDomainNameGroupBy = childListByDomainName.ToLookup(x => x.AdUnitId);

            foreach (var item in childListByDomainNameGroupBy)
            {
                string? adUnitName = item.Select(x => x.AdUnitName).FirstOrDefault();
                var adTypeId = item.Select(x => x.AdTypeId).FirstOrDefault();
                var adTypeName = item.Select(x => x.AdTypeName).FirstOrDefault();
                var totalExposureCount = item.Where(x => x.EscalationType == 0).Count();
                var totalClickCount = item.Where(x => x.EscalationType == 1).Count();

                var child = new AdStatisticsByArticleDetail
                {
                    ParentId = dbmainIds[i],
                    StatisticsTime = DateTime.Now.Date,
                    AdUnitId = item.Key,
                    AdUnitName = adUnitName,
                    AdTypeId = adTypeId,
                    AdTypeName = adTypeName,
                    ExposureCount = totalExposureCount,
                    ClickCount = totalClickCount,
                    Rate = totalExposureCount > 0 ? Math.Round((decimal)totalClickCount / (decimal)totalExposureCount, 4) : 0
                };
                childList.Add(child);
            }
        }
        stopwatch3.Stop();
        pr.PR(ServiceName, $"子表内存数据计算完成,耗时:{stopwatch3.Elapsed.TotalSeconds}秒", 1);
        Log.Info($"子表内存数据计算完成,耗时:{stopwatch3.Elapsed.TotalSeconds}秒");
        #endregion

        #region 子表数据入库
        Stopwatch stopwatch4 = Stopwatch.StartNew();
        //处理子表数据
        await db.Insertable(childList).ExecuteCommandAsync();
        stopwatch4.Stop();
        pr.PR(ServiceName, $"子表数据入库完成,耗时:{stopwatch4.Elapsed.TotalSeconds}秒", 1);
        Log.Info($"子表数据入库完成,耗时:{stopwatch4.Elapsed.TotalSeconds}秒");
        #endregion

        pr.PR(ServiceName, $"报表-来源链接 统计成功", 1);
        stopwatch.Stop();
        pr.PR(ServiceName, $"报表-来源链接 统计完成,耗时:{stopwatch.Elapsed.TotalSeconds}秒", 1);
        Log.Info($"报表-来源链接 统计完成,耗时:{stopwatch.Elapsed.TotalSeconds}秒");
    }
    catch (Exception ex)
    {
        Log.Error(ex.Message);
        pr.PR(ServiceName, $"报表-来源链接 统计失败,请查看日志", 1);
    }
}

两种实现方式在内存中处理效率相差很多(不是一个量级)

GroupBy方式实现用时

ToLookup方式实现用时

如果内存操作大批量数据,可替换为ToLookup处理

  • 4
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值