最近有个项目的抽样也是够恶心,原始数据表包含一张全的公司list(无重复公司)。交给Ops Team去根据它抽数。
接着对方返回一个excel 包含母list 和 多个data group sheet,每个data group里面就是抽到的,在指定时间段内的数据。
在被抽到的data group 中,总共抽取30个公司的数据。要求每个data group 都要random到两个记录。剩下的部分随机挑公司。
另外还要在过万的记录里面随机抽样30个公司,是没有任何数据的。去audit missing case。
各data group之间复制粘贴,还要考虑有些公司可能在各个data group都有数据。想来想去,觉得还是来个tool吧。
下图是拿到的raw data list样式
母list一万多条。在家测试共用去了26秒。每个页面
下图那些flag就是被抽中的了
下面上代码。
Option Explicit
Sub Summarize()
Dim WrkSht As Worksheet
Dim ShtNew As Worksheet
Dim ShtCom As Worksheet
Dim Rng As Range
Dim k As Integer
Dim k1 As Integer
Dim k2 As Integer
Dim kk As Integer
Dim yy As Integer
Dim Row1 As Integer
Dim Zebra As Integer
Dim Flag As Integer
Flag = 1
Dim CID As String
Dim arrData
Dim dataCount As Integer
dataCount = 0
Dim i As Integer
ReDim arrData(1 To 1)
Dim d As Object
Set d = CreateObject("scripting.dictionary")
Application.ScreenUpdating = False
Set ShtNew = Worksheets.Add(After:=Worksheets(Worksheets.Count))
ShtNew.Name = "AccuracyList"
'Accuracy List
For Each WrkSht In Worksheets
If Not WrkSht Is ShtNew And WrkSht.Name <> "CompanyList" Then
kk = WrkSht.Range("A1").CurrentRegion.Columns.Count
'MsgBox kk
yy = WrkSht.Range("A1").CurrentRegion.Rows.Count
'MsgBox yy
For k = 1 To kk
If WrkSht.Cells(1, k).Value = "Company Id" Then
WrkSht.Range(WrkSht.Cells(2, k), WrkSht.Cells(yy, k)).Copy ShtNew.Range("A" & ShtNew.Range("A56565").End(3).Row + 1)
'note = note + yy
Exit For
End If
Next k
End If
Next
ShtNew.Cells(1, 1).Value = "Company Id"
ShtNew.Cells(1, 1).CurrentRegion.RemoveDuplicates Columns:=1, Header:=xlYes
ShtNew.Cells(1, 2).Value = "RAND"
yy = ShtNew.Range("A1").CurrentRegion.Rows.Count
For k = 2 To yy
ShtNew.Cells(k, 2).Value = Rnd
Next k
ShtNew.Range("A:B").Sort Columns(2), xlAscending, Header:=xlYes
'End of Accuracy List
'Completeness List
Set ShtCom = Worksheets.Add(After:=Worksheets(Worksheets.Count))
ShtCom.Name = "CompleteList"
For Each WrkSht In Worksheets
If WrkSht.Name = "CompanyList" Then
WrkSht.Range("A1").CurrentRegion.Copy ShtCom.Range("A1")
Exit For
End If
Next
kk = ShtCom.Range("A1").CurrentRegion.Columns.Count ' The total number of columns
For k = 1 To kk
If ShtCom.Cells(1, k).Value = "CompanyId" Then
k1 = k ' the column named Company Id
Exit For
End If
Next k
For k = 1 To ShtNew.Range("A1").CurrentRegion.Columns.Count
If ShtNew.Cells(1, k).Value = "Company Id" Then
k2 = k ' the column named Company Id
Exit For
End If
Next
kk = ShtCom.Range("A1").CurrentRegion.Rows.Count ' The total number of rows
yy = ShtCom.Range("A1").CurrentRegion.Columns.Count 'The total number of columns
Cells(1, yy + 1).Value = "Sequence"
For Row1 = 2 To ShtNew.Range("A1").CurrentRegion.Rows.Count
CID = ShtNew.Cells(Row1, k2).Value
Set Rng = ShtCom.Range(ShtCom.Cells(1, k1), ShtCom.Cells(kk, k1)).Find(CID, lookat:=xlWhole)
If Not Rng Is Nothing Then
Cells(Rng.Row, yy + 1).Value = 1
End If
Next Row1
ShtCom.Range(Cells(1, 1), Cells(kk, yy + 1)).Sort Columns(yy + 1), xlDescending, Header:=xlYes
Range(Cells(2, yy + 1), Cells(Columns(yy + 1).End(xlDown).Row, yy + 1)).EntireRow.Delete
ShtCom.Columns(yy + 1).Delete
ShtCom.Cells(1, yy + 1).Value = "RAND"
kk = ShtCom.Range("A1").CurrentRegion.Rows.Count ' The total number of rows
For k = 2 To kk
Cells(k, yy + 1).Value = Rnd
Next k
ShtCom.Range(Cells(1, 1), Cells(kk, yy + 1)).Sort Columns(yy + 1), xlAscending, Header:=xlYes
'CompleteList End
'Accuracy Sampling
For Each WrkSht In Worksheets
If Not WrkSht Is ShtNew And WrkSht.Name <> "CompanyList" And Not WrkSht Is ShtCom Then
kk = WrkSht.Range("A1").CurrentRegion.Columns.Count
'MsgBox kk
yy = WrkSht.Range("A1").CurrentRegion.Rows.Count
'MsgBox yy
For k = 1 To kk
If WrkSht.Cells(1, k).Value = "Company Id" Then
k1 = k
Exit For
End If
Next k
If k = kk Then
MsgBox "No Company Id Column!"
Exit Sub
End If
If yy > 3 Then
WrkSht.Cells(1, kk + 1).Value = "RAND"
For k = 2 To yy
WrkSht.Cells(k, kk + 1) = Rnd
Next k
WrkSht.Range(WrkSht.Cells(1, 1), WrkSht.Cells(yy, kk + 1)).Sort WrkSht.Columns(kk + 1), xlAscending, Header:=xlYes
With WrkSht
dataCount = dataCount + 1
ReDim Preserve arrData(1 To dataCount)
arrData(dataCount) = .Cells(2, k1).Value
'MsgBox arrData(dataCount)
dataCount = dataCount + 1
ReDim Preserve arrData(1 To dataCount)
arrData(dataCount) = .Cells(3, k1).Value
'MsgBox arrData(dataCount)
End With
ElseIf yy = 3 Then
With WrkSht
dataCount = dataCount + 1
ReDim Preserve arrData(1 To dataCount)
arrData(dataCount) = .Cells(2, k1).Value
dataCount = dataCount + 1
ReDim Preserve arrData(1 To dataCount)
arrData(dataCount) = .Cells(3, k1).Value
End With
ElseIf yy = 2 Then
With WrkSht
dataCount = dataCount + 1
ReDim Preserve arrData(1 To dataCount)
arrData(dataCount) = .Cells(2, k1).Value
End With
Else
MsgBox "Error"
Exit Sub
End If
End If
Next
For i = 1 To UBound(arrData)
d(arrData(i)) = d(arrData(i)) + 1
Next i
Zebra = 30 - d.Count
ReDim Preserve arrData(1 To 31)
If Zebra <> 0 Then
k = 1
While Zebra >= 0
arrData(d.Count + 1) = ShtNew.Cells(k + 1, 1)
For i = 1 To UBound(arrData)
d(arrData(i)) = d(arrData(i)) + 1
Next i
k = k + 1
Zebra = 30 - d.Count
Wend
End If
ShtNew.Range("D2").Resize(UBound(arrData), 1).Value = WorksheetFunction.Transpose(arrData)
'Accuracy Sampling Complete
'Pick them up
For Each WrkSht In Worksheets
If Not WrkSht Is ShtNew And WrkSht.Name <> "CompanyList" And Not WrkSht Is ShtCom Then
kk = WrkSht.Range("A1").CurrentRegion.Columns.Count ' The total number of columns
yy = WrkSht.Range("A1").CurrentRegion.Rows.Count ' The total number of rows
WrkSht.Cells(1, kk + 1).Value = "Flag"
For k = 1 To kk
If ShtCom.Cells(1, k).Value = "Company Id" Then
k1 = k ' the column named Company Id
Exit For
End If
Next k
For i = 1 To UBound(arrData)
For k = 2 To yy
If WrkSht.Cells(k, k1).Value = arrData(i) Then
WrkSht.Cells(k, kk + 1).Value = Flag
End If
Next k
Flag = Flag + 1
Next i
End If
WrkSht.Range(WrkSht.Cells(1, 1), WrkSht.Cells(yy, kk + 1)).Sort WrkSht.Columns(kk + 1), xlDescending, Header:=xlYes
Next
'End of pick-up
Set d = Nothing
Application.ScreenUpdating = True
End Sub
结束语:再一次的感谢 组小牛同学,当我告诉他我用union range法去实现删除区域时,他提醒我应该去尝试排序再删除。
实验表明,同样家里的土冒机器的环境下,union_range法39秒,排序删除法19秒。
另外还有就是使用union 的时候,里面不可以有设置为nothing的区块或者没有定义的区块。