昨天在研究C#的定时器的过程中,发现 Delegate的效率比直接调用一个函数的效率要低不少,今天研究了一下,的确如此。在网上查了一下
http://www.cnblogs.com/sumtec/archive/2004/05/23/11025.aspx
也写了一个程序进行测试,但给出的程序不是太合理,因为在调用接口注册的函数时使用了for循环,操作明显多于delegate测试函数,所以执行结果反而是delegate比接口调用要快很多,于是我又写了如下程序进行测试:
interface IVoidMethod { void WorkMethod(); } interface IDelegateTester { void Run(); } abstract class DelegateTester : IDelegateTester, IVoidMethod { private int _multiCast; private long _iteration; protected delegate void TaskHandler(); protected event TaskHandler task; public int MultiCast { get { return _multiCast; } set { _multiCast = value; } } public long Iteration { get { return _iteration; } set { _iteration = value; } } protected DelegateTester(int multicast, long iteration) { _multiCast = multicast; _iteration = iteration; while (multicast-- > 0) { task += new TaskHandler(WorkMethod); } } protected void CallDelegate() { //if (task != null) { task(); } } protected abstract void CallFunction(); /// <summary> /// work method for test /// </summary> public virtual void WorkMethod() { } #region IDelegateTester Members public void Run() { Console.WriteLine("/n Delgate Test(Multicast={0}, iteration={1}", MultiCast, Iteration); DateTime t0 = DateTime.Now; Console.WriteLine(t0.ToString("mm:ss.ffff/t:") + "Runing delegate test ..."); for (int i = 0; i < _iteration; i++) { CallDelegate(); } DateTime t1 = DateTime.Now; Console.WriteLine(t1.ToString("mm:ss.ffff/t:") + "Runing function call test ..."); for (int i = 0; i < _iteration; i++) { CallFunction(); } DateTime t2 = DateTime.Now; Console.WriteLine(t2.ToString("mm:ss.ffff/t:") + "Test finished!"); TimeSpan deltaDelegate = t1 - t0; TimeSpan deltaFunction = t2 - t1; Console.WriteLine("Result:"); Console.WriteLine("/tTime expense (delegate : function) = {0} : {1} = {2:0.000}", deltaDelegate.Ticks.ToString(), deltaFunction.Ticks.ToString(), (double)(deltaDelegate.Ticks) / (double)(deltaFunction.Ticks)); } #endregion } // 比较调用单个函数时的效率 class SingleDelegateTester : DelegateTester { public SingleDelegateTester(int multicast, long iteration) : base(multicast, iteration) { } protected override void CallFunction() { this.WorkMethod(); } } /// <summary> /// 测试多播方式的函数调用时的效率 /// </summary> class MulticastDelegateTester : DelegateTester { IVoidMethod[] calls; public MulticastDelegateTester(int multicast, long iteration) : base(multicast, iteration) { calls = new IVoidMethod[multicast]; for (int i = 0; i < calls.Length; i++) { calls[i] = this; } } protected override void CallFunction() { foreach (IVoidMethod i in calls) { i.WorkMethod(); } } } public class Test { /// <summary> /// 应用程序的主入口点。 /// </summary> static void Main() { long iteration = 1000000000; IDelegateTester[] tests = new IDelegateTester[]{ new SingleDelegateTester(1, iteration), new MulticastDelegateTester(10, iteration)}; foreach (IDelegateTester t in tests) { t.Run(); Console.WriteLine("Press any key to run next test..."); Console.ReadKey(); } Console.WriteLine("Test finished, press any key to quit ..."); Console.ReadKey(); } }
程序运行结果如下:
Delgate Test(Multicast=1, iteration=1000000000 07:49.2850 :Runing delegate test ... 08:01.3380 :Runing function call test ... 08:11.6500 :Test finished! Result: Time expense (delegate : function) = 120530000 : 103120000 = 1.169 Press any key to run next test... Delgate Test(Multicast=10, iteration=1000000000 08:58.1870 :Runing delegate test ... 10:54.3060 :Runing function call test ... 12:36.5080 :Test finished! Result: Time expense (delegate : function) = 1161190000 : 1022020000 = 1.136 Press any key to run next test... Test finished, press any key to quit ...
由以上运行结果可以看出,delegate方式的函数调用的确比直接调用函数效率要低很多,特别是只注册一个事件处理函数时效率差别非常明显,相差近20%,这在实时控制程序中还是应该注意的问题。而在多播时由于两种方式都需要对注册的事件列表进行遍历,使得性能差距拉近,但仍在13.6%,在对性能要求比较高的情况要真的要考虑是使用delegate的方便性还是自行实现多播方式的函数调用了。
注意上面为了不让条件判断语句影响测试的准确性,在调用delegate时把判空操作注释掉了,即使在这种情况下仍然效率不及直接调用,下面研究一下原因。
首先看一下Run()函数,两种调用方式的测试代码段分别如下:
(1)Delegate调用段
DateTime t0 = DateTime.Now; 00000095 lea ecx,[ebp-5Ch] 00000098 call 78639580 0000009d lea edi,[ebp-14h] 000000a0 lea esi,[ebp-5Ch] 000000a3 movq xmm0,mmword ptr [esi] 000000a7 movq mmword ptr [edi],xmm0 Console.WriteLine(t0.ToString("mm:ss.ffff/t:") + "Runing delegate test ..."); 000000ab lea ecx,[ebp-14h] 000000ae mov edx,dword ptr ds:[023930A4h] 000000b4 call 78667010 000000b9 mov esi,eax 000000bb mov edx,dword ptr ds:[023930A8h] 000000c1 mov ecx,esi 000000c3 call 785FBDE0 000000c8 mov esi,eax 000000ca mov ecx,esi 000000cc call 78678544 for (int i = 0; i < _iteration; i++) { 000000d1 xor edx,edx 000000d3 mov dword ptr [ebp-18h],edx 000000d6 nop 000000d7 jmp 000000E8 CallDelegate(); 000000d9 mov ecx,dword ptr [ebp+FFFFFF68h] 000000df call dword ptr ds:[0389007Ch] for (int i = 0; i < _iteration; i++) { 000000e5 inc dword ptr [ebp-18h] 000000e8 mov eax,dword ptr [ebp-18h] 000000eb cdq 000000ec mov ecx,dword ptr [ebp+FFFFFF68h] 000000f2 cmp edx,dword ptr [ecx+8] 000000f5 jg 000000FE 000000f7 jl 000000D9 000000f9 cmp eax,dword ptr [ecx+4] 000000fc jb 000000D9 }
(2)函数直接调用ukw
DateTime t1 = DateTime.Now; 000000fe lea ecx,[ebp-64h] 00000101 call 78639580 00000106 lea edi,[ebp-20h] 00000109 lea esi,[ebp-64h] 0000010c movq xmm0,mmword ptr [esi] 00000110 movq mmword ptr [edi],xmm0 Console.WriteLine(t1.ToString("mm:ss.ffff/t:") + "Runing function call test ..."); 00000114 lea ecx,[ebp-20h] 00000117 mov edx,dword ptr ds:[023930A4h] 0000011d call 78667010 00000122 mov esi,eax 00000124 mov edx,dword ptr ds:[023930ACh] 0000012a mov ecx,esi 0000012c call 785FBDE0 00000131 mov esi,eax 00000133 mov ecx,esi 00000135 call 78678544 for (int i = 0; i < _iteration; i++) { 0000013a xor edx,edx 0000013c mov dword ptr [ebp-24h],edx 0000013f nop 00000140 jmp 00000150 CallFunction(); 00000142 mov ecx,dword ptr [ebp+FFFFFF68h] 00000148 mov eax,dword ptr [ecx] 0000014a call dword ptr [eax+38h] for (int i = 0; i < _iteration; i++) { 0000014d inc dword ptr [ebp-24h] 00000150 mov eax,dword ptr [ebp-24h] 00000153 cdq 00000154 mov ecx,dword ptr [ebp+FFFFFF68h] 0000015a cmp edx,dword ptr [ecx+8] 0000015d jg 00000166 0000015f jl 00000142 00000161 cmp eax,dword ptr [ecx+4] 00000164 jb 00000142 }
但是从汇编代码看,这里调用CallFunction()时反而多了一条MOV指令,从这里看应该直接调用函数会比delegate调用更消耗资源,但这条指令是单周期指令消耗资源极少,先不去管它,其它代码都一样,看来性能差在CallFunction()和CallDelegate()两个函数上,且看这两个函数的汇编代码:
protected void CallDelegate() { //if (task != null) { task(); 00000000 push esi 00000001 mov esi,ecx 00000003 cmp dword ptr ds:[00269204h],0 0000000a je 00000011 0000000c call 793B672F 00000011 mov ecx,dword ptr [esi+0Ch] 00000014 mov eax,dword ptr [ecx+0Ch] 00000017 mov ecx,dword ptr [ecx+4] 0000001a call eax } } 0000001c nop 0000001d pop esi 0000001e ret
protected override void CallFunction() { this.WorkMethod(); 00000000 push esi 00000001 mov esi,ecx 00000003 cmp dword ptr ds:[00269204h],0 0000000a je 00000011 0000000c call 793B66D7 00000011 mov ecx,esi 00000013 mov eax,dword ptr [ecx] 00000015 call dword ptr [eax+3Ch] } 00000018 nop 00000019 pop esi 0000001a ret
两段代码不同的地方如上面的黄色区域所示,在这里CallDelegate却又比CallFunction多执行一次MOV操作,这与调用这些函数前面那段结合起来应该一样,如果差别的话只能差在这几个MOV和CALL操作上了,在CallDelegate函数中三次MOV操作都是双字节赋值,并且是间接寻址到寄存器,而CallFunction的两次MOV操作只有一次是间接寻址,而另一次是从寄存器到寄存器,这要比从内存到寄存器快。看来差别也只是这个原因了。
下面看一下多播的情况。函数CallFunction代码如下:
protected override void CallFunction() { foreach (IVoidMethod i in calls) { 00000000 push edi 00000001 push esi 00000002 push ebx 00000003 push ebp 00000004 mov edi,ecx 00000006 cmp dword ptr ds:[00309204h],0 0000000d je 00000014 0000000f call 796C66A7 00000014 xor ebx,ebx 00000016 xor ebp,ebp 00000018 xor esi,esi 0000001a mov eax,dword ptr [edi+14h] 0000001d mov ebp,eax 0000001f xor esi,esi 00000021 nop 00000022 jmp 0000003D 00000024 cmp esi,dword ptr [ebp+4] 00000027 jb 0000002E 00000029 call 796C7CE3 0000002e mov eax,dword ptr [ebp+esi*4+0Ch] 00000032 mov ebx,eax i.WorkMethod(); 00000034 mov ecx,ebx 00000036 call dword ptr ds:[00310024h] 0000003c inc esi foreach (IVoidMethod i in calls) { 0000003d cmp esi,dword ptr [ebp+4] 00000040 jl 00000024 } } 00000042 nop 00000043 pop ebp 00000044 pop ebx 00000045 pop esi 00000046 pop edi 00000047 ret
而CallDelegate的代码并没有改变,代码差别很大,比较汇编比较困难。换条思路,研究一下delegate的实现机制