512x512像素,每像素1000采样,C#版本渲染时间为40分47秒
最近有多篇讨论程序语言趋势的博文,其中谈及到C#的性能问题。本人之前未做过相关测试,自己的回覆流于理论猜测,所以花了点时间做个简单实验,比较C#和C++的性能。
实验内容
赵姐夫在此回覆认为,C#比C/C++慢,主要在于.Net平台的垃圾回收(garbage collection, GC)机制。若是计算密集型应用,C#和C++产生的原生代码,速度应该相差不大。我对此半信半疑。想到之前看过一个用99行C++代码实现的全局照明(global illumination, GI)渲染程序smallpt ,是纯计算密集的。而且在运算期间,若用C#实现,基本上连GC都可以不用。因此,就把该99行代码移植至C#。
此渲染器的一些特点如下:
- 使用蒙地卡罗路径追踪(Monte Carlo path-tracing)来产生全局照明效果
- 支持三种双向反射分布函数(bidirectional reflectance distribution function, BRDF): 镜射(specular)、漫射(diffuse)和玻璃(即纯折射的介质)
- 从漫射光源产生柔和阴影(soft shadow)
- 使用2x2超采样(super-sampling)去实现反锯齿
- 使用OpenMP作并行运算,充份利用多核性能
当中的术语及技术,之后可能会于图形学博文系列里探讨。本文主要以性能为题。
C++版本
以下是C++版本代码,作了些许修改。修改地方加上了MILO注译。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#include <math.h> // smallpt, a Path Tracer by Kevin Beason, 2008
#include <stdlib.h> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
#include <stdio.h> // Remove "-fopenmp" for g++ version < 4.2
#include <time.h> // MILO
#include "erand48.inc" // MILO
#define M_PI 3.141592653589793238462643 // MILO
struct
Vec {
// Usage: time ./smallpt 5000 && xv image.ppm
double
x, y, z;
// position, also color (r,g,b)
Vec(
double
x_=0,
double
y_=0,
double
z_=0){ x=x_; y=y_; z=z_; }
Vec operator+(
const
Vec &b)
const
{
return
Vec(x+b.x,y+b.y,z+b.z); }
Vec operator-(
const
Vec &b)
const
{
return
Vec(x-b.x,y-b.y,z-b.z); }
Vec operator*(
double
b)
const
{
return
Vec(x*b,y*b,z*b); }
Vec mult(
const
Vec &b)
const
{
return
Vec(x*b.x,y*b.y,z*b.z); }
Vec& norm(){
return
*
this
= *
this
* (1/
sqrt
(x*x+y*y+z*z)); }
double
dot(
const
Vec &b)
const
{
return
x*b.x+y*b.y+z*b.z; }
// cross:
Vec operator%(
const
Vec &b){
return
Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
};
struct
Ray { Vec o, d; Ray(
const
Vec &o_,
const
Vec &d_) : o(o_), d(d_) {} };
enum
Refl_t { DIFF, SPEC, REFR };
// material types, used in radiance()
struct
Sphere {
double
rad;
// radius
Vec p, e, c;
// position, emission, color
Refl_t refl;
// reflection type (DIFFuse, SPECular, REFRactive)
Sphere(
double
rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
double
intersect(
const
Ray &r)
const
{
// returns distance, 0 if nohit
Vec op = p-r.o;
// Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
double
t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
if
(det<0)
return
0;
else
det=
sqrt
(det);
return
(t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
}
};
Sphere spheres[] = {
//Scene: radius, position, emission, color, material
Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),
//Left
Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),
//Rght
Sphere(1e5, Vec(50,40.8, 1e5), Vec(),Vec(.75,.75,.75),DIFF),
//Back
Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(), DIFF),
//Frnt
Sphere(1e5, Vec(50, 1e5, 81.6), Vec(),Vec(.75,.75,.75),DIFF),
//Botm
Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),
//Top
Sphere(16.5,Vec(27,16.5,47), Vec(),Vec(1,1,1)*.999, SPEC),
//Mirr
Sphere(16.5,Vec(73,16.5,78), Vec(),Vec(1,1,1)*.999, REFR),
//Glas
Sphere(600, Vec(50,681.6-.27,81.6),Vec(12,12,12), Vec(), DIFF)
//Lite
};
inline
double
clamp(
double
x){
return
x<0 ? 0 : x>1 ? 1 : x; }
inline
int
toInt(
double
x){
return
int
(
pow
(clamp(x),1/2.2)*255+.5); }
inline
bool
intersect(
const
Ray &r,
double
&t,
int
&id){
double
n=
sizeof
(spheres)/
sizeof
(Sphere), d, inf=t=1e20;
for
(
int
i=
int
(n);i--;)
if
((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
return
t<inf;
}
Vec radiance(
const
Ray &r,
int
depth, unsigned
short
*Xi){
double
t;
// distance to intersection
int
id=0;
// id of intersected object
if
(!intersect(r, t, id))
return
Vec();
// if miss, return black
const
Sphere &obj = spheres[id];
// the hit object
Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
double
p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z;
// max refl
if
(++depth>5)
if
(erand48(Xi)<p) f=f*(1/p);
else
return
obj.e;
//R.R.
if
(depth > 100)
return
obj.e;
// MILO
if
(obj.refl == DIFF){
// Ideal DIFFUSE reflection
double
r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=
sqrt
(r2);
Vec w=nl, u=((
fabs
(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
Vec d = (u*
cos
(r1)*r2s + v*
sin
(r1)*r2s + w*
sqrt
(1-r2)).norm();
return
obj.e + f.mult(radiance(Ray(x,d),depth,Xi));
}
else
if
(obj.refl == SPEC)
// Ideal SPECULAR reflection
return
obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
Ray reflRay(x, r.d-n*2*n.dot(r.d));
// Ideal dielectric REFRACTION
bool
into = n.dot(nl)>0;
// Ray from outside going in?
double
nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
if
((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)
// Total internal reflection
return
obj.e + f.mult(radiance(reflRay,depth,Xi));
Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+
sqrt
(cos2t)))).norm();
double
a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
double
Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
return
obj.e + f.mult(depth>2 ? (erand48(Xi)<P ?
// Russian roulette
radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
}
int
main(
int
argc,
char
*argv[]){
clock_t
start =
clock
();
// MILO
int
w=512, h=512, samps = argc==2 ?
atoi
(argv[1])/4 : 250;
// # samples
Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm());
// cam pos, dir
Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=
new
Vec[w*h];
#pragma omp parallel for schedule(dynamic, 1) private(r) // OpenMP
for
(
int
y=0; y<h; y++){
// Loop over image rows
fprintf
(stderr,
"\rRendering (%d spp) %5.2f%%"
,samps*4,100.*y/(h-1));
unsigned
short
Xi[3]={0,0,y*y*y};
// MILO
for
(unsigned
short
x=0; x<w; x++)
// Loop cols
for
(
int
sy=0, i=(h-y-1)*w+x; sy<2; sy++)
// 2x2 subpixel rows
for
(
int
sx=0; sx<2; sx++, r=Vec()){
// 2x2 subpixel cols
for
(
int
s=0; s<samps; s++){
double
r1=2*erand48(Xi), dx=r1<1 ?
sqrt
(r1)-1: 1-
sqrt
(2-r1);
double
r2=2*erand48(Xi), dy=r2<1 ?
sqrt
(r2)-1: 1-
sqrt
(2-r2);
Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
}
// Camera rays are pushed ^^^^^ forward to start in interior
c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
}
}
printf
(
"\n%f sec\n"
, (
float
)(
clock
() - start)/CLOCKS_PER_SEC);
// MILO
FILE
*f =
fopen
(
"image.ppm"
,
"w"
);
// Write image to PPM file.
fprintf
(f,
"P3\n%d %d\n%d\n"
, w, h, 255);
for
(
int
i=0; i<w*h; i++)
fprintf
(f,
"%d %d %d "
, toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}
|
由于Visual C++没有erand48()函数,便在网上找到一个PostreSQL的实现 。此外,为了比较公平,分别测试使用和禁用OpenMP情况下的性能。
本人亦为了显示C++特有的能力,另外作一个版本,采用微软DirectX SDK中的C++ XNA数学库进行SIMD矢量加速(XNA Game Studio也有.Net用的XNA数学库,但本文并没有使用)。由于XNA数学库采用单精度浮点数,对这个特别场景(6面墙壁其实是由6个巨大的球体组成)有超出精度范围的问题。因此,我对这版本里的场境稍作修改。又因为erand48()函数是传回双精度的随机数,多次转换比较慢,此版本就换了之前博文使用的LCG实现。
C#版本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
using
System;
using
System.IO;
namespace
smallpt_cs {
struct
Vec {
// Usage: time ./smallpt 5000 && xv image.ppm
public
double
x,y,z;
// position,also color (r,g,b)
public
Vec(
double
x_,
double
y_,
double
z_) {x=x_;y=y_;z=z_;}
public
static
Vec
operator
+(Vec a,Vec b) {
return
new
Vec(a.x+b.x,a.y+b.y,a.z+b.z);}
public
static
Vec
operator
-(Vec a,Vec b) {
return
new
Vec(a.x-b.x,a.y-b.y,a.z-b.z);}
public
static
Vec
operator
*(Vec a,
double
b) {
return
new
Vec(a.x*b,a.y*b,a.z*b);}
public
Vec mult(Vec b) {
return
new
Vec(x*b.x,y*b.y,z*b.z);}
public
Vec norm() {
return
this
=
this
*(1/Math.Sqrt(x*x+y*y+z*z));}
public
double
dot(Vec b) {
return
x*b.x+y*b.y+z*b.z;}
//cross:
public
static
Vec
operator
%(Vec a,Vec b) {
return
new
Vec(a.y*b.z-a.z*b.y,a.z*b.x-a.x*b.z,a.x*b.y-a.y*b.x);}
}
enum
Refl_t { DIFF,SPEC,REFR };
// material types,used in radiance()
struct
Ray {
public
Vec o,d;
public
Ray(Vec o_,Vec d_) { o=o_;d=d_;} }
class
Sphere {
public
double
rad;
// radius
public
Vec p,e,c;
// position,emission,color
public
Refl_t refl;
// reflection type (DIFFuse,SPECular,REFRactive)
public
Sphere(
double
rad_,Vec p_,Vec e_,Vec c_,Refl_t refl_) {
rad=rad_;p=p_;e=e_;c=c_;refl=refl_;
}
public
double
intersect(Ray r)
{
// returns distance,0 if nohit
Vec op=p-r.o;
// Solve t^2*d.d+2*t*(o-p).d+(o-p).(o-p)-R^2=0
double
t,eps=1e-4,b=op.dot(r.d),det=b*b-op.dot(op)+rad*rad;
if
(det<0)
return
0;
else
det=Math.Sqrt(det);
return
(t=b-det) > eps?t : ((t=b+det) > eps?t : 0);
}
};
class
smallpt {
static
Random random=
new
Random();
static
double
erand48() {
return
random.NextDouble();}
static
Sphere[] spheres={
//Scene: radius,position,emission,color,material
new
Sphere(1e5,
new
Vec( 1e5+1,40.8,81.6),
new
Vec(),
new
Vec(.75,.25,.25),Refl_t.DIFF),
//Left
new
Sphere(1e5,
new
Vec(-1e5+99,40.8,81.6),
new
Vec(),
new
Vec(.25,.25,.75),Refl_t.DIFF),
//Rght
new
Sphere(1e5,
new
Vec(50,40.8,1e5),
new
Vec(),
new
Vec(.75,.75,.75),Refl_t.DIFF),
//Back
new
Sphere(1e5,
new
Vec(50,40.8,-1e5+170),
new
Vec(),
new
Vec(), Refl_t.DIFF),
//Frnt
new
Sphere(1e5,
new
Vec(50,1e5,81.6),
new
Vec(),
new
Vec(.75,.75,.75),Refl_t.DIFF),
//Botm
new
Sphere(1e5,
new
Vec(50,-1e5+81.6,81.6),
new
Vec(),
new
Vec(.75,.75,.75),Refl_t.DIFF),
//Top
new
Sphere(16.5,
new
Vec(27,16.5,47),
new
Vec(),
new
Vec(1,1,1)*.999, Refl_t.SPEC),
//Mirr
new
Sphere(16.5,
new
Vec(73,16.5,78),
new
Vec(),
new
Vec(1,1,1)*.999, Refl_t.REFR),
//Glas
new
Sphere(600,
new
Vec(50,681.6-.27,81.6),
new
Vec(12,12,12),
new
Vec(), Refl_t.DIFF)
//Lite
};
static
double
clamp(
double
x) {
return
x<0?0 : x > 1?1 : x;}
static
int
toInt(
double
x) {
return
(
int
)(Math.Pow(clamp(x),1 / 2.2)*255+.5);}
static
bool
intersect(Ray r,
ref
double
t,
ref
int
id) {
double
d,inf=t=1e20;
for
(
int
i=spheres.Length-1;i >= 0;i--)
if
((d=spheres[i].intersect(r)) != 0 && d<t) { t=d;id=i;}
return
t<inf;
}
static
Vec radiance(Ray r,
int
depth) {
double
t=0;
// distance to intersection
int
id=0;
// id of intersected object
if
(!intersect(r,
ref
t,
ref
id))
return
new
Vec();
// if miss,return black
Sphere obj=spheres[id];
// the hit object
Vec x=r.o+r.d*t,n=(x-obj.p).norm(),nl=n.dot(r.d)<0?n:n*-1,f=obj.c;
double
p=f.x>f.y&&f.x>f.z?f.x:f.y>f.z?f.y:f.z;
//max refl
if
(++depth > 5)
if
(erand48()<p) f=f*(1 / p);
else
return
obj.e;
//R.R.
if
(depth > 100)
return
obj.e;
if
(obj.refl == Refl_t.DIFF) {
// Ideal DIFFUSE reflection
double
r1=2*Math.PI*erand48(),r2=erand48(),r2s=Math.Sqrt(r2);
Vec w=nl,u=((Math.Abs(w.x)>.1?
new
Vec(0,1,0):
new
Vec(1,0,0))%w).norm(),v=w%u;
Vec d=(u*Math.Cos(r1)*r2s+v*Math.Sin(r1)*r2s+w*Math.Sqrt(1-r2)).norm();
return
obj.e+f.mult(radiance(
new
Ray(x,d),depth));
}
else
if
(obj.refl == Refl_t.SPEC)
// Ideal SPECULAR reflection
return
obj.e+f.mult(radiance(
new
Ray(x,r.d-n*2*n.dot(r.d)),depth));
Ray reflRay=
new
Ray(x,r.d-n*2*n.dot(r.d));
//IdealdielectricREFRACTION
bool
into
=n.dot(nl) > 0;
// Ray from outside going in?
double
nc=1,nt=1.5,nnt=
into
?nc / nt : nt / nc,ddn=r.d.dot(nl),cos2t;
if
((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)
//Total internal reflection
return
obj.e+f.mult(radiance(reflRay,depth));
Vec tdir=(r.d*nnt-n*((
into
?1:-1)*(ddn*nnt+Math.Sqrt(cos2t)))).norm();
double
a=nt-nc,b=nt+nc,R0=a*a/(b*b),c=1-(
into
?-ddn:tdir.dot(n));
double
Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
return
obj.e+f.mult(depth > 2?(erand48()<P?
// Russian roulette
radiance(reflRay,depth)*RP:radiance(
new
Ray(x,tdir),depth)*TP):
radiance(reflRay,depth)*Re+radiance(
new
Ray(x,tdir),depth)*Tr);
}
public
static
void
Main(
string
[] args) {
DateTime start=DateTime.Now;
int
w=256,h=256,samps=args.Length==2?
int
.Parse(args[1])/4:25;
// # samples
Ray cam=
new
Ray(
new
Vec(50,52,295.6),
new
Vec(0,-0.042612,-1).norm());
//cam pos,dir
Vec cx=
new
Vec(w*.5135/h,0,0),cy=(cx%cam.d).norm()*.5135,r;Vec[] c=
new
Vec[w*h];
for
(
int
y=0;y<h;y++) {
// Loop over image rows
Console.Write(
"\rRendering ({0}spp) {1:F2}%"
,samps*4,100.0*y/(h-1));
for
(
int
x=0;x<w;x++)
// Loop cols
for
(
int
sy=0,i=(h-y-1)*w+x;sy<2;sy++)
// 2x2 subpixel rows
for
(
int
sx=0;sx<2;sx++) {
// 2x2 subpixel cols
r=
new
Vec();
for
(
int
s=0;s<samps;s++) {
double
r1=2*erand48(),dx=r1<1?Math.Sqrt(r1)-1:1-Math.Sqrt(2-r1);
double
r2=2*erand48(),dy=r2<1?Math.Sqrt(r2)-1:1-Math.Sqrt(2-r2);
Vec d=cx*(((sx+.5+dx)/2+x)/w-.5)+
cy*(((sy+.5+dy)/2+y)/h-.5)+cam.d;
r=r+radiance(
new
Ray(cam.o+d*140,d.norm()),0)*(1.0/samps);
}
// Camera rays are pushed ^^^^^ forward to start in interior
c[i]=c[i]+
new
Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
}
}
Console.WriteLine(
"\n{0} sec"
,(DateTime.Now-start).TotalSeconds);
using
(StreamWriter sw=
new
StreamWriter(
"image.ppm"
)) {
sw.Write(
"P3\r\n{0} {1}\r\n{2}\r\n"
,w,h,255);
for
(
int
i=0;i<w*h;i++)
sw.Write(
"{0} {1} {2}\r\n"
,toInt(c[i].x),toInt(c[i].y),toInt(c[i].z));
sw.Close();
}
}
}
}
|
Vec和Ray需要不断在计算中产生实例,所以设它们为struct,struct在C#代表值类型(value type),ibpp在堆栈上高效分配内存的,不需使用GC。渲染时,Sphere是只读对象,因此用class作为引用类型(reference type)去避免不必要的复制。
实验结果和分析
实验环境是Visual Studio 2008/.Net Framework 3.5编译,Intel I7 920 (4核、超线程)。渲染512x512解像度,每像素100个采样。结果如下:
测试版本 | 需时(秒) | |
(a) | C++ | 45.548 |
(b) | C# | 61.044 |
(c) | C++ SIMD | 20.500 |
(d) | C++(OpenMP) | 7.397 |
(e) | C++ SIMD(OpenMP) | 3.470 |
(f)* | C++ LCG | 17.365 |
(g)* | C# LCG | 59.623 |
(h)* | C++ LCG (OpenMP) | 3.427 |
*2010/6/23 加入(f)(g)(h),見更新1
最基本,应比较(a)和(b)。两者皆使用单线程。 C++版本性能比C#版本快大约34%。这其实已远远超出我对C#/.Net的期望,没想到用JIT代码的运行速度,能这么接近传统的编译方式。
采用SIMD的C++版本(c),虽然仍未大量优化,但性能比没有SIMD的版本高122%,比C#版本高接近两倍。不过,采用SIMD后,数值运算的精确度变低,所以这比较只能作为参考。
采用OpenMP能活用i7的8个逻辑核心。使用OpenMP的非SIMD(d)和SIMD(e)版本,分别比没使用OpenMP的版本(a)和(c),性能各为6.16倍和5.9倍。这已经很接近理想值8,说明这应用能充分利用CPU并行性。而OpenMP强大的地方,在于只需加入1句编译器#pragma指令就能自动并行。
结语
虽然本文的实验只能反映个别情况。但实验可以说明,在某些应用上,C#/.Net的性能可以非常贴近C++,差别小于一个数量级。
本文实验所用的程序代码,有不少进一步优化的空间,源代码可于这里下载。有兴趣的朋友也可把代码移植至Java及其他语言。
最后,本人认为,各种平台和语言,都有其适用时机。作为程序员,最理想是认识各种技术,以及认清每个技术的特长、短处,以便为应用找到最好的配撘。
更新
- 2010/6/23: 园友嗷嗷在本文#78楼回覆中指出,C++版本的很大部分开销在于erand48()函数里调用Runtime库函数。所以,决定用简单的LCG随机数实现,取代原来的库函数(包括C++和C#),再测试三个版本(f)(g)(h)。结果: C++版本(f)比C#版本(g)快2.43倍。使用OpenMP(h)是没用OpenMP(f)版本的5.06倍。此LCG版本代码可于此下载。再次感谢这位园友。