锁定老帖子 主题:我所做的Java和C++性能测试
精华帖 (2) :: 良好帖 (2) :: 新手帖 (11) :: 隐藏帖 (7)
|
|||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
作者 | 正文 | ||||||||||||||||||||
发表时间:2011-05-23
今天闲得无聊,突发奇想,做了一个Java和C++的性能对比测试。
1 测试方法 很简单的,就是分别让Java程序和C++程序做很多次的整数和浮点数运算,然后测量测试代码段的运行时间。C++代码中使用Windows函数QueryPerformanceCounter() 获取CPU的高精度计数值,测试开始前后分别获取一次计数值,使用Windows函数QueryPerformanceFrequency()获取运行频率,前后的计数差值除以频率就得到了运行时间。为了使时间测量基准一致,在Java程序中通过JNI去调用这两个Windows函数,测量Java程序的运行时间。 下面分别给出代码。
2 C++测试代码
#include "stdafx.h" #include "windows.h" #include <iostream> #include <cmath> using namespace std; const int INT_C = 200000; const int DOU_C = 50000; const int MAIN_C = 10000; class Test { public: Test(); void testInt(); void testDouble(); void doTest(); private: int m_i; double m_d; }; Test::Test() { m_i = 0; m_d = 0.0; } void Test::testInt() { for (int i=1;i<= INT_C;i++) { m_i = (~(i*7) + 0x963 - i) & (i / 3); } } void Test::testDouble() { for (int i=1;i<= DOU_C;i++) { m_d = ((i<<2) + 0.36954) * sin((double)i); } } void Test::doTest() { testInt(); testDouble(); } int _tmain(int argc, _TCHAR* argv[]) { LARGE_INTEGER freq; LARGE_INTEGER start; LARGE_INTEGER end; QueryPerformanceFrequency(&freq); Test* test = NULL; int j; cout<<"start test..."<<endl; QueryPerformanceCounter(&start); for (j = 0;j < MAIN_C; j++) { test = new Test(); test->doTest(); delete test; } QueryPerformanceCounter(&end); double druation = ((double)(end.QuadPart - start.QuadPart)) / ((double)freq.QuadPart); cout<<"Program run druation: "<<druation*1000<<" ms."<<endl; return 0; }
3 Java测试代码 3.1 测试代码
public class PerformTest { public static final int INT_C = 200000; public static final int DOU_C = 50000; public static final int MAIN_C = 10000; private int m_i; private double m_d; public void testInt() { for (int i=1;i<= INT_C;i++) { m_i = (~(i*7) + 0x963 - i) & (i / 3); } } public void testDouble() { for (int i=1;i<= DOU_C;i++) { m_d = ((i<<2) + 0.36954) * Math.sin((double)i); } } public void doTest() { testInt(); testDouble(); } public static void main(String[] args) { PerformanceTimer timer = new PerformanceTimer(); PerformTest test = null; int j; System.out.println("start test..."); timer.start(); for (j = 0;j < MAIN_C; j++) { test = new PerformTest(); test.doTest(); test = null; } double duration = timer.end(); System.out.println("Program run druation: " + duration + " ms."); } }
3.2 实现时间测量的代码
public class PerformanceTimer { private double freq; private double startTime; private double endTime; public PerformanceTimer() { this.freq = queryPerformanceFrequency(); } private native double queryPerformanceFrequency(); private native double QueryPerformanceCounter(); public void start() { this.startTime = QueryPerformanceCounter(); } public double end() { this.endTime = QueryPerformanceCounter(); double duration = (endTime - startTime) / freq * 1000; return duration; } static { try { System.loadLibrary("PerformanceTimer"); } catch (Exception e) { e.printStackTrace(); } } }
3.3 实现时间测量的本地C++代码 省略javah生成的头文件,给出实现的源文件。
#include "stdafx.h" #include "windows.h" #include "PerformanceTimer.h" JNIEXPORT jdouble JNICALL Java_PerformanceTimer_queryPerformanceFrequency (JNIEnv *, jobject) { LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); return (double)(freq.QuadPart); } JNIEXPORT jdouble JNICALL Java_PerformanceTimer_QueryPerformanceCounter (JNIEnv *, jobject) { LARGE_INTEGER counter; QueryPerformanceCounter(&counter); return (double)(counter.QuadPart); }
4 测试结果 我的软硬件环境: 硬件配置:AMD AthlonII X3 435 2.89GHz; 2GB DDR3内存;WD 500G硬盘; 软件环境:Windows7 旗舰版;Visual C++ 2008;SUN jdk1.6.0_21.
C++测试结果:
Java测试结果:
C++程序的性能竟是Java的3倍?这真的是他们之间真实的性能差距吗?我所用的测试方法是否科学呢?那么影响我们的Java程序的性能瓶颈在什么地方? 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
推荐链接
|
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-23
你用汇编再写个用例测试一下
|
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-23
JNI调用消耗一定时间。
|
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-23
这个倒还是有意思的,不过单单来写,这里的效率貌似并不能体现出来,只是一个个例来说,而不能代表整体
|
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-24
最后修改:2011-05-24
hmm, microbenchmarks...
请问楼主用VC2008编译源码的时候用的参数是什么?(如果是在VS2008里写的,可以在工程属性里找到实际采用的编译选项) 另外请问能否将VC2008编译生成的汇编贴出来(用/Fa选项),或者是把编译出来的exe文件上传一个?好奇它到底生成了怎样的代码。 楼主试试这样: 先把JDK升到6 update 25,然后把Java的测试代码换成以下方式,并确保在跑测试的时候java的命令行参数带上-server -XX:InlineSmallCode=2000 -XX:+AggressiveOpts 如果-server说找不到JVM那你的PATH上的java是public JRE而不是JDK的java,这个时候只要用完整路径指定用JDK里的java就行。 (虽说JDK6u21也挺新的,但我还是建议升级了再测,效果更佳;至少升到JDK6u23或以上) public class PerformTest { public static final int INT_C = 200000; public static final int DOU_C = 50000; public static final int MAIN_C = 10000; private int m_i; private double m_d; public void testInt() { for (int i=1;i<= INT_C;i++) { m_i = (~(i*7) + 0x963 - i) & (i / 3); } } public void testDouble() { for (int i=1;i<= DOU_C;i++) { m_d = ((i<<2) + 0.36954) * Math.sin((double)i); } } public void doTest() { testInt(); testDouble(); } public static double driveTest() { PerformanceTimer timer = new PerformanceTimer(); PerformTest test = null; int j; timer.start(); for (j = 0;j < MAIN_C; j++) { test = new PerformTest(); test.doTest(); test = null; } double duration = timer.end(); return duration; } public static void main(String[] args) { for (int i = 0; i < 3; i++) { driveTest(); // warm up } System.out.println("start test..."); double duration = driveTest(); System.out.println("Program run duration: " + duration + " ms."); } } 看看在楼主的机器上情况会不会有变化呢。 另外,其实Oracle JDK的System.nanoTime()就是用QueryPerformanceCounter来实现的,不必自己写 hotspot/src/os/windows/vm/os_windows.cpp jlong os::javaTimeNanos() { if (!has_performance_count) { return javaTimeMillis() * NANOS_PER_MILLISEC; // the best we can do. } else { LARGE_INTEGER current_count; QueryPerformanceCounter(¤t_count); double current = as_long(current_count); double freq = performance_frequency; jlong time = (jlong)((current/freq) * NANOS_PER_SEC); return time; } } 要跟C++相比勉强“更公平”的话,直接用System.nanoTime()比较好。那么把代码改写为: public class PerformTest { public static final int INT_C = 200000; public static final int DOU_C = 50000; public static final int MAIN_C = 10000; private int m_i; private double m_d; public void testInt() { for (int i=1;i<= INT_C;i++) { m_i = (~(i*7) + 0x963 - i) & (i / 3); } } public void testDouble() { for (int i=1;i<= DOU_C;i++) { m_d = ((i<<2) + 0.36954) * Math.sin((double)i); } } public void doTest() { testInt(); testDouble(); } public static long driveTest() { long start = System.nanoTime(); for (int j = 0; j < MAIN_C; j++) { PerformTest test = new PerformTest(); test.doTest(); test = null; } long end = System.nanoTime(); return end - start; } public static void main(String[] args) { for (int i = 0; i < 3; i++) { driveTest(); // warm up } System.out.println("start test..."); double duration = driveTest() / 1000000.0; // ns -> ms System.out.println("Program run duration: " + duration + " ms."); } } 记得用JDK6u25,java -server -XX:InlineSmallCode=2000 -XX:+AggressiveOpts PerformTest 来跑 然后像楼下说的,有条件的话试试IBM JDK 6,重复多跑几次会有发现 >_< |
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-24
hotspot有-client和-server的区别,有体现么
另外,强烈建议试试ibm J9 1.6,这个做运算经验上来看比hotspot快,然后ibm 1.5的效率并不高 |
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-24
not sure your purpose, but there is a catch.
Java's sin() function is slow, well known. So if you just want to test arithmetic operations, java and c should match head to head roughly (this is also fairly well known). As for those sin() cos() functions, java is slow since win32 uses assembly (if I remember it correctly). |
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-24
jellyfish 写道 Java's sin() function is slow, well known.
Slow, compared to what? That's a well-known myth, which is not true for modern high performance JVMs like HotSpot, JRockit and J9. Unless you specify strictfp (which you would seldom see anyone do), these JVMs will take advantage of the floating point instructions of the underlying hardware for maximum performance. For example, this is what Math.sin() looks like when it's called from C2 compiled code, on x64: (C2 is the name of HotSpot's server compiler) StubRoutines::sin [0x00007f89ea1dcf11, 0x00007f89ea1dd029[ (280 bytes) [Disassembling for mach='i386:x86-64'] 0x00007f89ea1dcf11: sub $0x8,%rsp 0x00007f89ea1dcf15: movsd %xmm0,(%rsp) 0x00007f89ea1dcf1a: fldl (%rsp) 0x00007f89ea1dcf1d: fldl 0x496451d(%rip) # 0x00007f89eeb41440 0x00007f89ea1dcf23: fld %st(1) 0x00007f89ea1dcf25: fabs 0x00007f89ea1dcf27: fucomip %st(1),%st 0x00007f89ea1dcf29: ffree %st(0) 0x00007f89ea1dcf2b: fincstp 0x00007f89ea1dcf2d: ja Stub::sin+41 0x0x7f89ea1dcf3a 0x00007f89ea1dcf33: fsin 0x00007f89ea1dcf35: jmpq Stub::sin+267 0x0x7f89ea1dd01c 0x00007f89ea1dcf3a: mov %rsp,-0x28(%rsp) 0x00007f89ea1dcf3f: sub $0x80,%rsp 0x00007f89ea1dcf46: mov %rax,0x78(%rsp) 0x00007f89ea1dcf4b: mov %rcx,0x70(%rsp) 0x00007f89ea1dcf50: mov %rdx,0x68(%rsp) 0x00007f89ea1dcf55: mov %rbx,0x60(%rsp) 0x00007f89ea1dcf5a: mov %rbp,0x50(%rsp) 0x00007f89ea1dcf5f: mov %rsi,0x48(%rsp) 0x00007f89ea1dcf64: mov %rdi,0x40(%rsp) 0x00007f89ea1dcf69: mov %r8,0x38(%rsp) 0x00007f89ea1dcf6e: mov %r9,0x30(%rsp) 0x00007f89ea1dcf73: mov %r10,0x28(%rsp) 0x00007f89ea1dcf78: mov %r11,0x20(%rsp) 0x00007f89ea1dcf7d: mov %r12,0x18(%rsp) 0x00007f89ea1dcf82: mov %r13,0x10(%rsp) 0x00007f89ea1dcf87: mov %r14,0x8(%rsp) 0x00007f89ea1dcf8c: mov %r15,(%rsp) 0x00007f89ea1dcf90: sub $0x8,%rsp 0x00007f89ea1dcf94: fstpl (%rsp) 0x00007f89ea1dcf97: movsd (%rsp),%xmm0 0x00007f89ea1dcf9c: test $0xf,%esp 0x00007f89ea1dcfa2: je Stub::sin+169 0x0x7f89ea1dcfba 0x00007f89ea1dcfa8: sub $0x8,%rsp 0x00007f89ea1dcfac: callq 0x00007f89eea45d76 0x00007f89ea1dcfb1: add $0x8,%rsp 0x00007f89ea1dcfb5: jmpq Stub::sin+174 0x0x7f89ea1dcfbf 0x00007f89ea1dcfba: callq 0x00007f89eea45d76 0x00007f89ea1dcfbf: movsd %xmm0,(%rsp) 0x00007f89ea1dcfc4: fldl (%rsp) 0x00007f89ea1dcfc7: add $0x8,%rsp 0x00007f89ea1dcfcb: mov (%rsp),%r15 0x00007f89ea1dcfcf: mov 0x8(%rsp),%r14 0x00007f89ea1dcfd4: mov 0x10(%rsp),%r13 0x00007f89ea1dcfd9: mov 0x18(%rsp),%r12 0x00007f89ea1dcfde: mov 0x20(%rsp),%r11 0x00007f89ea1dcfe3: mov 0x28(%rsp),%r10 0x00007f89ea1dcfe8: mov 0x30(%rsp),%r9 0x00007f89ea1dcfed: mov 0x38(%rsp),%r8 0x00007f89ea1dcff2: mov 0x40(%rsp),%rdi 0x00007f89ea1dcff7: mov 0x48(%rsp),%rsi 0x00007f89ea1dcffc: mov 0x50(%rsp),%rbp 0x00007f89ea1dd001: mov 0x60(%rsp),%rbx 0x00007f89ea1dd006: mov 0x68(%rsp),%rdx 0x00007f89ea1dd00b: mov 0x70(%rsp),%rcx 0x00007f89ea1dd010: mov 0x78(%rsp),%rax 0x00007f89ea1dd015: add $0x80,%rsp 0x00007f89ea1dd01c: fstpl (%rsp) 0x00007f89ea1dd01f: movsd (%rsp),%xmm0 0x00007f89ea1dd024: add $0x8,%rsp 0x00007f89ea1dd028: retq in the fastest case, the code above boils down to: StubRoutines::sin [0x00007f89ea1dcf11, 0x00007f89ea1dd029[ (280 bytes) [Disassembling for mach='i386:x86-64'] 0x00007f89ea1dcf11: sub $0x8,%rsp 0x00007f89ea1dcf15: movsd %xmm0,(%rsp) 0x00007f89ea1dcf1a: fldl (%rsp) 0x00007f89ea1dcf1d: fldl 0x496451d(%rip) # 0x00007f89eeb41440 0x00007f89ea1dcf23: fld %st(1) 0x00007f89ea1dcf25: fabs 0x00007f89ea1dcf27: fucomip %st(1),%st 0x00007f89ea1dcf29: ffree %st(0) 0x00007f89ea1dcf2b: fincstp 0x00007f89ea1dcf2d: ja Stub::sin+41 0x0x7f89ea1dcf3a 0x00007f89ea1dcf33: fsin 0x00007f89ea1dcf35: jmpq Stub::sin+267 0x0x7f89ea1dd01c # ... 0x00007f89ea1dd01c: fstpl (%rsp) 0x00007f89ea1dd01f: movsd (%rsp),%xmm0 0x00007f89ea1dd024: add $0x8,%rsp 0x00007f89ea1dd028: retq even better, it doesn't have to invoke a stub all the time; the code may be inlined to the call site, resulting in code like this: 0x00007f89ea1fa92b: sub $0x8,%rsp 0x00007f89ea1fa92f: movsd %xmm0,(%rsp) 0x00007f89ea1fa934: data16 0x00007f89ea1fa935: fldl (%rsp) 0x00007f89ea1fa938: fsin 0x00007f89ea1fa93a: fstpl 0x0(%rsp) 0x00007f89ea1fa93e: movsd (%rsp),%xmm0 0x00007f89ea1fa943: add $0x8,%rsp which isn't as slow as you might guess it is. |
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-24
LZ无聊,LX跟着更加无聊......
|
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||
发表时间:2011-05-24
starry198804265811 写道
4 测试结果 我的软硬件环境: 硬件配置:AMD AthlonII X3 435 2.89GHz; 2GB DDR3内存;WD 500G硬盘; 软件环境:Windows7 旗舰版;Visual C++ 2008;SUN jdk1.6.0_21.
你们也是AMD三核CPU? 你们的电脑也是王志配的吗? |
|||||||||||||||||||||
返回顶楼 | |||||||||||||||||||||