论坛首页 Java企业应用论坛

我所做的Java和C++性能测试

浏览 24562 次
精华帖 (2) :: 良好帖 (2) :: 新手帖 (11) :: 隐藏帖 (7)
作者 正文
   发表时间:2011-05-23  

今天闲得无聊,突发奇想,做了一个Java和C++的性能对比测试。

 

1 测试方法

  很简单的,就是分别让Java程序和C++程序做很多次的整数和浮点数运算,然后测量测试代码段的运行时间。C++代码中使用Windows函数QueryPerformanceCounter() 获取CPU的高精度计数值,测试开始前后分别获取一次计数值,使用Windows函数QueryPerformanceFrequency()获取运行频率,前后的计数差值除以频率就得到了运行时间。为了使时间测量基准一致,在Java程序中通过JNI去调用这两个Windows函数,测量Java程序的运行时间。

  下面分别给出代码。

 

2 C++测试代码

 

#include "stdafx.h"
#include "windows.h"
#include <iostream>
#include <cmath>
using namespace std;

const int INT_C = 200000;
const int DOU_C = 50000;
const int MAIN_C = 10000;

class Test {
public:
	Test();
	void testInt();
	void testDouble();
	void doTest();
private:
	int m_i;
	double m_d;
};

Test::Test()
{
	m_i = 0;
	m_d = 0.0;
}

void Test::testInt()
{
	for (int i=1;i<= INT_C;i++) {
		m_i = (~(i*7) + 0x963 - i) & (i / 3);
	}
}

void Test::testDouble()
{
	for (int i=1;i<= DOU_C;i++) {
		m_d = ((i<<2) + 0.36954) * sin((double)i);
	}
}

void Test::doTest()
{
	testInt();
	testDouble();
}

int _tmain(int argc, _TCHAR* argv[])
{
	LARGE_INTEGER freq; 
	LARGE_INTEGER start;
	LARGE_INTEGER end;
	QueryPerformanceFrequency(&freq);

	Test* test = NULL;
	int j;

	cout<<"start test..."<<endl;

	QueryPerformanceCounter(&start);
	for (j = 0;j < MAIN_C; j++) {
		test = new Test();
		test->doTest();
		delete test;
	}
	QueryPerformanceCounter(&end);

	double druation = ((double)(end.QuadPart - start.QuadPart)) / ((double)freq.QuadPart);
	cout<<"Program run druation: "<<druation*1000<<" ms."<<endl;
	
	return 0;
}

 

 3 Java测试代码

3.1 测试代码

 

public class PerformTest {
	public static final int INT_C = 200000;
	public static final int DOU_C = 50000;
	public static final int MAIN_C = 10000;
	
	private int m_i;
	private double m_d;
	
	public void testInt() {
		for (int i=1;i<= INT_C;i++) {
			m_i = (~(i*7) + 0x963 - i) & (i / 3);
		}
	}
	
	public void testDouble() {
		for (int i=1;i<= DOU_C;i++) {
			m_d = ((i<<2) + 0.36954) * Math.sin((double)i);
		}
	}
	
	public void doTest() {
		testInt();
		testDouble();
	}
	
	public static void main(String[] args) {
		PerformanceTimer timer = new PerformanceTimer();
		PerformTest test = null;
		int j;
		System.out.println("start test...");
		
		timer.start();
		for (j = 0;j < MAIN_C; j++) {
			test = new PerformTest();
			test.doTest();
			test = null;
		}
		double duration = timer.end();
		
		System.out.println("Program run druation: " + duration + " ms.");
	}
}

 

 3.2 实现时间测量的代码

 

public class PerformanceTimer {
	private double freq;
	private double startTime;
	private double endTime;
	
	public PerformanceTimer() {
		this.freq = queryPerformanceFrequency();
	}
	
	private native double queryPerformanceFrequency();
	
	private native double QueryPerformanceCounter();
	
	public void start() {
		this.startTime = QueryPerformanceCounter();
	}
	
	public double end() {
		this.endTime = QueryPerformanceCounter();
		double duration = (endTime - startTime) / freq * 1000;
		return duration;
	}
	
	static {
		try {
			System.loadLibrary("PerformanceTimer");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

 

 3.3 实现时间测量的本地C++代码

    省略javah生成的头文件,给出实现的源文件。

 

#include "stdafx.h"
#include "windows.h"
#include "PerformanceTimer.h"

JNIEXPORT jdouble JNICALL Java_PerformanceTimer_queryPerformanceFrequency
  (JNIEnv *, jobject)
{
	LARGE_INTEGER freq;
	QueryPerformanceFrequency(&freq);

	return (double)(freq.QuadPart);
}

JNIEXPORT jdouble JNICALL Java_PerformanceTimer_QueryPerformanceCounter
  (JNIEnv *, jobject)
{
	LARGE_INTEGER counter;
	QueryPerformanceCounter(&counter);

	return (double)(counter.QuadPart);
}

 

 

4 测试结果

  我的软硬件环境:

  硬件配置:AMD AthlonII X3 435 2.89GHz; 2GB DDR3内存;WD 500G硬盘;

  软件环境:Windows7 旗舰版;Visual C++ 2008;SUN jdk1.6.0_21.

 

  C++测试结果

 


第一次 第二次 第三次 平均时间
时间(单位:ms) 21023.6 21003.5 21014.7

21013.9

 

  Java测试结果

第一次 第二次 第三次 平均时间
时间(单位:ms) 94369.4 94317.3 94347.2 94344.6

 

C++程序的性能竟是Java的3倍?这真的是他们之间真实的性能差距吗?我所用的测试方法是否科学呢?那么影响我们的Java程序的性能瓶颈在什么地方?

   发表时间:2011-05-23  
你用汇编再写个用例测试一下
0 请登录后投票
   发表时间:2011-05-23  
JNI调用消耗一定时间。
0 请登录后投票
   发表时间:2011-05-23  
这个倒还是有意思的,不过单单来写,这里的效率貌似并不能体现出来,只是一个个例来说,而不能代表整体
0 请登录后投票
   发表时间:2011-05-24   最后修改:2011-05-24
hmm, microbenchmarks...

请问楼主用VC2008编译源码的时候用的参数是什么?(如果是在VS2008里写的,可以在工程属性里找到实际采用的编译选项)
另外请问能否将VC2008编译生成的汇编贴出来(用/Fa选项),或者是把编译出来的exe文件上传一个?好奇它到底生成了怎样的代码。

楼主试试这样:
先把JDK升到6 update 25,然后把Java的测试代码换成以下方式,并确保在跑测试的时候java的命令行参数带上-server -XX:InlineSmallCode=2000 -XX:+AggressiveOpts
如果-server说找不到JVM那你的PATH上的java是public JRE而不是JDK的java,这个时候只要用完整路径指定用JDK里的java就行。
(虽说JDK6u21也挺新的,但我还是建议升级了再测,效果更佳;至少升到JDK6u23或以上)
public class PerformTest {
	public static final int INT_C = 200000;
	public static final int DOU_C = 50000;
	public static final int MAIN_C = 10000;
	
	private int m_i;
	private double m_d;
	
	public void testInt() {
		for (int i=1;i<= INT_C;i++) {
			m_i = (~(i*7) + 0x963 - i) & (i / 3);
		}
	}
	
	public void testDouble() {
		for (int i=1;i<= DOU_C;i++) {
			m_d = ((i<<2) + 0.36954) * Math.sin((double)i);
		}
	}
	
	public void doTest() {
		testInt();
		testDouble();
	}
	
	public static double driveTest() {
		PerformanceTimer timer = new PerformanceTimer();
		PerformTest test = null;
		int j;
		
		timer.start();
		for (j = 0;j < MAIN_C; j++) {
			test = new PerformTest();
			test.doTest();
			test = null;
		}
		double duration = timer.end();
		return duration;
	}
	
	public static void main(String[] args) {
		for (int i = 0; i < 3; i++) {
			driveTest(); // warm up
		}
		
		System.out.println("start test...");
		
		double duration = driveTest();
		
		System.out.println("Program run duration: " + duration + " ms.");
	}
}


看看在楼主的机器上情况会不会有变化呢。

另外,其实Oracle JDK的System.nanoTime()就是用QueryPerformanceCounter来实现的,不必自己写
hotspot/src/os/windows/vm/os_windows.cpp
jlong os::javaTimeNanos() {
  if (!has_performance_count) {
    return javaTimeMillis() * NANOS_PER_MILLISEC; // the best we can do.
  } else {
    LARGE_INTEGER current_count;
    QueryPerformanceCounter(&current_count);
    double current = as_long(current_count);
    double freq = performance_frequency;
    jlong time = (jlong)((current/freq) * NANOS_PER_SEC);
    return time;
  }
}


要跟C++相比勉强“更公平”的话,直接用System.nanoTime()比较好。那么把代码改写为:
public class PerformTest {
	public static final int INT_C = 200000;
	public static final int DOU_C = 50000;
	public static final int MAIN_C = 10000;
	
	private int m_i;
	private double m_d;
	
	public void testInt() {
		for (int i=1;i<= INT_C;i++) {
			m_i = (~(i*7) + 0x963 - i) & (i / 3);
		}
	}
	
	public void testDouble() {
		for (int i=1;i<= DOU_C;i++) {
			m_d = ((i<<2) + 0.36954) * Math.sin((double)i);
		}
	}
	
	public void doTest() {
		testInt();
		testDouble();
	}
	
	public static long driveTest() {
		long start = System.nanoTime();
		for (int j = 0; j < MAIN_C; j++) {
			PerformTest test = new PerformTest();
			test.doTest();
			test = null;
		}
		long end = System.nanoTime();
		return end - start;
	}
	
	public static void main(String[] args) {
		for (int i = 0; i < 3; i++) {
			driveTest(); // warm up
		}
		
		System.out.println("start test...");
		double duration = driveTest() / 1000000.0; // ns -> ms
		System.out.println("Program run duration: " + duration + " ms.");
	}
}

记得用JDK6u25,java -server -XX:InlineSmallCode=2000 -XX:+AggressiveOpts PerformTest 来跑

然后像楼下说的,有条件的话试试IBM JDK 6,重复多跑几次会有发现 >_<
0 请登录后投票
   发表时间:2011-05-24  
hotspot有-client和-server的区别,有体现么

另外,强烈建议试试ibm J9 1.6,这个做运算经验上来看比hotspot快,然后ibm 1.5的效率并不高
0 请登录后投票
   发表时间:2011-05-24  
not sure your purpose, but there is a catch.

Java's sin() function is slow, well known. So if you just want to test arithmetic operations, java and c should match head to head roughly (this is also fairly well known).

As for those sin() cos() functions, java is slow since win32 uses assembly (if I remember it correctly).
0 请登录后投票
   发表时间:2011-05-24  
jellyfish 写道
Java's sin() function is slow, well known.

Slow, compared to what?
That's a well-known myth, which is not true for modern high performance JVMs like HotSpot, JRockit and J9. Unless you specify strictfp (which you would seldom see anyone do), these JVMs will take advantage of the floating point instructions of the underlying hardware for maximum performance. For example, this is what Math.sin() looks like when it's called from C2 compiled code, on x64: (C2 is the name of HotSpot's server compiler)
StubRoutines::sin [0x00007f89ea1dcf11, 0x00007f89ea1dd029[ (280 bytes)
[Disassembling for mach='i386:x86-64']
  0x00007f89ea1dcf11: sub    $0x8,%rsp
  0x00007f89ea1dcf15: movsd  %xmm0,(%rsp)
  0x00007f89ea1dcf1a: fldl   (%rsp)
  0x00007f89ea1dcf1d: fldl   0x496451d(%rip)        # 0x00007f89eeb41440
  0x00007f89ea1dcf23: fld    %st(1)
  0x00007f89ea1dcf25: fabs   
  0x00007f89ea1dcf27: fucomip %st(1),%st
  0x00007f89ea1dcf29: ffree  %st(0)
  0x00007f89ea1dcf2b: fincstp 
  0x00007f89ea1dcf2d: ja     Stub::sin+41 0x0x7f89ea1dcf3a
  0x00007f89ea1dcf33: fsin   
  0x00007f89ea1dcf35: jmpq   Stub::sin+267 0x0x7f89ea1dd01c
  0x00007f89ea1dcf3a: mov    %rsp,-0x28(%rsp)
  0x00007f89ea1dcf3f: sub    $0x80,%rsp
  0x00007f89ea1dcf46: mov    %rax,0x78(%rsp)
  0x00007f89ea1dcf4b: mov    %rcx,0x70(%rsp)
  0x00007f89ea1dcf50: mov    %rdx,0x68(%rsp)
  0x00007f89ea1dcf55: mov    %rbx,0x60(%rsp)
  0x00007f89ea1dcf5a: mov    %rbp,0x50(%rsp)
  0x00007f89ea1dcf5f: mov    %rsi,0x48(%rsp)
  0x00007f89ea1dcf64: mov    %rdi,0x40(%rsp)
  0x00007f89ea1dcf69: mov    %r8,0x38(%rsp)
  0x00007f89ea1dcf6e: mov    %r9,0x30(%rsp)
  0x00007f89ea1dcf73: mov    %r10,0x28(%rsp)
  0x00007f89ea1dcf78: mov    %r11,0x20(%rsp)
  0x00007f89ea1dcf7d: mov    %r12,0x18(%rsp)
  0x00007f89ea1dcf82: mov    %r13,0x10(%rsp)
  0x00007f89ea1dcf87: mov    %r14,0x8(%rsp)
  0x00007f89ea1dcf8c: mov    %r15,(%rsp)
  0x00007f89ea1dcf90: sub    $0x8,%rsp
  0x00007f89ea1dcf94: fstpl  (%rsp)
  0x00007f89ea1dcf97: movsd  (%rsp),%xmm0
  0x00007f89ea1dcf9c: test   $0xf,%esp
  0x00007f89ea1dcfa2: je     Stub::sin+169 0x0x7f89ea1dcfba
  0x00007f89ea1dcfa8: sub    $0x8,%rsp
  0x00007f89ea1dcfac: callq  0x00007f89eea45d76
  0x00007f89ea1dcfb1: add    $0x8,%rsp
  0x00007f89ea1dcfb5: jmpq   Stub::sin+174 0x0x7f89ea1dcfbf
  0x00007f89ea1dcfba: callq  0x00007f89eea45d76
  0x00007f89ea1dcfbf: movsd  %xmm0,(%rsp)
  0x00007f89ea1dcfc4: fldl   (%rsp)
  0x00007f89ea1dcfc7: add    $0x8,%rsp
  0x00007f89ea1dcfcb: mov    (%rsp),%r15
  0x00007f89ea1dcfcf: mov    0x8(%rsp),%r14
  0x00007f89ea1dcfd4: mov    0x10(%rsp),%r13
  0x00007f89ea1dcfd9: mov    0x18(%rsp),%r12
  0x00007f89ea1dcfde: mov    0x20(%rsp),%r11
  0x00007f89ea1dcfe3: mov    0x28(%rsp),%r10
  0x00007f89ea1dcfe8: mov    0x30(%rsp),%r9
  0x00007f89ea1dcfed: mov    0x38(%rsp),%r8
  0x00007f89ea1dcff2: mov    0x40(%rsp),%rdi
  0x00007f89ea1dcff7: mov    0x48(%rsp),%rsi
  0x00007f89ea1dcffc: mov    0x50(%rsp),%rbp
  0x00007f89ea1dd001: mov    0x60(%rsp),%rbx
  0x00007f89ea1dd006: mov    0x68(%rsp),%rdx
  0x00007f89ea1dd00b: mov    0x70(%rsp),%rcx
  0x00007f89ea1dd010: mov    0x78(%rsp),%rax
  0x00007f89ea1dd015: add    $0x80,%rsp
  0x00007f89ea1dd01c: fstpl  (%rsp)
  0x00007f89ea1dd01f: movsd  (%rsp),%xmm0
  0x00007f89ea1dd024: add    $0x8,%rsp
  0x00007f89ea1dd028: retq   

in the fastest case, the code above boils down to:
StubRoutines::sin [0x00007f89ea1dcf11, 0x00007f89ea1dd029[ (280 bytes)
[Disassembling for mach='i386:x86-64']
  0x00007f89ea1dcf11: sub    $0x8,%rsp
  0x00007f89ea1dcf15: movsd  %xmm0,(%rsp)
  0x00007f89ea1dcf1a: fldl   (%rsp)
  0x00007f89ea1dcf1d: fldl   0x496451d(%rip)        # 0x00007f89eeb41440
  0x00007f89ea1dcf23: fld    %st(1)
  0x00007f89ea1dcf25: fabs   
  0x00007f89ea1dcf27: fucomip %st(1),%st
  0x00007f89ea1dcf29: ffree  %st(0)
  0x00007f89ea1dcf2b: fincstp 
  0x00007f89ea1dcf2d: ja     Stub::sin+41 0x0x7f89ea1dcf3a
  0x00007f89ea1dcf33: fsin   
  0x00007f89ea1dcf35: jmpq   Stub::sin+267 0x0x7f89ea1dd01c
# ...
  0x00007f89ea1dd01c: fstpl  (%rsp)
  0x00007f89ea1dd01f: movsd  (%rsp),%xmm0
  0x00007f89ea1dd024: add    $0x8,%rsp
  0x00007f89ea1dd028: retq   

even better, it doesn't have to invoke a stub all the time; the code may be inlined to the call site, resulting in code like this:
  0x00007f89ea1fa92b: sub    $0x8,%rsp
  0x00007f89ea1fa92f: movsd  %xmm0,(%rsp)
  0x00007f89ea1fa934: data16
  0x00007f89ea1fa935: fldl   (%rsp)
  0x00007f89ea1fa938: fsin   
  0x00007f89ea1fa93a: fstpl  0x0(%rsp)
  0x00007f89ea1fa93e: movsd  (%rsp),%xmm0
  0x00007f89ea1fa943: add    $0x8,%rsp

which isn't as slow as you might guess it is.
0 请登录后投票
   发表时间:2011-05-24  
LZ无聊,LX跟着更加无聊......
0 请登录后投票
   发表时间:2011-05-24  
starry198804265811 写道

4 测试结果

  我的软硬件环境:

  硬件配置:AMD AthlonII X3 435 2.89GHz; 2GB DDR3内存;WD 500G硬盘;

  软件环境:Windows7 旗舰版;Visual C++ 2008;SUN jdk1.6.0_21.

 

你们也是AMD三核CPU?

你们的电脑也是王志配的吗?

0 请登录后投票
论坛首页 Java企业应用版

跳转论坛:
Global site tag (gtag.js) - Google Analytics