论坛首页 Java企业应用论坛

我所做的Java和C++性能测试

浏览 24559 次
精华帖 (2) :: 良好帖 (2) :: 新手帖 (11) :: 隐藏帖 (7)
作者 正文
   发表时间:2011-05-26   最后修改:2011-05-26
main.o:     file format pe-i386


Disassembly of section .text:

00000000 <__ZN4TestC1Ev>:
   0:	55                   	push   %ebp
   1:	89 e5                	mov    %esp,%ebp
   3:	8b 45 08             	mov    0x8(%ebp),%eax
   6:	c7 00 00 00 00 00    	movl   $0x0,(%eax)
   c:	8b 4d 08             	mov    0x8(%ebp),%ecx
   f:	b8 00 00 00 00       	mov    $0x0,%eax
  14:	ba 00 00 00 00       	mov    $0x0,%edx
  19:	89 41 08             	mov    %eax,0x8(%ecx)
  1c:	89 51 0c             	mov    %edx,0xc(%ecx)
  1f:	c9                   	leave  
  20:	c3                   	ret    
  21:	90                   	nop

00000022 <__ZN4Test7testIntEv>:
  22:	55                   	push   %ebp
  23:	89 e5                	mov    %esp,%ebp
  25:	53                   	push   %ebx
  26:	83 ec 10             	sub    $0x10,%esp
  29:	c7 45 f8 01 00 00 00 	movl   $0x1,-0x8(%ebp)
  30:	eb 39                	jmp    6b <__ZN4Test7testIntEv+0x49>
  32:	8b 55 f8             	mov    -0x8(%ebp),%edx
  35:	89 d0                	mov    %edx,%eax
  37:	c1 e0 03             	shl    $0x3,%eax
  3a:	29 d0                	sub    %edx,%eax
  3c:	f7 d0                	not    %eax
  3e:	05 63 09 00 00       	add    $0x963,%eax
  43:	89 c3                	mov    %eax,%ebx
  45:	2b 5d f8             	sub    -0x8(%ebp),%ebx
  48:	8b 4d f8             	mov    -0x8(%ebp),%ecx
  4b:	ba 56 55 55 55       	mov    $0x55555556,%edx
  50:	89 c8                	mov    %ecx,%eax
  52:	f7 ea                	imul   %edx
  54:	89 c8                	mov    %ecx,%eax
  56:	c1 f8 1f             	sar    $0x1f,%eax
  59:	89 d1                	mov    %edx,%ecx
  5b:	29 c1                	sub    %eax,%ecx
  5d:	89 c8                	mov    %ecx,%eax
  5f:	89 da                	mov    %ebx,%edx
  61:	21 c2                	and    %eax,%edx
  63:	8b 45 08             	mov    0x8(%ebp),%eax
  66:	89 10                	mov    %edx,(%eax)
  68:	ff 45 f8             	incl   -0x8(%ebp)
  6b:	81 7d f8 40 0d 03 00 	cmpl   $0x30d40,-0x8(%ebp)
  72:	0f 9e c0             	setle  %al
  75:	84 c0                	test   %al,%al
  77:	75 b9                	jne    32 <__ZN4Test7testIntEv+0x10>
  79:	83 c4 10             	add    $0x10,%esp
  7c:	5b                   	pop    %ebx
  7d:	c9                   	leave  
  7e:	c3                   	ret    
  7f:	90                   	nop

00000080 <__ZN4Test10testDoubleEv>:
  80:	55                   	push   %ebp
  81:	89 e5                	mov    %esp,%ebp
  83:	83 ec 38             	sub    $0x38,%esp
  86:	c7 45 f4 01 00 00 00 	movl   $0x1,-0xc(%ebp)
  8d:	eb 2e                	jmp    bd <__ZN4Test10testDoubleEv+0x3d>
  8f:	8b 45 f4             	mov    -0xc(%ebp),%eax
  92:	c1 e0 02             	shl    $0x2,%eax
  95:	89 45 e4             	mov    %eax,-0x1c(%ebp)
  98:	db 45 e4             	fildl  -0x1c(%ebp)
  9b:	dd 05 28 00 00 00    	fldl   0x28
  a1:	de c1                	faddp  %st,%st(1)
  a3:	dd 5d d8             	fstpl  -0x28(%ebp)
  a6:	db 45 f4             	fildl  -0xc(%ebp)
  a9:	dd 1c 24             	fstpl  (%esp)
  ac:	e8 00 00 00 00       	call   b1 <__ZN4Test10testDoubleEv+0x31>
  b1:	dc 4d d8             	fmull  -0x28(%ebp)
  b4:	8b 45 08             	mov    0x8(%ebp),%eax
  b7:	dd 58 08             	fstpl  0x8(%eax)
  ba:	ff 45 f4             	incl   -0xc(%ebp)
  bd:	81 7d f4 50 c3 00 00 	cmpl   $0xc350,-0xc(%ebp)
  c4:	0f 9e c0             	setle  %al
  c7:	84 c0                	test   %al,%al
  c9:	75 c4                	jne    8f <__ZN4Test10testDoubleEv+0xf>
  cb:	c9                   	leave  
  cc:	c3                   	ret    
  cd:	90                   	nop

000000ce <__ZN4Test6doTestEv>:
  ce:	55                   	push   %ebp
  cf:	89 e5                	mov    %esp,%ebp
  d1:	83 ec 18             	sub    $0x18,%esp
  d4:	8b 45 08             	mov    0x8(%ebp),%eax
  d7:	89 04 24             	mov    %eax,(%esp)
  da:	e8 43 ff ff ff       	call   22 <__ZN4Test7testIntEv>			<-------------------------------------
  df:	8b 45 08             	mov    0x8(%ebp),%eax
  e2:	89 04 24             	mov    %eax,(%esp)
  e5:	e8 96 ff ff ff       	call   80 <__ZN4Test10testDoubleEv>		<-------------------------------------
  ea:	c9                   	leave  
  eb:	c3                   	ret    

000000ec <_main>:
  ec:	55                   	push   %ebp
  ed:	89 e5                	mov    %esp,%ebp
  ef:	83 e4 f0             	and    $0xfffffff0,%esp
  f2:	53                   	push   %ebx
  f3:	83 ec 2c             	sub    $0x2c,%esp
  f6:	e8 00 00 00 00       	call   fb <_main+0xf>
  fb:	c7 44 24 04 00 00 00 	movl   $0x0,0x4(%esp)
 102:	00 
 103:	c7 04 24 00 00 00 00 	movl   $0x0,(%esp)
 10a:	e8 00 00 00 00       	call   10f <_main+0x23>
 10f:	c7 44 24 04 00 00 00 	movl   $0x0,0x4(%esp)
 116:	00 
 117:	89 04 24             	mov    %eax,(%esp)
 11a:	e8 00 00 00 00       	call   11f <_main+0x33>
 11f:	e8 00 00 00 00       	call   124 <_main+0x38>
 124:	89 44 24 18          	mov    %eax,0x18(%esp)
 128:	c7 04 24 10 00 00 00 	movl   $0x10,(%esp)
 12f:	e8 00 00 00 00       	call   134 <_main+0x48>
 134:	89 c3                	mov    %eax,%ebx
 136:	89 d8                	mov    %ebx,%eax
 138:	89 04 24             	mov    %eax,(%esp)
 13b:	e8 c0 fe ff ff       	call   0 <__ZN4TestC1Ev>
 140:	89 5c 24 14          	mov    %ebx,0x14(%esp)
 144:	c7 44 24 1c 00 00 00 	movl   $0x0,0x1c(%esp)
 14b:	00 
 14c:	eb 10                	jmp    15e <_main+0x72>
 14e:	8b 44 24 14          	mov    0x14(%esp),%eax
 152:	89 04 24             	mov    %eax,(%esp)
 155:	e8 74 ff ff ff       	call   ce <__ZN4Test6doTestEv>     <-------------------------------------
 15a:	ff 44 24 1c          	incl   0x1c(%esp)
 15e:	81 7c 24 1c 0f 27 00 	cmpl   $0x270f,0x1c(%esp)
 165:	00 
 166:	0f 9e c0             	setle  %al
 169:	84 c0                	test   %al,%al
 16b:	75 e1                	jne    14e <_main+0x62>


000001fd <___tcf_0>:
 1fd:	55                   	push   %ebp


00000211 <__Z41__static_initialization_and_destruction_0ii>:
 211:	55                   	push   %ebp


00000240 <__GLOBAL__I__ZN4TestC2Ev>:
 240:	55                   	push   %ebp
 

 

 

这是在无父类FOO时O0的汇编,注意到箭头处的call都是静态的绝对地址,因此当O3优化时能够将call处这些静态地址用内联替换

 

0 请登录后投票
   发表时间:2011-05-26  
还是很有意思的。学习了
0 请登录后投票
   发表时间:2011-05-28  
jellyfish 写道
That reference is really a wild guess. NO, it's not.

I apologize for the wild guess. My bad.

jellyfish 写道
http://blogs.oracle.com/jag/entry/transcendental_meditation

Yes, this is a nice post that describes exactly what's going on in the HotSpot VM. In fact, if you read the assembler code I posted on page 1 of this thread, that's the actual implementation used in x86-64 version of the HotSpot VM: it does a range check, and then either use fsin if the value is within range [-pi/4, pi/4], otherwise it does argument reduction and then use fsin.

jellyfish 写道
a simple google on "java sin cos slow" can generate a lot of interesting entries, especially in the game arena, so it should be classified as "well known".

Yeah, you can search Google and get a lot of nonsense as well, that's not convincing enough. All we need is solid evidence, which is easier to get by reading source code and running "macro benchmarks" instead of searching Google.

jellyfish 写道
While I am saying java sin() is slower than C version, I am not saying in general java is slow, did I?

No, you didn't. I missed the point, my bad.

jellyfish 写道
In fact, if you can make java sin() as closely fast as C, I would take it. I did quite some coding on how to make those special functions as fast as possible, such as gamma and log gamma. It's just hard. It's so hard that sometimes people accept the inaccuracy as the cost.

"As fast as C" -- that's not hard, if you can accept giving up Java semantics for floating point arithmetic, and instead put the C implementation into a JVM, which is quite easy.
The post you quoted from James Gosling was telling you how sin() and cos() are implemented in HotSpot. But that's their way of doing it, for a tradeoff between performance and Java conformance.

Pick a "sin() in C" implementation that satisfies you, and replace the sin() implementation in HotSpot, and there you've got what you want.
sin() and cos() are intrinsic functions in HotSpot; calling them wouldn't incur any JNI invocation overhead -- JNI would be too slow for this kind of stuff.
If you'd like to give a shot at this, a few of the places to look for would be:
hotspot/src/share/vm/classfile/vmSymbols.hpp
  do_intrinsic(_dsin,                     java_lang_Math,         sin_name,   double_double_signature,           F_S)   \

that's where java.lang.Math.sin() gets declared as an intrinsic function.

hotspot/src/share/vm/runtime/sharedRuntimeTrig.cpp
//----------------------------------------------------------------------
//
// Routines for new sin/cos implementation
//
//----------------------------------------------------------------------

/* sin(x)
 * Return sine function of x.
 *
 * kernel function:
 *      __kernel_sin            ... sine function on [-pi/4,pi/4]
 *      __kernel_cos            ... cose function on [-pi/4,pi/4]
 *      __ieee754_rem_pio2      ... argument reduction routine
 *
 * Method.
 *      Let S,C and T denote the sin, cos and tan respectively on
 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 *      in [-pi/4 , +pi/4], and let n = k mod 4.
 *      We have
 *
 *          n        sin(x)      cos(x)        tan(x)
 *     ----------------------------------------------------------
 *          0          S           C             T
 *          1          C          -S            -1/T
 *          2         -S          -C             T
 *          3         -C           S            -1/T
 *     ----------------------------------------------------------
 *
 * Special cases:
 *      Let trig be any of sin, cos, or tan.
 *      trig(+-INF)  is NaN, with signals;
 *      trig(NaN)    is that NaN;
 *
 * Accuracy:
 *      TRIG(x) returns trig(x) nearly rounded
 */

JRT_LEAF(jdouble, SharedRuntime::dsin(jdouble x))
  double y[2],z=0.0;
  int n, ix;

  /* High word of x. */
  ix = __HI(x);

  /* |x| ~< pi/4 */
  ix &= 0x7fffffff;
  if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);

  /* sin(Inf or NaN) is NaN */
  else if (ix>=0x7ff00000) return x-x;

  /* argument reduction needed */
  else {
    n = __ieee754_rem_pio2(x,y);
    switch(n&3) {
    case 0: return  __kernel_sin(y[0],y[1],1);
    case 1: return  __kernel_cos(y[0],y[1]);
    case 2: return -__kernel_sin(y[0],y[1],1);
    default:
      return -__kernel_cos(y[0],y[1]);
    }
  }
JRT_END

the general/slow-path implementation

hotspot/src/cpu/x86/vm/assembler_x86.cpp
void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
  // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
  // was attempted in this code; unfortunately it appears that the
  // switch to 80-bit precision and back causes this to be
  // unprofitable compared with simply performing a runtime call if
  // the argument is out of the (-pi/4, pi/4) range.

  Register tmp = noreg;
  if (!VM_Version::supports_cmov()) {
    // fcmp needs a temporary so preserve rbx,
    tmp = rbx;
    push(tmp);
  }

  Label slow_case, done;

  ExternalAddress pi4_adr = (address)&pi_4;
  if (reachable(pi4_adr)) {
    // x ?<= pi/4
    fld_d(pi4_adr);
    fld_s(1);                // Stack:  X  PI/4  X
    fabs();                  // Stack: |X| PI/4  X
    fcmp(tmp);
    jcc(Assembler::above, slow_case);

    // fastest case: -pi/4 <= x <= pi/4
    switch(trig) {
    case 's':
      fsin();
      break;
    case 'c':
      fcos();
      break;
    case 't':
      ftan();
      break;
    default:
      assert(false, "bad intrinsic");
      break;
    }
    jmp(done);
  }

  // slow case: runtime call
  bind(slow_case);
  // Preserve registers across runtime call
  pusha();
  int incoming_argument_and_return_value_offset = -1;
  if (num_fpu_regs_in_use > 1) {
    // Must preserve all other FPU regs (could alternatively convert
    // SharedRuntime::dsin and dcos into assembly routines known not to trash
    // FPU state, but can not trust C compiler)
    NEEDS_CLEANUP;
    // NOTE that in this case we also push the incoming argument to
    // the stack and restore it later; we also use this stack slot to
    // hold the return value from dsin or dcos.
    for (int i = 0; i < num_fpu_regs_in_use; i++) {
      subptr(rsp, sizeof(jdouble));
      fstp_d(Address(rsp, 0));
    }
    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
    fld_d(Address(rsp, incoming_argument_and_return_value_offset));
  }
  subptr(rsp, sizeof(jdouble));
  fstp_d(Address(rsp, 0));
#ifdef _LP64
  movdbl(xmm0, Address(rsp, 0));
#endif // _LP64

  // NOTE: we must not use call_VM_leaf here because that requires a
  // complete interpreter frame in debug mode -- same bug as 4387334
  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
  // do proper 64bit abi

  NEEDS_CLEANUP;
  // Need to add stack banging before this runtime call if it needs to
  // be taken; however, there is no generic stack banging routine at
  // the MacroAssembler level
  switch(trig) {
  case 's':
    {
      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
    }
    break;
  case 'c':
    {
      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
    }
    break;
  case 't':
    {
      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
    }
    break;
  default:
    assert(false, "bad intrinsic");
    break;
  }
#ifdef _LP64
    movsd(Address(rsp, 0), xmm0);
    fld_d(Address(rsp, 0));
#endif // _LP64
  addptr(rsp, sizeof(jdouble));
  if (num_fpu_regs_in_use > 1) {
    // Must save return value to stack and then restore entire FPU stack
    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
    for (int i = 0; i < num_fpu_regs_in_use; i++) {
      fld_d(Address(rsp, 0));
      addptr(rsp, sizeof(jdouble));
    }
  }
  popa();

  // Come here with result in F-TOS
  bind(done);

  if (tmp != noreg) {
    pop(tmp);
  }
}

a specialized version on x86

hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
    {
      StubCodeMark mark(this, "StubRoutines", "sin");
      StubRoutines::_intrinsic_sin = (double (*)(double)) __ pc();

      __ subq(rsp, 8);
      __ movdbl(Address(rsp, 0), xmm0);
      __ fld_d(Address(rsp, 0));
      __ trigfunc('s');
      __ fstp_d(Address(rsp, 0));
      __ movdbl(xmm0, Address(rsp, 0));
      __ addq(rsp, 8);
      __ ret(0);
    }

the stub code of the intrinsic sin() on x86-64

hotspot/src/share/vm/opto/library_call.cpp
//------------------------------inline_trig----------------------------------
// Inline sin/cos/tan instructions, if possible.  If rounding is required, do
// argument reduction which will turn into a fast/slow diamond.
bool LibraryCallKit::inline_trig(vmIntrinsics::ID id) {
  _sp += arg_size();            // restore stack pointer
  Node* arg = pop_math_arg();
  Node* trig = NULL;

  switch (id) {
  case vmIntrinsics::_dsin:
    trig = _gvn.transform((Node*)new (C, 2) SinDNode(arg));
    break;
  case vmIntrinsics::_dcos:
    trig = _gvn.transform((Node*)new (C, 2) CosDNode(arg));
    break;
  case vmIntrinsics::_dtan:
    trig = _gvn.transform((Node*)new (C, 2) TanDNode(arg));
    break;
  default:
    assert(false, "bad intrinsic was passed in");
    return false;
  }

  // Rounding required?  Check for argument reduction!
  if( Matcher::strict_fp_requires_explicit_rounding ) {

    static const double     pi_4 =  0.7853981633974483;
    static const double neg_pi_4 = -0.7853981633974483;
    // pi/2 in 80-bit extended precision
    // static const unsigned char pi_2_bits_x[] = {0x35,0xc2,0x68,0x21,0xa2,0xda,0x0f,0xc9,0xff,0x3f,0x00,0x00,0x00,0x00,0x00,0x00};
    // -pi/2 in 80-bit extended precision
    // static const unsigned char neg_pi_2_bits_x[] = {0x35,0xc2,0x68,0x21,0xa2,0xda,0x0f,0xc9,0xff,0xbf,0x00,0x00,0x00,0x00,0x00,0x00};
    // Cutoff value for using this argument reduction technique
    //static const double    pi_2_minus_epsilon =  1.564660403643354;
    //static const double neg_pi_2_plus_epsilon = -1.564660403643354;

    // Pseudocode for sin:
    // if (x <= Math.PI / 4.0) {
    //   if (x >= -Math.PI / 4.0) return  fsin(x);
    //   if (x >= -Math.PI / 2.0) return -fcos(x + Math.PI / 2.0);
    // } else {
    //   if (x <=  Math.PI / 2.0) return  fcos(x - Math.PI / 2.0);
    // }
    // return StrictMath.sin(x);

    // Pseudocode for cos:
    // if (x <= Math.PI / 4.0) {
    //   if (x >= -Math.PI / 4.0) return  fcos(x);
    //   if (x >= -Math.PI / 2.0) return  fsin(x + Math.PI / 2.0);
    // } else {
    //   if (x <=  Math.PI / 2.0) return -fsin(x - Math.PI / 2.0);
    // }
    // return StrictMath.cos(x);

    // Actually, sticking in an 80-bit Intel value into C2 will be tough; it
    // requires a special machine instruction to load it.  Instead we'll try
    // the 'easy' case.  If we really need the extra range +/- PI/2 we'll
    // probably do the math inside the SIN encoding.

    // Make the merge point
    RegionNode *r = new (C, 3) RegionNode(3);
    Node *phi = new (C, 3) PhiNode(r,Type::DOUBLE);

    // Flatten arg so we need only 1 test
    Node *abs = _gvn.transform(new (C, 2) AbsDNode(arg));
    // Node for PI/4 constant
    Node *pi4 = makecon(TypeD::make(pi_4));
    // Check PI/4 : abs(arg)
    Node *cmp = _gvn.transform(new (C, 3) CmpDNode(pi4,abs));
    // Check: If PI/4 < abs(arg) then go slow
    Node *bol = _gvn.transform( new (C, 2) BoolNode( cmp, BoolTest::lt ) );
    // Branch either way
    IfNode *iff = create_and_xform_if(control(),bol, PROB_STATIC_FREQUENT, COUNT_UNKNOWN);
    set_control(opt_iff(r,iff));

    // Set fast path result
    phi->init_req(2,trig);

    // Slow path - non-blocking leaf call
    Node* call = NULL;
    switch (id) {
    case vmIntrinsics::_dsin:
      call = make_runtime_call(RC_LEAF, OptoRuntime::Math_D_D_Type(),
                               CAST_FROM_FN_PTR(address, SharedRuntime::dsin),
                               "Sin", NULL, arg, top());
      break;
    case vmIntrinsics::_dcos:
      call = make_runtime_call(RC_LEAF, OptoRuntime::Math_D_D_Type(),
                               CAST_FROM_FN_PTR(address, SharedRuntime::dcos),
                               "Cos", NULL, arg, top());
      break;
    case vmIntrinsics::_dtan:
      call = make_runtime_call(RC_LEAF, OptoRuntime::Math_D_D_Type(),
                               CAST_FROM_FN_PTR(address, SharedRuntime::dtan),
                               "Tan", NULL, arg, top());
      break;
    }
    assert(control()->in(0) == call, "");
    Node* slow_result = _gvn.transform(new (C, 1) ProjNode(call,TypeFunc::Parms));
    r->init_req(1,control());
    phi->init_req(1,slow_result);

    // Post-merge
    set_control(_gvn.transform(r));
    record_for_igvn(r);
    trig = _gvn.transform(phi);

    C->set_has_split_ifs(true); // Has chance for split-if optimization
  }
  // Push result back on JVM stack
  push_pair(trig);
  return true;
}

the inlined version in HotSpot server compiler

It's important so I'm gonna say it twice: if you're willing to make a different choice on the tradeoff between performance and Java conformance, just tweak the code above, and you'll get what you want. The performance won't be that much different from a C implementation if you choose the same tradeoffs.

jellyfish 写道
I've done a lot performance tunings as well, and have seen so many cases for premature optimization. The most common case is that people don't understand the problem itself and still try to optimize/profile it.

Bad microbenchmarks contribute to the "common case" you're talking about, don't you agree?
0 请登录后投票
   发表时间:2011-05-28  
看来ls吊完几瓶点滴又原地满状态复活了。
0 请登录后投票
   发表时间:2011-05-29  
JNI调用需要内存复制,会慢很多的
0 请登录后投票
   发表时间:2011-05-31  
看了你的代码,你这样对比测试得出的结论不能说明什么问题。
如果把java代码优化一下下,速度会提升会超出你的预料。不同的JVM运算速度的差别很大。
0 请登录后投票
   发表时间:2011-09-20  
eisenwolf 写道
JDK自从升级到1.6之后,速度继续翻番。我个人觉得C++只适合一些专业领域了……但是它又没有C的速度快,所以生存空间只会越来越小……


据我有限的经验。大部分人写的c程序未必能快得过c++风格的程序。

0 请登录后投票
   发表时间:2011-09-23  
C++的性能是会比JAVA高。可是这也差太多了吧。各种因素造成的。
0 请登录后投票
   发表时间:2011-09-26  
c/c++直接编译成汇编到二进制,这个都是可见的,但是java不一样,它会编译成自己优化过的bytecode,类似于汇编,这个bytecode也是可见的,但是jvm如何执行这个,编译成机器执行的二进制后咋样,这个就不知道了。所以测试也只能看出差距,但不知道为什么会有差距,如何优化,我们该做的也就是优化bytecode了。还要继续学习...
0 请登录后投票
论坛首页 Java企业应用版

跳转论坛:
Global site tag (gtag.js) - Google Analytics