   发表时间:2011-05-26   最后修改:2011-05-26
main.o:     file format pe-i386

Disassembly of section .text:

00000000 <__ZN4TestC1Ev>:
   0:	55                   	push   %ebp
   1:	89 e5                	mov    %esp,%ebp
   3:	8b 45 08             	mov    0x8(%ebp),%eax
   6:	c7 00 00 00 00 00    	movl   $0x0,(%eax)
   c:	8b 4d 08             	mov    0x8(%ebp),%ecx
   f:	b8 00 00 00 00       	mov    $0x0,%eax
  14:	ba 00 00 00 00       	mov    $0x0,%edx
  19:	89 41 08             	mov    %eax,0x8(%ecx)
  1c:	89 51 0c             	mov    %edx,0xc(%ecx)
  1f:	c9                   	leave  
  20:	c3                   	ret    
  21:	90                   	nop

00000022 <__ZN4Test7testIntEv>:
  22:	55                   	push   %ebp
  23:	89 e5                	mov    %esp,%ebp
  25:	53                   	push   %ebx
  26:	83 ec 10             	sub    $0x10,%esp
  29:	c7 45 f8 01 00 00 00 	movl   $0x1,-0x8(%ebp)
  30:	eb 39                	jmp    6b <__ZN4Test7testIntEv+0x49>
  32:	8b 55 f8             	mov    -0x8(%ebp),%edx
  35:	89 d0                	mov    %edx,%eax
  37:	c1 e0 03             	shl    $0x3,%eax
  3a:	29 d0                	sub    %edx,%eax
  3c:	f7 d0                	not    %eax
  3e:	05 63 09 00 00       	add    $0x963,%eax
  43:	89 c3                	mov    %eax,%ebx
  45:	2b 5d f8             	sub    -0x8(%ebp),%ebx
  48:	8b 4d f8             	mov    -0x8(%ebp),%ecx
  4b:	ba 56 55 55 55       	mov    $0x55555556,%edx
  50:	89 c8                	mov    %ecx,%eax
  52:	f7 ea                	imul   %edx
  54:	89 c8                	mov    %ecx,%eax
  56:	c1 f8 1f             	sar    $0x1f,%eax
  59:	89 d1                	mov    %edx,%ecx
  5b:	29 c1                	sub    %eax,%ecx
  5d:	89 c8                	mov    %ecx,%eax
  5f:	89 da                	mov    %ebx,%edx
  61:	21 c2                	and    %eax,%edx
  63:	8b 45 08             	mov    0x8(%ebp),%eax
  66:	89 10                	mov    %edx,(%eax)
  68:	ff 45 f8             	incl   -0x8(%ebp)
  6b:	81 7d f8 40 0d 03 00 	cmpl   $0x30d40,-0x8(%ebp)
  72:	0f 9e c0             	setle  %al
  75:	84 c0                	test   %al,%al
  77:	75 b9                	jne    32 <__ZN4Test7testIntEv+0x10>
  79:	83 c4 10             	add    $0x10,%esp
  7c:	5b                   	pop    %ebx
  7d:	c9                   	leave  
  7e:	c3                   	ret    
  7f:	90                   	nop

00000080 <__ZN4Test10testDoubleEv>:
  80:	55                   	push   %ebp
  81:	89 e5                	mov    %esp,%ebp
  83:	83 ec 38             	sub    $0x38,%esp
  86:	c7 45 f4 01 00 00 00 	movl   $0x1,-0xc(%ebp)
  8d:	eb 2e                	jmp    bd <__ZN4Test10testDoubleEv+0x3d>
  8f:	8b 45 f4             	mov    -0xc(%ebp),%eax
  92:	c1 e0 02             	shl    $0x2,%eax
  95:	89 45 e4             	mov    %eax,-0x1c(%ebp)
  98:	db 45 e4             	fildl  -0x1c(%ebp)
  9b:	dd 05 28 00 00 00    	fldl   0x28
  a1:	de c1                	faddp  %st,%st(1)
  a3:	dd 5d d8             	fstpl  -0x28(%ebp)
  a6:	db 45 f4             	fildl  -0xc(%ebp)
  a9:	dd 1c 24             	fstpl  (%esp)
  ac:	e8 00 00 00 00       	call   b1 <__ZN4Test10testDoubleEv+0x31>
  b1:	dc 4d d8             	fmull  -0x28(%ebp)
  b4:	8b 45 08             	mov    0x8(%ebp),%eax
  b7:	dd 58 08             	fstpl  0x8(%eax)
  ba:	ff 45 f4             	incl   -0xc(%ebp)
  bd:	81 7d f4 50 c3 00 00 	cmpl   $0xc350,-0xc(%ebp)
  c4:	0f 9e c0             	setle  %al
  c7:	84 c0                	test   %al,%al
  c9:	75 c4                	jne    8f <__ZN4Test10testDoubleEv+0xf>
  cb:	c9                   	leave  
  cc:	c3                   	ret    
  cd:	90                   	nop

000000ce <__ZN4Test6doTestEv>:
  ce:	55                   	push   %ebp
  cf:	89 e5                	mov    %esp,%ebp
  d1:	83 ec 18             	sub    $0x18,%esp
  d4:	8b 45 08             	mov    0x8(%ebp),%eax
  d7:	89 04 24             	mov    %eax,(%esp)
  da:	e8 43 ff ff ff       	call   22 <__ZN4Test7testIntEv>			<-------------------------------------
  df:	8b 45 08             	mov    0x8(%ebp),%eax
  e2:	89 04 24             	mov    %eax,(%esp)
  e5:	e8 96 ff ff ff       	call   80 <__ZN4Test10testDoubleEv>		<-------------------------------------
  ea:	c9                   	leave  
  eb:	c3                   	ret    

000000ec <_main>:
  ec:	55                   	push   %ebp
  ed:	89 e5                	mov    %esp,%ebp
  ef:	83 e4 f0             	and    $0xfffffff0,%esp
  f2:	53                   	push   %ebx
  f3:	83 ec 2c             	sub    $0x2c,%esp
  f6:	e8 00 00 00 00       	call   fb <_main+0xf>
  fb:	c7 44 24 04 00 00 00 	movl   $0x0,0x4(%esp)
 102:	00 
 103:	c7 04 24 00 00 00 00 	movl   $0x0,(%esp)
 10a:	e8 00 00 00 00       	call   10f <_main+0x23>
 10f:	c7 44 24 04 00 00 00 	movl   $0x0,0x4(%esp)
 116:	00 
 117:	89 04 24             	mov    %eax,(%esp)
 11a:	e8 00 00 00 00       	call   11f <_main+0x33>
 11f:	e8 00 00 00 00       	call   124 <_main+0x38>
 124:	89 44 24 18          	mov    %eax,0x18(%esp)
 128:	c7 04 24 10 00 00 00 	movl   $0x10,(%esp)
 12f:	e8 00 00 00 00       	call   134 <_main+0x48>
 134:	89 c3                	mov    %eax,%ebx
 136:	89 d8                	mov    %ebx,%eax
 138:	89 04 24             	mov    %eax,(%esp)
 13b:	e8 c0 fe ff ff       	call   0 <__ZN4TestC1Ev>
 140:	89 5c 24 14          	mov    %ebx,0x14(%esp)
 144:	c7 44 24 1c 00 00 00 	movl   $0x0,0x1c(%esp)
 14b:	00 
 14c:	eb 10                	jmp    15e <_main+0x72>
 14e:	8b 44 24 14          	mov    0x14(%esp),%eax
 152:	89 04 24             	mov    %eax,(%esp)
 155:	e8 74 ff ff ff       	call   ce <__ZN4Test6doTestEv>     <-------------------------------------
 15a:	ff 44 24 1c          	incl   0x1c(%esp)
 15e:	81 7c 24 1c 0f 27 00 	cmpl   $0x270f,0x1c(%esp)
 165:	00 
 166:	0f 9e c0             	setle  %al
 169:	84 c0                	test   %al,%al
 16b:	75 e1                	jne    14e <_main+0x62>

000001fd <___tcf_0>:
 1fd:	55                   	push   %ebp

00000211 <__Z41__static_initialization_and_destruction_0ii>:
 211:	55                   	push   %ebp

00000240 <__GLOBAL__I__ZN4TestC2Ev>:
 240:	55                   	push   %ebp





jellyfish 写道
That reference is really a wild guess. NO, it's not.

I apologize for the wild guess. My bad.

jellyfish 写道

Yes, this is a nice post that describes exactly what's going on in the HotSpot VM. In fact, if you read the assembler code I posted on page 1 of this thread, that's the actual implementation used in x86-64 version of the HotSpot VM: it does a range check, and then either use fsin if the value is within range [-pi/4, pi/4], otherwise it does argument reduction and then use fsin.

jellyfish 写道
a simple google on "java sin cos slow" can generate a lot of interesting entries, especially in the game arena, so it should be classified as "well known".

Yeah, you can search Google and get a lot of nonsense as well, that's not convincing enough. All we need is solid evidence, which is easier to get by reading source code and running "macro benchmarks" instead of searching Google.

jellyfish 写道
While I am saying java sin() is slower than C version, I am not saying in general java is slow, did I?

No, you didn't. I missed the point, my bad.

jellyfish 写道
In fact, if you can make java sin() as closely fast as C, I would take it. I did quite some coding on how to make those special functions as fast as possible, such as gamma and log gamma. It's just hard. It's so hard that sometimes people accept the inaccuracy as the cost.

"As fast as C" -- that's not hard, if you can accept giving up Java semantics for floating point arithmetic, and instead put the C implementation into a JVM, which is quite easy.
The post you quoted from James Gosling was telling you how sin() and cos() are implemented in HotSpot. But that's their way of doing it, for a tradeoff between performance and Java conformance.

Pick a "sin() in C" implementation that satisfies you, and replace the sin() implementation in HotSpot, and there you've got what you want.
sin() and cos() are intrinsic functions in HotSpot; calling them wouldn't incur any JNI invocation overhead -- JNI would be too slow for this kind of stuff.
If you'd like to give a shot at this, a few of the places to look for would be:
  do_intrinsic(_dsin,                     java_lang_Math,         sin_name,   double_double_signature,           F_S)   \

that's where java.lang.Math.sin() gets declared as an intrinsic function.

// Routines for new sin/cos implementation

/* sin(x)
 * Return sine function of x.
 * kernel function:
 *      __kernel_sin            ... sine function on [-pi/4,pi/4]
 *      __kernel_cos            ... cose function on [-pi/4,pi/4]
 *      __ieee754_rem_pio2      ... argument reduction routine
 * Method.
 *      Let S,C and T denote the sin, cos and tan respectively on
 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
 *      in [-pi/4 , +pi/4], and let n = k mod 4.
 *      We have
 *          n        sin(x)      cos(x)        tan(x)
 *     ----------------------------------------------------------
 *          0          S           C             T
 *          1          C          -S            -1/T
 *          2         -S          -C             T
 *          3         -C           S            -1/T
 *     ----------------------------------------------------------
 * Special cases:
 *      Let trig be any of sin, cos, or tan.
 *      trig(+-INF)  is NaN, with signals;
 *      trig(NaN)    is that NaN;
 * Accuracy:
 *      TRIG(x) returns trig(x) nearly rounded

JRT_LEAF(jdouble, SharedRuntime::dsin(jdouble x))
  double y[2],z=0.0;
  int n, ix;

  /* High word of x. */
  ix = __HI(x);

  /* |x| ~< pi/4 */
  ix &= 0x7fffffff;
  if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);

  /* sin(Inf or NaN) is NaN */
  else if (ix>=0x7ff00000) return x-x;

  /* argument reduction needed */
  else {
    n = __ieee754_rem_pio2(x,y);
    switch(n&3) {
    case 0: return  __kernel_sin(y[0],y[1],1);
    case 1: return  __kernel_cos(y[0],y[1]);
    case 2: return -__kernel_sin(y[0],y[1],1);
      return -__kernel_cos(y[0],y[1]);

the general/slow-path implementation

void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
  // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
  // was attempted in this code; unfortunately it appears that the
  // switch to 80-bit precision and back causes this to be
  // unprofitable compared with simply performing a runtime call if
  // the argument is out of the (-pi/4, pi/4) range.

  Register tmp = noreg;
  if (!VM_Version::supports_cmov()) {
    // fcmp needs a temporary so preserve rbx,
    tmp = rbx;

  Label slow_case, done;

  ExternalAddress pi4_adr = (address)&pi_4;
  if (reachable(pi4_adr)) {
    // x ?<= pi/4
    fld_s(1);                // Stack:  X  PI/4  X
    fabs();                  // Stack: |X| PI/4  X
    jcc(Assembler::above, slow_case);

    // fastest case: -pi/4 <= x <= pi/4
    switch(trig) {
    case 's':
    case 'c':
    case 't':
      assert(false, "bad intrinsic");

  // slow case: runtime call
  // Preserve registers across runtime call
  int incoming_argument_and_return_value_offset = -1;
  if (num_fpu_regs_in_use > 1) {
    // Must preserve all other FPU regs (could alternatively convert
    // SharedRuntime::dsin and dcos into assembly routines known not to trash
    // FPU state, but can not trust C compiler)
    // NOTE that in this case we also push the incoming argument to
    // the stack and restore it later; we also use this stack slot to
    // hold the return value from dsin or dcos.
    for (int i = 0; i < num_fpu_regs_in_use; i++) {
      subptr(rsp, sizeof(jdouble));
      fstp_d(Address(rsp, 0));
    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
    fld_d(Address(rsp, incoming_argument_and_return_value_offset));
  subptr(rsp, sizeof(jdouble));
  fstp_d(Address(rsp, 0));
#ifdef _LP64
  movdbl(xmm0, Address(rsp, 0));
#endif // _LP64

  // NOTE: we must not use call_VM_leaf here because that requires a
  // complete interpreter frame in debug mode -- same bug as 4387334
  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
  // do proper 64bit abi

  // Need to add stack banging before this runtime call if it needs to
  // be taken; however, there is no generic stack banging routine at
  // the MacroAssembler level
  switch(trig) {
  case 's':
      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
  case 'c':
      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
  case 't':
      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
    assert(false, "bad intrinsic");
#ifdef _LP64
    movsd(Address(rsp, 0), xmm0);
    fld_d(Address(rsp, 0));
#endif // _LP64
  addptr(rsp, sizeof(jdouble));
  if (num_fpu_regs_in_use > 1) {
    // Must save return value to stack and then restore entire FPU stack
    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
    for (int i = 0; i < num_fpu_regs_in_use; i++) {
      fld_d(Address(rsp, 0));
      addptr(rsp, sizeof(jdouble));

  // Come here with result in F-TOS

  if (tmp != noreg) {

a specialized version on x86

      StubCodeMark mark(this, "StubRoutines", "sin");
      StubRoutines::_intrinsic_sin = (double (*)(double)) __ pc();

      __ subq(rsp, 8);
      __ movdbl(Address(rsp, 0), xmm0);
      __ fld_d(Address(rsp, 0));
      __ trigfunc('s');
      __ fstp_d(Address(rsp, 0));
      __ movdbl(xmm0, Address(rsp, 0));
      __ addq(rsp, 8);
      __ ret(0);

the stub code of the intrinsic sin() on x86-64

// Inline sin/cos/tan instructions, if possible.  If rounding is required, do
// argument reduction which will turn into a fast/slow diamond.
bool LibraryCallKit::inline_trig(vmIntrinsics::ID id) {
  _sp += arg_size();            // restore stack pointer
  Node* arg = pop_math_arg();
  Node* trig = NULL;

  switch (id) {
  case vmIntrinsics::_dsin:
    trig = _gvn.transform((Node*)new (C, 2) SinDNode(arg));
  case vmIntrinsics::_dcos:
    trig = _gvn.transform((Node*)new (C, 2) CosDNode(arg));
  case vmIntrinsics::_dtan:
    trig = _gvn.transform((Node*)new (C, 2) TanDNode(arg));
    assert(false, "bad intrinsic was passed in");
    return false;

  // Rounding required?  Check for argument reduction!
  if( Matcher::strict_fp_requires_explicit_rounding ) {

    static const double     pi_4 =  0.7853981633974483;
    static const double neg_pi_4 = -0.7853981633974483;
    // pi/2 in 80-bit extended precision
    // static const unsigned char pi_2_bits_x[] = {0x35,0xc2,0x68,0x21,0xa2,0xda,0x0f,0xc9,0xff,0x3f,0x00,0x00,0x00,0x00,0x00,0x00};
    // -pi/2 in 80-bit extended precision
    // static const unsigned char neg_pi_2_bits_x[] = {0x35,0xc2,0x68,0x21,0xa2,0xda,0x0f,0xc9,0xff,0xbf,0x00,0x00,0x00,0x00,0x00,0x00};
    // Cutoff value for using this argument reduction technique
    //static const double    pi_2_minus_epsilon =  1.564660403643354;
    //static const double neg_pi_2_plus_epsilon = -1.564660403643354;

    // Pseudocode for sin:
    // if (x <= Math.PI / 4.0) {
    //   if (x >= -Math.PI / 4.0) return  fsin(x);
    //   if (x >= -Math.PI / 2.0) return -fcos(x + Math.PI / 2.0);
    // } else {
    //   if (x <=  Math.PI / 2.0) return  fcos(x - Math.PI / 2.0);
    // }
    // return StrictMath.sin(x);

    // Pseudocode for cos:
    // if (x <= Math.PI / 4.0) {
    //   if (x >= -Math.PI / 4.0) return  fcos(x);
    //   if (x >= -Math.PI / 2.0) return  fsin(x + Math.PI / 2.0);
    // } else {
    //   if (x <=  Math.PI / 2.0) return -fsin(x - Math.PI / 2.0);
    // }
    // return StrictMath.cos(x);

    // Actually, sticking in an 80-bit Intel value into C2 will be tough; it
    // requires a special machine instruction to load it.  Instead we'll try
    // the 'easy' case.  If we really need the extra range +/- PI/2 we'll
    // probably do the math inside the SIN encoding.

    // Make the merge point
    RegionNode *r = new (C, 3) RegionNode(3);
    Node *phi = new (C, 3) PhiNode(r,Type::DOUBLE);

    // Flatten arg so we need only 1 test
    Node *abs = _gvn.transform(new (C, 2) AbsDNode(arg));
    // Node for PI/4 constant
    Node *pi4 = makecon(TypeD::make(pi_4));
    // Check PI/4 : abs(arg)
    Node *cmp = _gvn.transform(new (C, 3) CmpDNode(pi4,abs));
    // Check: If PI/4 < abs(arg) then go slow
    Node *bol = _gvn.transform( new (C, 2) BoolNode( cmp, BoolTest::lt ) );
    // Branch either way
    IfNode *iff = create_and_xform_if(control(),bol, PROB_STATIC_FREQUENT, COUNT_UNKNOWN);

    // Set fast path result

    // Slow path - non-blocking leaf call
    Node* call = NULL;
    switch (id) {
    case vmIntrinsics::_dsin:
      call = make_runtime_call(RC_LEAF, OptoRuntime::Math_D_D_Type(),
                               CAST_FROM_FN_PTR(address, SharedRuntime::dsin),
                               "Sin", NULL, arg, top());
    case vmIntrinsics::_dcos:
      call = make_runtime_call(RC_LEAF, OptoRuntime::Math_D_D_Type(),
                               CAST_FROM_FN_PTR(address, SharedRuntime::dcos),
                               "Cos", NULL, arg, top());
    case vmIntrinsics::_dtan:
      call = make_runtime_call(RC_LEAF, OptoRuntime::Math_D_D_Type(),
                               CAST_FROM_FN_PTR(address, SharedRuntime::dtan),
                               "Tan", NULL, arg, top());
    assert(control()->in(0) == call, "");
    Node* slow_result = _gvn.transform(new (C, 1) ProjNode(call,TypeFunc::Parms));

    // Post-merge
    trig = _gvn.transform(phi);

    C->set_has_split_ifs(true); // Has chance for split-if optimization
  // Push result back on JVM stack
  return true;

the inlined version in HotSpot server compiler

It's important so I'm gonna say it twice: if you're willing to make a different choice on the tradeoff between performance and Java conformance, just tweak the code above, and you'll get what you want. The performance won't be that much different from a C implementation if you choose the same tradeoffs.

jellyfish 写道
I've done a lot performance tunings as well, and have seen so many cases for premature optimization. The most common case is that people don't understand the problem itself and still try to optimize/profile it.

Bad microbenchmarks contribute to the "common case" you're talking about, don't you agree?
eisenwolf 写道


