diff --git a/CMakeLists.txt b/CMakeLists.txt index e114587e8..8e8401b82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -441,7 +441,7 @@ FOREACH(opt_target ${opt_targets}) ELSE(EXISTS "$ENV{SVN_C_COMPILER}") SET(C_COMPILER ${CMAKE_C_COMPILER}) ENDIF(EXISTS "$ENV{SVN_C_COMPILER}") - ADD_CUSTOM_TARGET(regen-basic-ops-${opt_target} COMMAND ${C_COMPILER} -O2 -ftree-vectorize -ftree-vectorizer-verbose=2 -fomit-frame-pointer -c -S -I${CMAKE_SOURCE_DIR}/include -I${CMAKE_BINARY_DIR} -g0 -DBUILD_${OPT_TARGET} -m${opt_target} ${FPMATH_FLAGS} -o ${BASIC_OPS_X86_TARGET_S} ${BASIC_OPS_X86_C} DEPENDS ${BASIC_OPS_X86_C}) + ADD_CUSTOM_TARGET(regen-basic-ops-${opt_target} COMMAND ${C_COMPILER} -O2 -fno-stack-protector -ftree-vectorize -ftree-vectorizer-verbose=2 -fomit-frame-pointer -c -S -I${CMAKE_SOURCE_DIR}/include -I${CMAKE_BINARY_DIR} -g0 -DBUILD_${OPT_TARGET} -m${opt_target} ${FPMATH_FLAGS} -o ${BASIC_OPS_X86_TARGET_S} ${BASIC_OPS_X86_C} DEPENDS ${BASIC_OPS_X86_C}) ADD_CUSTOM_COMMAND(OUTPUT ${BASIC_OPS_X86_TARGET_O} COMMAND ${CMAKE_C_COMPILER} ARGS ${BASIC_OPS_X86_TARGET_S} -c -o ${BASIC_OPS_X86_TARGET_O} DEPENDS ${BASIC_OPS_X86_TARGET_S}) ADD_DEPENDENCIES(regen-basic-ops regen-basic-ops-${opt_target}) SET(opt_target_objects ${opt_target_objects} ${BASIC_OPS_X86_TARGET_O}) diff --git a/src/core/basic_ops_x86_mmx.s b/src/core/basic_ops_x86_mmx.s index 0ea75c74a..0f21c9a92 100644 --- a/src/core/basic_ops_x86_mmx.s +++ b/src/core/basic_ops_x86_mmx.s @@ -103,5 +103,5 @@ movq %mm0, 56(%edx) emms ret .size alignedMemClearMMX, .-alignedMemClearMMX - .ident "GCC: (GNU) 4.4.0 20090304 (experimental)" + .ident "GCC: (Ubuntu 4.4.0-0ubuntu2) 4.4.0" .section .note.GNU-stack,"",@progbits diff --git a/src/core/basic_ops_x86_sse.s b/src/core/basic_ops_x86_sse.s index 16cc8a239..3f72a9ccd 100644 --- a/src/core/basic_ops_x86_sse.s +++ b/src/core/basic_ops_x86_sse.s @@ -230,48 +230,42 @@ alignedBufWetDryMixSplittedSSE: pushl %edi pushl %esi pushl %ebx - subl $140, %esp - movl 180(%esp), %eax - flds 172(%esp) - movl 160(%esp), %edx - movl 164(%esp), %esi + subl $124, %esp + movl 164(%esp), %eax + movl 144(%esp), %edx + movl 148(%esp), %esi + movl 152(%esp), %ecx testl %eax, %eax - movl 168(%esp), %ecx - flds 176(%esp) - jle .L43 - movl 180(%esp), %eax + jle .L39 + movl 164(%esp), %eax subl $1, %eax shrl %eax addl $1, %eax movl %eax, %ebp - movl %eax, 120(%esp) + movl %eax, 104(%esp) shrl $2, %ebp - cmpl $3, 120(%esp) + cmpl $3, 104(%esp) leal 0(,%ebp,4), %eax - movl %eax, 124(%esp) + movl %eax, 108(%esp) jbe .L40 testl %eax, %eax jne .L34 .L40: - fxch %st(1) xorl %edi, %edi jmp .L36 .p2align 4,,7 .p2align 3 .L34: - fsts 12(%esp) - fxch %st(1) + movss 160(%esp), %xmm0 xorps %xmm7, %xmm7 - movss 12(%esp), %xmm0 movl %esi, %ebx - fsts 12(%esp) xorl %eax, %eax xorl %edi, %edi shufps $0, %xmm0, %xmm0 - movaps %xmm0, 32(%esp) - movss 12(%esp), %xmm0 - shufps $0, %xmm0, %xmm0 movaps %xmm0, 16(%esp) + movss 156(%esp), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, (%esp) .p2align 4,,7 .p2align 3 .L37: @@ -282,28 +276,28 @@ alignedBufWetDryMixSplittedSSE: shufps $136, %xmm6, %xmm0 movaps 32(%edx,%eax,2), %xmm4 shufps $221, %xmm6, %xmm5 - movaps %xmm0, 96(%esp) + movaps %xmm0, 80(%esp) movaps 48(%edx,%eax,2), %xmm3 movaps %xmm4, %xmm0 shufps $136, %xmm3, %xmm0 - movaps 96(%esp), %xmm2 + movaps 80(%esp), %xmm2 shufps $221, %xmm3, %xmm4 movaps %xmm7, %xmm6 movlps (%ebx), %xmm6 - movaps %xmm5, 80(%esp) + movaps %xmm5, 64(%esp) movhps 8(%ebx), %xmm6 shufps $136, %xmm0, %xmm2 - movaps %xmm0, 64(%esp) + movaps %xmm0, 48(%esp) movaps %xmm7, %xmm5 movaps %xmm6, %xmm0 movlps 16(%ebx), %xmm5 movhps 24(%ebx), %xmm5 shufps $136, %xmm5, %xmm0 - mulps 32(%esp), %xmm2 + mulps 16(%esp), %xmm2 shufps $221, %xmm5, %xmm6 - movaps %xmm4, 48(%esp) + movaps %xmm4, 32(%esp) addl $32, %ebx - mulps 16(%esp), %xmm0 + mulps (%esp), %xmm0 movaps %xmm7, %xmm4 movlps (%eax,%ecx), %xmm4 movaps %xmm7, %xmm3 @@ -313,21 +307,21 @@ alignedBufWetDryMixSplittedSSE: movhps 24(%ecx,%eax), %xmm3 shufps $136, %xmm3, %xmm1 addps %xmm0, %xmm2 - movaps 80(%esp), %xmm0 + movaps 64(%esp), %xmm0 shufps $221, %xmm3, %xmm4 - shufps $136, 48(%esp), %xmm0 - mulps 16(%esp), %xmm1 + shufps $136, 32(%esp), %xmm0 + mulps (%esp), %xmm1 movaps %xmm2, %xmm3 - movaps 80(%esp), %xmm5 - mulps 32(%esp), %xmm0 - shufps $221, 48(%esp), %xmm5 - mulps 16(%esp), %xmm6 + movaps 64(%esp), %xmm5 + mulps 16(%esp), %xmm0 + shufps $221, 32(%esp), %xmm5 + mulps (%esp), %xmm6 addps %xmm1, %xmm0 - movaps 96(%esp), %xmm1 - shufps $221, 64(%esp), %xmm1 - mulps 16(%esp), %xmm4 - mulps 32(%esp), %xmm1 - mulps 32(%esp), %xmm5 + movaps 80(%esp), %xmm1 + shufps $221, 48(%esp), %xmm1 + mulps (%esp), %xmm4 + mulps 16(%esp), %xmm1 + mulps 16(%esp), %xmm5 addps %xmm6, %xmm1 addps %xmm4, %xmm5 movaps %xmm0, %xmm4 @@ -348,63 +342,53 @@ alignedBufWetDryMixSplittedSSE: addl $32, %eax cmpl %edi, %ebp ja .L37 - movl 124(%esp), %edi - movl 120(%esp), %eax + movl 108(%esp), %edi + movl 104(%esp), %eax addl %edi, %edi - cmpl %eax, 124(%esp) - je .L44 + cmpl %eax, 108(%esp) + je .L39 .L36: - leal (%edx,%edi,8), %ebx + movss 156(%esp), %xmm0 xorl %ebp, %ebp - leal 8(%edx,%edi,8), %edx + movss 160(%esp), %xmm1 movl %edi, %eax + leal (%edx,%edi,8), %ebx + leal 8(%edx,%edi,8), %edx .p2align 4,,7 .p2align 3 .L38: - flds (%ebx) + movss (%esi,%eax,4), %xmm3 addl $2, %ebp - fmul %st(2), %st - flds (%esi,%eax,4) - fmul %st(2), %st - faddp %st, %st(1) - fstps (%ebx) - flds 4(%ebx) - fmul %st(2), %st - flds (%ecx,%eax,4) - fmul %st(2), %st - faddp %st, %st(1) - fstps 4(%ebx) + movss (%ebx), %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm2 + addss %xmm3, %xmm2 + movss %xmm2, (%ebx) + movss 4(%ebx), %xmm2 + movss (%ecx,%eax,4), %xmm3 + mulss %xmm1, %xmm2 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm2 + movss %xmm2, 4(%ebx) addl $16, %ebx - flds (%edx) - fmul %st(2), %st - flds 4(%esi,%eax,4) - fmul %st(2), %st - faddp %st, %st(1) - fstps (%edx) - flds 4(%edx) - fmul %st(2), %st - flds 4(%ecx,%eax,4) + movss 4(%esi,%eax,4), %xmm3 + movss (%edx), %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm2 + addss %xmm3, %xmm2 + movss %xmm2, (%edx) + movss 4(%edx), %xmm2 + movss 4(%ecx,%eax,4), %xmm3 + mulss %xmm1, %xmm2 leal (%edi,%ebp), %eax - fmul %st(2), %st - faddp %st, %st(1) - fstps 4(%edx) + mulss %xmm0, %xmm3 + addss %xmm3, %xmm2 + movss %xmm2, 4(%edx) addl $16, %edx - cmpl %eax, 180(%esp) + cmpl %eax, 164(%esp) jg .L38 - fstp %st(0) - fstp %st(0) - jmp .L39 -.L43: - fstp %st(0) - fstp %st(0) - jmp .L39 -.L44: - fstp %st(0) - fstp %st(0) - .p2align 4,,7 - .p2align 3 .L39: - addl $140, %esp + addl $124, %esp popl %ebx popl %esi popl %edi @@ -417,39 +401,34 @@ alignedBufWetDryMixSplittedSSE: unalignedBufMixLRCoeffSSE: pushl %esi pushl %ebx - subl $4, %esp - movl 32(%esp), %esi - flds 24(%esp) - movl 16(%esp), %eax - movl 20(%esp), %edx - movl %esi, %ebx - flds 28(%esp) - shrl $31, %ebx - leal (%esi,%ebx), %ecx + movl 28(%esp), %ebx + movl 12(%esp), %eax + movl 16(%esp), %edx + movss 20(%esp), %xmm1 + movl %ebx, %esi + shrl $31, %esi + leal (%ebx,%esi), %ecx andl $1, %ecx - cmpl %ebx, %ecx - jne .L54 -.L46: - testl %esi, %esi - jle .L55 - leal -1(%esi), %ebx - shrl %ebx + cmpl %esi, %ecx + movss 24(%esp), %xmm3 + jne .L52 +.L44: + testl %ebx, %ebx + jle .L49 testb $15, %al - jne .L48 - fxch %st(1) - fstps (%esp) + jne .L46 + movaps %xmm1, %xmm0 + subl $1, %ebx + unpcklps %xmm3, %xmm0 + shrl %ebx xorps %xmm2, %xmm2 - movss (%esp), %xmm0 - addl $1, %ebx - fstps (%esp) - xorl %ecx, %ecx - movss (%esp), %xmm1 - unpcklps %xmm1, %xmm0 movaps %xmm0, %xmm3 + addl $1, %ebx movlhps %xmm0, %xmm3 + xorl %ecx, %ecx .p2align 4,,7 .p2align 3 -.L49: +.L47: movaps %xmm2, %xmm1 addl $1, %ecx movlps (%edx), %xmm1 @@ -463,65 +442,53 @@ unalignedBufMixLRCoeffSSE: movaps %xmm0, (%eax) addl $16, %eax cmpl %ebx, %ecx - jb .L49 - jmp .L51 - .p2align 4,,7 - .p2align 3 -.L55: - fstp %st(0) - fstp %st(0) - .p2align 4,,7 - .p2align 3 -.L51: - addl $4, %esp + jb .L47 +.L49: popl %ebx popl %esi ret .p2align 4,,7 .p2align 3 -.L48: +.L46: xorl %ecx, %ecx .p2align 4,,7 .p2align 3 -.L50: - flds (%edx,%ecx,8) - fmul %st(2), %st - fadds (%eax,%ecx,8) - fstps (%eax,%ecx,8) - flds 4(%edx,%ecx,8) - fmul %st(1), %st - fadds 4(%eax,%ecx,8) - fstps 4(%eax,%ecx,8) - flds 8(%edx,%ecx,8) - fmul %st(2), %st - fadds 8(%eax,%ecx,8) - fstps 8(%eax,%ecx,8) - flds 12(%edx,%ecx,8) - fmul %st(1), %st - fadds 12(%eax,%ecx,8) - fstps 12(%eax,%ecx,8) +.L48: + movss (%edx,%ecx,8), %xmm0 + mulss %xmm1, %xmm0 + addss (%eax,%ecx,8), %xmm0 + movss %xmm0, (%eax,%ecx,8) + movss 4(%edx,%ecx,8), %xmm0 + mulss %xmm3, %xmm0 + addss 4(%eax,%ecx,8), %xmm0 + movss %xmm0, 4(%eax,%ecx,8) + movss 8(%edx,%ecx,8), %xmm0 + mulss %xmm1, %xmm0 + addss 8(%eax,%ecx,8), %xmm0 + movss %xmm0, 8(%eax,%ecx,8) + movss 12(%edx,%ecx,8), %xmm0 + mulss %xmm3, %xmm0 + addss 12(%eax,%ecx,8), %xmm0 + movss %xmm0, 12(%eax,%ecx,8) addl $2, %ecx - cmpl %ecx, %esi - jg .L50 - fstp %st(0) - fstp %st(0) - addl $4, %esp + cmpl %ecx, %ebx + jg .L48 popl %ebx popl %esi ret -.L54: - flds (%edx) - subl $1, %esi - fmul %st(2), %st - fadds (%eax) - fstps (%eax) - flds 4(%edx) +.L52: + movss (%edx), %xmm0 + subl $1, %ebx + mulss %xmm1, %xmm0 + addss (%eax), %xmm0 + movss %xmm0, (%eax) + movss 4(%edx), %xmm0 addl $8, %edx - fmul %st(1), %st - fadds 4(%eax) - fstps 4(%eax) + mulss %xmm3, %xmm0 + addss 4(%eax), %xmm0 + movss %xmm0, 4(%eax) addl $8, %eax - jmp .L46 + jmp .L44 .size unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE - .ident "GCC: (GNU) 4.4.0 20090304 (experimental)" + .ident "GCC: (Ubuntu 4.4.0-0ubuntu2) 4.4.0" .section .note.GNU-stack,"",@progbits diff --git a/src/core/basic_ops_x86_sse2.s b/src/core/basic_ops_x86_sse2.s index fb33cc85d..aee60c234 100644 --- a/src/core/basic_ops_x86_sse2.s +++ b/src/core/basic_ops_x86_sse2.s @@ -70,19 +70,19 @@ alignedConvertToS16SSE2: pushl %ebx subl $8, %esp movl 36(%esp), %eax + movss .LC0, %xmm6 cmpb $0, 44(%esp) - flds .LC0 movl 28(%esp), %edx movl 32(%esp), %ebx movl %eax, %esi - fmuls 40(%esp) + mulss 40(%esp), %xmm6 jne .L13 testw %ax, %ax - jle .L35 + jle .L15 movl %eax, %edi shrw $2, %di cmpw $3, %ax - movw %ax, 4(%esp) + movw %ax, 2(%esp) leal 0(,%edi,4), %ebp ja .L33 .L28: @@ -98,19 +98,17 @@ alignedConvertToS16SSE2: .p2align 4,,7 .p2align 3 .L25: - flds (%edx) - fmul %st(1), %st - fstps 4(%esp) - cvttss2si 4(%esp), %ecx - flds 4(%edx) - fmul %st(1), %st + movss (%edx), %xmm0 + mulss %xmm6, %xmm0 + cvttss2si %xmm0, %ecx + movss 4(%edx), %xmm0 cmpl $-32768, %ecx + mulss %xmm6, %xmm0 cmovl %edi, %ecx cmpl $32767, %ecx cmovg %ebx, %ecx - fstps 4(%esp) movw %cx, (%eax) - cvttss2si 4(%esp), %ecx + cvttss2si %xmm0, %ecx cmpl $-32768, %ecx cmovl %edi, %ecx cmpl $32767, %ecx @@ -121,15 +119,6 @@ alignedConvertToS16SSE2: addl $4, %eax cmpw %bp, %si jg .L25 - fstp %st(0) - jmp .L15 -.L35: - fstp %st(0) - jmp .L15 -.L36: - fstp %st(0) - .p2align 4,,7 - .p2align 3 .L15: movswl %si,%esi addl $8, %esp @@ -143,11 +132,11 @@ alignedConvertToS16SSE2: .p2align 3 .L13: testw %ax, %ax - jle .L36 + jle .L15 movl %eax, %ebp shrw $2, %bp cmpw $3, %si - movw %ax, 4(%esp) + movw %ax, 2(%esp) leal 0(,%ebp,4), %eax ja .L34 .L27: @@ -162,24 +151,22 @@ alignedConvertToS16SSE2: .p2align 4,,7 .p2align 3 .L20: - flds (%ecx) + movss (%ecx), %xmm0 movl $32767, %ebp - fmul %st(1), %st - fstps 4(%esp) - cvttss2si 4(%esp), %ebx - flds 4(%ecx) - fmul %st(1), %st + mulss %xmm6, %xmm0 + cvttss2si %xmm0, %ebx + movss 4(%ecx), %xmm0 cmpl $-32768, %ebx cmovl %edi, %ebx cmpl $32767, %ebx + mulss %xmm6, %xmm0 cmovg %ebp, %ebx movzbl %bh, %ebp sall $8, %ebx orl %ebp, %ebx movl $32767, %ebp - fstps 4(%esp) movw %bx, (%edx) - cvttss2si 4(%esp), %ebx + cvttss2si %xmm0, %ebx cmpl $-32768, %ebx cmovl %edi, %ebx cmpl $32767, %ebx @@ -193,107 +180,105 @@ alignedConvertToS16SSE2: addl $4, %edx cmpw %ax, %si jg .L20 - fstp %st(0) jmp .L15 .p2align 4,,7 .p2align 3 .L34: testw %ax, %ax je .L27 - fsts (%esp) + movaps %xmm6, %xmm0 xorl %ecx, %ecx movdqa .LC1, %xmm3 - movss (%esp), %xmm0 - xorl %edi, %edi - movdqa .LC2, %xmm2 shufps $0, %xmm0, %xmm0 + movdqa .LC2, %xmm2 + movss %xmm6, 4(%esp) + xorl %edi, %edi movaps %xmm0, %xmm7 .p2align 4,,7 .p2align 3 .L19: movaps (%edx,%ecx,2), %xmm0 - movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm5 movdqa %xmm2, %xmm6 addl $1, %edi - movaps 16(%edx,%ecx,2), %xmm5 + movaps 16(%edx,%ecx,2), %xmm4 mulps %xmm7, %xmm0 - mulps %xmm7, %xmm5 + mulps %xmm7, %xmm4 cvttps2dq %xmm0, %xmm0 movdqa %xmm0, %xmm1 pcmpgtd %xmm3, %xmm1 pand %xmm1, %xmm0 pandn %xmm3, %xmm1 por %xmm0, %xmm1 - cvttps2dq %xmm5, %xmm5 + cvttps2dq %xmm4, %xmm4 movdqa %xmm1, %xmm0 pcmpgtd %xmm2, %xmm0 - pand %xmm0, %xmm4 + pand %xmm0, %xmm5 pandn %xmm1, %xmm0 movdqa %xmm0, %xmm1 - movdqa %xmm5, %xmm0 - por %xmm4, %xmm1 + movdqa %xmm4, %xmm0 + por %xmm5, %xmm1 pcmpgtd %xmm3, %xmm0 - movdqa .LC3, %xmm4 - pand %xmm0, %xmm5 - pand %xmm1, %xmm4 - pandn %xmm3, %xmm0 - psrad $8, %xmm4 - por %xmm5, %xmm0 - pslld $8, %xmm1 - movdqa %xmm0, %xmm5 - pcmpgtd %xmm2, %xmm5 - pand %xmm5, %xmm6 - pandn %xmm0, %xmm5 - movdqa %xmm5, %xmm0 movdqa .LC3, %xmm5 - por %xmm6, %xmm0 - pand %xmm0, %xmm5 - pslld $8, %xmm0 + pand %xmm0, %xmm4 + pand %xmm1, %xmm5 + pandn %xmm3, %xmm0 psrad $8, %xmm5 - movdqa %xmm4, %xmm6 - punpcklwd %xmm5, %xmm4 - punpckhwd %xmm5, %xmm6 - movdqa %xmm4, %xmm5 - punpcklwd %xmm6, %xmm4 - punpckhwd %xmm6, %xmm5 - punpcklwd %xmm5, %xmm4 - movdqa %xmm1, %xmm5 + por %xmm4, %xmm0 + pslld $8, %xmm1 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm2, %xmm4 + pand %xmm4, %xmm6 + pandn %xmm0, %xmm4 + movdqa %xmm4, %xmm0 + movdqa .LC3, %xmm4 + por %xmm6, %xmm0 + pand %xmm0, %xmm4 + pslld $8, %xmm0 + psrad $8, %xmm4 + movdqa %xmm5, %xmm6 + punpcklwd %xmm4, %xmm5 + punpckhwd %xmm4, %xmm6 + movdqa %xmm5, %xmm4 + punpcklwd %xmm6, %xmm5 + punpckhwd %xmm6, %xmm4 + punpcklwd %xmm4, %xmm5 + movdqa %xmm1, %xmm4 punpcklwd %xmm0, %xmm1 - punpckhwd %xmm0, %xmm5 - movdqa %xmm1, %xmm0 - punpcklwd %xmm5, %xmm1 - punpckhwd %xmm5, %xmm0 - punpcklwd %xmm0, %xmm1 - por %xmm1, %xmm4 - movdqa %xmm4, (%ebx,%ecx) + punpckhwd %xmm0, %xmm4 + movdqa %xmm1, %xmm6 + punpcklwd %xmm4, %xmm1 + punpckhwd %xmm4, %xmm6 + punpcklwd %xmm6, %xmm1 + por %xmm1, %xmm5 + movdqa %xmm5, (%ebx,%ecx) addl $16, %ecx cmpw %di, %bp ja .L19 - cmpw 4(%esp), %ax + cmpw 2(%esp), %ax + movss 4(%esp), %xmm6 jne .L18 - fstp %st(0) jmp .L15 .p2align 4,,7 .p2align 3 .L33: testw %bp, %bp - .p2align 4,,4 + .p2align 4,,3 .p2align 3 je .L28 - fsts (%esp) + movaps %xmm6, %xmm0 xorl %eax, %eax movdqa .LC1, %xmm3 - movss (%esp), %xmm0 - xorl %ecx, %ecx - movdqa .LC2, %xmm2 shufps $0, %xmm0, %xmm0 + movdqa .LC2, %xmm2 + xorl %ecx, %ecx movaps %xmm0, %xmm5 .p2align 4,,7 .p2align 3 .L24: movaps (%edx,%eax,2), %xmm0 addl $1, %ecx - movdqa %xmm2, %xmm6 + movdqa %xmm2, %xmm7 movaps 16(%edx,%eax,2), %xmm4 mulps %xmm5, %xmm0 mulps %xmm5, %xmm4 @@ -306,23 +291,23 @@ alignedConvertToS16SSE2: cvttps2dq %xmm4, %xmm4 movdqa %xmm1, %xmm0 pcmpgtd %xmm2, %xmm0 - pand %xmm0, %xmm6 + pand %xmm0, %xmm7 pandn %xmm1, %xmm0 movdqa %xmm0, %xmm1 movdqa %xmm4, %xmm0 - por %xmm6, %xmm1 + por %xmm7, %xmm1 pcmpgtd %xmm3, %xmm0 - movdqa %xmm2, %xmm6 + movdqa %xmm2, %xmm7 pand %xmm0, %xmm4 pandn %xmm3, %xmm0 por %xmm4, %xmm0 movdqa %xmm0, %xmm4 pcmpgtd %xmm2, %xmm4 - pand %xmm4, %xmm6 + pand %xmm4, %xmm7 pandn %xmm0, %xmm4 movdqa %xmm4, %xmm0 movdqa %xmm1, %xmm4 - por %xmm6, %xmm0 + por %xmm7, %xmm0 punpckhwd %xmm0, %xmm4 punpcklwd %xmm0, %xmm1 movdqa %xmm1, %xmm0 @@ -333,9 +318,8 @@ alignedConvertToS16SSE2: addl $16, %eax cmpw %cx, %di ja .L24 - cmpw %bp, 4(%esp) + cmpw %bp, 2(%esp) jne .L23 - fstp %st(0) jmp .L15 .size alignedConvertToS16SSE2, .-alignedConvertToS16SSE2 .section .rodata.cst4,"aM",@progbits,4 @@ -361,5 +345,5 @@ alignedConvertToS16SSE2: .long 65280 .long 65280 .long 65280 - .ident "GCC: (GNU) 4.4.0 20090304 (experimental)" + .ident "GCC: (Ubuntu 4.4.0-0ubuntu2) 4.4.0" .section .note.GNU-stack,"",@progbits