Files
lmms/src/core/Cpu.cpp
Tobias Doerffel 89fa5c99e9 CPU: new framework for optimized CPU-specific routines
The new CPU framework replaces the old BasicOps framework. It is more
flexible and the build process isn't such a mess anymore (pre-compiled
assembler files etc.). It will hopefully see some improvements and
extensions soon.

Signed-off-by: Tobias Doerffel <tobias.doerffel@gmail.com>
2009-08-03 17:14:16 +02:00

471 lines
10 KiB
C++

/*
* Cpu.cpp - CPU specific accellerated operations
*
* Copyright (c) 2008-2009 Tobias Doerffel <tobydox/at/users.sourceforge.net>
*
* This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program (see COPYING); if not, write to the
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA.
*
*/
#include "Cpu.h"
#include <cstdlib>
#include <cstdio>
#include <memory.h>
namespace CPU
{
void * memAlloc( int _bytes )
{
char *ptr,*ptr2,*_ptr;
int align_mask = ALIGN_SIZE- 1;
ptr =(char *) malloc( _bytes + ALIGN_SIZE + sizeof(int) );
if( ptr == NULL )
{
return NULL;
}
ptr2 = ptr + sizeof(int);
_ptr = ptr2 + ( ALIGN_SIZE- ( (size_t) ptr2 & align_mask ) );
ptr2 = _ptr - sizeof(int);
*((int *) ptr2) = (int)( _ptr - ptr );
return _ptr;
}
void memFree( void * _buf )
{
if( _buf )
{
int * ptr2 = (int *) _buf - 1;
void * buf2 = (char *) _buf - *ptr2;
if( buf2 )
{
free( buf2 );
}
}
}
sampleFrameA * allocFrames( int _n )
{
return (sampleFrameA *) memAlloc( _n * sizeof( sampleFrameA ) );
}
void freeFrames( sampleFrame * _buf )
{
memFree( _buf );
}
// slow fallback
void memCpyNoOpt( void * RP _dst, const void * RP _src, int _size )
{
const int s = _size / sizeof( int );
const int * RP src = (const int *) _src;
int * RP dst = (int *) _dst;
for( int i = 0; i < s; )
{
dst[i+0] = src[i+0];
dst[i+1] = src[i+1];
dst[i+2] = src[i+2];
dst[i+3] = src[i+3];
dst[i+4] = src[i+4];
dst[i+5] = src[i+5];
dst[i+6] = src[i+6];
dst[i+7] = src[i+7];
dst[i+8] = src[i+8];
dst[i+9] = src[i+9];
dst[i+10] = src[i+10];
dst[i+11] = src[i+11];
dst[i+12] = src[i+12];
dst[i+13] = src[i+13];
dst[i+14] = src[i+14];
dst[i+15] = src[i+15];
i += 16;
}
}
// slow fallback
void memClearNoOpt( void * _dst, int _size )
{
const int s = _size / ( sizeof( int ) * 4 );
int * dst = (int *) _dst;
for( int i = 0; i < s; ++i )
{
dst[0] = 0;
dst[1] = 0;
dst[2] = 0;
dst[3] = 0;
dst += 4;
}
}
void bufApplyGainNoOpt( sampleFrameA * RP _dst, float _gain,
int _frames )
{
for( int i = 0; i < _frames; )
{
_dst[i+0][0] *= _gain;
_dst[i+0][1] *= _gain;
_dst[i+1][0] *= _gain;
_dst[i+1][1] *= _gain;
_dst[i+2][0] *= _gain;
_dst[i+2][1] *= _gain;
_dst[i+3][0] *= _gain;
_dst[i+3][1] *= _gain;
_dst[i+4][0] *= _gain;
_dst[i+4][1] *= _gain;
_dst[i+5][0] *= _gain;
_dst[i+5][1] *= _gain;
_dst[i+6][0] *= _gain;
_dst[i+6][1] *= _gain;
_dst[i+7][0] *= _gain;
_dst[i+7][1] *= _gain;
i += 8;
}
}
void bufMixNoOpt( sampleFrameA * RP _dst, const sampleFrameA * RP _src,
int _frames )
{
for( int i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0];
_dst[i+0][1] += _src[i+0][1];
_dst[i+1][0] += _src[i+1][0];
_dst[i+1][1] += _src[i+1][1];
_dst[i+2][0] += _src[i+2][0];
_dst[i+2][1] += _src[i+2][1];
_dst[i+3][0] += _src[i+3][0];
_dst[i+3][1] += _src[i+3][1];
i += 4;
}
}
void bufMixLRCoeffNoOpt( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _left, float _right, int _frames )
{
for( int i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0]*_left;
_dst[i+0][1] += _src[i+0][1]*_right;
_dst[i+1][0] += _src[i+1][0]*_left;
_dst[i+1][1] += _src[i+1][1]*_right;
_dst[i+2][0] += _src[i+2][0]*_left;
_dst[i+2][1] += _src[i+2][1]*_right;
_dst[i+3][0] += _src[i+3][0]*_left;
_dst[i+3][1] += _src[i+3][1]*_right;
i += 4;
}
}
void unalignedBufMixLRCoeffNoOpt( sampleFrame * RP _dst,
const sampleFrame * RP _src,
const float _left,
const float _right,
int _frames )
{
if( _frames % 2 )
{
_dst[0][0] += _src[0][0] * _left;
_dst[0][1] += _src[0][1] * _right;
++_src;
++_dst;
--_frames;
}
for( int i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0]*_left;
_dst[i+0][1] += _src[i+0][1]*_right;
_dst[i+1][0] += _src[i+1][0]*_left;
_dst[i+1][1] += _src[i+1][1]*_right;
i += 2;
}
}
void bufWetDryMixNoOpt( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _wet, float _dry, int _frames )
{
for( int i = 0; i < _frames; ++i )
{
_dst[i+0][0] = _dst[i+0][0]*_dry + _src[i+0][0]*_wet;
_dst[i+0][1] = _dst[i+0][1]*_dry + _src[i+0][1]*_wet;
}
}
void bufWetDryMixSplittedNoOpt( sampleFrameA * RP _dst,
const float * RP _left,
const float * RP _right,
float _wet, float _dry, int _frames )
{
int i;
for( i = 0; i < _frames; ++i )
{
_dst[i+0][0] = _dst[i+0][0]*_dry + _left[i+0]*_wet;
_dst[i+0][1] = _dst[i+0][1]*_dry + _right[i+0]*_wet;
++i;
}
}
int convertToS16NoOpt( const sampleFrameA * RP _src,
intSampleFrameA * RP _dst,
const fpp_t _frames,
const float _master_gain,
const bool _convert_endian )
{
int t1;
int t2;
const float f = _master_gain * OUTPUT_SAMPLE_MULTIPLIER;
if( _convert_endian )
{
for( fpp_t frame = 0; frame < _frames; ++frame )
{
t1 = _src[frame][0] * f;
t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
_dst[frame][0] = ( t1 & 0x00ff) << 8 |
( t1 & 0xff00 ) >> 8;
t2 = _src[frame][1] * f;
t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
_dst[frame][1] = ( t2 & 0x00ff) << 8 |
( t2 & 0xff00 ) >> 8;
}
}
else
{
for( fpp_t frame = 0; frame < _frames; ++frame )
{
t1 = _src[frame][0] * f;
t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
_dst[frame][0] = t1;
t2 = _src[frame][1] * f;
t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
_dst[frame][1] = t2;
}
}
return _frames * DEFAULT_CHANNELS * BYTES_PER_INT_SAMPLE;
}
MemCpyFunc memCpy = memCpyNoOpt;
MemClearFunc memClear = memClearNoOpt;
BufApplyGainFunc bufApplyGain = bufApplyGainNoOpt;
BufMixFunc bufMix = bufMixNoOpt;
BufMixLRCoeffFunc bufMixLRCoeff = bufMixLRCoeffNoOpt;
UnalignedBufMixLRCoeffFunc unalignedBufMixLRCoeff = unalignedBufMixLRCoeffNoOpt;
BufWetDryMixFunc bufWetDryMix = bufWetDryMixNoOpt;
BufWetDryMixSplittedFunc bufWetDryMixSplitted = bufWetDryMixSplittedNoOpt;
ConvertToS16Func convertToS16 = convertToS16NoOpt;
#ifdef X86_OPTIMIZATIONS
enum CPUFeatures
{
None = 0,
MMX = 0x1,
MMXEXT = 0x2,
MMX3DNOW = 0x4,
MMX3DNOWEXT = 0x8,
SSE = 0x10,
SSE2 = 0x20,
CMOV = 0x40,
IWMMXT = 0x80
};
extern "C"
{
#ifdef LMMS_HOST_X86
void memCpyMMX( void * RP _dst, const void * RP _src, int _size );
void memClearMMX( void * RP _dst, int _size );
#endif
void memCpySSE( void * RP _dst, const void * RP _src, int _size );
void memClearSSE( void * RP _dst, int _size );
void bufApplyGainSSE( sampleFrameA * RP _dst, float _gain, int _frames );
void bufMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, int _frames );
void bufMixLRCoeffSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _left, float _right, int _frames );
void unalignedBufMixLRCoeffSSE( sampleFrame * RP _dst, const sampleFrame * RP _src, const float _left, const float _right, int _frames );
void bufWetDryMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _wet, float _dry, int _frames );
void bufWetDryMixSplittedSSE( sampleFrameA * RP _dst, const float * RP _left, const float * RP _right, float _wet, float _dry, int _frames );
#ifdef X86_OPTIMIZATIONS
void memCpySSE2( void * RP _dst, const void * RP _src, int _size );
void memClearSSE2( void * RP _dst, int _size );
int convertToS16SSE2( const sampleFrameA * RP _src, intSampleFrameA * RP _dst, const fpp_t _frames, const float _master_gain, const bool _convert_endian );
#endif
} ;
#endif
void init()
{
#ifdef X86_OPTIMIZATIONS
static bool extensions_checked = false;
if( extensions_checked == false )
{
int features = 0;
#if defined(__x86_64__) || defined(Q_OS_WIN64)
features = MMX | SSE | SSE2 | CMOV;
#elif defined(__ia64__)
features = MMX | SSE | SSE2;
#else
unsigned int result = 0;
unsigned int extended_result = 0;
asm( "push %%ebx\n"
"pushf\n"
"pop %%eax\n"
"mov %%eax, %%ebx\n"
"xor $0x00200000, %%eax\n"
"push %%eax\n"
"popf\n"
"pushf\n"
"pop %%eax\n"
"xor %%edx, %%edx\n"
"xor %%ebx, %%eax\n"
"jz 1f\n"
"mov $0x00000001, %%eax\n"
"cpuid\n"
"1:\n"
"pop %%ebx\n"
"mov %%edx, %0\n"
: "=r" (result)
:
: "%eax", "%ecx", "%edx"
);
asm( "push %%ebx\n"
"pushf\n"
"pop %%eax\n"
"mov %%eax, %%ebx\n"
"xor $0x00200000, %%eax\n"
"push %%eax\n"
"popf\n"
"pushf\n"
"pop %%eax\n"
"xor %%edx, %%edx\n"
"xor %%ebx, %%eax\n"
"jz 2f\n"
"mov $0x80000000, %%eax\n"
"cpuid\n"
"cmp $0x80000000, %%eax\n"
"jbe 2f\n"
"mov $0x80000001, %%eax\n"
"cpuid\n"
"2:\n"
"pop %%ebx\n"
"mov %%edx, %0\n"
: "=r" (extended_result)
:
: "%eax", "%ecx", "%edx"
);
if( result & (1u << 15) )
features |= CMOV;
if( result & (1u << 23) )
features |= MMX;
if( extended_result & (1u << 22) )
features |= MMXEXT;
if( extended_result & (1u << 31) )
features |= MMX3DNOW;
if( extended_result & (1u << 30) )
features |= MMX3DNOWEXT;
if( result & (1u << 25) )
features |= SSE;
if( result & (1u << 26) )
features |= SSE2;
#endif
#ifdef LMMS_HOST_X86
if( features & MMX )
{
memCpy = memCpyMMX;
memClear = memClearMMX;
}
#endif
if( features & SSE )
{
fprintf( stderr, "Using SSE optimized routines\n" );
memCpy = memCpySSE;
memClear = memClearSSE;
bufApplyGain = bufApplyGainSSE;
bufMix = bufMixSSE;
bufMixLRCoeff = bufMixLRCoeffSSE;
unalignedBufMixLRCoeff = unalignedBufMixLRCoeffSSE;
bufWetDryMix = bufWetDryMixSSE;
bufWetDryMixSplitted =
bufWetDryMixSplittedSSE;
}
if( features & SSE2 )
{
fprintf( stderr, "Using SSE2 optimized routines\n" );
memCpy = memCpySSE2;
memClear = memClearSSE2;
convertToS16 = convertToS16SSE2;
}
extensions_checked = true;
}
#endif
}
}