diff --git a/CMakeLists.txt b/CMakeLists.txt index ed693ec5a..bd6979f9f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ ENDIF(LMMS_BUILD_WIN32) CHECK_INCLUDE_FILES(stdint.h LMMS_HAVE_STDINT_H) +CHECK_INCLUDE_FILES(stdbool.h LMMS_HAVE_STDBOOL_H) CHECK_INCLUDE_FILES(stdlib.h LMMS_HAVE_STDLIB_H) CHECK_INCLUDE_FILES(pthread.h LMMS_HAVE_PTHREAD_H) CHECK_INCLUDE_FILES(semaphore.h LMMS_HAVE_SEMAPHORE_H) @@ -387,6 +388,43 @@ SET(LMMS_ER_H ${CMAKE_CURRENT_BINARY_DIR}/embedded_resources.h) ADD_FILE_DEPENDENCIES(${CMAKE_BINARY_DIR}/lmmsconfig.h ${lmms_MOC_out}) ADD_CUSTOM_COMMAND(OUTPUT ${LMMS_ER_H} COMMAND ${BIN2RES} ARGS ${lmms_EMBEDDED_RESOURCES} > ${LMMS_ER_H} DEPENDS ${BIN2RES}) +SET(BASIC_OPS_X86_C "${CMAKE_SOURCE_DIR}/src/core/basic_ops_x86.c") + +IF(LMMS_HOST_X86 OR LMMS_HOST_X86_64) + +ADD_CUSTOM_TARGET(regen-basic-ops) + +IF(LMMS_HOST_X86) +SET(opt_targets mmx sse sse2) +SET(host_arch x86) +ELSE(LMMS_HOST_X86) +SET(opt_targets sse sse2) +SET(host_arch x86_64) +ENDIF(LMMS_HOST_X86) + +FOREACH(opt_target ${opt_targets}) + + STRING(TOUPPER ${opt_target} OPT_TARGET) + + SET(BASIC_OPS_X86_TARGET_S "${CMAKE_SOURCE_DIR}/src/core/basic_ops_${host_arch}_${opt_target}.s") + SET(BASIC_OPS_X86_TARGET_O "${CMAKE_BINARY_DIR}/basic_ops_${host_arch}_${opt_target}.o") + IF(NOT "${OPT_TARGET}" STREQUAL "MMX") + SET(FPMATH_FLAGS "-mfpmath=sse") + ENDIF(NOT "${OPT_TARGET}" STREQUAL "MMX") + IF(EXISTS "$ENV{SVN_C_COMPILER}") + SET(C_COMPILER $ENV{SVN_C_COMPILER}) + ELSE(EXISTS "$ENV{SVN_C_COMPILER}") + SET(C_COMPILER ${CMAKE_C_COMPILER}) + ENDIF(EXISTS "$ENV{SVN_C_COMPILER}") + ADD_CUSTOM_TARGET(regen-basic-ops-${opt_target} COMMAND ${C_COMPILER} -O2 -ftree-vectorize -ftree-vectorizer-verbose=2 -fomit-frame-pointer -c -S -I${CMAKE_SOURCE_DIR}/include -I${CMAKE_BINARY_DIR} -g0 -DBUILD_${OPT_TARGET} -m${opt_target} ${FPMATH_FLAGS} -o ${BASIC_OPS_X86_TARGET_S} ${BASIC_OPS_X86_C} DEPENDS ${BASIC_OPS_X86_C}) + ADD_CUSTOM_COMMAND(OUTPUT ${BASIC_OPS_X86_TARGET_O} COMMAND ${CMAKE_C_COMPILER} ARGS ${BASIC_OPS_X86_TARGET_S} -c -o ${BASIC_OPS_X86_TARGET_O} DEPENDS ${BASIC_OPS_X86_TARGET_S}) + ADD_DEPENDENCIES(regen-basic-ops regen-basic-ops-${opt_target}) + SET(opt_target_objects ${opt_target_objects} ${BASIC_OPS_X86_TARGET_O}) + +ENDFOREACH(opt_target ${opt_targets}) +SET(lmms_SOURCES ${lmms_SOURCES} ${opt_target_objects}) +# to be used by maintainer with special ultra-optimizing super duper GCC +ENDIF(LMMS_HOST_X86 OR LMMS_HOST_X86_64) IF(WIN32) SET(WINRC "${CMAKE_BINARY_DIR}/lmmsrc.obj") diff --git a/ChangeLog b/ChangeLog index ba66a7ca1..f8e65ff41 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,40 @@ +2008-11-10 Tobias Doerffel + + * include/audio_portaudio.h: + * include/lmms_basics.h: + * include/fifo_buffer.h: + * include/mixer.h: + * include/audio_port.h: + * include/audio_dummy.h: + * include/basic_ops.h: + * include/audio_sdl.h: + * include/audio_jack.h: + * include/audio_device.h: + * src/core/audio/audio_device.cpp: + * src/core/audio/audio_alsa.cpp: + * src/core/audio/audio_file_wave.cpp: + * src/core/audio/audio_sdl.cpp: + * src/core/audio/audio_oss.cpp: + * src/core/audio/audio_port.cpp: + * src/core/audio/audio_portaudio.cpp: + * src/core/audio/audio_jack.cpp: + * src/core/audio/audio_pulseaudio.cpp: + * src/core/basic_ops.cpp: + * src/core/basic_ops_x86.c: + * src/core/basic_ops_x86_mmx.s: + * src/core/basic_ops_x86_sse.s: + * src/core/basic_ops_x86_sse2.s: + * src/core/basic_ops_x86_64_sse.s: + * src/core/basic_ops_x86_64_sse2.s: + * src/core/mixer.cpp: + * src/core/main.cpp: + * src/core/project_renderer.cpp: + * src/core/fx_mixer.cpp: + * plugins/ladspa_effect/ladspa_effect.cpp: + * lmmsconfig.h.in: + * CMakeLists.txt: + experimental support for MMX/SSE/SSE2 instructions + 2008-11-04 Tobias Doerffel * plugins/sf2_player/sf2_player.cpp: diff --git a/include/audio_device.h b/include/audio_device.h index b938a555f..bf01143f7 100644 --- a/include/audio_device.h +++ b/include/audio_device.h @@ -121,31 +121,22 @@ public: protected: // subclasses can re-implement this for being used in conjunction with // processNextBuffer() - virtual void writeBuffer( const surroundSampleFrame * /* _buf*/, + virtual void writeBuffer( const sampleFrameA * /* _buf*/, const fpp_t /*_frames*/, const float /*_master_gain*/ ) { } // called by according driver for fetching new sound-data - fpp_t getNextBuffer( surroundSampleFrame * _ab ); - - // convert a given audio-buffer to a buffer in signed 16-bit samples - // returns num of bytes in outbuf - Uint32 convertToS16( const surroundSampleFrame * _ab, - const fpp_t _frames, - const float _master_gain, - int_sample_t * _output_buffer, - const bool _convert_endian = FALSE ); + fpp_t getNextBuffer( sampleFrameA * _ab ); // clear given signed-int-16-buffer - void clearS16Buffer( int_sample_t * _outbuf, - const fpp_t _frames ); + void clearS16Buffer( intSampleFrameA * _outbuf, const fpp_t _frames ); // resample given buffer from samplerate _src_sr to samplerate _dst_sr - void resample( const surroundSampleFrame * _src, + void resample( const sampleFrameA * _src, const fpp_t _frames, - surroundSampleFrame * _dst, + sampleFrameA * _dst, const sample_rate_t _src_sr, const sample_rate_t _dst_sr ); @@ -161,9 +152,11 @@ protected: bool hqAudio( void ) const; + protected: bool m_supportsCapture; + private: sample_rate_t m_sampleRate; ch_cnt_t m_channels; @@ -175,7 +168,7 @@ private: SRC_DATA m_srcData; SRC_STATE * m_srcState; - surroundSampleFrame * m_buffer; + sampleFrameA * m_buffer; } ; diff --git a/include/audio_dummy.h b/include/audio_dummy.h index 5ef9a38ae..f64fbb8f5 100644 --- a/include/audio_dummy.h +++ b/include/audio_dummy.h @@ -27,6 +27,7 @@ #define _AUDIO_DUMMY_H #include "audio_device.h" +#include "basic_ops.h" #include "micro_timer.h" @@ -94,16 +95,16 @@ private: virtual void run( void ) { microTimer timer; - while( TRUE ) + while( true ) { timer.reset(); - const surroundSampleFrame * b = + surroundSampleFrame * b = getMixer()->nextBuffer(); if( !b ) { break; } - delete[] b; + alignedFreeFrames( b ); const Sint32 microseconds = static_cast( getMixer()->framesPerPeriod() * diff --git a/include/audio_jack.h b/include/audio_jack.h index 3fcf54ed1..cdd367a67 100644 --- a/include/audio_jack.h +++ b/include/audio_jack.h @@ -94,7 +94,7 @@ private: QSemaphore m_stop_semaphore; QVector m_outputPorts; - surroundSampleFrame * m_outBuf; + sampleFrameA * m_outBuf; f_cnt_t m_framesDoneInCurBuf; diff --git a/include/audio_port.h b/include/audio_port.h index 6ee29326d..b428c568b 100644 --- a/include/audio_port.h +++ b/include/audio_port.h @@ -40,14 +40,14 @@ public: audioPort( const QString & _name, bool _has_effect_chain = true ); ~audioPort(); - inline sampleFrame * firstBuffer( void ) + inline sampleFrameA * firstBuffer( void ) { - return( m_firstBuffer ); + return m_firstBuffer; } - inline sampleFrame * secondBuffer( void ) + inline sampleFrameA * secondBuffer( void ) { - return( m_secondBuffer ); + return m_secondBuffer; } inline void lockFirstBuffer( void ) @@ -76,7 +76,7 @@ public: // indicate whether JACK & Co should provide output-buffer at ext. port inline bool extOutputEnabled( void ) const { - return( m_extOutputEnabled ); + return m_extOutputEnabled; } void setExtOutputEnabled( bool _enabled ); @@ -86,12 +86,12 @@ public: // (-1 = none 0 = master) inline fx_ch_t nextFxChannel( void ) const { - return( m_nextFxChannel ); + return m_nextFxChannel; } inline effectChain * getEffects( void ) { - return( m_effects ); + return m_effects; } void setNextFxChannel( const fx_ch_t _chnl ) @@ -102,7 +102,7 @@ public: const QString & name( void ) const { - return( m_name ); + return m_name; } void setName( const QString & _new_name ); @@ -122,8 +122,8 @@ public: private: volatile bufferUsages m_bufferUsage; - sampleFrame * m_firstBuffer; - sampleFrame * m_secondBuffer; + sampleFrameA * m_firstBuffer; + sampleFrameA * m_secondBuffer; QMutex m_firstBufferLock; QMutex m_secondBufferLock; diff --git a/include/audio_portaudio.h b/include/audio_portaudio.h index 0a3463d67..383848a28 100644 --- a/include/audio_portaudio.h +++ b/include/audio_portaudio.h @@ -140,7 +140,7 @@ private: bool m_wasPAInitError; - surroundSampleFrame * m_outBuf; + sampleFrameA * m_outBuf; int m_outBufPos; int m_outBufSize; diff --git a/include/audio_sdl.h b/include/audio_sdl.h index ff18ac8a4..9ccedb838 100644 --- a/include/audio_sdl.h +++ b/include/audio_sdl.h @@ -76,8 +76,8 @@ private: SDL_AudioSpec m_audioHandle; - surroundSampleFrame * m_outBuf; - Uint8 * m_convertedBuf; + sampleFrameA * m_outBuf; + intSampleFrameA * m_convertedBuf; int m_convertedBufPos; int m_convertedBufSize; diff --git a/include/basic_ops.h b/include/basic_ops.h new file mode 100644 index 000000000..5163b333e --- /dev/null +++ b/include/basic_ops.h @@ -0,0 +1,94 @@ +/* + * basic_ops.h - basic memory operations + * + * Copyright (c) 2008 Tobias Doerffel + * + * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program (see COPYING); if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA. + * + */ + + +#ifndef _BASIC_OPS_H +#define _BASIC_OPS_H + +#include "lmms_basics.h" + +#ifdef LMMS_HAVE_STDBOOL_H +#include +#endif + +void initBasicOps( void ); + +void * alignedMalloc( int _bytes ); +void alignedFree( void * _buf ); + +sampleFrameA * alignedAllocFrames( int _frames ); +void alignedFreeFrames( sampleFrameA * _buf ); + + +// all aligned* functions assume data to be 16 byte aligned and size to be +// multiples of 64 +typedef void (*alignedMemCpyFunc)( void * RP _dst, const void * RP _src, + int _size ); +typedef void (*alignedMemClearFunc)( void * RP _dst, int _size ); +typedef void (*alignedBufApplyGainFunc)( sampleFrameA * RP _dst, + float _gain, int _frames ); +typedef void (*alignedBufMixFunc)( sampleFrameA * RP _dst, + const sampleFrameA * RP _src, + int _frames ); +typedef void (*alignedBufMixLRCoeffFunc)( sampleFrameA * RP _dst, + const sampleFrameA * RP _src, + float _left, float _right, + int _frames ); +typedef void (*unalignedBufMixLRCoeffFunc)( sampleFrame * RP _dst, + const sampleFrame * RP _src, + float _left, float _right, + int _frames ); +typedef void (*alignedBufWetDryMixFunc)( sampleFrameA * RP _dst, + const sampleFrameA * RP _src, + float _wet, float _dry, int _frames ); +typedef void (*alignedBufWetDryMixSplittedFunc)( sampleFrameA * RP _dst, + const float * RP _left, + const float * RP _right, + float _wet, float _dry, int _frames ); +typedef int (*alignedConvertToS16Func)( const sampleFrameA * RP _src, + intSampleFrameA * RP _dst, + const fpp_t _frames, + const float _master_gain, + const bool _convert_endian ); + +extern alignedMemCpyFunc alignedMemCpy; +extern alignedMemClearFunc alignedMemClear; +extern alignedBufApplyGainFunc alignedBufApplyGain; +extern alignedBufMixFunc alignedBufMix; +extern alignedBufMixLRCoeffFunc alignedBufMixLRCoeff; +extern unalignedBufMixLRCoeffFunc unalignedBufMixLRCoeff; +extern alignedBufWetDryMixFunc alignedBufWetDryMix; +extern alignedBufWetDryMixSplittedFunc alignedBufWetDryMixSplitted; +extern alignedConvertToS16Func alignedConvertToS16; + + +#ifdef LMMS_HOST_X86 +#define X86_OPTIMIZATIONS +#endif +#ifdef LMMS_HOST_X86_64 +#define X86_OPTIMIZATIONS +#endif + +#endif + diff --git a/include/fifo_buffer.h b/include/fifo_buffer.h index 4dc636b12..f607d9370 100644 --- a/include/fifo_buffer.h +++ b/include/fifo_buffer.h @@ -2,6 +2,7 @@ * fifo_buffer.h - FIFO fixed-size buffer * * Copyright (c) 2007 Javier Serrano Polo + * Copyright (c) 2008 Tobias Doerffel * * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net * @@ -33,50 +34,50 @@ class fifoBuffer { public: fifoBuffer( int _size ) : - m_reader_sem( _size ), - m_writer_sem( _size ), - m_reader_index( 0 ), - m_writer_index( 0 ), + m_readerSem( _size ), + m_writerSem( _size ), + m_readerIndex( 0 ), + m_writerIndex( 0 ), m_size( _size ) { m_buffer = new T[_size]; - m_reader_sem.acquire( _size ); + m_readerSem.acquire( _size ); } ~fifoBuffer() { delete[] m_buffer; - m_reader_sem.release( m_size ); + m_readerSem.release( m_size ); } void write( T _element ) { - m_writer_sem.acquire(); - m_buffer[m_writer_index++] = _element; - m_writer_index %= m_size; - m_reader_sem.release(); + m_writerSem.acquire(); + m_buffer[m_writerIndex++] = _element; + m_writerIndex %= m_size; + m_readerSem.release(); } T read( void ) { - m_reader_sem.acquire(); - T element = m_buffer[m_reader_index++]; - m_reader_index %= m_size; - m_writer_sem.release(); - return( element ); + m_readerSem.acquire(); + T element = m_buffer[m_readerIndex++]; + m_readerIndex %= m_size; + m_writerSem.release(); + return element; } bool available( void ) { - return( m_reader_sem.available() ); + return m_readerSem.available(); } private: - QSemaphore m_reader_sem; - QSemaphore m_writer_sem; - int m_reader_index; - int m_writer_index; + QSemaphore m_readerSem; + QSemaphore m_writerSem; + int m_readerIndex; + int m_writerIndex; int m_size; T * m_buffer; diff --git a/include/lmms_basics.h b/include/lmms_basics.h index 0deefee66..42bf7ba4b 100644 --- a/include/lmms_basics.h +++ b/include/lmms_basics.h @@ -1,5 +1,5 @@ /* - * types.h - typedefs for common types that are used in the whole app + * lmms_basics.h - common basics for the whole App * * Copyright (c) 2004-2008 Tobias Doerffel * @@ -23,10 +23,8 @@ */ -#ifndef _TYPES_H -#define _TYPES_H - -#include +#ifndef _LMMS_BASICS_H +#define _LMMS_BASICS_H #include "lmmsconfig.h" @@ -68,6 +66,9 @@ typedef Uint32 jo_id_t; // (unique) ID of a journalling object #define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0) +#ifdef __cplusplus + +#include template struct typeInfo @@ -115,25 +116,50 @@ inline bool typeInfo::isEqual( float _x, float _y ) return absVal( _x - _y ) < minEps(); } +#endif -const ch_cnt_t DEFAULT_CHANNELS = 2; - -const ch_cnt_t SURROUND_CHANNELS = +#define DEFAULT_CHANNELS 2 #define LMMS_DISABLE_SURROUND -#ifndef LMMS_DISABLE_SURROUND - 4; +#ifdef LMMS_DISABLE_SURROUND +#define SURROUND_CHANNELS 2 #else - 2; +#define SURROUND_CHANNELS 4 #endif typedef sample_t sampleFrame[DEFAULT_CHANNELS]; typedef sample_t surroundSampleFrame[SURROUND_CHANNELS]; + #define ALIGN_SIZE 16 + #if __GNUC__ + typedef sample_t sampleFrameA[DEFAULT_CHANNELS] __attribute__((__aligned__(ALIGN_SIZE))); +typedef int_sample_t intSampleFrameA[DEFAULT_CHANNELS] __attribute__((__aligned__(ALIGN_SIZE))); +#define RP __restrict__ + +#else + +#define RP + +#endif + + +#ifdef __cplusplus +const int BYTES_PER_SAMPLE = sizeof( sample_t ); +const int BYTES_PER_INT_SAMPLE = sizeof( int_sample_t ); +const int BYTES_PER_FRAME = sizeof( sampleFrame ); +const int BYTES_PER_SURROUND_FRAME = sizeof( surroundSampleFrame ); + +const float OUTPUT_SAMPLE_MULTIPLIER = 32767.0f; +#else +#define BYTES_PER_SAMPLE sizeof( sample_t ) +#define BYTES_PER_INT_SAMPLE sizeof( int_sample_t ) +#define BYTES_PER_FRAME sizeof( sampleFrame ) +#define BYTES_PER_SURROUND_FRAME sizeof( surroundSampleFrame ) +#define OUTPUT_SAMPLE_MULTIPLIER 32767.0f #endif diff --git a/include/mixer.h b/include/mixer.h index 12354a096..a35b85277 100644 --- a/include/mixer.h +++ b/include/mixer.h @@ -57,13 +57,6 @@ class audioPort; const fpp_t DEFAULT_BUFFER_SIZE = 256; -const int BYTES_PER_SAMPLE = sizeof( sample_t ); -const int BYTES_PER_INT_SAMPLE = sizeof( int_sample_t ); -const int BYTES_PER_FRAME = sizeof( sampleFrame ); -const int BYTES_PER_SURROUND_FRAME = sizeof( surroundSampleFrame ); - -const float OUTPUT_SAMPLE_MULTIPLIER = 32767.0f; - const float BaseFreq = 440.0f; const Keys BaseKey = Key_A; @@ -361,7 +354,7 @@ public: return m_inputBufferFrames[ m_inputBufferRead ]; } - inline const surroundSampleFrame * nextBuffer( void ) + inline surroundSampleFrame * nextBuffer( void ) { return hasFifoWriter() ? m_fifo->read() : renderNextBuffer(); } @@ -407,7 +400,7 @@ private: midiClient * tryMidiClients( void ); - const surroundSampleFrame * renderNextBuffer( void ); + surroundSampleFrame * renderNextBuffer( void ); diff --git a/lmmsconfig.h.in b/lmmsconfig.h.in index 4965d2433..ece2c2585 100644 --- a/lmmsconfig.h.in +++ b/lmmsconfig.h.in @@ -19,6 +19,7 @@ #cmakedefine LMMS_HAVE_VST #cmakedefine LMMS_HAVE_STDINT_H +#cmakedefine LMMS_HAVE_STDBOOL_H #cmakedefine LMMS_HAVE_STDLIB_H #cmakedefine LMMS_HAVE_PTHREAD_H #cmakedefine LMMS_HAVE_UNISTD_H diff --git a/plugins/ladspa_effect/ladspa_effect.cpp b/plugins/ladspa_effect/ladspa_effect.cpp index 79b3b6644..087ea2d49 100644 --- a/plugins/ladspa_effect/ladspa_effect.cpp +++ b/plugins/ladspa_effect/ladspa_effect.cpp @@ -34,6 +34,7 @@ #include "ladspa_subplugin_features.h" #include "mixer.h" #include "effect_chain.h" +#include "basic_ops.h" #include "automation_pattern.h" @@ -144,7 +145,7 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf, if( m_maxSampleRate < engine::getMixer()->processingSampleRate() ) { o_buf = _buf; - _buf = new sampleFrame[_frames]; + _buf = alignedAllocFrames( _frames ); sampleDown( o_buf, _buf, m_maxSampleRate ); frames = _frames * m_maxSampleRate / engine::getMixer()->processingSampleRate(); @@ -217,8 +218,8 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf, // Copy the LADSPA output buffers to the LMMS buffer. double out_sum = 0.0; channel = 0; - const float d = getDryLevel(); - const float w = getWetLevel(); + float * buffers[2]; + for( ch_cnt_t proc = 0; proc < getProcessorCount(); ++proc ) { for( int port = 0; port < m_portCount; ++port ) @@ -231,17 +232,9 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf, case CONTROL_RATE_INPUT: break; case CHANNEL_OUT: - for( fpp_t frame = 0; - frame < frames; ++frame ) + if( channel < DEFAULT_CHANNELS ) { - _buf[frame][channel] = - d * - _buf[frame][channel] + - w * - pp->buffer[frame]; - out_sum += - _buf[frame][channel] * - _buf[frame][channel]; + buffers[channel] = pp->buffer; } ++channel; break; @@ -254,10 +247,27 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf, } } + if( channel == 1 ) + { + buffers[1] = buffers[0]; + } + if( channel >= 1 && channel <= DEFAULT_CHANNELS ) + { + alignedBufWetDryMixSplitted( _buf, buffers[0], buffers[1], + getWetLevel(), getDryLevel(), frames ); + } + + for( int i = 0; i < frames; ++i ) + { + out_sum += _buf[i][0]*_buf[i][0]; + out_sum += _buf[i][1]*_buf[i][1]; + + } + if( o_buf != NULL ) { sampleBack( _buf, o_buf, m_maxSampleRate ); - delete[] _buf; + alignedFreeFrames( _buf ); } checkGate( out_sum / frames ); diff --git a/src/core/audio/audio_alsa.cpp b/src/core/audio/audio_alsa.cpp index d2786b3dd..15e588fd4 100644 --- a/src/core/audio/audio_alsa.cpp +++ b/src/core/audio/audio_alsa.cpp @@ -39,6 +39,7 @@ #include "lcd_spinbox.h" #include "gui_templates.h" #include "templates.h" +#include "basic_ops.h" @@ -229,13 +230,15 @@ void audioALSA::applyQualitySettings( void ) void audioALSA::run( void ) { - surroundSampleFrame * temp = - new surroundSampleFrame[getMixer()->framesPerPeriod()]; - int_sample_t * outbuf = - new int_sample_t[getMixer()->framesPerPeriod() * - channels()]; + sampleFrameA * temp = alignedAllocFrames( + getMixer()->framesPerPeriod() ); + intSampleFrameA * outbuf = (intSampleFrameA *) + alignedMalloc( sizeof( intSampleFrameA ) * channels() / + DEFAULT_CHANNELS * getMixer()->framesPerPeriod() ); + int_sample_t * pcmbuf = new int_sample_t[m_periodSize * channels()]; + int outbuf_size = getMixer()->framesPerPeriod() * channels(); int outbuf_pos = 0; int pcmbuf_size = m_periodSize * channels(); @@ -254,16 +257,15 @@ void audioALSA::run( void ) if( !frames ) { quit = TRUE; - memset( ptr, 0, len + alignedMemClear( ptr, len * sizeof( int_sample_t ) ); break; } outbuf_size = frames * channels(); - convertToS16( temp, frames, + alignedConvertToS16( temp, outbuf, frames, getMixer()->masterGain(), - outbuf, - m_convertEndian ); + m_convertEndian ); } int min_len = qMin( len, outbuf_size - outbuf_pos ); memcpy( ptr, outbuf + outbuf_pos, @@ -300,8 +302,8 @@ void audioALSA::run( void ) } } - delete[] temp; - delete[] outbuf; + alignedFreeFrames( temp ); + alignedFree( outbuf ); delete[] pcmbuf; } diff --git a/src/core/audio/audio_device.cpp b/src/core/audio/audio_device.cpp index 84ed18c71..0e1e0af91 100644 --- a/src/core/audio/audio_device.cpp +++ b/src/core/audio/audio_device.cpp @@ -31,6 +31,7 @@ #include "audio_device.h" #include "config_mgr.h" #include "debug.h" +#include "basic_ops.h" @@ -39,7 +40,7 @@ audioDevice::audioDevice( const ch_cnt_t _channels, mixer * _mixer ) : m_sampleRate( _mixer->processingSampleRate() ), m_channels( _channels ), m_mixer( _mixer ), - m_buffer( new surroundSampleFrame[getMixer()->framesPerPeriod()] ) + m_buffer( alignedAllocFrames( getMixer()->framesPerPeriod() ) ) { int error; if( ( m_srcState = src_new( @@ -56,7 +57,7 @@ audioDevice::audioDevice( const ch_cnt_t _channels, mixer * _mixer ) : audioDevice::~audioDevice() { src_delete( m_srcState ); - delete[] m_buffer; + alignedFreeFrames( m_buffer ); m_devMutex.tryLock(); unlock(); @@ -81,10 +82,10 @@ void audioDevice::processNextBuffer( void ) -fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab ) +fpp_t audioDevice::getNextBuffer( sampleFrameA * _ab ) { fpp_t frames = getMixer()->framesPerPeriod(); - const surroundSampleFrame * b = getMixer()->nextBuffer(); + sampleFrameA * b = getMixer()->nextBuffer(); if( !b ) { return( 0 ); @@ -103,7 +104,7 @@ fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab ) } else { - memcpy( _ab, b, frames * sizeof( surroundSampleFrame ) ); + alignedMemCpy( _ab, b, frames * sizeof( surroundSampleFrame ) ); } // release lock @@ -111,10 +112,10 @@ fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab ) if( getMixer()->hasFifoWriter() ) { - delete[] b; + alignedFreeFrames( b ); } - return( frames ); + return frames; } @@ -171,11 +172,10 @@ void audioDevice::renamePort( audioPort * ) -void audioDevice::resample( const surroundSampleFrame * _src, - const fpp_t _frames, - surroundSampleFrame * _dst, - const sample_rate_t _src_sr, - const sample_rate_t _dst_sr ) +void audioDevice::resample( const sampleFrame * _src, const fpp_t _frames, + sampleFrame * _dst, + const sample_rate_t _src_sr, + const sample_rate_t _dst_sr ) { if( m_srcState == NULL ) { @@ -197,57 +197,11 @@ void audioDevice::resample( const surroundSampleFrame * _src, -Uint32 audioDevice::convertToS16( const surroundSampleFrame * _ab, - const fpp_t _frames, - const float _master_gain, - int_sample_t * _output_buffer, - const bool _convert_endian ) + +void audioDevice::clearS16Buffer( intSampleFrameA * _outbuf, const fpp_t _frames ) { - if( _convert_endian ) - { - Uint16 temp; - for( fpp_t frame = 0; frame < _frames; ++frame ) - { - for( ch_cnt_t chnl = 0; chnl < channels(); ++chnl ) - { - temp = static_cast( - mixer::clip( _ab[frame][chnl] * - _master_gain ) * - OUTPUT_SAMPLE_MULTIPLIER ); - - ( _output_buffer + frame * channels() )[chnl] = - ( temp & 0x00ff ) << 8 | - ( temp & 0xff00 ) >> 8; - } - } - } - else - { - for( fpp_t frame = 0; frame < _frames; ++frame ) - { - for( ch_cnt_t chnl = 0; chnl < channels(); ++chnl ) - { - ( _output_buffer + frame * channels() )[chnl] = - static_cast( - mixer::clip( _ab[frame][chnl] * - _master_gain ) * - OUTPUT_SAMPLE_MULTIPLIER ); - } - } - } - - return( _frames * channels() * BYTES_PER_INT_SAMPLE ); -} - - - - -void audioDevice::clearS16Buffer( int_sample_t * _outbuf, const fpp_t _frames ) -{ -#ifdef LMMS_DEBUG - assert( _outbuf != NULL ); -#endif - memset( _outbuf, 0, _frames * channels() * BYTES_PER_INT_SAMPLE ); + alignedMemClear( _outbuf, _frames * sizeof( *_outbuf ) ); +// memset( _outbuf, 0, _frames * channels() * BYTES_PER_INT_SAMPLE ); } diff --git a/src/core/audio/audio_file_wave.cpp b/src/core/audio/audio_file_wave.cpp index 09b5118d2..364bbe563 100644 --- a/src/core/audio/audio_file_wave.cpp +++ b/src/core/audio/audio_file_wave.cpp @@ -29,6 +29,7 @@ #include "audio_file_wave.h" #include "endian_handling.h" +#include "basic_ops.h" #include @@ -101,12 +102,14 @@ void audioFileWave::writeBuffer( const surroundSampleFrame * _ab, } else { - int_sample_t * buf = new int_sample_t[_frames * channels()]; - convertToS16( _ab, _frames, _master_gain, buf, + intSampleFrameA * buf = (intSampleFrameA *) + alignedMalloc( + sizeof( intSampleFrameA ) * _frames ); + alignedConvertToS16( _ab, buf, _frames, _master_gain, !isLittleEndian() ); - sf_writef_short( m_sf, buf, _frames ); - delete[] buf; + sf_writef_short( m_sf, (int_sample_t *) buf, _frames ); + alignedFree( buf ); } } diff --git a/src/core/audio/audio_jack.cpp b/src/core/audio/audio_jack.cpp index 8f7b64307..c82ecedc9 100644 --- a/src/core/audio/audio_jack.cpp +++ b/src/core/audio/audio_jack.cpp @@ -45,6 +45,7 @@ #include "config_mgr.h" #include "lcd_spinbox.h" #include "audio_port.h" +#include "basic_ops.h" @@ -57,7 +58,7 @@ audioJACK::audioJACK( bool & _success_ful, mixer * _mixer ) : m_client( NULL ), m_active( FALSE ), m_stop_semaphore( 1 ), - m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ), + m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ), m_framesDoneInCurBuf( 0 ), m_framesToDoInCurBuf( 0 ) { @@ -159,7 +160,7 @@ audioJACK::~audioJACK() jack_client_close( m_client ); } - delete[] m_outBuf; + alignedFreeFrames( m_outBuf ); } @@ -367,14 +368,14 @@ int audioJACK::processCallback( jack_nframes_t _nframes, void * _udata ) _this->m_framesDoneInCurBuf ); if( ts == JackTransportRolling ) { + const float gain = _this->getMixer()->masterGain(); for( Uint8 chnl = 0; chnl < _this->channels(); ++chnl ) { for( jack_nframes_t frame = 0; frame < todo; ++frame ) { outbufs[chnl][done+frame] = - _this->m_outBuf[_this->m_framesDoneInCurBuf+frame][chnl] * - _this->getMixer()->masterGain(); + _this->m_outBuf[_this->m_framesDoneInCurBuf+frame][chnl] * gain; } } } diff --git a/src/core/audio/audio_oss.cpp b/src/core/audio/audio_oss.cpp index a3be7fb43..ab7fe02d0 100644 --- a/src/core/audio/audio_oss.cpp +++ b/src/core/audio/audio_oss.cpp @@ -39,6 +39,7 @@ #include "engine.h" #include "gui_templates.h" #include "templates.h" +#include "basic_ops.h" #ifdef LMMS_HAVE_UNISTD_H #include @@ -298,13 +299,13 @@ void audioOSS::applyQualitySettings( void ) void audioOSS::run( void ) { - surroundSampleFrame * temp = - new surroundSampleFrame[getMixer()->framesPerPeriod()]; - int_sample_t * outbuf = - new int_sample_t[getMixer()->framesPerPeriod() * - channels()]; + sampleFrameA * temp = alignedAllocFrames( + getMixer()->framesPerPeriod() ); + intSampleFrameA * outbuf = (intSampleFrameA *) + alignedMalloc( sizeof( intSampleFrameA ) * + getMixer()->framesPerPeriod() ); - while( TRUE ) + while( 1 ) { const fpp_t frames = getNextBuffer( temp ); if( !frames ) @@ -312,8 +313,8 @@ void audioOSS::run( void ) break; } - int bytes = convertToS16( temp, frames, - getMixer()->masterGain(), outbuf, + int bytes = alignedConvertToS16( temp, outbuf, frames, + getMixer()->masterGain(), m_convertEndian ); if( write( m_audioFD, outbuf, bytes ) != bytes ) { @@ -321,8 +322,8 @@ void audioOSS::run( void ) } } - delete[] temp; - delete[] outbuf; + alignedFreeFrames( temp ); + alignedFree( outbuf ); } diff --git a/src/core/audio/audio_port.cpp b/src/core/audio/audio_port.cpp index 42f4d8d85..119013339 100644 --- a/src/core/audio/audio_port.cpp +++ b/src/core/audio/audio_port.cpp @@ -26,13 +26,15 @@ #include "audio_device.h" #include "effect_chain.h" #include "engine.h" +#include "basic_ops.h" audioPort::audioPort( const QString & _name, bool _has_effect_chain ) : m_bufferUsage( NoUsage ), - m_firstBuffer( new sampleFrame[engine::getMixer()->framesPerPeriod()] ), - m_secondBuffer( new sampleFrame[ - engine::getMixer()->framesPerPeriod()] ), + m_firstBuffer( alignedAllocFrames( + engine::getMixer()->framesPerPeriod() ) ), + m_secondBuffer( alignedAllocFrames( + engine::getMixer()->framesPerPeriod() ) ), m_extOutputEnabled( false ), m_nextFxChannel( 0 ), m_name( "unnamed port" ), @@ -53,8 +55,8 @@ audioPort::~audioPort() { setExtOutputEnabled( false ); engine::getMixer()->removeAudioPort( this ); - delete[] m_firstBuffer; - delete[] m_secondBuffer; + alignedFreeFrames( m_firstBuffer ); + alignedFreeFrames( m_secondBuffer ); delete m_effects; } diff --git a/src/core/audio/audio_portaudio.cpp b/src/core/audio/audio_portaudio.cpp index 6276e0cea..c99d92ddc 100644 --- a/src/core/audio/audio_portaudio.cpp +++ b/src/core/audio/audio_portaudio.cpp @@ -55,11 +55,12 @@ void audioPortAudioSetupUtil::updateChannels( void ) audioPortAudio::audioPortAudio( bool & _success_ful, mixer * _mixer ) : audioDevice( tLimit( - configManager::inst()->value( "audioportaudio", "channels" ).toInt(), + configManager::inst()->value( "audioportaudio", + "channels" ).toInt(), DEFAULT_CHANNELS, SURROUND_CHANNELS ), _mixer ), m_wasPAInitError( false ), - m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ), + m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ), m_outBufPos( 0 ), m_stopSemaphore( 1 ) { @@ -205,7 +206,7 @@ audioPortAudio::~audioPortAudio() { Pa_Terminate(); } - delete[] m_outBuf; + alignedFreeFrames( m_outBuf ); } diff --git a/src/core/audio/audio_pulseaudio.cpp b/src/core/audio/audio_pulseaudio.cpp index 0a4b8978d..09c9d6217 100644 --- a/src/core/audio/audio_pulseaudio.cpp +++ b/src/core/audio/audio_pulseaudio.cpp @@ -40,6 +40,7 @@ #include "lcd_spinbox.h" #include "gui_templates.h" #include "templates.h" +#include "basic_ops.h" static void stream_write_callback(pa_stream *s, size_t length, void *userdata) @@ -230,8 +231,9 @@ void audioPulseAudio::run( void ) void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length) { const fpp_t fpp = getMixer()->framesPerPeriod(); - surroundSampleFrame * temp = new surroundSampleFrame[fpp]; - Sint16 * pcmbuf = (Sint16*)pa_xmalloc( fpp * channels() * sizeof(Sint16) ); + sampleFrameA * temp = alignedAllocFrames( fpp ); + Sint16 * pcmbuf = (Sint16*)pa_xmalloc( fpp * channels() * + sizeof(Sint16) ); size_t fd = 0; while( fd < length/4 ) @@ -241,9 +243,10 @@ void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length) { return; } - int bytes = convertToS16( temp, frames, + int bytes = alignedConvertToS16( temp, + (intSampleFrameA *) pcmbuf, + frames, getMixer()->masterGain(), - pcmbuf, m_convertEndian ); if( bytes > 0 ) { @@ -254,7 +257,7 @@ void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length) } pa_xfree( pcmbuf ); - delete[] temp; + alignedFreeFrames( temp ); } diff --git a/src/core/audio/audio_sdl.cpp b/src/core/audio/audio_sdl.cpp index af7935d49..e4066a9b8 100644 --- a/src/core/audio/audio_sdl.cpp +++ b/src/core/audio/audio_sdl.cpp @@ -38,22 +38,22 @@ #include "config_mgr.h" #include "gui_templates.h" #include "templates.h" - +#include "basic_ops.h" audioSDL::audioSDL( bool & _success_ful, mixer * _mixer ) : audioDevice( DEFAULT_CHANNELS, _mixer ), - m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ), + m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ), m_convertedBufPos( 0 ), m_convertEndian( false ), m_stopSemaphore( 1 ) { _success_ful = FALSE; - m_convertedBufSize = getMixer()->framesPerPeriod() * channels() - * sizeof( int_sample_t ); - m_convertedBuf = new Uint8[m_convertedBufSize]; + m_convertedBufSize = getMixer()->framesPerPeriod() * + sizeof( intSampleFrameA ); + m_convertedBuf = (intSampleFrameA *) alignedMalloc( m_convertedBufSize ); if( SDL_Init( SDL_INIT_AUDIO | SDL_INIT_NOPARACHUTE ) < 0 ) @@ -97,8 +97,8 @@ audioSDL::~audioSDL() SDL_CloseAudio(); SDL_Quit(); - delete[] m_convertedBuf; - delete[] m_outBuf; + alignedFree( m_convertedBuf ); + alignedFreeFrames( m_outBuf ); } @@ -190,12 +190,12 @@ void audioSDL::sdlAudioCallback( Uint8 * _buf, int _len ) memset( _buf, 0, _len ); return; } - m_convertedBufSize = frames * channels() - * sizeof( int_sample_t ); + m_convertedBufSize = frames * sizeof( intSampleFrameA ); - convertToS16( m_outBuf, frames, + alignedConvertToS16( m_outBuf, + m_convertedBuf, + frames, getMixer()->masterGain(), - (int_sample_t *)m_convertedBuf, m_convertEndian ); } const int min_len = qMin( _len, m_convertedBufSize diff --git a/src/core/basic_ops.cpp b/src/core/basic_ops.cpp new file mode 100644 index 000000000..8ae8d800c --- /dev/null +++ b/src/core/basic_ops.cpp @@ -0,0 +1,455 @@ +/* + * basic_ops.cpp - basic memory operations + * + * Copyright (c) 2008 Tobias Doerffel + * + * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program (see COPYING); if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA. + * + */ + + +#include "basic_ops.h" + +#include +#include +#include + + + +void * alignedMalloc( int _bytes ) +{ + char *ptr,*ptr2,*aligned_ptr; + int align_mask = ALIGN_SIZE- 1; + ptr =(char *) malloc( _bytes + ALIGN_SIZE + sizeof(int) ); + if( ptr == NULL ) + { + return NULL; + } + + ptr2 = ptr + sizeof(int); + aligned_ptr = ptr2 + ( ALIGN_SIZE- ( (size_t) ptr2 & align_mask ) ); + + + ptr2 = aligned_ptr - sizeof(int); + *((int *) ptr2) = (int)( aligned_ptr - ptr ); + + return aligned_ptr; +} + + +void alignedFree( void * _buf ) +{ + if( _buf ) + { + int * ptr2 = (int *) _buf - 1; + void * buf2 = (char *) _buf - *ptr2; + if( buf2 ) + { + free( buf2 ); + } + } +} + + +sampleFrameA * alignedAllocFrames( int _n ) +{ + return (sampleFrameA *) alignedMalloc( _n * sizeof( sampleFrameA ) ); +} + + +void alignedFreeFrames( sampleFrame * _buf ) +{ + alignedFree( _buf ); +} + + + + +// slow fallback +void alignedMemCpyNoOpt( void * RP _dst, const void * RP _src, int _size ) +{ + const int s = _size / ( sizeof( int ) * 16 ); + const int * RP src = (const int *) _src; + int * RP dst = (int *) _dst; + for( int i = 0; i < s; ) + { + dst[i+0] = src[i+0]; + dst[i+1] = src[i+1]; + dst[i+2] = src[i+2]; + dst[i+3] = src[i+3]; + dst[i+4] = src[i+4]; + dst[i+5] = src[i+5]; + dst[i+6] = src[i+6]; + dst[i+7] = src[i+7]; + dst[i+8] = src[i+8]; + dst[i+9] = src[i+9]; + dst[i+10] = src[i+10]; + dst[i+11] = src[i+11]; + dst[i+12] = src[i+12]; + dst[i+13] = src[i+13]; + dst[i+14] = src[i+14]; + dst[i+15] = src[i+15]; + i += 16; + } +} + + +// slow fallback +void alignedMemClearNoOpt( void * _dst, int _size ) +{ + const int s = _size / ( sizeof( int ) * 4 ); + int * dst = (int *) _dst; + for( int i = 0; i < s; ++i ) + { + dst[0] = 0; + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + dst += 4; + } +} + + + +void alignedBufApplyGainNoOpt( sampleFrameA * RP _dst, float _gain, + int _frames ) +{ + for( int i = 0; i < _frames; ) + { + _dst[i+0][0] *= _gain; + _dst[i+0][1] *= _gain; + _dst[i+1][0] *= _gain; + _dst[i+1][1] *= _gain; + _dst[i+2][0] *= _gain; + _dst[i+2][1] *= _gain; + _dst[i+3][0] *= _gain; + _dst[i+3][1] *= _gain; + _dst[i+4][0] *= _gain; + _dst[i+4][1] *= _gain; + _dst[i+5][0] *= _gain; + _dst[i+5][1] *= _gain; + _dst[i+6][0] *= _gain; + _dst[i+6][1] *= _gain; + _dst[i+7][0] *= _gain; + _dst[i+7][1] *= _gain; + i += 8; + } +} + + +void alignedBufMixNoOpt( sampleFrameA * RP _dst, const sampleFrameA * RP _src, + int _frames ) +{ + for( int i = 0; i < _frames; ) + { + _dst[i+0][0] += _src[i+0][0]; + _dst[i+0][1] += _src[i+0][1]; + _dst[i+1][0] += _src[i+1][0]; + _dst[i+1][1] += _src[i+1][1]; + _dst[i+2][0] += _src[i+2][0]; + _dst[i+2][1] += _src[i+2][1]; + _dst[i+3][0] += _src[i+3][0]; + _dst[i+3][1] += _src[i+3][1]; + i += 4; + } +} + + + +void alignedBufMixLRCoeffNoOpt( sampleFrameA * RP _dst, + const sampleFrameA * RP _src, + float _left, float _right, int _frames ) +{ + for( int i = 0; i < _frames; ) + { + _dst[i+0][0] += _src[i+0][0]*_left; + _dst[i+0][1] += _src[i+0][1]*_right; + _dst[i+1][0] += _src[i+1][0]*_left; + _dst[i+1][1] += _src[i+1][1]*_right; + _dst[i+2][0] += _src[i+2][0]*_left; + _dst[i+2][1] += _src[i+2][1]*_right; + _dst[i+3][0] += _src[i+3][0]*_left; + _dst[i+3][1] += _src[i+3][1]*_right; + i += 4; + } +} + + + +void unalignedBufMixLRCoeffNoOpt( sampleFrame * RP _dst, + const sampleFrame * RP _src, + const float _left, + const float _right, + int _frames ) +{ + if( _frames % 2 ) + { + _dst[0][0] += _src[0][0] * _left; + _dst[0][1] += _src[0][1] * _right; + ++_src; + ++_dst; + --_frames; + } + for( int i = 0; i < _frames; ) + { + _dst[i+0][0] += _src[i+0][0]*_left; + _dst[i+0][1] += _src[i+0][1]*_right; + _dst[i+1][0] += _src[i+1][0]*_left; + _dst[i+1][1] += _src[i+1][1]*_right; + i += 2; + } +} + + + +void alignedBufWetDryMixNoOpt( sampleFrameA * RP _dst, + const sampleFrameA * RP _src, + float _wet, float _dry, int _frames ) +{ + for( int i = 0; i < _frames; ++i ) + { + _dst[i+0][0] = _dst[i+0][0]*_dry + _src[i+0][0]*_wet; + _dst[i+0][1] = _dst[i+0][1]*_dry + _src[i+0][1]*_wet; + } +} + + + + +void alignedBufWetDryMixSplittedNoOpt( sampleFrameA * RP _dst, + const float * RP _left, + const float * RP _right, + float _wet, float _dry, int _frames ) +{ + int i; + for( i = 0; i < _frames; ++i ) + { + _dst[i+0][0] = _dst[i+0][0]*_dry + _left[i+0]*_wet; + _dst[i+0][1] = _dst[i+0][1]*_dry + _right[i+0]*_wet; + ++i; + } +} + + + + +int alignedConvertToS16NoOpt( const sampleFrameA * RP _src, + intSampleFrameA * RP _dst, + const fpp_t _frames, + const float _master_gain, + const bool _convert_endian ) +{ + int t1; + int t2; + const float f = _master_gain * OUTPUT_SAMPLE_MULTIPLIER; + if( _convert_endian ) + { + for( fpp_t frame = 0; frame < _frames; ++frame ) + { + t1 = _src[frame][0] * f; + t1 = unlikely( t1 > 32767 ) ? 32767 : t1; + t1 = unlikely( t1 < -32768 ) ? -32768 : t1; + _dst[frame][0] = ( t1 & 0x00ff) << 8 | + ( t1 & 0xff00 ) >> 8; + + t2 = _src[frame][1] * f; + t2 = unlikely( t2 > 32767 ) ? 32767 : t2; + t2 = unlikely( t2 < -32768 ) ? -32768 : t2; + _dst[frame][1] = ( t2 & 0x00ff) << 8 | + ( t2 & 0xff00 ) >> 8; + } + } + else + { + for( fpp_t frame = 0; frame < _frames; ++frame ) + { + t1 = _src[frame][0] * f; + t1 = unlikely( t1 > 32767 ) ? 32767 : t1; + t1 = unlikely( t1 < -32768 ) ? -32768 : t1; + _dst[frame][0] = t1; + + t2 = _src[frame][1] * f; + t2 = unlikely( t2 > 32767 ) ? 32767 : t2; + t2 = unlikely( t2 < -32768 ) ? -32768 : t2; + _dst[frame][1] = t2; + } + } + + return _frames * DEFAULT_CHANNELS * BYTES_PER_INT_SAMPLE; +} + + +alignedMemCpyFunc alignedMemCpy = alignedMemCpyNoOpt; +alignedMemClearFunc alignedMemClear = alignedMemClearNoOpt; +alignedBufApplyGainFunc alignedBufApplyGain = alignedBufApplyGainNoOpt; +alignedBufMixFunc alignedBufMix = alignedBufMixNoOpt; +alignedBufMixLRCoeffFunc alignedBufMixLRCoeff = alignedBufMixLRCoeffNoOpt; +unalignedBufMixLRCoeffFunc unalignedBufMixLRCoeff = unalignedBufMixLRCoeffNoOpt; +alignedBufWetDryMixFunc alignedBufWetDryMix = alignedBufWetDryMixNoOpt; +alignedBufWetDryMixSplittedFunc alignedBufWetDryMixSplitted = alignedBufWetDryMixSplittedNoOpt; +alignedConvertToS16Func alignedConvertToS16 = alignedConvertToS16NoOpt; + + +#ifdef X86_OPTIMIZATIONS +enum CPUFeatures +{ + None = 0, + MMX = 0x1, + MMXEXT = 0x2, + MMX3DNOW = 0x4, + MMX3DNOWEXT = 0x8, + SSE = 0x10, + SSE2 = 0x20, + CMOV = 0x40, + IWMMXT = 0x80 +}; + +extern "C" +{ +#ifdef LMMS_HOST_X86 +void alignedMemCpyMMX( void * RP _dst, const void * RP _src, int _size ); +void alignedMemClearMMX( void * RP _dst, int _size ); +#endif +void alignedMemCpySSE( void * RP _dst, const void * RP _src, int _size ); +void alignedMemClearSSE( void * RP _dst, int _size ); +void alignedBufApplyGainSSE( sampleFrameA * RP _dst, float _gain, int _frames ); +void alignedBufMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, int _frames ); +void alignedBufMixLRCoeffSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _left, float _right, int _frames ); +void unalignedBufMixLRCoeffSSE( sampleFrame * RP _dst, const sampleFrame * RP _src, const float _left, const float _right, int _frames ); +void alignedBufWetDryMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _wet, float _dry, int _frames ); +void alignedBufWetDryMixSplittedSSE( sampleFrameA * RP _dst, const float * RP _left, const float * RP _right, float _wet, float _dry, int _frames ); +#ifdef LMMS_HOST_X86 +void alignedMemCpySSE2( void * RP _dst, const void * RP _src, int _size ); +void alignedMemClearSSE2( void * RP _dst, int _size ); +int alignedConvertToS16SSE2( const sampleFrameA * RP _src, intSampleFrameA * RP _dst, const fpp_t _frames, const float _master_gain, const bool _convert_endian ); +#endif +} ; +#endif + + + +void initBasicOps( void ) +{ +#ifdef X86_OPTIMIZATIONS + static bool extensions_checked = false; + if( extensions_checked == false ) + { + int features = 0; + unsigned int result = 0; + unsigned int extended_result = 0; + asm( "push %%ebx\n" + "pushf\n" + "pop %%eax\n" + "mov %%eax, %%ebx\n" + "xor $0x00200000, %%eax\n" + "push %%eax\n" + "popf\n" + "pushf\n" + "pop %%eax\n" + "xor %%edx, %%edx\n" + "xor %%ebx, %%eax\n" + "jz 1f\n" + + "mov $0x00000001, %%eax\n" + "cpuid\n" + "1:\n" + "pop %%ebx\n" + "mov %%edx, %0\n" + + : "=r" (result) + : + : "%eax", "%ecx", "%edx" + ); + + asm( "push %%ebx\n" + "pushf\n" + "pop %%eax\n" + "mov %%eax, %%ebx\n" + "xor $0x00200000, %%eax\n" + "push %%eax\n" + "popf\n" + "pushf\n" + "pop %%eax\n" + "xor %%edx, %%edx\n" + "xor %%ebx, %%eax\n" + "jz 2f\n" + + "mov $0x80000000, %%eax\n" + "cpuid\n" + "cmp $0x80000000, %%eax\n" + "jbe 2f\n" + "mov $0x80000001, %%eax\n" + "cpuid\n" + "2:\n" + "pop %%ebx\n" + "mov %%edx, %0\n" + + : "=r" (extended_result) + : + : "%eax", "%ecx", "%edx" + ); + + if( result & (1u << 15) ) + features |= CMOV; + if( result & (1u << 23) ) + features |= MMX; + if( extended_result & (1u << 22) ) + features |= MMXEXT; + if( extended_result & (1u << 31) ) + features |= MMX3DNOW; + if( extended_result & (1u << 30) ) + features |= MMX3DNOWEXT; + if( result & (1u << 25) ) + features |= SSE; + if( result & (1u << 26) ) + features |= SSE2; + +#ifdef LMMS_HOST_X86 + if( features & MMX ) + { + alignedMemCpy = alignedMemCpyMMX; + alignedMemClear = alignedMemClearMMX; + } +#endif + if( features & SSE ) + { + fprintf( stderr, "Using SSE optimized routines\n" ); + alignedMemCpy = alignedMemCpySSE; + alignedMemClear = alignedMemClearSSE; + alignedBufApplyGain = alignedBufApplyGainSSE; + alignedBufMix = alignedBufMixSSE; + alignedBufMixLRCoeff = alignedBufMixLRCoeffSSE; + unalignedBufMixLRCoeff = unalignedBufMixLRCoeffSSE; + alignedBufWetDryMix = alignedBufWetDryMixSSE; + alignedBufWetDryMixSplitted = + alignedBufWetDryMixSplittedSSE; + } + if( features & SSE2 ) + { + fprintf( stderr, "Using SSE2 optimized routines\n" ); + alignedMemCpy = alignedMemCpySSE2; + alignedMemClear = alignedMemClearSSE2; + alignedConvertToS16 = alignedConvertToS16SSE2; + } + extensions_checked = true; + } +#endif +} + + + diff --git a/src/core/basic_ops_x86.c b/src/core/basic_ops_x86.c new file mode 100644 index 000000000..8b8535095 --- /dev/null +++ b/src/core/basic_ops_x86.c @@ -0,0 +1,395 @@ +/* + * basic_ops_x86.c - x86 specific optimized operations + * + * Copyright (c) 2008 Tobias Doerffel + * + * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program (see COPYING); if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA. + * + */ + + + +#include "basic_ops.h" + +#ifdef X86_OPTIMIZATIONS + +#ifdef BUILD_MMX + +#include + +void alignedMemCpyMMX( void * RP _dst, const void * RP _src, int _size ) +{ + const int s = _size / ( sizeof( __m64 ) * 8 ); + int i; + char fpu_save[108]; + char * RP src = (char *) _src; + char * RP dst = (char *) _dst; + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + + __asm__ __volatile__ ( + "1: prefetchnta (%0)\n" + " prefetchnta 64(%0)\n" + " prefetchnta 128(%0)\n" + " prefetchnta 192(%0)\n" + " prefetchnta 256(%0)\n" + : : "r" (src) ); + for(i=0; i + +void alignedMemCpySSE( void * RP _dst, const void * RP _src, int _size ) +{ + __m128 * dst = (__m128 *) _dst; + __m128 * src = (__m128 *) _src; + const int s = _size / ( sizeof( *dst ) * 4 ); + int i; + for( i = 0; i < s; ++i ) + { +/* _mm_store_ps( dst+0, _mm_load_ps( src+0 ) ); + _mm_store_ps( dst+1, _mm_load_ps( src+1 ) ); + _mm_store_ps( dst+2, _mm_load_ps( src+2 ) ); + _mm_store_ps( dst+3, _mm_load_ps( src+3 ) );*/ + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + src += 4; + dst += 4; + } +} + + + + +void alignedMemClearSSE( void * RP _dst, int _size ) +{ + __m128 * dst = (__m128 *) _dst; + const int s = _size / ( sizeof( *dst ) * 4 ); + __m128 val = _mm_setzero_ps(); + int i; + for( i = 0; i < s; ++i ) + { + dst[0] = val; + dst[1] = val; + dst[2] = val; + dst[3] = val; + dst += 4; + } +} + + + + +void alignedBufApplyGainSSE( sampleFrameA * RP _dst, float _gain, int _frames ) +{ + int i; + for( i = 0; i < _frames; ) + { + _dst[i+0][0] *= _gain; + _dst[i+0][1] *= _gain; + _dst[i+1][0] *= _gain; + _dst[i+1][1] *= _gain; + _dst[i+2][0] *= _gain; + _dst[i+2][1] *= _gain; + _dst[i+3][0] *= _gain; + _dst[i+3][1] *= _gain; + _dst[i+4][0] *= _gain; + _dst[i+4][1] *= _gain; + _dst[i+5][0] *= _gain; + _dst[i+5][1] *= _gain; + _dst[i+6][0] *= _gain; + _dst[i+6][1] *= _gain; + _dst[i+7][0] *= _gain; + _dst[i+7][1] *= _gain; + i += 8; + } +} + + + + +void alignedBufMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, + int _frames ) +{ + int i; + for( i = 0; i < _frames; ) + { + _dst[i+0][0] += _src[i+0][0]; + _dst[i+0][1] += _src[i+0][1]; + _dst[i+1][0] += _src[i+1][0]; + _dst[i+1][1] += _src[i+1][1]; + _dst[i+2][0] += _src[i+2][0]; + _dst[i+2][1] += _src[i+2][1]; + _dst[i+3][0] += _src[i+3][0]; + _dst[i+3][1] += _src[i+3][1]; + i += 4; + _dst[i+0][0] += _src[i+0][0]; + _dst[i+0][1] += _src[i+0][1]; + _dst[i+1][0] += _src[i+1][0]; + _dst[i+1][1] += _src[i+1][1]; + _dst[i+2][0] += _src[i+2][0]; + _dst[i+2][1] += _src[i+2][1]; + _dst[i+3][0] += _src[i+3][0]; + _dst[i+3][1] += _src[i+3][1]; + i += 4; + } +} + + + +void alignedBufMixLRCoeffSSE( sampleFrameA * RP _dst, + const sampleFrameA * RP _src, + float _left, float _right, int _frames ) +{ + int i; + for( i = 0; i < _frames; ) + { + _dst[i+0][0] += _src[i+0][0]*_left; + _dst[i+0][1] += _src[i+0][1]*_right; + _dst[i+1][0] += _src[i+1][0]*_left; + _dst[i+1][1] += _src[i+1][1]*_right; + _dst[i+2][0] += _src[i+2][0]*_left; + _dst[i+2][1] += _src[i+2][1]*_right; + _dst[i+3][0] += _src[i+3][0]*_left; + _dst[i+3][1] += _src[i+3][1]*_right; + i += 4; + } +} + + + +void unalignedBufMixLRCoeffSSE( sampleFrame * RP _dst, const sampleFrame * RP _src, + const float _left, + const float _right, + int _frames ) +{ + int i; + if( unlikely( _frames % 2 ) ) + { + _dst[0][0] += _src[0][0] * _left; + _dst[0][1] += _src[0][1] * _right; + ++_src; + ++_dst; + --_frames; + } + + for( i = 0; i < _frames; ) + { + _dst[i+0][0] += _src[i+0][0]*_left; + _dst[i+0][1] += _src[i+0][1]*_right; + _dst[i+1][0] += _src[i+1][0]*_left; + _dst[i+1][1] += _src[i+1][1]*_right; + i += 2; + } +} + + + +void alignedBufWetDryMixSSE( sampleFrameA * RP _dst, + const sampleFrameA * RP _src, + float _wet, float _dry, int _frames ) +{ + int i; + for( i = 0; i < _frames; ) + { + _dst[i+0][0] = _dst[i+0][0]*_dry + _src[i+0][0]*_wet; + _dst[i+0][1] = _dst[i+0][1]*_dry + _src[i+0][1]*_wet; + _dst[i+1][0] = _dst[i+1][0]*_dry + _src[i+1][0]*_wet; + _dst[i+1][1] = _dst[i+1][1]*_dry + _src[i+1][1]*_wet; + _dst[i+2][0] = _dst[i+2][0]*_dry + _src[i+2][0]*_wet; + _dst[i+2][1] = _dst[i+2][1]*_dry + _src[i+2][1]*_wet; + _dst[i+3][0] = _dst[i+3][0]*_dry + _src[i+3][0]*_wet; + _dst[i+3][1] = _dst[i+3][1]*_dry + _src[i+3][1]*_wet; + i += 4; + } +} + + + + +void alignedBufWetDryMixSplittedSSE( sampleFrameA * RP _dst, + const float * RP _left, + const float * RP _right, + float _wet, float _dry, int _frames ) +{ + int i; + for( i = 0; i < _frames; ) + { + _dst[i+0][0] = _dst[i+0][0]*_dry + _left[i+0]*_wet; + _dst[i+0][1] = _dst[i+0][1]*_dry + _right[i+0]*_wet; + _dst[i+1][0] = _dst[i+1][0]*_dry + _left[i+1]*_wet; + _dst[i+1][1] = _dst[i+1][1]*_dry + _right[i+1]*_wet; + i += 2; + } +} + + + +#endif + + +#ifdef BUILD_SSE2 + +#include + +void alignedMemCpySSE2( void * RP _dst, const void * RP _src, int _size ) +{ + __m128i * dst = (__m128i *) _dst; + __m128i * src = (__m128i *) _src; + const int s = _size / ( sizeof( *dst ) * 4 ); + int i; + for( i = 0; i < s; ++i ) + { + _mm_store_si128( dst+0, _mm_load_si128( src+0 ) ); + _mm_store_si128( dst+1, _mm_load_si128( src+1 ) ); + _mm_store_si128( dst+2, _mm_load_si128( src+2 ) ); + _mm_store_si128( dst+3, _mm_load_si128( src+3 ) ); + src += 4; + dst += 4; + } +} + + + + +void alignedMemClearSSE2( void * RP _dst, int _size ) +{ + __m128i * dst = (__m128i *) _dst; + const int s = _size / ( sizeof( *dst ) * 4 ); + __m128i val = _mm_setzero_si128(); + int i; + for( i = 0; i < s; ++i ) + { + _mm_store_si128( dst+0, val ); + _mm_store_si128( dst+1, val ); + _mm_store_si128( dst+2, val ); + _mm_store_si128( dst+3, val ); + dst += 4; + } +} + + + +int alignedConvertToS16SSE2( const sampleFrameA * RP _src, + intSampleFrameA * RP _dst, + const fpp_t _frames, + const float _master_gain, + const bool _convert_endian ) +{ + int t1; + int t2; + fpp_t frame; + const float f = _master_gain * OUTPUT_SAMPLE_MULTIPLIER; + if( _convert_endian ) + { + for( frame = 0; frame < _frames; ++frame ) + { + t1 = _src[frame][0] * f; + t1 = unlikely( t1 > 32767 ) ? 32767 : t1; + t1 = unlikely( t1 < -32768 ) ? -32768 : t1; + _dst[frame][0] = ( t1 & 0x00ff) << 8 | + ( t1 & 0xff00 ) >> 8; + + t2 = _src[frame][1] * f; + t2 = unlikely( t2 > 32767 ) ? 32767 : t2; + t2 = unlikely( t2 < -32768 ) ? -32768 : t2; + _dst[frame][1] = ( t2 & 0x00ff) << 8 | + ( t2 & 0xff00 ) >> 8; + } + } + else + { + for( frame = 0; frame < _frames; ++frame ) + { + t1 = _src[frame][0] * f; + t1 = unlikely( t1 > 32767 ) ? 32767 : t1; + t1 = unlikely( t1 < -32768 ) ? -32768 : t1; + _dst[frame][0] = t1; + + t2 = _src[frame][1] * f; + t2 = unlikely( t2 > 32767 ) ? 32767 : t2; + t2 = unlikely( t2 < -32768 ) ? -32768 : t2; + _dst[frame][1] = t2; + } + } + + return _frames * DEFAULT_CHANNELS * BYTES_PER_INT_SAMPLE; +} + + + +#endif + +#endif diff --git a/src/core/basic_ops_x86_64_sse.s b/src/core/basic_ops_x86_64_sse.s new file mode 100644 index 000000000..6c42a8f45 --- /dev/null +++ b/src/core/basic_ops_x86_64_sse.s @@ -0,0 +1,563 @@ + .file "basic_ops_x86.c" + .text + .align 16 +.globl alignedMemCpySSE + .type alignedMemCpySSE, @function +alignedMemCpySSE: +.LFB509: + movslq %edx,%rdx + shrq $6, %rdx + testl %edx, %edx + jle .L4 + leal -1(%rdx), %r9d + xorl %eax, %eax + mov %r9d, %r8d + leaq 1(%r8), %rcx + movq %rcx, %rdx + salq $6, %rdx + .align 16 +.L3: + movaps (%rsi,%rax), %xmm0 + movaps %xmm0, (%rdi,%rax) + movaps 16(%rsi,%rax), %xmm0 + movaps %xmm0, 16(%rdi,%rax) + movaps 32(%rsi,%rax), %xmm0 + movaps %xmm0, 32(%rdi,%rax) + movaps 48(%rsi,%rax), %xmm0 + movaps %xmm0, 48(%rdi,%rax) + addq $64, %rax + cmpq %rdx, %rax + jne .L3 +.L4: + rep + ret +.LFE509: + .size alignedMemCpySSE, .-alignedMemCpySSE + .align 16 +.globl alignedMemClearSSE + .type alignedMemClearSSE, @function +alignedMemClearSSE: +.LFB510: + movslq %esi,%rax + shrq $6, %rax + testl %eax, %eax + jle .L10 + subl $1, %eax + xorps %xmm0, %xmm0 + salq $6, %rax + leaq 64(%rax,%rdi), %rax + .align 16 +.L9: + movaps %xmm0, (%rdi) + movaps %xmm0, 16(%rdi) + movaps %xmm0, 32(%rdi) + movaps %xmm0, 48(%rdi) + addq $64, %rdi + cmpq %rax, %rdi + jne .L9 +.L10: + rep + ret +.LFE510: + .size alignedMemClearSSE, .-alignedMemClearSSE + .align 16 +.globl alignedBufApplyGainSSE + .type alignedBufApplyGainSSE, @function +alignedBufApplyGainSSE: +.LFB511: + testl %esi, %esi + jle .L15 + subl $1, %esi + shufps $0, %xmm0, %xmm0 + shrl $3, %esi + xorl %eax, %eax + leal 1(%rsi), %edx + .align 16 +.L14: + movaps %xmm0, %xmm3 + addl $1, %eax + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm4 + mulps 16(%rdi), %xmm3 + mulps 32(%rdi), %xmm2 + mulps 48(%rdi), %xmm1 + mulps (%rdi), %xmm4 + movaps %xmm3, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm1, 48(%rdi) + movaps %xmm4, (%rdi) + addq $64, %rdi + cmpl %eax, %edx + ja .L14 +.L15: + rep + ret +.LFE511: + .size alignedBufApplyGainSSE, .-alignedBufApplyGainSSE + .align 16 +.globl alignedBufMixSSE + .type alignedBufMixSSE, @function +alignedBufMixSSE: +.LFB512: + testl %edx, %edx + jle .L20 + subl $1, %edx + xorl %eax, %eax + shrl $3, %edx + leal 1(%rdx), %ecx + xorl %edx, %edx + .align 16 +.L19: + movaps 16(%rdi,%rax), %xmm2 + addl $1, %edx + movaps 32(%rdi,%rax), %xmm1 + addps 16(%rsi,%rax), %xmm2 + movaps 48(%rdi,%rax), %xmm0 + addps 32(%rsi,%rax), %xmm1 + movaps (%rdi,%rax), %xmm3 + addps 48(%rsi,%rax), %xmm0 + addps (%rsi,%rax), %xmm3 + movaps %xmm2, 16(%rdi,%rax) + movaps %xmm1, 32(%rdi,%rax) + movaps %xmm0, 48(%rdi,%rax) + movaps %xmm3, (%rdi,%rax) + addq $64, %rax + cmpl %edx, %ecx + ja .L19 +.L20: + rep + ret +.LFE512: + .size alignedBufMixSSE, .-alignedBufMixSSE + .align 16 +.globl alignedBufMixLRCoeffSSE + .type alignedBufMixLRCoeffSSE, @function +alignedBufMixLRCoeffSSE: +.LFB513: + testl %edx, %edx + jle .L25 + unpcklps %xmm1, %xmm0 + subl $1, %edx + shrl $2, %edx + xorl %eax, %eax + leal 1(%rdx), %ecx + xorl %edx, %edx + movlhps %xmm0, %xmm0 + .align 16 +.L24: + movaps %xmm0, %xmm1 + addl $1, %edx + movaps %xmm0, %xmm2 + mulps 16(%rsi,%rax), %xmm1 + mulps (%rsi,%rax), %xmm2 + addps 16(%rdi,%rax), %xmm1 + addps (%rdi,%rax), %xmm2 + movaps %xmm1, 16(%rdi,%rax) + movaps %xmm2, (%rdi,%rax) + addq $32, %rax + cmpl %edx, %ecx + ja .L24 +.L25: + rep + ret +.LFE513: + .size alignedBufMixLRCoeffSSE, .-alignedBufMixLRCoeffSSE + .align 16 +.globl alignedBufWetDryMixSSE + .type alignedBufWetDryMixSSE, @function +alignedBufWetDryMixSSE: +.LFB515: + testl %edx, %edx + jle .L30 + subl $1, %edx + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm0, %xmm0 + shrl $2, %edx + leal 1(%rdx), %ecx + xorl %eax, %eax + xorl %edx, %edx + .align 16 +.L29: + movaps %xmm1, %xmm3 + addl $1, %edx + movaps %xmm0, %xmm2 + mulps 16(%rdi,%rax), %xmm3 + movaps %xmm1, %xmm4 + mulps 16(%rsi,%rax), %xmm2 + mulps (%rdi,%rax), %xmm4 + addps %xmm3, %xmm2 + movaps %xmm0, %xmm3 + mulps (%rsi,%rax), %xmm3 + movaps %xmm2, 16(%rdi,%rax) + addps %xmm4, %xmm3 + movaps %xmm3, (%rdi,%rax) + addq $32, %rax + cmpl %edx, %ecx + ja .L29 +.L30: + rep + ret +.LFE515: + .size alignedBufWetDryMixSSE, .-alignedBufWetDryMixSSE + .align 16 +.globl alignedBufWetDryMixSplittedSSE + .type alignedBufWetDryMixSplittedSSE, @function +alignedBufWetDryMixSplittedSSE: +.LFB516: + pushq %rbp +.LCFI0: + testl %ecx, %ecx + pushq %rbx +.LCFI1: + jle .L39 + leal -1(%rcx), %ebx + shrl %ebx + addl $1, %ebx + movl %ebx, %r11d + shrl $2, %r11d + cmpl $3, %ebx + leal 0(,%r11,4), %ebp + jbe .L40 + testl %ebp, %ebp + jne .L34 +.L40: + xorl %r9d, %r9d + jmp .L36 + .align 16 +.L34: + movaps %xmm1, %xmm2 + movq %rdi, %rax + xorps %xmm6, %xmm6 + movq %rsi, %r9 + shufps $0, %xmm2, %xmm2 + movq %rdx, %r8 + xorl %r10d, %r10d + movaps %xmm2, %xmm8 + movaps %xmm0, %xmm2 + shufps $0, %xmm2, %xmm2 + movaps %xmm2, %xmm7 + .align 16 +.L37: + movaps (%rax), %xmm12 + addl $1, %r10d + movaps %xmm6, %xmm3 + movaps 16(%rax), %xmm5 + movaps %xmm12, %xmm14 + movlps (%r8), %xmm3 + movaps 32(%rax), %xmm9 + shufps $136, %xmm5, %xmm14 + shufps $221, %xmm5, %xmm12 + movhps 8(%r8), %xmm3 + movaps 48(%rax), %xmm4 + movaps %xmm9, %xmm13 + movaps %xmm6, %xmm5 + shufps $221, %xmm4, %xmm9 + movlps (%r9), %xmm5 + shufps $136, %xmm4, %xmm13 + movaps %xmm6, %xmm4 + movhps 8(%r9), %xmm5 + movaps %xmm14, %xmm11 + movlps 16(%r9), %xmm4 + movaps %xmm12, %xmm15 + movaps %xmm5, %xmm2 + movhps 24(%r9), %xmm4 + shufps $136, %xmm13, %xmm11 + movaps %xmm3, %xmm10 + addq $32, %r9 + shufps $136, %xmm4, %xmm2 + mulps %xmm8, %xmm11 + mulps %xmm7, %xmm2 + shufps $221, %xmm13, %xmm14 + shufps $136, %xmm9, %xmm15 + shufps $221, %xmm4, %xmm5 + addps %xmm2, %xmm11 + movaps %xmm6, %xmm2 + shufps $221, %xmm9, %xmm12 + movlps 16(%r8), %xmm2 + mulps %xmm8, %xmm14 + movhps 24(%r8), %xmm2 + mulps %xmm7, %xmm5 + movaps %xmm11, %xmm9 + addq $32, %r8 + shufps $136, %xmm2, %xmm10 + shufps $221, %xmm2, %xmm3 + movaps %xmm14, %xmm4 + mulps %xmm8, %xmm15 + addps %xmm5, %xmm4 + mulps %xmm7, %xmm10 + movaps %xmm11, %xmm5 + mulps %xmm8, %xmm12 + mulps %xmm7, %xmm3 + addps %xmm15, %xmm10 + unpcklps %xmm4, %xmm9 + movaps %xmm12, %xmm2 + unpckhps %xmm4, %xmm5 + addps %xmm3, %xmm2 + movaps %xmm10, %xmm4 + movaps %xmm10, %xmm3 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm3 + movaps %xmm9, %xmm2 + unpcklps %xmm4, %xmm2 + unpckhps %xmm4, %xmm9 + movaps %xmm2, (%rax) + movaps %xmm5, %xmm2 + unpckhps %xmm3, %xmm5 + unpcklps %xmm3, %xmm2 + movaps %xmm9, 16(%rax) + movaps %xmm2, 32(%rax) + movaps %xmm5, 48(%rax) + addq $64, %rax + cmpl %r10d, %r11d + ja .L37 + cmpl %ebx, %ebp + leal (%rbp,%rbp), %r9d + je .L39 +.L36: + movslq %r9d,%rax + leaq 1(%rax), %rbx + leaq 0(,%rax,4), %r10 + leaq (%rdi,%rax,8), %r8 + leaq (%rdi,%rbx,8), %rax + salq $2, %rbx + leaq (%rsi,%r10), %r11 + leaq (%rdx,%r10), %r10 + addq %rbx, %rsi + addq %rbx, %rdx + .align 16 +.L38: + movaps %xmm1, %xmm3 + addl $2, %r9d + movaps %xmm0, %xmm2 + mulss (%r8), %xmm3 + mulss (%r11), %xmm2 + addq $8, %r11 + addss %xmm3, %xmm2 + movaps %xmm1, %xmm3 + mulss 4(%r8), %xmm3 + movss %xmm2, (%r8) + movaps %xmm0, %xmm2 + mulss (%r10), %xmm2 + addq $8, %r10 + addss %xmm3, %xmm2 + movaps %xmm1, %xmm3 + movss %xmm2, 4(%r8) + movaps %xmm0, %xmm2 + addq $16, %r8 + mulss (%rax), %xmm3 + mulss (%rsi), %xmm2 + addq $8, %rsi + addss %xmm3, %xmm2 + movaps %xmm1, %xmm3 + mulss 4(%rax), %xmm3 + movss %xmm2, (%rax) + movaps %xmm0, %xmm2 + mulss (%rdx), %xmm2 + addq $8, %rdx + addss %xmm3, %xmm2 + movss %xmm2, 4(%rax) + addq $16, %rax + cmpl %r9d, %ecx + jg .L38 +.L39: + popq %rbx + popq %rbp + ret +.LFE516: + .size alignedBufWetDryMixSplittedSSE, .-alignedBufWetDryMixSplittedSSE + .align 16 +.globl unalignedBufMixLRCoeffSSE + .type unalignedBufMixLRCoeffSSE, @function +unalignedBufMixLRCoeffSSE: +.LFB514: + movl %edx, %eax + shrl $31, %eax + leal (%rdx,%rax), %ecx + andl $1, %ecx + cmpl %eax, %ecx + jne .L52 +.L44: + testl %edx, %edx + jle .L49 + subl $1, %edx + shrl %edx + testb $15, %dil + jne .L46 + unpcklps %xmm1, %xmm0 + addl $1, %edx + xorps %xmm3, %xmm3 + xorl %eax, %eax + movlhps %xmm0, %xmm0 + .align 16 +.L47: + movaps %xmm3, %xmm2 + addl $1, %eax + movaps %xmm3, %xmm1 + movlps (%rsi), %xmm2 + movlps (%rdi), %xmm1 + movhps 8(%rsi), %xmm2 + addq $16, %rsi + movhps 8(%rdi), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm1 + movaps %xmm1, (%rdi) + addq $16, %rdi + cmpl %edx, %eax + jb .L47 + rep + ret + .align 16 +.L46: + mov %edx, %edx + xorl %eax, %eax + addq $1, %rdx + salq $4, %rdx + .align 16 +.L48: + movaps %xmm0, %xmm2 + mulss (%rsi,%rax), %xmm2 + addss (%rdi,%rax), %xmm2 + movss %xmm2, (%rdi,%rax) + movaps %xmm1, %xmm2 + mulss 4(%rsi,%rax), %xmm2 + addss 4(%rdi,%rax), %xmm2 + movss %xmm2, 4(%rdi,%rax) + movaps %xmm0, %xmm2 + mulss 8(%rsi,%rax), %xmm2 + addss 8(%rdi,%rax), %xmm2 + movss %xmm2, 8(%rdi,%rax) + movaps %xmm1, %xmm2 + mulss 12(%rsi,%rax), %xmm2 + addss 12(%rdi,%rax), %xmm2 + movss %xmm2, 12(%rdi,%rax) + addq $16, %rax + cmpq %rdx, %rax + jne .L48 +.L49: + rep + ret +.L52: + movaps %xmm0, %xmm2 + subl $1, %edx + movss (%rdi), %xmm3 + mulss (%rsi), %xmm2 + addss %xmm3, %xmm2 + movss 4(%rdi), %xmm3 + movss %xmm2, (%rdi) + movaps %xmm1, %xmm2 + mulss 4(%rsi), %xmm2 + addq $8, %rsi + addss %xmm3, %xmm2 + movss %xmm2, 4(%rdi) + addq $8, %rdi + jmp .L44 +.LFE514: + .size unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE + .section .eh_frame,"aw",@progbits +.Lframe1: + .long .LECIE1-.LSCIE1 +.LSCIE1: + .long 0x0 + .byte 0x1 + .string "zR" + .byte 0x1 + .byte 0x78 + .byte 0x10 + .byte 0x1 + .byte 0x3 + .byte 0xc + .byte 0x7 + .byte 0x8 + .byte 0x11 + .byte 0x10 + .byte 0x1 + .align 8 +.LECIE1: +.LSFDE1: + .long .LEFDE1-.LASFDE1 +.LASFDE1: + .long .LASFDE1-.Lframe1 + .long .LFB509 + .long .LFE509-.LFB509 + .byte 0x0 + .align 8 +.LEFDE1: +.LSFDE3: + .long .LEFDE3-.LASFDE3 +.LASFDE3: + .long .LASFDE3-.Lframe1 + .long .LFB510 + .long .LFE510-.LFB510 + .byte 0x0 + .align 8 +.LEFDE3: +.LSFDE5: + .long .LEFDE5-.LASFDE5 +.LASFDE5: + .long .LASFDE5-.Lframe1 + .long .LFB511 + .long .LFE511-.LFB511 + .byte 0x0 + .align 8 +.LEFDE5: +.LSFDE7: + .long .LEFDE7-.LASFDE7 +.LASFDE7: + .long .LASFDE7-.Lframe1 + .long .LFB512 + .long .LFE512-.LFB512 + .byte 0x0 + .align 8 +.LEFDE7: +.LSFDE9: + .long .LEFDE9-.LASFDE9 +.LASFDE9: + .long .LASFDE9-.Lframe1 + .long .LFB513 + .long .LFE513-.LFB513 + .byte 0x0 + .align 8 +.LEFDE9: +.LSFDE11: + .long .LEFDE11-.LASFDE11 +.LASFDE11: + .long .LASFDE11-.Lframe1 + .long .LFB515 + .long .LFE515-.LFB515 + .byte 0x0 + .align 8 +.LEFDE11: +.LSFDE13: + .long .LEFDE13-.LASFDE13 +.LASFDE13: + .long .LASFDE13-.Lframe1 + .long .LFB516 + .long .LFE516-.LFB516 + .byte 0x0 + .byte 0x4 + .long .LCFI0-.LFB516 + .byte 0xe + .byte 0x10 + .byte 0x4 + .long .LCFI1-.LCFI0 + .byte 0xe + .byte 0x18 + .byte 0x11 + .byte 0x3 + .byte 0x3 + .byte 0x11 + .byte 0x6 + .byte 0x2 + .align 8 +.LEFDE13: +.LSFDE15: + .long .LEFDE15-.LASFDE15 +.LASFDE15: + .long .LASFDE15-.Lframe1 + .long .LFB514 + .long .LFE514-.LFB514 + .byte 0x0 + .align 8 +.LEFDE15: + .ident "GCC: (GNU) 4.4.0 20081110 (experimental)" diff --git a/src/core/basic_ops_x86_64_sse2.s b/src/core/basic_ops_x86_64_sse2.s new file mode 100644 index 000000000..94fc2de6d --- /dev/null +++ b/src/core/basic_ops_x86_64_sse2.s @@ -0,0 +1,395 @@ + .file "basic_ops_x86.c" + .text + .align 16 +.globl alignedMemCpySSE2 + .type alignedMemCpySSE2, @function +alignedMemCpySSE2: +.LFB509: + movslq %edx,%rdx + shrq $6, %rdx + testl %edx, %edx + jle .L4 + leal -1(%rdx), %r9d + xorl %eax, %eax + mov %r9d, %r8d + leaq 1(%r8), %rcx + movq %rcx, %rdx + salq $6, %rdx + .align 16 +.L3: + movdqa (%rsi,%rax), %xmm0 + movdqa %xmm0, (%rdi,%rax) + movdqa 16(%rsi,%rax), %xmm0 + movdqa %xmm0, 16(%rdi,%rax) + movdqa 32(%rsi,%rax), %xmm0 + movdqa %xmm0, 32(%rdi,%rax) + movdqa 48(%rsi,%rax), %xmm0 + movdqa %xmm0, 48(%rdi,%rax) + addq $64, %rax + cmpq %rdx, %rax + jne .L3 +.L4: + rep + ret +.LFE509: + .size alignedMemCpySSE2, .-alignedMemCpySSE2 + .align 16 +.globl alignedMemClearSSE2 + .type alignedMemClearSSE2, @function +alignedMemClearSSE2: +.LFB510: + movslq %esi,%rax + shrq $6, %rax + testl %eax, %eax + jle .L10 + subl $1, %eax + pxor %xmm0, %xmm0 + salq $6, %rax + leaq 64(%rax,%rdi), %rax + .align 16 +.L9: + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + addq $64, %rdi + cmpq %rax, %rdi + jne .L9 +.L10: + rep + ret +.LFE510: + .size alignedMemClearSSE2, .-alignedMemClearSSE2 + .align 16 +.globl alignedConvertToS16SSE2 + .type alignedConvertToS16SSE2, @function +alignedConvertToS16SSE2: +.LFB511: + pushq %rbp +.LCFI0: + testb %cl, %cl + movl %edx, %eax + mulss .LC0(%rip), %xmm0 + pushq %rbx +.LCFI1: + jne .L13 + testw %dx, %dx + jle .L15 + movl %edx, %ebx + shrw $2, %bx + cmpw $3, %dx + leal 0(,%rbx,4), %r8d + ja .L33 +.L28: + xorl %r8d, %r8d + .align 16 +.L23: + movswq %r8w,%rdx + movl $32767, %ebx + leaq (%rdi,%rdx,8), %rcx + leaq (%rsi,%rdx,4), %rdx + movl $-32768, %edi + .align 16 +.L25: + movaps %xmm0, %xmm1 + mulss (%rcx), %xmm1 + cvttss2si %xmm1, %esi + movaps %xmm0, %xmm1 + mulss 4(%rcx), %xmm1 + cmpl $-32768, %esi + cmovl %edi, %esi + cmpl $32767, %esi + cmovg %ebx, %esi + movw %si, (%rdx) + cvttss2si %xmm1, %esi + cmpl $-32768, %esi + cmovl %edi, %esi + cmpl $32767, %esi + cmovg %ebx, %esi + addl $1, %r8d + addq $8, %rcx + movw %si, 2(%rdx) + addq $4, %rdx + cmpw %r8w, %ax + jg .L25 +.L15: + cwtl + popq %rbx + sall $2, %eax + popq %rbp + ret + .align 16 +.L13: + testw %dx, %dx + jle .L15 + movl %edx, %ebx + shrw $2, %bx + cmpw $3, %dx + leal 0(,%rbx,4), %r8d + ja .L34 +.L27: + xorl %r8d, %r8d + .align 16 +.L18: + movswq %r8w,%rdx + leaq (%rdi,%rdx,8), %rcx + leaq (%rsi,%rdx,4), %rdx + movl $-32768, %edi + movl $32767, %esi + .align 16 +.L20: + movaps %xmm0, %xmm1 + mulss (%rcx), %xmm1 + cvttss2si %xmm1, %ebx + movaps %xmm0, %xmm1 + mulss 4(%rcx), %xmm1 + cmpl $-32768, %ebx + cmovl %edi, %ebx + cmpl $32767, %ebx + cmovg %esi, %ebx + movzbl %bh, %ebp + sall $8, %ebx + orl %ebp, %ebx + movw %bx, (%rdx) + cvttss2si %xmm1, %ebx + cmpl $-32768, %ebx + cmovl %edi, %ebx + cmpl $32767, %ebx + cmovg %esi, %ebx + addl $1, %r8d + addq $8, %rcx + movzbl %bh, %ebp + sall $8, %ebx + orl %ebp, %ebx + movw %bx, 2(%rdx) + addq $4, %rdx + cmpw %r8w, %ax + jg .L20 + cwtl + popq %rbx + sall $2, %eax + popq %rbp + ret + .align 16 +.L34: + testw %r8w, %r8w + je .L27 + movaps %xmm0, %xmm1 + movq %rdi, %rcx + movdqa .LC1(%rip), %xmm2 + movq %rsi, %r10 + shufps $0, %xmm1, %xmm1 + xorl %r9d, %r9d + movdqa .LC3(%rip), %xmm8 + movaps %xmm1, %xmm9 + movdqa .LC2(%rip), %xmm1 + .align 16 +.L19: + movaps %xmm9, %xmm4 + addl $1, %r9d + movaps %xmm9, %xmm3 + mulps (%rcx), %xmm4 + movdqa %xmm1, %xmm6 + mulps 16(%rcx), %xmm3 + addq $32, %rcx + cvttps2dq %xmm4, %xmm4 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm2, %xmm5 + cvttps2dq %xmm3, %xmm3 + pand %xmm5, %xmm4 + pandn %xmm2, %xmm5 + por %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm1, %xmm5 + pand %xmm5, %xmm6 + pandn %xmm4, %xmm5 + movdqa %xmm5, %xmm4 + movdqa %xmm3, %xmm5 + por %xmm6, %xmm4 + movdqa %xmm1, %xmm6 + pcmpgtd %xmm2, %xmm5 + pand %xmm5, %xmm3 + pandn %xmm2, %xmm5 + movdqa %xmm4, %xmm7 + pslld $8, %xmm4 + pand %xmm8, %xmm7 + por %xmm5, %xmm3 + psrad $8, %xmm7 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm1, %xmm5 + pand %xmm5, %xmm6 + pandn %xmm3, %xmm5 + movdqa %xmm5, %xmm3 + por %xmm6, %xmm3 + movdqa %xmm7, %xmm6 + movdqa %xmm3, %xmm5 + pslld $8, %xmm3 + pand %xmm8, %xmm5 + psrad $8, %xmm5 + punpcklwd %xmm5, %xmm7 + punpckhwd %xmm5, %xmm6 + movdqa %xmm4, %xmm5 + punpcklwd %xmm3, %xmm4 + movdqa %xmm7, %xmm10 + punpckhwd %xmm3, %xmm5 + punpcklwd %xmm6, %xmm7 + punpckhwd %xmm6, %xmm10 + punpcklwd %xmm10, %xmm7 + movdqa %xmm4, %xmm10 + punpcklwd %xmm5, %xmm4 + punpckhwd %xmm5, %xmm10 + punpcklwd %xmm10, %xmm4 + por %xmm7, %xmm4 + movdqa %xmm4, (%r10) + addq $16, %r10 + cmpw %r9w, %bx + ja .L19 + cmpw %dx, %r8w + jne .L18 + jmp .L15 + .align 16 +.L33: + testw %r8w, %r8w + je .L28 + movaps %xmm0, %xmm1 + movq %rdi, %rcx + movdqa .LC1(%rip), %xmm2 + movq %rsi, %r10 + shufps $0, %xmm1, %xmm1 + xorl %r9d, %r9d + movaps %xmm1, %xmm6 + movdqa .LC2(%rip), %xmm1 + .align 16 +.L24: + movaps %xmm6, %xmm4 + addl $1, %r9d + movaps %xmm6, %xmm3 + mulps (%rcx), %xmm4 + movdqa %xmm1, %xmm7 + mulps 16(%rcx), %xmm3 + addq $32, %rcx + cvttps2dq %xmm4, %xmm4 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm2, %xmm5 + cvttps2dq %xmm3, %xmm3 + pand %xmm5, %xmm4 + pandn %xmm2, %xmm5 + por %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm1, %xmm5 + pand %xmm5, %xmm7 + pandn %xmm4, %xmm5 + movdqa %xmm5, %xmm4 + movdqa %xmm3, %xmm5 + por %xmm7, %xmm4 + movdqa %xmm1, %xmm7 + pcmpgtd %xmm2, %xmm5 + pand %xmm5, %xmm3 + pandn %xmm2, %xmm5 + por %xmm5, %xmm3 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm1, %xmm5 + pand %xmm5, %xmm7 + pandn %xmm3, %xmm5 + movdqa %xmm5, %xmm3 + movdqa %xmm4, %xmm5 + por %xmm7, %xmm3 + punpcklwd %xmm3, %xmm4 + punpckhwd %xmm3, %xmm5 + movdqa %xmm4, %xmm7 + punpcklwd %xmm5, %xmm4 + punpckhwd %xmm5, %xmm7 + punpcklwd %xmm7, %xmm4 + movdqa %xmm4, (%r10) + addq $16, %r10 + cmpw %r9w, %bx + ja .L24 + cmpw %r8w, %dx + jne .L23 + jmp .L15 +.LFE511: + .size alignedConvertToS16SSE2, .-alignedConvertToS16SSE2 + .section .rodata + .align 4 +.LC0: + .long 1191181824 + .align 16 +.LC1: + .long -32768 + .long -32768 + .long -32768 + .long -32768 + .align 16 +.LC2: + .long 32767 + .long 32767 + .long 32767 + .long 32767 + .align 16 +.LC3: + .long 65280 + .long 65280 + .long 65280 + .long 65280 + .section .eh_frame,"aw",@progbits +.Lframe1: + .long .LECIE1-.LSCIE1 +.LSCIE1: + .long 0x0 + .byte 0x1 + .string "zR" + .byte 0x1 + .byte 0x78 + .byte 0x10 + .byte 0x1 + .byte 0x3 + .byte 0xc + .byte 0x7 + .byte 0x8 + .byte 0x11 + .byte 0x10 + .byte 0x1 + .align 8 +.LECIE1: +.LSFDE1: + .long .LEFDE1-.LASFDE1 +.LASFDE1: + .long .LASFDE1-.Lframe1 + .long .LFB509 + .long .LFE509-.LFB509 + .byte 0x0 + .align 8 +.LEFDE1: +.LSFDE3: + .long .LEFDE3-.LASFDE3 +.LASFDE3: + .long .LASFDE3-.Lframe1 + .long .LFB510 + .long .LFE510-.LFB510 + .byte 0x0 + .align 8 +.LEFDE3: +.LSFDE5: + .long .LEFDE5-.LASFDE5 +.LASFDE5: + .long .LASFDE5-.Lframe1 + .long .LFB511 + .long .LFE511-.LFB511 + .byte 0x0 + .byte 0x4 + .long .LCFI0-.LFB511 + .byte 0xe + .byte 0x10 + .byte 0x4 + .long .LCFI1-.LCFI0 + .byte 0xe + .byte 0x18 + .byte 0x11 + .byte 0x3 + .byte 0x3 + .byte 0x11 + .byte 0x6 + .byte 0x2 + .align 8 +.LEFDE5: + .ident "GCC: (GNU) 4.4.0 20081110 (experimental)" diff --git a/src/core/basic_ops_x86_mmx.s b/src/core/basic_ops_x86_mmx.s new file mode 100644 index 000000000..a1bdc3240 --- /dev/null +++ b/src/core/basic_ops_x86_mmx.s @@ -0,0 +1,107 @@ + .file "basic_ops_x86.c" + .text + .p2align 4,,15 +.globl alignedMemCpyMMX + .type alignedMemCpyMMX, @function +alignedMemCpyMMX: + pushl %ebx + subl $112, %esp + movl 128(%esp), %ebx + movl 124(%esp), %eax + shrl $6, %ebx +#APP +# 42 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1 + fsave 4(%esp); fwait + +# 0 "" 2 +# 44 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1 + 1: prefetchnta (%eax) + prefetchnta 64(%eax) + prefetchnta 128(%eax) + prefetchnta 192(%eax) + prefetchnta 256(%eax) + +# 0 "" 2 +#NO_APP + testl %ebx, %ebx + je .L2 + movl 120(%esp), %ecx + xorl %edx, %edx + .p2align 4,,7 + .p2align 3 +.L3: +#APP +# 53 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1 + 1: prefetchnta 320(%eax) +2: movq (%eax), %mm0 + movq 8(%eax), %mm1 + movq 16(%eax), %mm2 + movq 24(%eax), %mm3 + movq %mm0, (%ecx) + movq %mm1, 8(%ecx) + movq %mm2, 16(%ecx) + movq %mm3, 24(%ecx) + movq 32(%eax), %mm0 + movq 40(%eax), %mm1 + movq 48(%eax), %mm2 + movq 56(%eax), %mm3 + movq %mm0, 32(%ecx) + movq %mm1, 40(%ecx) + movq %mm2, 48(%ecx) + movq %mm3, 56(%ecx) + +# 0 "" 2 +#NO_APP + addl $1, %edx + addl $64, %eax + addl $64, %ecx + cmpl %edx, %ebx + jne .L3 +.L2: +#APP +# 75 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1 + fsave 4(%esp); fwait + +# 0 "" 2 +#NO_APP + addl $112, %esp + popl %ebx + ret + .size alignedMemCpyMMX, .-alignedMemCpyMMX + .p2align 4,,15 +.globl alignedMemClearMMX + .type alignedMemClearMMX, @function +alignedMemClearMMX: + movl 8(%esp), %ecx + shrl $6, %ecx + testl %ecx, %ecx + je .L8 + movl 4(%esp), %edx + xorl %eax, %eax + pxor %mm0, %mm0 + .p2align 4,,7 + .p2align 3 +.L9: +#APP +# 90 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1 + movq %mm0, (%edx) +movq %mm0, 8(%edx) +movq %mm0, 16(%edx) +movq %mm0, 24(%edx) +movq %mm0, 32(%edx) +movq %mm0, 40(%edx) +movq %mm0, 48(%edx) +movq %mm0, 56(%edx) + +# 0 "" 2 +#NO_APP + addl $1, %eax + addl $64, %edx + cmpl %eax, %ecx + jne .L9 +.L8: + emms + ret + .size alignedMemClearMMX, .-alignedMemClearMMX + .ident "GCC: (GNU) 4.4.0 20081110 (experimental)" + .section .note.GNU-stack,"",@progbits diff --git a/src/core/basic_ops_x86_sse.s b/src/core/basic_ops_x86_sse.s new file mode 100644 index 000000000..ab5d006e7 --- /dev/null +++ b/src/core/basic_ops_x86_sse.s @@ -0,0 +1,505 @@ + .file "basic_ops_x86.c" + .text + .p2align 4,,15 +.globl alignedMemCpySSE + .type alignedMemCpySSE, @function +alignedMemCpySSE: + pushl %esi + pushl %ebx + movl 20(%esp), %esi + movl 12(%esp), %edx + movl 16(%esp), %ecx + shrl $6, %esi + testl %esi, %esi + je .L4 + xorl %eax, %eax + xorl %ebx, %ebx + .p2align 4,,7 + .p2align 3 +.L3: + movaps (%ecx,%eax), %xmm0 + addl $1, %ebx + movaps %xmm0, (%edx,%eax) + movaps 16(%ecx,%eax), %xmm0 + movaps %xmm0, 16(%edx,%eax) + movaps 32(%ecx,%eax), %xmm0 + movaps %xmm0, 32(%edx,%eax) + movaps 48(%ecx,%eax), %xmm0 + movaps %xmm0, 48(%edx,%eax) + addl $64, %eax + cmpl %ebx, %esi + jne .L3 +.L4: + popl %ebx + popl %esi + ret + .size alignedMemCpySSE, .-alignedMemCpySSE + .p2align 4,,15 +.globl alignedMemClearSSE + .type alignedMemClearSSE, @function +alignedMemClearSSE: + movl 8(%esp), %ecx + shrl $6, %ecx + testl %ecx, %ecx + je .L10 + movl 4(%esp), %eax + xorps %xmm0, %xmm0 + xorl %edx, %edx + .p2align 4,,7 + .p2align 3 +.L9: + addl $1, %edx + movaps %xmm0, (%eax) + movaps %xmm0, 16(%eax) + movaps %xmm0, 32(%eax) + movaps %xmm0, 48(%eax) + addl $64, %eax + cmpl %edx, %ecx + jne .L9 +.L10: + rep + ret + .size alignedMemClearSSE, .-alignedMemClearSSE + .p2align 4,,15 +.globl alignedBufApplyGainSSE + .type alignedBufApplyGainSSE, @function +alignedBufApplyGainSSE: + movl 12(%esp), %ecx + testl %ecx, %ecx + jle .L15 + movss 8(%esp), %xmm0 + subl $1, %ecx + movl 4(%esp), %eax + shrl $3, %ecx + xorl %edx, %edx + addl $1, %ecx + shufps $0, %xmm0, %xmm0 + .p2align 4,,7 + .p2align 3 +.L14: + movaps %xmm0, %xmm3 + addl $1, %edx + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm4 + mulps 16(%eax), %xmm3 + mulps 32(%eax), %xmm2 + mulps 48(%eax), %xmm1 + movaps %xmm3, 16(%eax) + mulps (%eax), %xmm4 + movaps %xmm2, 32(%eax) + movaps %xmm1, 48(%eax) + movaps %xmm4, (%eax) + addl $64, %eax + cmpl %edx, %ecx + ja .L14 +.L15: + rep + ret + .size alignedBufApplyGainSSE, .-alignedBufApplyGainSSE + .p2align 4,,15 +.globl alignedBufMixSSE + .type alignedBufMixSSE, @function +alignedBufMixSSE: + pushl %esi + pushl %ebx + movl 20(%esp), %esi + movl 12(%esp), %edx + movl 16(%esp), %ecx + testl %esi, %esi + jle .L20 + subl $1, %esi + xorl %eax, %eax + shrl $3, %esi + xorl %ebx, %ebx + addl $1, %esi + .p2align 4,,7 + .p2align 3 +.L19: + movaps 16(%edx,%eax), %xmm2 + addl $1, %ebx + movaps 32(%edx,%eax), %xmm1 + movaps 48(%edx,%eax), %xmm0 + movaps (%edx,%eax), %xmm3 + addps 16(%ecx,%eax), %xmm2 + addps 32(%ecx,%eax), %xmm1 + addps 48(%ecx,%eax), %xmm0 + addps (%ecx,%eax), %xmm3 + movaps %xmm2, 16(%edx,%eax) + movaps %xmm3, (%edx,%eax) + movaps %xmm1, 32(%edx,%eax) + movaps %xmm0, 48(%edx,%eax) + addl $64, %eax + cmpl %ebx, %esi + ja .L19 +.L20: + popl %ebx + popl %esi + ret + .size alignedBufMixSSE, .-alignedBufMixSSE + .p2align 4,,15 +.globl alignedBufMixLRCoeffSSE + .type alignedBufMixLRCoeffSSE, @function +alignedBufMixLRCoeffSSE: + pushl %esi + pushl %ebx + movl 28(%esp), %esi + movl 12(%esp), %edx + movl 16(%esp), %ebx + testl %esi, %esi + jle .L25 + movss 24(%esp), %xmm0 + subl $1, %esi + movss 20(%esp), %xmm1 + xorl %eax, %eax + shrl $2, %esi + xorl %ecx, %ecx + addl $1, %esi + unpcklps %xmm0, %xmm1 + movaps %xmm1, %xmm0 + movlhps %xmm1, %xmm0 + .p2align 4,,7 + .p2align 3 +.L24: + movaps %xmm0, %xmm1 + addl $1, %ecx + movaps %xmm0, %xmm2 + mulps 16(%ebx,%eax), %xmm1 + mulps (%ebx,%eax), %xmm2 + addps 16(%edx,%eax), %xmm1 + addps (%edx,%eax), %xmm2 + movaps %xmm1, 16(%edx,%eax) + movaps %xmm2, (%edx,%eax) + addl $32, %eax + cmpl %ecx, %esi + ja .L24 +.L25: + popl %ebx + popl %esi + ret + .size alignedBufMixLRCoeffSSE, .-alignedBufMixLRCoeffSSE + .p2align 4,,15 +.globl alignedBufWetDryMixSSE + .type alignedBufWetDryMixSSE, @function +alignedBufWetDryMixSSE: + pushl %esi + pushl %ebx + movl 28(%esp), %esi + movl 12(%esp), %edx + movl 16(%esp), %ebx + testl %esi, %esi + jle .L30 + movss 24(%esp), %xmm1 + subl $1, %esi + movss 20(%esp), %xmm0 + xorl %eax, %eax + shrl $2, %esi + xorl %ecx, %ecx + shufps $0, %xmm1, %xmm1 + addl $1, %esi + shufps $0, %xmm0, %xmm0 + .p2align 4,,7 + .p2align 3 +.L29: + movaps %xmm1, %xmm3 + addl $1, %ecx + movaps %xmm0, %xmm2 + movaps %xmm1, %xmm4 + mulps 16(%edx,%eax), %xmm3 + mulps 16(%ebx,%eax), %xmm2 + mulps (%edx,%eax), %xmm4 + addps %xmm3, %xmm2 + movaps %xmm0, %xmm3 + mulps (%ebx,%eax), %xmm3 + movaps %xmm2, 16(%edx,%eax) + addps %xmm4, %xmm3 + movaps %xmm3, (%edx,%eax) + addl $32, %eax + cmpl %ecx, %esi + ja .L29 +.L30: + popl %ebx + popl %esi + ret + .size alignedBufWetDryMixSSE, .-alignedBufWetDryMixSSE + .p2align 4,,15 +.globl alignedBufWetDryMixSplittedSSE + .type alignedBufWetDryMixSplittedSSE, @function +alignedBufWetDryMixSplittedSSE: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + subl $140, %esp + movl 180(%esp), %eax + movl 160(%esp), %edx + movl 164(%esp), %esi + movl 168(%esp), %ecx + testl %eax, %eax + movss 172(%esp), %xmm4 + movss 176(%esp), %xmm5 + jle .L39 + movl 180(%esp), %eax + subl $1, %eax + shrl %eax + addl $1, %eax + movl %eax, %ebp + movl %eax, 112(%esp) + shrl $2, %ebp + cmpl $3, 112(%esp) + leal 0(,%ebp,4), %eax + movl %eax, 116(%esp) + jbe .L40 + testl %eax, %eax + jne .L34 +.L40: + xorl %edi, %edi + jmp .L36 + .p2align 4,,7 + .p2align 3 +.L34: + movaps %xmm4, %xmm2 + xorps %xmm6, %xmm6 + shufps $0, %xmm2, %xmm2 + movaps %xmm5, %xmm1 + movl %esi, %ebx + shufps $0, %xmm1, %xmm1 + movaps %xmm2, 32(%esp) + xorl %eax, %eax + xorl %edi, %edi + movss %xmm5, 124(%esp) + movss %xmm4, 120(%esp) + movaps %xmm1, %xmm4 + .p2align 4,,7 + .p2align 3 +.L37: + movaps 16(%edx,%eax,2), %xmm3 + addl $1, %edi + movaps (%edx,%eax,2), %xmm2 + movaps 48(%edx,%eax,2), %xmm0 + movaps %xmm2, %xmm5 + shufps $221, %xmm3, %xmm2 + movaps 32(%edx,%eax,2), %xmm1 + shufps $136, %xmm3, %xmm5 + movaps %xmm2, 96(%esp) + movaps %xmm1, %xmm7 + shufps $221, %xmm0, %xmm1 + shufps $136, %xmm0, %xmm7 + movaps %xmm1, 64(%esp) + movaps %xmm6, %xmm3 + movaps %xmm5, (%esp) + shufps $136, %xmm7, %xmm5 + movlps (%ebx), %xmm3 + movaps %xmm6, %xmm2 + movhps 8(%ebx), %xmm3 + movaps %xmm7, 80(%esp) + movlps 16(%ebx), %xmm2 + movhps 24(%ebx), %xmm2 + movaps 96(%esp), %xmm7 + addl $32, %ebx + movaps %xmm3, %xmm0 + shufps $221, %xmm2, %xmm3 + shufps $136, %xmm2, %xmm0 + shufps $136, 64(%esp), %xmm7 + mulps 32(%esp), %xmm0 + movaps %xmm6, %xmm1 + movlps (%ecx,%eax), %xmm1 + movhps 8(%ecx,%eax), %xmm1 + movaps 96(%esp), %xmm2 + mulps %xmm4, %xmm7 + shufps $221, 64(%esp), %xmm2 + mulps %xmm4, %xmm5 + mulps 32(%esp), %xmm3 + movaps %xmm7, 16(%esp) + movaps %xmm1, %xmm7 + addps %xmm0, %xmm5 + movaps %xmm6, %xmm0 + movlps 16(%ecx,%eax), %xmm0 + movhps 24(%ecx,%eax), %xmm0 + shufps $136, %xmm0, %xmm7 + shufps $221, %xmm0, %xmm1 + mulps 32(%esp), %xmm7 + mulps 32(%esp), %xmm1 + mulps %xmm4, %xmm2 + movaps %xmm7, 48(%esp) + movaps 16(%esp), %xmm7 + addps 48(%esp), %xmm7 + addps %xmm1, %xmm2 + movaps %xmm7, 16(%esp) + movaps (%esp), %xmm7 + shufps $221, 80(%esp), %xmm7 + movaps 16(%esp), %xmm1 + mulps %xmm4, %xmm7 + movaps 16(%esp), %xmm0 + unpckhps %xmm2, %xmm1 + unpcklps %xmm2, %xmm0 + movaps %xmm1, %xmm2 + addps %xmm3, %xmm7 + movaps %xmm5, %xmm3 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm5 + movaps %xmm3, %xmm1 + unpckhps %xmm0, %xmm3 + unpcklps %xmm0, %xmm1 + movaps %xmm5, %xmm0 + unpckhps %xmm2, %xmm5 + unpcklps %xmm2, %xmm0 + movaps %xmm1, (%edx,%eax,2) + movaps %xmm3, 16(%edx,%eax,2) + movaps %xmm0, 32(%edx,%eax,2) + movaps %xmm5, 48(%edx,%eax,2) + addl $32, %eax + cmpl %edi, %ebp + ja .L37 + movl 116(%esp), %edi + movl 112(%esp), %eax + movss 120(%esp), %xmm4 + movss 124(%esp), %xmm5 + addl %edi, %edi + cmpl %eax, 116(%esp) + je .L39 +.L36: + leal (%edx,%edi,8), %ebx + xorl %ebp, %ebp + leal 8(%edx,%edi,8), %edx + movl %edi, %eax + .p2align 4,,7 + .p2align 3 +.L38: + movaps %xmm5, %xmm1 + addl $2, %ebp + movaps %xmm4, %xmm0 + mulss (%ebx), %xmm1 + mulss (%esi,%eax,4), %xmm0 + addss %xmm1, %xmm0 + movaps %xmm5, %xmm1 + movss %xmm0, (%ebx) + movaps %xmm4, %xmm0 + mulss 4(%ebx), %xmm1 + mulss (%ecx,%eax,4), %xmm0 + addss %xmm1, %xmm0 + movaps %xmm5, %xmm1 + movss %xmm0, 4(%ebx) + addl $16, %ebx + movaps %xmm4, %xmm0 + mulss (%edx), %xmm1 + mulss 4(%esi,%eax,4), %xmm0 + addss %xmm1, %xmm0 + movaps %xmm5, %xmm1 + movss %xmm0, (%edx) + movaps %xmm4, %xmm0 + mulss 4(%edx), %xmm1 + mulss 4(%ecx,%eax,4), %xmm0 + leal (%edi,%ebp), %eax + addss %xmm1, %xmm0 + movss %xmm0, 4(%edx) + addl $16, %edx + cmpl %eax, 180(%esp) + jg .L38 +.L39: + addl $140, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + .size alignedBufWetDryMixSplittedSSE, .-alignedBufWetDryMixSplittedSSE + .p2align 4,,15 +.globl unalignedBufMixLRCoeffSSE + .type unalignedBufMixLRCoeffSSE, @function +unalignedBufMixLRCoeffSSE: + pushl %esi + pushl %ebx + movl 28(%esp), %esi + movl 12(%esp), %eax + movl 16(%esp), %edx + movss 20(%esp), %xmm0 + movl %esi, %ecx + shrl $31, %ecx + leal (%esi,%ecx), %ebx + andl $1, %ebx + cmpl %ecx, %ebx + movss 24(%esp), %xmm3 + jne .L52 +.L44: + testl %esi, %esi + jle .L49 + leal -1(%esi), %ebx + shrl %ebx + testb $15, %al + jne .L46 + movaps %xmm0, %xmm1 + xorps %xmm2, %xmm2 + unpcklps %xmm3, %xmm1 + addl $1, %ebx + xorl %ecx, %ecx + movaps %xmm1, %xmm3 + movlhps %xmm1, %xmm3 + .p2align 4,,7 + .p2align 3 +.L47: + movaps %xmm2, %xmm1 + addl $1, %ecx + movlps (%edx), %xmm1 + movhps 8(%edx), %xmm1 + movaps %xmm2, %xmm0 + movlps (%eax), %xmm0 + movhps 8(%eax), %xmm0 + addl $16, %edx + mulps %xmm3, %xmm1 + addps %xmm1, %xmm0 + movaps %xmm0, (%eax) + addl $16, %eax + cmpl %ebx, %ecx + jb .L47 +.L49: + popl %ebx + popl %esi + ret + .p2align 4,,7 + .p2align 3 +.L46: + xorl %ecx, %ecx + .p2align 4,,7 + .p2align 3 +.L48: + movaps %xmm0, %xmm1 + mulss (%edx,%ecx,8), %xmm1 + addss (%eax,%ecx,8), %xmm1 + movss %xmm1, (%eax,%ecx,8) + movaps %xmm3, %xmm1 + mulss 4(%edx,%ecx,8), %xmm1 + addss 4(%eax,%ecx,8), %xmm1 + movss %xmm1, 4(%eax,%ecx,8) + movaps %xmm0, %xmm1 + mulss 8(%edx,%ecx,8), %xmm1 + addss 8(%eax,%ecx,8), %xmm1 + movss %xmm1, 8(%eax,%ecx,8) + movaps %xmm3, %xmm1 + mulss 12(%edx,%ecx,8), %xmm1 + addss 12(%eax,%ecx,8), %xmm1 + movss %xmm1, 12(%eax,%ecx,8) + addl $2, %ecx + cmpl %ecx, %esi + jg .L48 + popl %ebx + popl %esi + ret +.L52: + movaps %xmm0, %xmm1 + subl $1, %esi + movss (%eax), %xmm2 + mulss (%edx), %xmm1 + addss %xmm2, %xmm1 + movss 4(%eax), %xmm2 + movss %xmm1, (%eax) + movaps %xmm3, %xmm1 + mulss 4(%edx), %xmm1 + addl $8, %edx + addss %xmm2, %xmm1 + movss %xmm1, 4(%eax) + addl $8, %eax + jmp .L44 + .size unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE + .ident "GCC: (GNU) 4.4.0 20081110 (experimental)" + .section .note.GNU-stack,"",@progbits diff --git a/src/core/basic_ops_x86_sse2.s b/src/core/basic_ops_x86_sse2.s new file mode 100644 index 000000000..c575a150f --- /dev/null +++ b/src/core/basic_ops_x86_sse2.s @@ -0,0 +1,349 @@ + .file "basic_ops_x86.c" + .text + .p2align 4,,15 +.globl alignedMemCpySSE2 + .type alignedMemCpySSE2, @function +alignedMemCpySSE2: + pushl %esi + pushl %ebx + movl 20(%esp), %esi + movl 12(%esp), %edx + movl 16(%esp), %ecx + shrl $6, %esi + testl %esi, %esi + je .L4 + xorl %eax, %eax + xorl %ebx, %ebx + .p2align 4,,7 + .p2align 3 +.L3: + addl $1, %ebx + movdqa (%ecx,%eax), %xmm0 + movdqa %xmm0, (%edx,%eax) + movdqa 16(%ecx,%eax), %xmm0 + movdqa %xmm0, 16(%edx,%eax) + movdqa 32(%ecx,%eax), %xmm0 + movdqa %xmm0, 32(%edx,%eax) + movdqa 48(%ecx,%eax), %xmm0 + movdqa %xmm0, 48(%edx,%eax) + addl $64, %eax + cmpl %ebx, %esi + jne .L3 +.L4: + popl %ebx + popl %esi + ret + .size alignedMemCpySSE2, .-alignedMemCpySSE2 + .p2align 4,,15 +.globl alignedMemClearSSE2 + .type alignedMemClearSSE2, @function +alignedMemClearSSE2: + movl 8(%esp), %ecx + shrl $6, %ecx + testl %ecx, %ecx + je .L10 + movl 4(%esp), %eax + xorl %edx, %edx + pxor %xmm0, %xmm0 + .p2align 4,,7 + .p2align 3 +.L9: + addl $1, %edx + movdqa %xmm0, (%eax) + movdqa %xmm0, 16(%eax) + movdqa %xmm0, 32(%eax) + movdqa %xmm0, 48(%eax) + addl $64, %eax + cmpl %edx, %ecx + jne .L9 +.L10: + rep + ret + .size alignedMemClearSSE2, .-alignedMemClearSSE2 + .p2align 4,,15 +.globl alignedConvertToS16SSE2 + .type alignedConvertToS16SSE2, @function +alignedConvertToS16SSE2: + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + subl $8, %esp + movl 36(%esp), %eax + movss .LC0, %xmm4 + cmpb $0, 44(%esp) + movl 28(%esp), %edx + movl 32(%esp), %ebx + movl %eax, %esi + mulss 40(%esp), %xmm4 + jne .L13 + testw %ax, %ax + jle .L15 + movl %eax, %edi + shrw $2, %di + cmpw $3, %ax + movw %ax, 2(%esp) + leal 0(,%edi,4), %ebp + ja .L33 +.L28: + xorl %ebp, %ebp + .p2align 4,,7 + .p2align 3 +.L23: + movswl %bp,%eax + movl $-32768, %edi + leal (%edx,%eax,8), %edx + leal (%ebx,%eax,4), %eax + movl $32767, %ebx + .p2align 4,,7 + .p2align 3 +.L25: + movaps %xmm4, %xmm0 + mulss (%edx), %xmm0 + cvttss2si %xmm0, %ecx + movaps %xmm4, %xmm0 + mulss 4(%edx), %xmm0 + cmpl $-32768, %ecx + cmovl %edi, %ecx + cmpl $32767, %ecx + cmovg %ebx, %ecx + movw %cx, (%eax) + cvttss2si %xmm0, %ecx + cmpl $-32768, %ecx + cmovl %edi, %ecx + cmpl $32767, %ecx + cmovg %ebx, %ecx + addl $1, %ebp + movw %cx, 2(%eax) + addl $8, %edx + addl $4, %eax + cmpw %bp, %si + jg .L25 +.L15: + movswl %si,%esi + addl $8, %esp + leal 0(,%esi,4), %eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + .p2align 4,,7 + .p2align 3 +.L13: + testw %ax, %ax + jle .L15 + movl %eax, %ebp + shrw $2, %bp + cmpw $3, %si + movw %ax, 2(%esp) + leal 0(,%ebp,4), %eax + ja .L34 +.L27: + xorl %eax, %eax + .p2align 4,,7 + .p2align 3 +.L18: + movswl %ax,%edi + leal (%edx,%edi,8), %ecx + leal (%ebx,%edi,4), %edx + movl $-32768, %edi + .p2align 4,,7 + .p2align 3 +.L20: + movaps %xmm4, %xmm0 + movl $32767, %ebp + mulss (%ecx), %xmm0 + cvttss2si %xmm0, %ebx + movaps %xmm4, %xmm0 + mulss 4(%ecx), %xmm0 + cmpl $-32768, %ebx + cmovl %edi, %ebx + cmpl $32767, %ebx + cmovg %ebp, %ebx + movzbl %bh, %ebp + sall $8, %ebx + orl %ebp, %ebx + movl $32767, %ebp + movw %bx, (%edx) + cvttss2si %xmm0, %ebx + cmpl $-32768, %ebx + cmovl %edi, %ebx + cmpl $32767, %ebx + cmovg %ebp, %ebx + addl $1, %eax + movzbl %bh, %ebp + addl $8, %ecx + sall $8, %ebx + orl %ebp, %ebx + movw %bx, 2(%edx) + addl $4, %edx + cmpw %ax, %si + jg .L20 + jmp .L15 + .p2align 4,,7 + .p2align 3 +.L34: + testw %ax, %ax + je .L27 + movaps %xmm4, %xmm0 + xorl %ecx, %ecx + movdqa .LC1, %xmm1 + movss %xmm4, 4(%esp) + shufps $0, %xmm0, %xmm0 + xorl %edi, %edi + movaps %xmm0, %xmm7 + movdqa .LC2, %xmm0 + .p2align 4,,7 + .p2align 3 +.L19: + movaps %xmm7, %xmm3 + movdqa %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + movaps %xmm7, %xmm2 + addl $1, %edi + mulps (%edx,%ecx,2), %xmm3 + mulps 16(%edx,%ecx,2), %xmm2 + cvttps2dq %xmm3, %xmm3 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm1, %xmm4 + pand %xmm4, %xmm3 + pandn %xmm1, %xmm4 + por %xmm4, %xmm3 + cvttps2dq %xmm2, %xmm2 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm0, %xmm4 + pand %xmm4, %xmm5 + pandn %xmm3, %xmm4 + movdqa %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + por %xmm5, %xmm3 + pcmpgtd %xmm1, %xmm4 + movdqa .LC3, %xmm5 + pand %xmm4, %xmm2 + pand %xmm3, %xmm5 + pandn %xmm1, %xmm4 + psrad $8, %xmm5 + por %xmm4, %xmm2 + pslld $8, %xmm3 + movdqa %xmm2, %xmm4 + pcmpgtd %xmm0, %xmm4 + pand %xmm4, %xmm6 + pandn %xmm2, %xmm4 + movdqa %xmm4, %xmm2 + por %xmm6, %xmm2 + movdqa .LC3, %xmm6 + pand %xmm2, %xmm6 + pslld $8, %xmm2 + psrad $8, %xmm6 + movdqa %xmm5, %xmm4 + punpcklwd %xmm6, %xmm5 + punpckhwd %xmm6, %xmm4 + movdqa %xmm5, %xmm6 + punpcklwd %xmm4, %xmm5 + punpckhwd %xmm4, %xmm6 + movdqa %xmm3, %xmm4 + punpcklwd %xmm6, %xmm5 + punpckhwd %xmm2, %xmm4 + punpcklwd %xmm2, %xmm3 + movdqa %xmm3, %xmm6 + punpcklwd %xmm4, %xmm3 + punpckhwd %xmm4, %xmm6 + punpcklwd %xmm6, %xmm3 + por %xmm3, %xmm5 + movdqa %xmm5, (%ebx,%ecx) + addl $16, %ecx + cmpw %di, %bp + ja .L19 + cmpw 2(%esp), %ax + movss 4(%esp), %xmm4 + jne .L18 + jmp .L15 + .p2align 4,,7 + .p2align 3 +.L33: + testw %bp, %bp + .p2align 4,,3 + .p2align 3 + je .L28 + movaps %xmm4, %xmm0 + xorl %eax, %eax + movdqa .LC1, %xmm1 + shufps $0, %xmm0, %xmm0 + xorl %ecx, %ecx + movaps %xmm0, %xmm6 + movdqa .LC2, %xmm0 + .p2align 4,,7 + .p2align 3 +.L24: + movaps %xmm6, %xmm3 + addl $1, %ecx + movdqa %xmm0, %xmm7 + movaps %xmm6, %xmm2 + mulps (%edx,%eax,2), %xmm3 + mulps 16(%edx,%eax,2), %xmm2 + cvttps2dq %xmm3, %xmm3 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm1, %xmm5 + pand %xmm5, %xmm3 + pandn %xmm1, %xmm5 + por %xmm5, %xmm3 + cvttps2dq %xmm2, %xmm2 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm0, %xmm5 + pand %xmm5, %xmm7 + pandn %xmm3, %xmm5 + movdqa %xmm5, %xmm3 + movdqa %xmm2, %xmm5 + por %xmm7, %xmm3 + pcmpgtd %xmm1, %xmm5 + movdqa %xmm0, %xmm7 + pand %xmm5, %xmm2 + pandn %xmm1, %xmm5 + por %xmm5, %xmm2 + movdqa %xmm2, %xmm5 + pcmpgtd %xmm0, %xmm5 + pand %xmm5, %xmm7 + pandn %xmm2, %xmm5 + movdqa %xmm5, %xmm2 + movdqa %xmm3, %xmm5 + por %xmm7, %xmm2 + punpckhwd %xmm2, %xmm5 + punpcklwd %xmm2, %xmm3 + movdqa %xmm3, %xmm7 + punpcklwd %xmm5, %xmm3 + punpckhwd %xmm5, %xmm7 + punpcklwd %xmm7, %xmm3 + movdqa %xmm3, (%ebx,%eax) + addl $16, %eax + cmpw %cx, %di + ja .L24 + cmpw %bp, 2(%esp) + jne .L23 + jmp .L15 + .size alignedConvertToS16SSE2, .-alignedConvertToS16SSE2 + .section .rodata.cst4,"aM",@progbits,4 + .align 4 +.LC0: + .long 1191181824 + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.LC1: + .long -32768 + .long -32768 + .long -32768 + .long -32768 + .align 16 +.LC2: + .long 32767 + .long 32767 + .long 32767 + .long 32767 + .align 16 +.LC3: + .long 65280 + .long 65280 + .long 65280 + .long 65280 + .ident "GCC: (GNU) 4.4.0 20081110 (experimental)" + .section .note.GNU-stack,"",@progbits diff --git a/src/core/fx_mixer.cpp b/src/core/fx_mixer.cpp index edf168bcb..cb69b6ac8 100644 --- a/src/core/fx_mixer.cpp +++ b/src/core/fx_mixer.cpp @@ -28,6 +28,7 @@ #include #include "fx_mixer.h" +#include "basic_ops.h" #include "effect.h" #include "song.h" @@ -38,7 +39,7 @@ fxChannel::fxChannel( model * _parent ) : m_stillRunning( false ), m_peakLeft( 0.0f ), m_peakRight( 0.0f ), - m_buffer( new sampleFrame[engine::getMixer()->framesPerPeriod()] ), + m_buffer( alignedAllocFrames( engine::getMixer()->framesPerPeriod() ) ), m_muteModel( false, _parent ), m_volumeModel( 1.0, 0.0, 2.0, 0.01, _parent ), m_name(), @@ -53,7 +54,7 @@ fxChannel::fxChannel( model * _parent ) : fxChannel::~fxChannel() { - delete[] m_buffer; + alignedFreeFrames( m_buffer ); } @@ -92,13 +93,7 @@ void fxMixer::mixToChannel( const sampleFrame * _buf, fx_ch_t _ch ) if( m_fxChannels[_ch]->m_muteModel.value() == false ) { m_fxChannels[_ch]->m_lock.lock(); - sampleFrame * buf = m_fxChannels[_ch]->m_buffer; - for( f_cnt_t f = 0; f < engine::getMixer()->framesPerPeriod(); - ++f ) - { - buf[f][0] += _buf[f][0]; - buf[f][1] += _buf[f][1]; - } + alignedBufMix( m_fxChannels[_ch]->m_buffer, _buf, engine::getMixer()->framesPerPeriod() ); m_fxChannels[_ch]->m_used = true; m_fxChannels[_ch]->m_lock.unlock(); } diff --git a/src/core/main.cpp b/src/core/main.cpp index 91566d379..91a90cd4e 100644 --- a/src/core/main.cpp +++ b/src/core/main.cpp @@ -57,6 +57,7 @@ #include "main_window.h" #include "project_renderer.h" #include "song.h" +#include "basic_ops.h" #warning TODO: move somewhere else static inline QString baseName( const QString & _file ) @@ -78,12 +79,29 @@ inline void loadTranslation( const QString & _tname, } +Uint32 convertToS16( const sampleFrameA * RP _ab, + const fpp_t _frames, + const float _master_gain, + intSampleFrameA * RP _output_buffer, + const bool _convert_endian ); int main( int argc, char * * argv ) { // intialize RNG srand( getpid() + time( 0 ) ); + // init CPU specific optimized basic ops + initBasicOps(); + +#if 0 + sampleFrameA * buf = (sampleFrameA *) alignedMalloc( sizeof( sampleFrameA ) * 256 ); + intSampleFrameA * obuf = (intSampleFrameA*)alignedMalloc( sizeof( intSampleFrameA ) * 256 ); + for( int i = 0; i< 1000000; ++i ) + { + convertToS16( buf, 256, 0.7, obuf, false ); + } +return 0; +#endif bool core_only = FALSE; for( int i = 1; i < argc; ++i ) diff --git a/src/core/mixer.cpp b/src/core/mixer.cpp index ffca00679..2fb71ca33 100644 --- a/src/core/mixer.cpp +++ b/src/core/mixer.cpp @@ -41,6 +41,7 @@ #include "sample_play_handle.h" #include "piano_roll.h" #include "micro_timer.h" +#include "basic_ops.h" #include "audio_device.h" #include "midi_client.h" @@ -61,40 +62,15 @@ #include "midi_winmm.h" #include "midi_dummy.h" +#ifdef LMMS_HAVE_PTHREAD_H +#include +#endif + static QVector __fx_channel_jobs( NumFxChannels ); -static void aligned_free( void * _buf ) -{ - if( _buf != NULL ) - { - int *ptr2=(int *)_buf - 1; - _buf = (char *)_buf- *ptr2; - free(_buf); - } -} - -static void * aligned_malloc( int _bytes ) -{ - char *ptr,*ptr2,*aligned_ptr; - int align_mask = ALIGN_SIZE- 1; - ptr=(char *)malloc(_bytes +ALIGN_SIZE+ sizeof(int)); - if(ptr==NULL) return(NULL); - - ptr2 = ptr + sizeof(int); - aligned_ptr = ptr2 + (ALIGN_SIZE- ((size_t)ptr2 & align_mask)); - - - ptr2 = aligned_ptr - sizeof(int); - *((int *)ptr2)=(int)(aligned_ptr - ptr); - - return(aligned_ptr); -} - - - class mixerWorkerThread : public QThread { public: @@ -152,9 +128,7 @@ public: mixerWorkerThread( int _worker_num, mixer * _mixer ) : QThread( _mixer ), - m_workingBuf( (sampleFrame *) aligned_malloc( - _mixer->framesPerPeriod() * - sizeof( sampleFrame ) ) ), + m_workingBuf( alignedAllocFrames( _mixer->framesPerPeriod() ) ), m_workerNum( _worker_num ), m_quit( false ), m_mixer( _mixer ), @@ -165,7 +139,7 @@ public: virtual ~mixerWorkerThread() { - aligned_free( m_workingBuf ); + alignedFreeFrames( m_workingBuf ); } virtual void quit( void ) @@ -234,11 +208,11 @@ private: { #if 0 #ifdef LMMS_BUILD_LINUX -#ifdef LMMS_HAVE_SCHED_H +#ifdef LMMS_HAVE_PTHREAD_H cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET( m_workerNum, &mask ); - sched_setaffinity( 0, sizeof( mask ), &mask ); + pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ); #endif #endif #endif @@ -310,7 +284,8 @@ mixer::mixer( void ) : { m_inputBufferFrames[i] = 0; m_inputBufferSize[i] = DEFAULT_BUFFER_SIZE * 100; - m_inputBuffer[i] = new sampleFrame[ DEFAULT_BUFFER_SIZE * 100 ]; + m_inputBuffer[i] = alignedAllocFrames( + DEFAULT_BUFFER_SIZE * 100 ); clearAudioBuffer( m_inputBuffer[i], m_inputBufferSize[i] ); } @@ -351,14 +326,10 @@ mixer::mixer( void ) : m_fifo = new fifo( 1 ); } - m_workingBuf = (sampleFrame*) aligned_malloc( m_framesPerPeriod * - sizeof( sampleFrame ) ); + m_workingBuf = alignedAllocFrames( m_framesPerPeriod ); for( Uint8 i = 0; i < 3; i++ ) { - m_readBuf = (surroundSampleFrame*) - aligned_malloc( m_framesPerPeriod * - sizeof( surroundSampleFrame ) ); - + m_readBuf = alignedAllocFrames( m_framesPerPeriod ); clearAudioBuffer( m_readBuf, m_framesPerPeriod ); m_bufferPool.push_back( m_readBuf ); } @@ -409,10 +380,10 @@ mixer::~mixer() for( Uint8 i = 0; i < 3; i++ ) { - aligned_free( m_bufferPool[i] ); + alignedFreeFrames( m_bufferPool[i] ); } - aligned_free( m_workingBuf ); + alignedFreeFrames( m_workingBuf ); } @@ -524,9 +495,9 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames ) if( frames + _frames > size ) { size = qMax( size * 2, frames + _frames ); - sampleFrame * ab = new sampleFrame[ size ]; - memcpy( ab, buf, frames * sizeof( sampleFrame ) ); - delete [] buf; + sampleFrame * ab = alignedAllocFrames( size ); + alignedMemCpy( ab, buf, frames * sizeof( sampleFrame ) ); + alignedFreeFrames( buf ); m_inputBufferSize[ m_inputBufferWrite ] = size; m_inputBuffer[ m_inputBufferWrite ] = ab; @@ -534,7 +505,7 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames ) buf = ab; } - memcpy( &buf[ frames ], _ab, _frames * sizeof( sampleFrame ) ); + alignedMemCpy( &buf[ frames ], _ab, _frames * sizeof( sampleFrame ) ); m_inputBufferFrames[ m_inputBufferWrite ] += _frames; unlockInputFrames(); @@ -543,7 +514,7 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames ) -const surroundSampleFrame * mixer::renderNextBuffer( void ) +sampleFrameA * mixer::renderNextBuffer( void ) { microTimer timer; static song::playPos last_metro_pos = -1; @@ -709,12 +680,9 @@ void mixer::bufferToPort( const sampleFrame * _buf, const int loop1_frame = qMin( end_frame, m_framesPerPeriod ); _port->lockFirstBuffer(); - sampleFrame * obuf = _port->firstBuffer()+start_frame; - for( int frame = 0; frame < loop1_frame-start_frame; ++frame ) - { - obuf[frame][0] += _buf[frame][0] * _vv.vol[0]; - obuf[frame][1] += _buf[frame][1] * _vv.vol[1]; - } + unalignedBufMixLRCoeff( _port->firstBuffer() + start_frame, + _buf, _vv.vol[0], _vv.vol[1], + loop1_frame - start_frame ); _port->unlockFirstBuffer(); _port->lockSecondBuffer(); @@ -723,14 +691,10 @@ void mixer::bufferToPort( const sampleFrame * _buf, const int frames_done = m_framesPerPeriod - start_frame; end_frame -= m_framesPerPeriod; end_frame = qMin( end_frame, m_framesPerPeriod ); - sampleFrame * obuf = _port->secondBuffer(); - for( fpp_t frame = 0; frame < end_frame; ++frame ) - { - obuf[frame][0] += _buf[frames_done + frame][0] * - _vv.vol[0]; - obuf[frame][1] += _buf[frames_done + frame][1] * - _vv.vol[1]; - } + unalignedBufMixLRCoeff( _port->secondBuffer(), + _buf+frames_done, + _vv.vol[0], _vv.vol[1], + end_frame ); // we used both buffers so set flags _port->m_bufferUsage = audioPort::BothBuffers; } @@ -748,7 +712,14 @@ void mixer::bufferToPort( const sampleFrame * _buf, void mixer::clearAudioBuffer( sampleFrame * _ab, const f_cnt_t _frames, const f_cnt_t _offset ) { - memset( _ab+_offset, 0, sizeof( *_ab ) * _frames ); + if( likely( (int)( _ab+_offset ) % 16 == 0 && _frames % 8 == 0 ) ) + { + alignedMemClear( _ab+_offset, sizeof( *_ab ) * _frames ); + } + else + { + memset( _ab+_offset, 0, sizeof( *_ab ) * _frames ); + } } @@ -1166,11 +1137,11 @@ void mixer::fifoWriter::run( void ) { #if 0 #ifdef LMMS_BUILD_LINUX -#ifdef LMMS_HAVE_SCHED_H +#ifdef LMMS_HAVE_PTHREAD_H cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET( 0, &mask ); - sched_setaffinity( 0, sizeof( mask ), &mask ); + pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ); #endif #endif #endif @@ -1178,9 +1149,9 @@ void mixer::fifoWriter::run( void ) const fpp_t frames = m_mixer->framesPerPeriod(); while( m_writing ) { - surroundSampleFrame * buffer = new surroundSampleFrame[frames]; - const surroundSampleFrame * b = m_mixer->renderNextBuffer(); - memcpy( buffer, b, frames * sizeof( surroundSampleFrame ) ); + sampleFrameA * buffer = alignedAllocFrames( frames ); + const sampleFrameA * b = m_mixer->renderNextBuffer(); + alignedMemCpy( buffer, b, frames * sizeof( sampleFrameA ) ); m_fifo->write( buffer ); } diff --git a/src/core/project_renderer.cpp b/src/core/project_renderer.cpp index ffc6174e1..bae9cccda 100644 --- a/src/core/project_renderer.cpp +++ b/src/core/project_renderer.cpp @@ -32,11 +32,12 @@ #include "audio_file_wave.h" #include "audio_file_ogg.h" -#ifdef LMMS_HAVE_SCHED_H -#include +#ifdef LMMS_HAVE_PTHREAD_H +#include #endif + fileEncodeDevice __fileEncodeDevices[] = { @@ -148,11 +149,11 @@ void projectRenderer::run( void ) { #if 0 #ifdef LMMS_BUILD_LINUX -#ifdef LMMS_HAVE_SCHED_H +#ifdef LMMS_HAVE_PTHREAD_H cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET( 0, &mask ); - sched_setaffinity( 0, sizeof( mask ), &mask ); + pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ); #endif #endif #endif