experimental support for MMX/SSE/SSE2 instructions

git-svn-id: https://lmms.svn.sf.net/svnroot/lmms/trunk/lmms@1832 0778d3d1-df1d-0410-868b-ea421aaaa00d
This commit is contained in:
Tobias Doerffel
2008-11-10 10:31:11 +00:00
parent 986fce1126
commit 22dc97f13d
34 changed files with 3198 additions and 283 deletions

View File

@@ -70,6 +70,7 @@ ENDIF(LMMS_BUILD_WIN32)
CHECK_INCLUDE_FILES(stdint.h LMMS_HAVE_STDINT_H)
CHECK_INCLUDE_FILES(stdbool.h LMMS_HAVE_STDBOOL_H)
CHECK_INCLUDE_FILES(stdlib.h LMMS_HAVE_STDLIB_H)
CHECK_INCLUDE_FILES(pthread.h LMMS_HAVE_PTHREAD_H)
CHECK_INCLUDE_FILES(semaphore.h LMMS_HAVE_SEMAPHORE_H)
@@ -387,6 +388,43 @@ SET(LMMS_ER_H ${CMAKE_CURRENT_BINARY_DIR}/embedded_resources.h)
ADD_FILE_DEPENDENCIES(${CMAKE_BINARY_DIR}/lmmsconfig.h ${lmms_MOC_out})
ADD_CUSTOM_COMMAND(OUTPUT ${LMMS_ER_H} COMMAND ${BIN2RES} ARGS ${lmms_EMBEDDED_RESOURCES} > ${LMMS_ER_H} DEPENDS ${BIN2RES})
SET(BASIC_OPS_X86_C "${CMAKE_SOURCE_DIR}/src/core/basic_ops_x86.c")
IF(LMMS_HOST_X86 OR LMMS_HOST_X86_64)
ADD_CUSTOM_TARGET(regen-basic-ops)
IF(LMMS_HOST_X86)
SET(opt_targets mmx sse sse2)
SET(host_arch x86)
ELSE(LMMS_HOST_X86)
SET(opt_targets sse sse2)
SET(host_arch x86_64)
ENDIF(LMMS_HOST_X86)
FOREACH(opt_target ${opt_targets})
STRING(TOUPPER ${opt_target} OPT_TARGET)
SET(BASIC_OPS_X86_TARGET_S "${CMAKE_SOURCE_DIR}/src/core/basic_ops_${host_arch}_${opt_target}.s")
SET(BASIC_OPS_X86_TARGET_O "${CMAKE_BINARY_DIR}/basic_ops_${host_arch}_${opt_target}.o")
IF(NOT "${OPT_TARGET}" STREQUAL "MMX")
SET(FPMATH_FLAGS "-mfpmath=sse")
ENDIF(NOT "${OPT_TARGET}" STREQUAL "MMX")
IF(EXISTS "$ENV{SVN_C_COMPILER}")
SET(C_COMPILER $ENV{SVN_C_COMPILER})
ELSE(EXISTS "$ENV{SVN_C_COMPILER}")
SET(C_COMPILER ${CMAKE_C_COMPILER})
ENDIF(EXISTS "$ENV{SVN_C_COMPILER}")
ADD_CUSTOM_TARGET(regen-basic-ops-${opt_target} COMMAND ${C_COMPILER} -O2 -ftree-vectorize -ftree-vectorizer-verbose=2 -fomit-frame-pointer -c -S -I${CMAKE_SOURCE_DIR}/include -I${CMAKE_BINARY_DIR} -g0 -DBUILD_${OPT_TARGET} -m${opt_target} ${FPMATH_FLAGS} -o ${BASIC_OPS_X86_TARGET_S} ${BASIC_OPS_X86_C} DEPENDS ${BASIC_OPS_X86_C})
ADD_CUSTOM_COMMAND(OUTPUT ${BASIC_OPS_X86_TARGET_O} COMMAND ${CMAKE_C_COMPILER} ARGS ${BASIC_OPS_X86_TARGET_S} -c -o ${BASIC_OPS_X86_TARGET_O} DEPENDS ${BASIC_OPS_X86_TARGET_S})
ADD_DEPENDENCIES(regen-basic-ops regen-basic-ops-${opt_target})
SET(opt_target_objects ${opt_target_objects} ${BASIC_OPS_X86_TARGET_O})
ENDFOREACH(opt_target ${opt_targets})
SET(lmms_SOURCES ${lmms_SOURCES} ${opt_target_objects})
# to be used by maintainer with special ultra-optimizing super duper GCC
ENDIF(LMMS_HOST_X86 OR LMMS_HOST_X86_64)
IF(WIN32)
SET(WINRC "${CMAKE_BINARY_DIR}/lmmsrc.obj")

View File

@@ -1,3 +1,40 @@
2008-11-10 Tobias Doerffel <tobydox/at/users/dot/sourceforge/dot/net>
* include/audio_portaudio.h:
* include/lmms_basics.h:
* include/fifo_buffer.h:
* include/mixer.h:
* include/audio_port.h:
* include/audio_dummy.h:
* include/basic_ops.h:
* include/audio_sdl.h:
* include/audio_jack.h:
* include/audio_device.h:
* src/core/audio/audio_device.cpp:
* src/core/audio/audio_alsa.cpp:
* src/core/audio/audio_file_wave.cpp:
* src/core/audio/audio_sdl.cpp:
* src/core/audio/audio_oss.cpp:
* src/core/audio/audio_port.cpp:
* src/core/audio/audio_portaudio.cpp:
* src/core/audio/audio_jack.cpp:
* src/core/audio/audio_pulseaudio.cpp:
* src/core/basic_ops.cpp:
* src/core/basic_ops_x86.c:
* src/core/basic_ops_x86_mmx.s:
* src/core/basic_ops_x86_sse.s:
* src/core/basic_ops_x86_sse2.s:
* src/core/basic_ops_x86_64_sse.s:
* src/core/basic_ops_x86_64_sse2.s:
* src/core/mixer.cpp:
* src/core/main.cpp:
* src/core/project_renderer.cpp:
* src/core/fx_mixer.cpp:
* plugins/ladspa_effect/ladspa_effect.cpp:
* lmmsconfig.h.in:
* CMakeLists.txt:
experimental support for MMX/SSE/SSE2 instructions
2008-11-04 Tobias Doerffel <tobydox/at/users/dot/sourceforge/dot/net>
* plugins/sf2_player/sf2_player.cpp:

View File

@@ -121,31 +121,22 @@ public:
protected:
// subclasses can re-implement this for being used in conjunction with
// processNextBuffer()
virtual void writeBuffer( const surroundSampleFrame * /* _buf*/,
virtual void writeBuffer( const sampleFrameA * /* _buf*/,
const fpp_t /*_frames*/,
const float /*_master_gain*/ )
{
}
// called by according driver for fetching new sound-data
fpp_t getNextBuffer( surroundSampleFrame * _ab );
// convert a given audio-buffer to a buffer in signed 16-bit samples
// returns num of bytes in outbuf
Uint32 convertToS16( const surroundSampleFrame * _ab,
const fpp_t _frames,
const float _master_gain,
int_sample_t * _output_buffer,
const bool _convert_endian = FALSE );
fpp_t getNextBuffer( sampleFrameA * _ab );
// clear given signed-int-16-buffer
void clearS16Buffer( int_sample_t * _outbuf,
const fpp_t _frames );
void clearS16Buffer( intSampleFrameA * _outbuf, const fpp_t _frames );
// resample given buffer from samplerate _src_sr to samplerate _dst_sr
void resample( const surroundSampleFrame * _src,
void resample( const sampleFrameA * _src,
const fpp_t _frames,
surroundSampleFrame * _dst,
sampleFrameA * _dst,
const sample_rate_t _src_sr,
const sample_rate_t _dst_sr );
@@ -161,9 +152,11 @@ protected:
bool hqAudio( void ) const;
protected:
bool m_supportsCapture;
private:
sample_rate_t m_sampleRate;
ch_cnt_t m_channels;
@@ -175,7 +168,7 @@ private:
SRC_DATA m_srcData;
SRC_STATE * m_srcState;
surroundSampleFrame * m_buffer;
sampleFrameA * m_buffer;
} ;

View File

@@ -27,6 +27,7 @@
#define _AUDIO_DUMMY_H
#include "audio_device.h"
#include "basic_ops.h"
#include "micro_timer.h"
@@ -94,16 +95,16 @@ private:
virtual void run( void )
{
microTimer timer;
while( TRUE )
while( true )
{
timer.reset();
const surroundSampleFrame * b =
surroundSampleFrame * b =
getMixer()->nextBuffer();
if( !b )
{
break;
}
delete[] b;
alignedFreeFrames( b );
const Sint32 microseconds = static_cast<Sint32>(
getMixer()->framesPerPeriod() *

View File

@@ -94,7 +94,7 @@ private:
QSemaphore m_stop_semaphore;
QVector<jack_port_t *> m_outputPorts;
surroundSampleFrame * m_outBuf;
sampleFrameA * m_outBuf;
f_cnt_t m_framesDoneInCurBuf;

View File

@@ -40,14 +40,14 @@ public:
audioPort( const QString & _name, bool _has_effect_chain = true );
~audioPort();
inline sampleFrame * firstBuffer( void )
inline sampleFrameA * firstBuffer( void )
{
return( m_firstBuffer );
return m_firstBuffer;
}
inline sampleFrame * secondBuffer( void )
inline sampleFrameA * secondBuffer( void )
{
return( m_secondBuffer );
return m_secondBuffer;
}
inline void lockFirstBuffer( void )
@@ -76,7 +76,7 @@ public:
// indicate whether JACK & Co should provide output-buffer at ext. port
inline bool extOutputEnabled( void ) const
{
return( m_extOutputEnabled );
return m_extOutputEnabled;
}
void setExtOutputEnabled( bool _enabled );
@@ -86,12 +86,12 @@ public:
// (-1 = none 0 = master)
inline fx_ch_t nextFxChannel( void ) const
{
return( m_nextFxChannel );
return m_nextFxChannel;
}
inline effectChain * getEffects( void )
{
return( m_effects );
return m_effects;
}
void setNextFxChannel( const fx_ch_t _chnl )
@@ -102,7 +102,7 @@ public:
const QString & name( void ) const
{
return( m_name );
return m_name;
}
void setName( const QString & _new_name );
@@ -122,8 +122,8 @@ public:
private:
volatile bufferUsages m_bufferUsage;
sampleFrame * m_firstBuffer;
sampleFrame * m_secondBuffer;
sampleFrameA * m_firstBuffer;
sampleFrameA * m_secondBuffer;
QMutex m_firstBufferLock;
QMutex m_secondBufferLock;

View File

@@ -140,7 +140,7 @@ private:
bool m_wasPAInitError;
surroundSampleFrame * m_outBuf;
sampleFrameA * m_outBuf;
int m_outBufPos;
int m_outBufSize;

View File

@@ -76,8 +76,8 @@ private:
SDL_AudioSpec m_audioHandle;
surroundSampleFrame * m_outBuf;
Uint8 * m_convertedBuf;
sampleFrameA * m_outBuf;
intSampleFrameA * m_convertedBuf;
int m_convertedBufPos;
int m_convertedBufSize;

94
include/basic_ops.h Normal file
View File

@@ -0,0 +1,94 @@
/*
* basic_ops.h - basic memory operations
*
* Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
*
* This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program (see COPYING); if not, write to the
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA.
*
*/
#ifndef _BASIC_OPS_H
#define _BASIC_OPS_H
#include "lmms_basics.h"
#ifdef LMMS_HAVE_STDBOOL_H
#include <stdbool.h>
#endif
void initBasicOps( void );
void * alignedMalloc( int _bytes );
void alignedFree( void * _buf );
sampleFrameA * alignedAllocFrames( int _frames );
void alignedFreeFrames( sampleFrameA * _buf );
// all aligned* functions assume data to be 16 byte aligned and size to be
// multiples of 64
typedef void (*alignedMemCpyFunc)( void * RP _dst, const void * RP _src,
int _size );
typedef void (*alignedMemClearFunc)( void * RP _dst, int _size );
typedef void (*alignedBufApplyGainFunc)( sampleFrameA * RP _dst,
float _gain, int _frames );
typedef void (*alignedBufMixFunc)( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
int _frames );
typedef void (*alignedBufMixLRCoeffFunc)( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _left, float _right,
int _frames );
typedef void (*unalignedBufMixLRCoeffFunc)( sampleFrame * RP _dst,
const sampleFrame * RP _src,
float _left, float _right,
int _frames );
typedef void (*alignedBufWetDryMixFunc)( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _wet, float _dry, int _frames );
typedef void (*alignedBufWetDryMixSplittedFunc)( sampleFrameA * RP _dst,
const float * RP _left,
const float * RP _right,
float _wet, float _dry, int _frames );
typedef int (*alignedConvertToS16Func)( const sampleFrameA * RP _src,
intSampleFrameA * RP _dst,
const fpp_t _frames,
const float _master_gain,
const bool _convert_endian );
extern alignedMemCpyFunc alignedMemCpy;
extern alignedMemClearFunc alignedMemClear;
extern alignedBufApplyGainFunc alignedBufApplyGain;
extern alignedBufMixFunc alignedBufMix;
extern alignedBufMixLRCoeffFunc alignedBufMixLRCoeff;
extern unalignedBufMixLRCoeffFunc unalignedBufMixLRCoeff;
extern alignedBufWetDryMixFunc alignedBufWetDryMix;
extern alignedBufWetDryMixSplittedFunc alignedBufWetDryMixSplitted;
extern alignedConvertToS16Func alignedConvertToS16;
#ifdef LMMS_HOST_X86
#define X86_OPTIMIZATIONS
#endif
#ifdef LMMS_HOST_X86_64
#define X86_OPTIMIZATIONS
#endif
#endif

View File

@@ -2,6 +2,7 @@
* fifo_buffer.h - FIFO fixed-size buffer
*
* Copyright (c) 2007 Javier Serrano Polo <jasp00/at/users.sourceforge.net>
* Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
*
* This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
*
@@ -33,50 +34,50 @@ class fifoBuffer
{
public:
fifoBuffer( int _size ) :
m_reader_sem( _size ),
m_writer_sem( _size ),
m_reader_index( 0 ),
m_writer_index( 0 ),
m_readerSem( _size ),
m_writerSem( _size ),
m_readerIndex( 0 ),
m_writerIndex( 0 ),
m_size( _size )
{
m_buffer = new T[_size];
m_reader_sem.acquire( _size );
m_readerSem.acquire( _size );
}
~fifoBuffer()
{
delete[] m_buffer;
m_reader_sem.release( m_size );
m_readerSem.release( m_size );
}
void write( T _element )
{
m_writer_sem.acquire();
m_buffer[m_writer_index++] = _element;
m_writer_index %= m_size;
m_reader_sem.release();
m_writerSem.acquire();
m_buffer[m_writerIndex++] = _element;
m_writerIndex %= m_size;
m_readerSem.release();
}
T read( void )
{
m_reader_sem.acquire();
T element = m_buffer[m_reader_index++];
m_reader_index %= m_size;
m_writer_sem.release();
return( element );
m_readerSem.acquire();
T element = m_buffer[m_readerIndex++];
m_readerIndex %= m_size;
m_writerSem.release();
return element;
}
bool available( void )
{
return( m_reader_sem.available() );
return m_readerSem.available();
}
private:
QSemaphore m_reader_sem;
QSemaphore m_writer_sem;
int m_reader_index;
int m_writer_index;
QSemaphore m_readerSem;
QSemaphore m_writerSem;
int m_readerIndex;
int m_writerIndex;
int m_size;
T * m_buffer;

View File

@@ -1,5 +1,5 @@
/*
* types.h - typedefs for common types that are used in the whole app
* lmms_basics.h - common basics for the whole App
*
* Copyright (c) 2004-2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
*
@@ -23,10 +23,8 @@
*/
#ifndef _TYPES_H
#define _TYPES_H
#include <limits>
#ifndef _LMMS_BASICS_H
#define _LMMS_BASICS_H
#include "lmmsconfig.h"
@@ -68,6 +66,9 @@ typedef Uint32 jo_id_t; // (unique) ID of a journalling object
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#ifdef __cplusplus
#include <limits>
template<typename T>
struct typeInfo
@@ -115,25 +116,50 @@ inline bool typeInfo<float>::isEqual( float _x, float _y )
return absVal( _x - _y ) < minEps();
}
#endif
const ch_cnt_t DEFAULT_CHANNELS = 2;
const ch_cnt_t SURROUND_CHANNELS =
#define DEFAULT_CHANNELS 2
#define LMMS_DISABLE_SURROUND
#ifndef LMMS_DISABLE_SURROUND
4;
#ifdef LMMS_DISABLE_SURROUND
#define SURROUND_CHANNELS 2
#else
2;
#define SURROUND_CHANNELS 4
#endif
typedef sample_t sampleFrame[DEFAULT_CHANNELS];
typedef sample_t surroundSampleFrame[SURROUND_CHANNELS];
#define ALIGN_SIZE 16
#if __GNUC__
typedef sample_t sampleFrameA[DEFAULT_CHANNELS] __attribute__((__aligned__(ALIGN_SIZE)));
typedef int_sample_t intSampleFrameA[DEFAULT_CHANNELS] __attribute__((__aligned__(ALIGN_SIZE)));
#define RP __restrict__
#else
#define RP
#endif
#ifdef __cplusplus
const int BYTES_PER_SAMPLE = sizeof( sample_t );
const int BYTES_PER_INT_SAMPLE = sizeof( int_sample_t );
const int BYTES_PER_FRAME = sizeof( sampleFrame );
const int BYTES_PER_SURROUND_FRAME = sizeof( surroundSampleFrame );
const float OUTPUT_SAMPLE_MULTIPLIER = 32767.0f;
#else
#define BYTES_PER_SAMPLE sizeof( sample_t )
#define BYTES_PER_INT_SAMPLE sizeof( int_sample_t )
#define BYTES_PER_FRAME sizeof( sampleFrame )
#define BYTES_PER_SURROUND_FRAME sizeof( surroundSampleFrame )
#define OUTPUT_SAMPLE_MULTIPLIER 32767.0f
#endif

View File

@@ -57,13 +57,6 @@ class audioPort;
const fpp_t DEFAULT_BUFFER_SIZE = 256;
const int BYTES_PER_SAMPLE = sizeof( sample_t );
const int BYTES_PER_INT_SAMPLE = sizeof( int_sample_t );
const int BYTES_PER_FRAME = sizeof( sampleFrame );
const int BYTES_PER_SURROUND_FRAME = sizeof( surroundSampleFrame );
const float OUTPUT_SAMPLE_MULTIPLIER = 32767.0f;
const float BaseFreq = 440.0f;
const Keys BaseKey = Key_A;
@@ -361,7 +354,7 @@ public:
return m_inputBufferFrames[ m_inputBufferRead ];
}
inline const surroundSampleFrame * nextBuffer( void )
inline surroundSampleFrame * nextBuffer( void )
{
return hasFifoWriter() ? m_fifo->read() : renderNextBuffer();
}
@@ -407,7 +400,7 @@ private:
midiClient * tryMidiClients( void );
const surroundSampleFrame * renderNextBuffer( void );
surroundSampleFrame * renderNextBuffer( void );

View File

@@ -19,6 +19,7 @@
#cmakedefine LMMS_HAVE_VST
#cmakedefine LMMS_HAVE_STDINT_H
#cmakedefine LMMS_HAVE_STDBOOL_H
#cmakedefine LMMS_HAVE_STDLIB_H
#cmakedefine LMMS_HAVE_PTHREAD_H
#cmakedefine LMMS_HAVE_UNISTD_H

View File

@@ -34,6 +34,7 @@
#include "ladspa_subplugin_features.h"
#include "mixer.h"
#include "effect_chain.h"
#include "basic_ops.h"
#include "automation_pattern.h"
@@ -144,7 +145,7 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
if( m_maxSampleRate < engine::getMixer()->processingSampleRate() )
{
o_buf = _buf;
_buf = new sampleFrame[_frames];
_buf = alignedAllocFrames( _frames );
sampleDown( o_buf, _buf, m_maxSampleRate );
frames = _frames * m_maxSampleRate /
engine::getMixer()->processingSampleRate();
@@ -217,8 +218,8 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
// Copy the LADSPA output buffers to the LMMS buffer.
double out_sum = 0.0;
channel = 0;
const float d = getDryLevel();
const float w = getWetLevel();
float * buffers[2];
for( ch_cnt_t proc = 0; proc < getProcessorCount(); ++proc )
{
for( int port = 0; port < m_portCount; ++port )
@@ -231,17 +232,9 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
case CONTROL_RATE_INPUT:
break;
case CHANNEL_OUT:
for( fpp_t frame = 0;
frame < frames; ++frame )
if( channel < DEFAULT_CHANNELS )
{
_buf[frame][channel] =
d *
_buf[frame][channel] +
w *
pp->buffer[frame];
out_sum +=
_buf[frame][channel] *
_buf[frame][channel];
buffers[channel] = pp->buffer;
}
++channel;
break;
@@ -254,10 +247,27 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
}
}
if( channel == 1 )
{
buffers[1] = buffers[0];
}
if( channel >= 1 && channel <= DEFAULT_CHANNELS )
{
alignedBufWetDryMixSplitted( _buf, buffers[0], buffers[1],
getWetLevel(), getDryLevel(), frames );
}
for( int i = 0; i < frames; ++i )
{
out_sum += _buf[i][0]*_buf[i][0];
out_sum += _buf[i][1]*_buf[i][1];
}
if( o_buf != NULL )
{
sampleBack( _buf, o_buf, m_maxSampleRate );
delete[] _buf;
alignedFreeFrames( _buf );
}
checkGate( out_sum / frames );

View File

@@ -39,6 +39,7 @@
#include "lcd_spinbox.h"
#include "gui_templates.h"
#include "templates.h"
#include "basic_ops.h"
@@ -229,13 +230,15 @@ void audioALSA::applyQualitySettings( void )
void audioALSA::run( void )
{
surroundSampleFrame * temp =
new surroundSampleFrame[getMixer()->framesPerPeriod()];
int_sample_t * outbuf =
new int_sample_t[getMixer()->framesPerPeriod() *
channels()];
sampleFrameA * temp = alignedAllocFrames(
getMixer()->framesPerPeriod() );
intSampleFrameA * outbuf = (intSampleFrameA *)
alignedMalloc( sizeof( intSampleFrameA ) * channels() /
DEFAULT_CHANNELS * getMixer()->framesPerPeriod() );
int_sample_t * pcmbuf = new int_sample_t[m_periodSize * channels()];
int outbuf_size = getMixer()->framesPerPeriod() * channels();
int outbuf_pos = 0;
int pcmbuf_size = m_periodSize * channels();
@@ -254,16 +257,15 @@ void audioALSA::run( void )
if( !frames )
{
quit = TRUE;
memset( ptr, 0, len
alignedMemClear( ptr, len
* sizeof( int_sample_t ) );
break;
}
outbuf_size = frames * channels();
convertToS16( temp, frames,
alignedConvertToS16( temp, outbuf, frames,
getMixer()->masterGain(),
outbuf,
m_convertEndian );
m_convertEndian );
}
int min_len = qMin( len, outbuf_size - outbuf_pos );
memcpy( ptr, outbuf + outbuf_pos,
@@ -300,8 +302,8 @@ void audioALSA::run( void )
}
}
delete[] temp;
delete[] outbuf;
alignedFreeFrames( temp );
alignedFree( outbuf );
delete[] pcmbuf;
}

View File

@@ -31,6 +31,7 @@
#include "audio_device.h"
#include "config_mgr.h"
#include "debug.h"
#include "basic_ops.h"
@@ -39,7 +40,7 @@ audioDevice::audioDevice( const ch_cnt_t _channels, mixer * _mixer ) :
m_sampleRate( _mixer->processingSampleRate() ),
m_channels( _channels ),
m_mixer( _mixer ),
m_buffer( new surroundSampleFrame[getMixer()->framesPerPeriod()] )
m_buffer( alignedAllocFrames( getMixer()->framesPerPeriod() ) )
{
int error;
if( ( m_srcState = src_new(
@@ -56,7 +57,7 @@ audioDevice::audioDevice( const ch_cnt_t _channels, mixer * _mixer ) :
audioDevice::~audioDevice()
{
src_delete( m_srcState );
delete[] m_buffer;
alignedFreeFrames( m_buffer );
m_devMutex.tryLock();
unlock();
@@ -81,10 +82,10 @@ void audioDevice::processNextBuffer( void )
fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab )
fpp_t audioDevice::getNextBuffer( sampleFrameA * _ab )
{
fpp_t frames = getMixer()->framesPerPeriod();
const surroundSampleFrame * b = getMixer()->nextBuffer();
sampleFrameA * b = getMixer()->nextBuffer();
if( !b )
{
return( 0 );
@@ -103,7 +104,7 @@ fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab )
}
else
{
memcpy( _ab, b, frames * sizeof( surroundSampleFrame ) );
alignedMemCpy( _ab, b, frames * sizeof( surroundSampleFrame ) );
}
// release lock
@@ -111,10 +112,10 @@ fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab )
if( getMixer()->hasFifoWriter() )
{
delete[] b;
alignedFreeFrames( b );
}
return( frames );
return frames;
}
@@ -171,11 +172,10 @@ void audioDevice::renamePort( audioPort * )
void audioDevice::resample( const surroundSampleFrame * _src,
const fpp_t _frames,
surroundSampleFrame * _dst,
const sample_rate_t _src_sr,
const sample_rate_t _dst_sr )
void audioDevice::resample( const sampleFrame * _src, const fpp_t _frames,
sampleFrame * _dst,
const sample_rate_t _src_sr,
const sample_rate_t _dst_sr )
{
if( m_srcState == NULL )
{
@@ -197,57 +197,11 @@ void audioDevice::resample( const surroundSampleFrame * _src,
Uint32 audioDevice::convertToS16( const surroundSampleFrame * _ab,
const fpp_t _frames,
const float _master_gain,
int_sample_t * _output_buffer,
const bool _convert_endian )
void audioDevice::clearS16Buffer( intSampleFrameA * _outbuf, const fpp_t _frames )
{
if( _convert_endian )
{
Uint16 temp;
for( fpp_t frame = 0; frame < _frames; ++frame )
{
for( ch_cnt_t chnl = 0; chnl < channels(); ++chnl )
{
temp = static_cast<int_sample_t>(
mixer::clip( _ab[frame][chnl] *
_master_gain ) *
OUTPUT_SAMPLE_MULTIPLIER );
( _output_buffer + frame * channels() )[chnl] =
( temp & 0x00ff ) << 8 |
( temp & 0xff00 ) >> 8;
}
}
}
else
{
for( fpp_t frame = 0; frame < _frames; ++frame )
{
for( ch_cnt_t chnl = 0; chnl < channels(); ++chnl )
{
( _output_buffer + frame * channels() )[chnl] =
static_cast<int_sample_t>(
mixer::clip( _ab[frame][chnl] *
_master_gain ) *
OUTPUT_SAMPLE_MULTIPLIER );
}
}
}
return( _frames * channels() * BYTES_PER_INT_SAMPLE );
}
void audioDevice::clearS16Buffer( int_sample_t * _outbuf, const fpp_t _frames )
{
#ifdef LMMS_DEBUG
assert( _outbuf != NULL );
#endif
memset( _outbuf, 0, _frames * channels() * BYTES_PER_INT_SAMPLE );
alignedMemClear( _outbuf, _frames * sizeof( *_outbuf ) );
// memset( _outbuf, 0, _frames * channels() * BYTES_PER_INT_SAMPLE );
}

View File

@@ -29,6 +29,7 @@
#include "audio_file_wave.h"
#include "endian_handling.h"
#include "basic_ops.h"
#include <cstring>
@@ -101,12 +102,14 @@ void audioFileWave::writeBuffer( const surroundSampleFrame * _ab,
}
else
{
int_sample_t * buf = new int_sample_t[_frames * channels()];
convertToS16( _ab, _frames, _master_gain, buf,
intSampleFrameA * buf = (intSampleFrameA *)
alignedMalloc(
sizeof( intSampleFrameA ) * _frames );
alignedConvertToS16( _ab, buf, _frames, _master_gain,
!isLittleEndian() );
sf_writef_short( m_sf, buf, _frames );
delete[] buf;
sf_writef_short( m_sf, (int_sample_t *) buf, _frames );
alignedFree( buf );
}
}

View File

@@ -45,6 +45,7 @@
#include "config_mgr.h"
#include "lcd_spinbox.h"
#include "audio_port.h"
#include "basic_ops.h"
@@ -57,7 +58,7 @@ audioJACK::audioJACK( bool & _success_ful, mixer * _mixer ) :
m_client( NULL ),
m_active( FALSE ),
m_stop_semaphore( 1 ),
m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ),
m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ),
m_framesDoneInCurBuf( 0 ),
m_framesToDoInCurBuf( 0 )
{
@@ -159,7 +160,7 @@ audioJACK::~audioJACK()
jack_client_close( m_client );
}
delete[] m_outBuf;
alignedFreeFrames( m_outBuf );
}
@@ -367,14 +368,14 @@ int audioJACK::processCallback( jack_nframes_t _nframes, void * _udata )
_this->m_framesDoneInCurBuf );
if( ts == JackTransportRolling )
{
const float gain = _this->getMixer()->masterGain();
for( Uint8 chnl = 0; chnl < _this->channels(); ++chnl )
{
for( jack_nframes_t frame = 0; frame < todo;
++frame )
{
outbufs[chnl][done+frame] =
_this->m_outBuf[_this->m_framesDoneInCurBuf+frame][chnl] *
_this->getMixer()->masterGain();
_this->m_outBuf[_this->m_framesDoneInCurBuf+frame][chnl] * gain;
}
}
}

View File

@@ -39,6 +39,7 @@
#include "engine.h"
#include "gui_templates.h"
#include "templates.h"
#include "basic_ops.h"
#ifdef LMMS_HAVE_UNISTD_H
#include <unistd.h>
@@ -298,13 +299,13 @@ void audioOSS::applyQualitySettings( void )
void audioOSS::run( void )
{
surroundSampleFrame * temp =
new surroundSampleFrame[getMixer()->framesPerPeriod()];
int_sample_t * outbuf =
new int_sample_t[getMixer()->framesPerPeriod() *
channels()];
sampleFrameA * temp = alignedAllocFrames(
getMixer()->framesPerPeriod() );
intSampleFrameA * outbuf = (intSampleFrameA *)
alignedMalloc( sizeof( intSampleFrameA ) *
getMixer()->framesPerPeriod() );
while( TRUE )
while( 1 )
{
const fpp_t frames = getNextBuffer( temp );
if( !frames )
@@ -312,8 +313,8 @@ void audioOSS::run( void )
break;
}
int bytes = convertToS16( temp, frames,
getMixer()->masterGain(), outbuf,
int bytes = alignedConvertToS16( temp, outbuf, frames,
getMixer()->masterGain(),
m_convertEndian );
if( write( m_audioFD, outbuf, bytes ) != bytes )
{
@@ -321,8 +322,8 @@ void audioOSS::run( void )
}
}
delete[] temp;
delete[] outbuf;
alignedFreeFrames( temp );
alignedFree( outbuf );
}

View File

@@ -26,13 +26,15 @@
#include "audio_device.h"
#include "effect_chain.h"
#include "engine.h"
#include "basic_ops.h"
audioPort::audioPort( const QString & _name, bool _has_effect_chain ) :
m_bufferUsage( NoUsage ),
m_firstBuffer( new sampleFrame[engine::getMixer()->framesPerPeriod()] ),
m_secondBuffer( new sampleFrame[
engine::getMixer()->framesPerPeriod()] ),
m_firstBuffer( alignedAllocFrames(
engine::getMixer()->framesPerPeriod() ) ),
m_secondBuffer( alignedAllocFrames(
engine::getMixer()->framesPerPeriod() ) ),
m_extOutputEnabled( false ),
m_nextFxChannel( 0 ),
m_name( "unnamed port" ),
@@ -53,8 +55,8 @@ audioPort::~audioPort()
{
setExtOutputEnabled( false );
engine::getMixer()->removeAudioPort( this );
delete[] m_firstBuffer;
delete[] m_secondBuffer;
alignedFreeFrames( m_firstBuffer );
alignedFreeFrames( m_secondBuffer );
delete m_effects;
}

View File

@@ -55,11 +55,12 @@ void audioPortAudioSetupUtil::updateChannels( void )
audioPortAudio::audioPortAudio( bool & _success_ful, mixer * _mixer ) :
audioDevice( tLimit<ch_cnt_t>(
configManager::inst()->value( "audioportaudio", "channels" ).toInt(),
configManager::inst()->value( "audioportaudio",
"channels" ).toInt(),
DEFAULT_CHANNELS, SURROUND_CHANNELS ),
_mixer ),
m_wasPAInitError( false ),
m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ),
m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ),
m_outBufPos( 0 ),
m_stopSemaphore( 1 )
{
@@ -205,7 +206,7 @@ audioPortAudio::~audioPortAudio()
{
Pa_Terminate();
}
delete[] m_outBuf;
alignedFreeFrames( m_outBuf );
}

View File

@@ -40,6 +40,7 @@
#include "lcd_spinbox.h"
#include "gui_templates.h"
#include "templates.h"
#include "basic_ops.h"
static void stream_write_callback(pa_stream *s, size_t length, void *userdata)
@@ -230,8 +231,9 @@ void audioPulseAudio::run( void )
void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length)
{
const fpp_t fpp = getMixer()->framesPerPeriod();
surroundSampleFrame * temp = new surroundSampleFrame[fpp];
Sint16 * pcmbuf = (Sint16*)pa_xmalloc( fpp * channels() * sizeof(Sint16) );
sampleFrameA * temp = alignedAllocFrames( fpp );
Sint16 * pcmbuf = (Sint16*)pa_xmalloc( fpp * channels() *
sizeof(Sint16) );
size_t fd = 0;
while( fd < length/4 )
@@ -241,9 +243,10 @@ void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length)
{
return;
}
int bytes = convertToS16( temp, frames,
int bytes = alignedConvertToS16( temp,
(intSampleFrameA *) pcmbuf,
frames,
getMixer()->masterGain(),
pcmbuf,
m_convertEndian );
if( bytes > 0 )
{
@@ -254,7 +257,7 @@ void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length)
}
pa_xfree( pcmbuf );
delete[] temp;
alignedFreeFrames( temp );
}

View File

@@ -38,22 +38,22 @@
#include "config_mgr.h"
#include "gui_templates.h"
#include "templates.h"
#include "basic_ops.h"
audioSDL::audioSDL( bool & _success_ful, mixer * _mixer ) :
audioDevice( DEFAULT_CHANNELS, _mixer ),
m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ),
m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ),
m_convertedBufPos( 0 ),
m_convertEndian( false ),
m_stopSemaphore( 1 )
{
_success_ful = FALSE;
m_convertedBufSize = getMixer()->framesPerPeriod() * channels()
* sizeof( int_sample_t );
m_convertedBuf = new Uint8[m_convertedBufSize];
m_convertedBufSize = getMixer()->framesPerPeriod() *
sizeof( intSampleFrameA );
m_convertedBuf = (intSampleFrameA *) alignedMalloc( m_convertedBufSize );
if( SDL_Init( SDL_INIT_AUDIO | SDL_INIT_NOPARACHUTE ) < 0 )
@@ -97,8 +97,8 @@ audioSDL::~audioSDL()
SDL_CloseAudio();
SDL_Quit();
delete[] m_convertedBuf;
delete[] m_outBuf;
alignedFree( m_convertedBuf );
alignedFreeFrames( m_outBuf );
}
@@ -190,12 +190,12 @@ void audioSDL::sdlAudioCallback( Uint8 * _buf, int _len )
memset( _buf, 0, _len );
return;
}
m_convertedBufSize = frames * channels()
* sizeof( int_sample_t );
m_convertedBufSize = frames * sizeof( intSampleFrameA );
convertToS16( m_outBuf, frames,
alignedConvertToS16( m_outBuf,
m_convertedBuf,
frames,
getMixer()->masterGain(),
(int_sample_t *)m_convertedBuf,
m_convertEndian );
}
const int min_len = qMin( _len, m_convertedBufSize

455
src/core/basic_ops.cpp Normal file
View File

@@ -0,0 +1,455 @@
/*
* basic_ops.cpp - basic memory operations
*
* Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
*
* This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program (see COPYING); if not, write to the
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA.
*
*/
#include "basic_ops.h"
#include <cstdlib>
#include <cstdio>
#include <memory.h>
void * alignedMalloc( int _bytes )
{
char *ptr,*ptr2,*aligned_ptr;
int align_mask = ALIGN_SIZE- 1;
ptr =(char *) malloc( _bytes + ALIGN_SIZE + sizeof(int) );
if( ptr == NULL )
{
return NULL;
}
ptr2 = ptr + sizeof(int);
aligned_ptr = ptr2 + ( ALIGN_SIZE- ( (size_t) ptr2 & align_mask ) );
ptr2 = aligned_ptr - sizeof(int);
*((int *) ptr2) = (int)( aligned_ptr - ptr );
return aligned_ptr;
}
void alignedFree( void * _buf )
{
if( _buf )
{
int * ptr2 = (int *) _buf - 1;
void * buf2 = (char *) _buf - *ptr2;
if( buf2 )
{
free( buf2 );
}
}
}
sampleFrameA * alignedAllocFrames( int _n )
{
return (sampleFrameA *) alignedMalloc( _n * sizeof( sampleFrameA ) );
}
void alignedFreeFrames( sampleFrame * _buf )
{
alignedFree( _buf );
}
// slow fallback
void alignedMemCpyNoOpt( void * RP _dst, const void * RP _src, int _size )
{
const int s = _size / ( sizeof( int ) * 16 );
const int * RP src = (const int *) _src;
int * RP dst = (int *) _dst;
for( int i = 0; i < s; )
{
dst[i+0] = src[i+0];
dst[i+1] = src[i+1];
dst[i+2] = src[i+2];
dst[i+3] = src[i+3];
dst[i+4] = src[i+4];
dst[i+5] = src[i+5];
dst[i+6] = src[i+6];
dst[i+7] = src[i+7];
dst[i+8] = src[i+8];
dst[i+9] = src[i+9];
dst[i+10] = src[i+10];
dst[i+11] = src[i+11];
dst[i+12] = src[i+12];
dst[i+13] = src[i+13];
dst[i+14] = src[i+14];
dst[i+15] = src[i+15];
i += 16;
}
}
// slow fallback
void alignedMemClearNoOpt( void * _dst, int _size )
{
const int s = _size / ( sizeof( int ) * 4 );
int * dst = (int *) _dst;
for( int i = 0; i < s; ++i )
{
dst[0] = 0;
dst[1] = 0;
dst[2] = 0;
dst[3] = 0;
dst += 4;
}
}
void alignedBufApplyGainNoOpt( sampleFrameA * RP _dst, float _gain,
int _frames )
{
for( int i = 0; i < _frames; )
{
_dst[i+0][0] *= _gain;
_dst[i+0][1] *= _gain;
_dst[i+1][0] *= _gain;
_dst[i+1][1] *= _gain;
_dst[i+2][0] *= _gain;
_dst[i+2][1] *= _gain;
_dst[i+3][0] *= _gain;
_dst[i+3][1] *= _gain;
_dst[i+4][0] *= _gain;
_dst[i+4][1] *= _gain;
_dst[i+5][0] *= _gain;
_dst[i+5][1] *= _gain;
_dst[i+6][0] *= _gain;
_dst[i+6][1] *= _gain;
_dst[i+7][0] *= _gain;
_dst[i+7][1] *= _gain;
i += 8;
}
}
void alignedBufMixNoOpt( sampleFrameA * RP _dst, const sampleFrameA * RP _src,
int _frames )
{
for( int i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0];
_dst[i+0][1] += _src[i+0][1];
_dst[i+1][0] += _src[i+1][0];
_dst[i+1][1] += _src[i+1][1];
_dst[i+2][0] += _src[i+2][0];
_dst[i+2][1] += _src[i+2][1];
_dst[i+3][0] += _src[i+3][0];
_dst[i+3][1] += _src[i+3][1];
i += 4;
}
}
void alignedBufMixLRCoeffNoOpt( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _left, float _right, int _frames )
{
for( int i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0]*_left;
_dst[i+0][1] += _src[i+0][1]*_right;
_dst[i+1][0] += _src[i+1][0]*_left;
_dst[i+1][1] += _src[i+1][1]*_right;
_dst[i+2][0] += _src[i+2][0]*_left;
_dst[i+2][1] += _src[i+2][1]*_right;
_dst[i+3][0] += _src[i+3][0]*_left;
_dst[i+3][1] += _src[i+3][1]*_right;
i += 4;
}
}
void unalignedBufMixLRCoeffNoOpt( sampleFrame * RP _dst,
const sampleFrame * RP _src,
const float _left,
const float _right,
int _frames )
{
if( _frames % 2 )
{
_dst[0][0] += _src[0][0] * _left;
_dst[0][1] += _src[0][1] * _right;
++_src;
++_dst;
--_frames;
}
for( int i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0]*_left;
_dst[i+0][1] += _src[i+0][1]*_right;
_dst[i+1][0] += _src[i+1][0]*_left;
_dst[i+1][1] += _src[i+1][1]*_right;
i += 2;
}
}
void alignedBufWetDryMixNoOpt( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _wet, float _dry, int _frames )
{
for( int i = 0; i < _frames; ++i )
{
_dst[i+0][0] = _dst[i+0][0]*_dry + _src[i+0][0]*_wet;
_dst[i+0][1] = _dst[i+0][1]*_dry + _src[i+0][1]*_wet;
}
}
void alignedBufWetDryMixSplittedNoOpt( sampleFrameA * RP _dst,
const float * RP _left,
const float * RP _right,
float _wet, float _dry, int _frames )
{
int i;
for( i = 0; i < _frames; ++i )
{
_dst[i+0][0] = _dst[i+0][0]*_dry + _left[i+0]*_wet;
_dst[i+0][1] = _dst[i+0][1]*_dry + _right[i+0]*_wet;
++i;
}
}
int alignedConvertToS16NoOpt( const sampleFrameA * RP _src,
intSampleFrameA * RP _dst,
const fpp_t _frames,
const float _master_gain,
const bool _convert_endian )
{
int t1;
int t2;
const float f = _master_gain * OUTPUT_SAMPLE_MULTIPLIER;
if( _convert_endian )
{
for( fpp_t frame = 0; frame < _frames; ++frame )
{
t1 = _src[frame][0] * f;
t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
_dst[frame][0] = ( t1 & 0x00ff) << 8 |
( t1 & 0xff00 ) >> 8;
t2 = _src[frame][1] * f;
t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
_dst[frame][1] = ( t2 & 0x00ff) << 8 |
( t2 & 0xff00 ) >> 8;
}
}
else
{
for( fpp_t frame = 0; frame < _frames; ++frame )
{
t1 = _src[frame][0] * f;
t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
_dst[frame][0] = t1;
t2 = _src[frame][1] * f;
t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
_dst[frame][1] = t2;
}
}
return _frames * DEFAULT_CHANNELS * BYTES_PER_INT_SAMPLE;
}
alignedMemCpyFunc alignedMemCpy = alignedMemCpyNoOpt;
alignedMemClearFunc alignedMemClear = alignedMemClearNoOpt;
alignedBufApplyGainFunc alignedBufApplyGain = alignedBufApplyGainNoOpt;
alignedBufMixFunc alignedBufMix = alignedBufMixNoOpt;
alignedBufMixLRCoeffFunc alignedBufMixLRCoeff = alignedBufMixLRCoeffNoOpt;
unalignedBufMixLRCoeffFunc unalignedBufMixLRCoeff = unalignedBufMixLRCoeffNoOpt;
alignedBufWetDryMixFunc alignedBufWetDryMix = alignedBufWetDryMixNoOpt;
alignedBufWetDryMixSplittedFunc alignedBufWetDryMixSplitted = alignedBufWetDryMixSplittedNoOpt;
alignedConvertToS16Func alignedConvertToS16 = alignedConvertToS16NoOpt;
#ifdef X86_OPTIMIZATIONS
enum CPUFeatures
{
None = 0,
MMX = 0x1,
MMXEXT = 0x2,
MMX3DNOW = 0x4,
MMX3DNOWEXT = 0x8,
SSE = 0x10,
SSE2 = 0x20,
CMOV = 0x40,
IWMMXT = 0x80
};
extern "C"
{
#ifdef LMMS_HOST_X86
void alignedMemCpyMMX( void * RP _dst, const void * RP _src, int _size );
void alignedMemClearMMX( void * RP _dst, int _size );
#endif
void alignedMemCpySSE( void * RP _dst, const void * RP _src, int _size );
void alignedMemClearSSE( void * RP _dst, int _size );
void alignedBufApplyGainSSE( sampleFrameA * RP _dst, float _gain, int _frames );
void alignedBufMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, int _frames );
void alignedBufMixLRCoeffSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _left, float _right, int _frames );
void unalignedBufMixLRCoeffSSE( sampleFrame * RP _dst, const sampleFrame * RP _src, const float _left, const float _right, int _frames );
void alignedBufWetDryMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _wet, float _dry, int _frames );
void alignedBufWetDryMixSplittedSSE( sampleFrameA * RP _dst, const float * RP _left, const float * RP _right, float _wet, float _dry, int _frames );
#ifdef LMMS_HOST_X86
void alignedMemCpySSE2( void * RP _dst, const void * RP _src, int _size );
void alignedMemClearSSE2( void * RP _dst, int _size );
int alignedConvertToS16SSE2( const sampleFrameA * RP _src, intSampleFrameA * RP _dst, const fpp_t _frames, const float _master_gain, const bool _convert_endian );
#endif
} ;
#endif
void initBasicOps( void )
{
#ifdef X86_OPTIMIZATIONS
static bool extensions_checked = false;
if( extensions_checked == false )
{
int features = 0;
unsigned int result = 0;
unsigned int extended_result = 0;
asm( "push %%ebx\n"
"pushf\n"
"pop %%eax\n"
"mov %%eax, %%ebx\n"
"xor $0x00200000, %%eax\n"
"push %%eax\n"
"popf\n"
"pushf\n"
"pop %%eax\n"
"xor %%edx, %%edx\n"
"xor %%ebx, %%eax\n"
"jz 1f\n"
"mov $0x00000001, %%eax\n"
"cpuid\n"
"1:\n"
"pop %%ebx\n"
"mov %%edx, %0\n"
: "=r" (result)
:
: "%eax", "%ecx", "%edx"
);
asm( "push %%ebx\n"
"pushf\n"
"pop %%eax\n"
"mov %%eax, %%ebx\n"
"xor $0x00200000, %%eax\n"
"push %%eax\n"
"popf\n"
"pushf\n"
"pop %%eax\n"
"xor %%edx, %%edx\n"
"xor %%ebx, %%eax\n"
"jz 2f\n"
"mov $0x80000000, %%eax\n"
"cpuid\n"
"cmp $0x80000000, %%eax\n"
"jbe 2f\n"
"mov $0x80000001, %%eax\n"
"cpuid\n"
"2:\n"
"pop %%ebx\n"
"mov %%edx, %0\n"
: "=r" (extended_result)
:
: "%eax", "%ecx", "%edx"
);
if( result & (1u << 15) )
features |= CMOV;
if( result & (1u << 23) )
features |= MMX;
if( extended_result & (1u << 22) )
features |= MMXEXT;
if( extended_result & (1u << 31) )
features |= MMX3DNOW;
if( extended_result & (1u << 30) )
features |= MMX3DNOWEXT;
if( result & (1u << 25) )
features |= SSE;
if( result & (1u << 26) )
features |= SSE2;
#ifdef LMMS_HOST_X86
if( features & MMX )
{
alignedMemCpy = alignedMemCpyMMX;
alignedMemClear = alignedMemClearMMX;
}
#endif
if( features & SSE )
{
fprintf( stderr, "Using SSE optimized routines\n" );
alignedMemCpy = alignedMemCpySSE;
alignedMemClear = alignedMemClearSSE;
alignedBufApplyGain = alignedBufApplyGainSSE;
alignedBufMix = alignedBufMixSSE;
alignedBufMixLRCoeff = alignedBufMixLRCoeffSSE;
unalignedBufMixLRCoeff = unalignedBufMixLRCoeffSSE;
alignedBufWetDryMix = alignedBufWetDryMixSSE;
alignedBufWetDryMixSplitted =
alignedBufWetDryMixSplittedSSE;
}
if( features & SSE2 )
{
fprintf( stderr, "Using SSE2 optimized routines\n" );
alignedMemCpy = alignedMemCpySSE2;
alignedMemClear = alignedMemClearSSE2;
alignedConvertToS16 = alignedConvertToS16SSE2;
}
extensions_checked = true;
}
#endif
}

395
src/core/basic_ops_x86.c Normal file
View File

@@ -0,0 +1,395 @@
/*
* basic_ops_x86.c - x86 specific optimized operations
*
* Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
*
* This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program (see COPYING); if not, write to the
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA.
*
*/
#include "basic_ops.h"
#ifdef X86_OPTIMIZATIONS
#ifdef BUILD_MMX
#include <mmintrin.h>
void alignedMemCpyMMX( void * RP _dst, const void * RP _src, int _size )
{
const int s = _size / ( sizeof( __m64 ) * 8 );
int i;
char fpu_save[108];
char * RP src = (char *) _src;
char * RP dst = (char *) _dst;
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
__asm__ __volatile__ (
"1: prefetchnta (%0)\n"
" prefetchnta 64(%0)\n"
" prefetchnta 128(%0)\n"
" prefetchnta 192(%0)\n"
" prefetchnta 256(%0)\n"
: : "r" (src) );
for(i=0; i<s; i++)
{
__asm__ __volatile__ (
"1: prefetchnta 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
: : "r" (src), "r" (dst) : "memory");
src+=64;
dst+=64;
}
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
}
void alignedMemClearMMX( void * RP _dst, int _size )
{
__m64 * dst = (__m64 *) _dst;
const int s = _size / ( sizeof( *dst ) * 8 );
__m64 val = _mm_setzero_si64();
int i;
for( i = 0; i < s; ++i )
{
__asm__ __volatile__ (
"movq %0, (%1)\n"
"movq %0, 8(%1)\n"
"movq %0, 16(%1)\n"
"movq %0, 24(%1)\n"
"movq %0, 32(%1)\n"
"movq %0, 40(%1)\n"
"movq %0, 48(%1)\n"
"movq %0, 56(%1)\n"
: : "y" (val), "r" (dst) : "memory" );
dst += 8;
}
_mm_empty();
}
#endif
#ifdef BUILD_SSE
#include <xmmintrin.h>
void alignedMemCpySSE( void * RP _dst, const void * RP _src, int _size )
{
__m128 * dst = (__m128 *) _dst;
__m128 * src = (__m128 *) _src;
const int s = _size / ( sizeof( *dst ) * 4 );
int i;
for( i = 0; i < s; ++i )
{
/* _mm_store_ps( dst+0, _mm_load_ps( src+0 ) );
_mm_store_ps( dst+1, _mm_load_ps( src+1 ) );
_mm_store_ps( dst+2, _mm_load_ps( src+2 ) );
_mm_store_ps( dst+3, _mm_load_ps( src+3 ) );*/
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
src += 4;
dst += 4;
}
}
void alignedMemClearSSE( void * RP _dst, int _size )
{
__m128 * dst = (__m128 *) _dst;
const int s = _size / ( sizeof( *dst ) * 4 );
__m128 val = _mm_setzero_ps();
int i;
for( i = 0; i < s; ++i )
{
dst[0] = val;
dst[1] = val;
dst[2] = val;
dst[3] = val;
dst += 4;
}
}
void alignedBufApplyGainSSE( sampleFrameA * RP _dst, float _gain, int _frames )
{
int i;
for( i = 0; i < _frames; )
{
_dst[i+0][0] *= _gain;
_dst[i+0][1] *= _gain;
_dst[i+1][0] *= _gain;
_dst[i+1][1] *= _gain;
_dst[i+2][0] *= _gain;
_dst[i+2][1] *= _gain;
_dst[i+3][0] *= _gain;
_dst[i+3][1] *= _gain;
_dst[i+4][0] *= _gain;
_dst[i+4][1] *= _gain;
_dst[i+5][0] *= _gain;
_dst[i+5][1] *= _gain;
_dst[i+6][0] *= _gain;
_dst[i+6][1] *= _gain;
_dst[i+7][0] *= _gain;
_dst[i+7][1] *= _gain;
i += 8;
}
}
void alignedBufMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src,
int _frames )
{
int i;
for( i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0];
_dst[i+0][1] += _src[i+0][1];
_dst[i+1][0] += _src[i+1][0];
_dst[i+1][1] += _src[i+1][1];
_dst[i+2][0] += _src[i+2][0];
_dst[i+2][1] += _src[i+2][1];
_dst[i+3][0] += _src[i+3][0];
_dst[i+3][1] += _src[i+3][1];
i += 4;
_dst[i+0][0] += _src[i+0][0];
_dst[i+0][1] += _src[i+0][1];
_dst[i+1][0] += _src[i+1][0];
_dst[i+1][1] += _src[i+1][1];
_dst[i+2][0] += _src[i+2][0];
_dst[i+2][1] += _src[i+2][1];
_dst[i+3][0] += _src[i+3][0];
_dst[i+3][1] += _src[i+3][1];
i += 4;
}
}
void alignedBufMixLRCoeffSSE( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _left, float _right, int _frames )
{
int i;
for( i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0]*_left;
_dst[i+0][1] += _src[i+0][1]*_right;
_dst[i+1][0] += _src[i+1][0]*_left;
_dst[i+1][1] += _src[i+1][1]*_right;
_dst[i+2][0] += _src[i+2][0]*_left;
_dst[i+2][1] += _src[i+2][1]*_right;
_dst[i+3][0] += _src[i+3][0]*_left;
_dst[i+3][1] += _src[i+3][1]*_right;
i += 4;
}
}
void unalignedBufMixLRCoeffSSE( sampleFrame * RP _dst, const sampleFrame * RP _src,
const float _left,
const float _right,
int _frames )
{
int i;
if( unlikely( _frames % 2 ) )
{
_dst[0][0] += _src[0][0] * _left;
_dst[0][1] += _src[0][1] * _right;
++_src;
++_dst;
--_frames;
}
for( i = 0; i < _frames; )
{
_dst[i+0][0] += _src[i+0][0]*_left;
_dst[i+0][1] += _src[i+0][1]*_right;
_dst[i+1][0] += _src[i+1][0]*_left;
_dst[i+1][1] += _src[i+1][1]*_right;
i += 2;
}
}
void alignedBufWetDryMixSSE( sampleFrameA * RP _dst,
const sampleFrameA * RP _src,
float _wet, float _dry, int _frames )
{
int i;
for( i = 0; i < _frames; )
{
_dst[i+0][0] = _dst[i+0][0]*_dry + _src[i+0][0]*_wet;
_dst[i+0][1] = _dst[i+0][1]*_dry + _src[i+0][1]*_wet;
_dst[i+1][0] = _dst[i+1][0]*_dry + _src[i+1][0]*_wet;
_dst[i+1][1] = _dst[i+1][1]*_dry + _src[i+1][1]*_wet;
_dst[i+2][0] = _dst[i+2][0]*_dry + _src[i+2][0]*_wet;
_dst[i+2][1] = _dst[i+2][1]*_dry + _src[i+2][1]*_wet;
_dst[i+3][0] = _dst[i+3][0]*_dry + _src[i+3][0]*_wet;
_dst[i+3][1] = _dst[i+3][1]*_dry + _src[i+3][1]*_wet;
i += 4;
}
}
void alignedBufWetDryMixSplittedSSE( sampleFrameA * RP _dst,
const float * RP _left,
const float * RP _right,
float _wet, float _dry, int _frames )
{
int i;
for( i = 0; i < _frames; )
{
_dst[i+0][0] = _dst[i+0][0]*_dry + _left[i+0]*_wet;
_dst[i+0][1] = _dst[i+0][1]*_dry + _right[i+0]*_wet;
_dst[i+1][0] = _dst[i+1][0]*_dry + _left[i+1]*_wet;
_dst[i+1][1] = _dst[i+1][1]*_dry + _right[i+1]*_wet;
i += 2;
}
}
#endif
#ifdef BUILD_SSE2
#include <emmintrin.h>
void alignedMemCpySSE2( void * RP _dst, const void * RP _src, int _size )
{
__m128i * dst = (__m128i *) _dst;
__m128i * src = (__m128i *) _src;
const int s = _size / ( sizeof( *dst ) * 4 );
int i;
for( i = 0; i < s; ++i )
{
_mm_store_si128( dst+0, _mm_load_si128( src+0 ) );
_mm_store_si128( dst+1, _mm_load_si128( src+1 ) );
_mm_store_si128( dst+2, _mm_load_si128( src+2 ) );
_mm_store_si128( dst+3, _mm_load_si128( src+3 ) );
src += 4;
dst += 4;
}
}
void alignedMemClearSSE2( void * RP _dst, int _size )
{
__m128i * dst = (__m128i *) _dst;
const int s = _size / ( sizeof( *dst ) * 4 );
__m128i val = _mm_setzero_si128();
int i;
for( i = 0; i < s; ++i )
{
_mm_store_si128( dst+0, val );
_mm_store_si128( dst+1, val );
_mm_store_si128( dst+2, val );
_mm_store_si128( dst+3, val );
dst += 4;
}
}
int alignedConvertToS16SSE2( const sampleFrameA * RP _src,
intSampleFrameA * RP _dst,
const fpp_t _frames,
const float _master_gain,
const bool _convert_endian )
{
int t1;
int t2;
fpp_t frame;
const float f = _master_gain * OUTPUT_SAMPLE_MULTIPLIER;
if( _convert_endian )
{
for( frame = 0; frame < _frames; ++frame )
{
t1 = _src[frame][0] * f;
t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
_dst[frame][0] = ( t1 & 0x00ff) << 8 |
( t1 & 0xff00 ) >> 8;
t2 = _src[frame][1] * f;
t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
_dst[frame][1] = ( t2 & 0x00ff) << 8 |
( t2 & 0xff00 ) >> 8;
}
}
else
{
for( frame = 0; frame < _frames; ++frame )
{
t1 = _src[frame][0] * f;
t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
_dst[frame][0] = t1;
t2 = _src[frame][1] * f;
t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
_dst[frame][1] = t2;
}
}
return _frames * DEFAULT_CHANNELS * BYTES_PER_INT_SAMPLE;
}
#endif
#endif

View File

@@ -0,0 +1,563 @@
.file "basic_ops_x86.c"
.text
.align 16
.globl alignedMemCpySSE
.type alignedMemCpySSE, @function
alignedMemCpySSE:
.LFB509:
movslq %edx,%rdx
shrq $6, %rdx
testl %edx, %edx
jle .L4
leal -1(%rdx), %r9d
xorl %eax, %eax
mov %r9d, %r8d
leaq 1(%r8), %rcx
movq %rcx, %rdx
salq $6, %rdx
.align 16
.L3:
movaps (%rsi,%rax), %xmm0
movaps %xmm0, (%rdi,%rax)
movaps 16(%rsi,%rax), %xmm0
movaps %xmm0, 16(%rdi,%rax)
movaps 32(%rsi,%rax), %xmm0
movaps %xmm0, 32(%rdi,%rax)
movaps 48(%rsi,%rax), %xmm0
movaps %xmm0, 48(%rdi,%rax)
addq $64, %rax
cmpq %rdx, %rax
jne .L3
.L4:
rep
ret
.LFE509:
.size alignedMemCpySSE, .-alignedMemCpySSE
.align 16
.globl alignedMemClearSSE
.type alignedMemClearSSE, @function
alignedMemClearSSE:
.LFB510:
movslq %esi,%rax
shrq $6, %rax
testl %eax, %eax
jle .L10
subl $1, %eax
xorps %xmm0, %xmm0
salq $6, %rax
leaq 64(%rax,%rdi), %rax
.align 16
.L9:
movaps %xmm0, (%rdi)
movaps %xmm0, 16(%rdi)
movaps %xmm0, 32(%rdi)
movaps %xmm0, 48(%rdi)
addq $64, %rdi
cmpq %rax, %rdi
jne .L9
.L10:
rep
ret
.LFE510:
.size alignedMemClearSSE, .-alignedMemClearSSE
.align 16
.globl alignedBufApplyGainSSE
.type alignedBufApplyGainSSE, @function
alignedBufApplyGainSSE:
.LFB511:
testl %esi, %esi
jle .L15
subl $1, %esi
shufps $0, %xmm0, %xmm0
shrl $3, %esi
xorl %eax, %eax
leal 1(%rsi), %edx
.align 16
.L14:
movaps %xmm0, %xmm3
addl $1, %eax
movaps %xmm0, %xmm2
movaps %xmm0, %xmm1
movaps %xmm0, %xmm4
mulps 16(%rdi), %xmm3
mulps 32(%rdi), %xmm2
mulps 48(%rdi), %xmm1
mulps (%rdi), %xmm4
movaps %xmm3, 16(%rdi)
movaps %xmm2, 32(%rdi)
movaps %xmm1, 48(%rdi)
movaps %xmm4, (%rdi)
addq $64, %rdi
cmpl %eax, %edx
ja .L14
.L15:
rep
ret
.LFE511:
.size alignedBufApplyGainSSE, .-alignedBufApplyGainSSE
.align 16
.globl alignedBufMixSSE
.type alignedBufMixSSE, @function
alignedBufMixSSE:
.LFB512:
testl %edx, %edx
jle .L20
subl $1, %edx
xorl %eax, %eax
shrl $3, %edx
leal 1(%rdx), %ecx
xorl %edx, %edx
.align 16
.L19:
movaps 16(%rdi,%rax), %xmm2
addl $1, %edx
movaps 32(%rdi,%rax), %xmm1
addps 16(%rsi,%rax), %xmm2
movaps 48(%rdi,%rax), %xmm0
addps 32(%rsi,%rax), %xmm1
movaps (%rdi,%rax), %xmm3
addps 48(%rsi,%rax), %xmm0
addps (%rsi,%rax), %xmm3
movaps %xmm2, 16(%rdi,%rax)
movaps %xmm1, 32(%rdi,%rax)
movaps %xmm0, 48(%rdi,%rax)
movaps %xmm3, (%rdi,%rax)
addq $64, %rax
cmpl %edx, %ecx
ja .L19
.L20:
rep
ret
.LFE512:
.size alignedBufMixSSE, .-alignedBufMixSSE
.align 16
.globl alignedBufMixLRCoeffSSE
.type alignedBufMixLRCoeffSSE, @function
alignedBufMixLRCoeffSSE:
.LFB513:
testl %edx, %edx
jle .L25
unpcklps %xmm1, %xmm0
subl $1, %edx
shrl $2, %edx
xorl %eax, %eax
leal 1(%rdx), %ecx
xorl %edx, %edx
movlhps %xmm0, %xmm0
.align 16
.L24:
movaps %xmm0, %xmm1
addl $1, %edx
movaps %xmm0, %xmm2
mulps 16(%rsi,%rax), %xmm1
mulps (%rsi,%rax), %xmm2
addps 16(%rdi,%rax), %xmm1
addps (%rdi,%rax), %xmm2
movaps %xmm1, 16(%rdi,%rax)
movaps %xmm2, (%rdi,%rax)
addq $32, %rax
cmpl %edx, %ecx
ja .L24
.L25:
rep
ret
.LFE513:
.size alignedBufMixLRCoeffSSE, .-alignedBufMixLRCoeffSSE
.align 16
.globl alignedBufWetDryMixSSE
.type alignedBufWetDryMixSSE, @function
alignedBufWetDryMixSSE:
.LFB515:
testl %edx, %edx
jle .L30
subl $1, %edx
shufps $0, %xmm1, %xmm1
shufps $0, %xmm0, %xmm0
shrl $2, %edx
leal 1(%rdx), %ecx
xorl %eax, %eax
xorl %edx, %edx
.align 16
.L29:
movaps %xmm1, %xmm3
addl $1, %edx
movaps %xmm0, %xmm2
mulps 16(%rdi,%rax), %xmm3
movaps %xmm1, %xmm4
mulps 16(%rsi,%rax), %xmm2
mulps (%rdi,%rax), %xmm4
addps %xmm3, %xmm2
movaps %xmm0, %xmm3
mulps (%rsi,%rax), %xmm3
movaps %xmm2, 16(%rdi,%rax)
addps %xmm4, %xmm3
movaps %xmm3, (%rdi,%rax)
addq $32, %rax
cmpl %edx, %ecx
ja .L29
.L30:
rep
ret
.LFE515:
.size alignedBufWetDryMixSSE, .-alignedBufWetDryMixSSE
.align 16
.globl alignedBufWetDryMixSplittedSSE
.type alignedBufWetDryMixSplittedSSE, @function
alignedBufWetDryMixSplittedSSE:
.LFB516:
pushq %rbp
.LCFI0:
testl %ecx, %ecx
pushq %rbx
.LCFI1:
jle .L39
leal -1(%rcx), %ebx
shrl %ebx
addl $1, %ebx
movl %ebx, %r11d
shrl $2, %r11d
cmpl $3, %ebx
leal 0(,%r11,4), %ebp
jbe .L40
testl %ebp, %ebp
jne .L34
.L40:
xorl %r9d, %r9d
jmp .L36
.align 16
.L34:
movaps %xmm1, %xmm2
movq %rdi, %rax
xorps %xmm6, %xmm6
movq %rsi, %r9
shufps $0, %xmm2, %xmm2
movq %rdx, %r8
xorl %r10d, %r10d
movaps %xmm2, %xmm8
movaps %xmm0, %xmm2
shufps $0, %xmm2, %xmm2
movaps %xmm2, %xmm7
.align 16
.L37:
movaps (%rax), %xmm12
addl $1, %r10d
movaps %xmm6, %xmm3
movaps 16(%rax), %xmm5
movaps %xmm12, %xmm14
movlps (%r8), %xmm3
movaps 32(%rax), %xmm9
shufps $136, %xmm5, %xmm14
shufps $221, %xmm5, %xmm12
movhps 8(%r8), %xmm3
movaps 48(%rax), %xmm4
movaps %xmm9, %xmm13
movaps %xmm6, %xmm5
shufps $221, %xmm4, %xmm9
movlps (%r9), %xmm5
shufps $136, %xmm4, %xmm13
movaps %xmm6, %xmm4
movhps 8(%r9), %xmm5
movaps %xmm14, %xmm11
movlps 16(%r9), %xmm4
movaps %xmm12, %xmm15
movaps %xmm5, %xmm2
movhps 24(%r9), %xmm4
shufps $136, %xmm13, %xmm11
movaps %xmm3, %xmm10
addq $32, %r9
shufps $136, %xmm4, %xmm2
mulps %xmm8, %xmm11
mulps %xmm7, %xmm2
shufps $221, %xmm13, %xmm14
shufps $136, %xmm9, %xmm15
shufps $221, %xmm4, %xmm5
addps %xmm2, %xmm11
movaps %xmm6, %xmm2
shufps $221, %xmm9, %xmm12
movlps 16(%r8), %xmm2
mulps %xmm8, %xmm14
movhps 24(%r8), %xmm2
mulps %xmm7, %xmm5
movaps %xmm11, %xmm9
addq $32, %r8
shufps $136, %xmm2, %xmm10
shufps $221, %xmm2, %xmm3
movaps %xmm14, %xmm4
mulps %xmm8, %xmm15
addps %xmm5, %xmm4
mulps %xmm7, %xmm10
movaps %xmm11, %xmm5
mulps %xmm8, %xmm12
mulps %xmm7, %xmm3
addps %xmm15, %xmm10
unpcklps %xmm4, %xmm9
movaps %xmm12, %xmm2
unpckhps %xmm4, %xmm5
addps %xmm3, %xmm2
movaps %xmm10, %xmm4
movaps %xmm10, %xmm3
unpcklps %xmm2, %xmm4
unpckhps %xmm2, %xmm3
movaps %xmm9, %xmm2
unpcklps %xmm4, %xmm2
unpckhps %xmm4, %xmm9
movaps %xmm2, (%rax)
movaps %xmm5, %xmm2
unpckhps %xmm3, %xmm5
unpcklps %xmm3, %xmm2
movaps %xmm9, 16(%rax)
movaps %xmm2, 32(%rax)
movaps %xmm5, 48(%rax)
addq $64, %rax
cmpl %r10d, %r11d
ja .L37
cmpl %ebx, %ebp
leal (%rbp,%rbp), %r9d
je .L39
.L36:
movslq %r9d,%rax
leaq 1(%rax), %rbx
leaq 0(,%rax,4), %r10
leaq (%rdi,%rax,8), %r8
leaq (%rdi,%rbx,8), %rax
salq $2, %rbx
leaq (%rsi,%r10), %r11
leaq (%rdx,%r10), %r10
addq %rbx, %rsi
addq %rbx, %rdx
.align 16
.L38:
movaps %xmm1, %xmm3
addl $2, %r9d
movaps %xmm0, %xmm2
mulss (%r8), %xmm3
mulss (%r11), %xmm2
addq $8, %r11
addss %xmm3, %xmm2
movaps %xmm1, %xmm3
mulss 4(%r8), %xmm3
movss %xmm2, (%r8)
movaps %xmm0, %xmm2
mulss (%r10), %xmm2
addq $8, %r10
addss %xmm3, %xmm2
movaps %xmm1, %xmm3
movss %xmm2, 4(%r8)
movaps %xmm0, %xmm2
addq $16, %r8
mulss (%rax), %xmm3
mulss (%rsi), %xmm2
addq $8, %rsi
addss %xmm3, %xmm2
movaps %xmm1, %xmm3
mulss 4(%rax), %xmm3
movss %xmm2, (%rax)
movaps %xmm0, %xmm2
mulss (%rdx), %xmm2
addq $8, %rdx
addss %xmm3, %xmm2
movss %xmm2, 4(%rax)
addq $16, %rax
cmpl %r9d, %ecx
jg .L38
.L39:
popq %rbx
popq %rbp
ret
.LFE516:
.size alignedBufWetDryMixSplittedSSE, .-alignedBufWetDryMixSplittedSSE
.align 16
.globl unalignedBufMixLRCoeffSSE
.type unalignedBufMixLRCoeffSSE, @function
unalignedBufMixLRCoeffSSE:
.LFB514:
movl %edx, %eax
shrl $31, %eax
leal (%rdx,%rax), %ecx
andl $1, %ecx
cmpl %eax, %ecx
jne .L52
.L44:
testl %edx, %edx
jle .L49
subl $1, %edx
shrl %edx
testb $15, %dil
jne .L46
unpcklps %xmm1, %xmm0
addl $1, %edx
xorps %xmm3, %xmm3
xorl %eax, %eax
movlhps %xmm0, %xmm0
.align 16
.L47:
movaps %xmm3, %xmm2
addl $1, %eax
movaps %xmm3, %xmm1
movlps (%rsi), %xmm2
movlps (%rdi), %xmm1
movhps 8(%rsi), %xmm2
addq $16, %rsi
movhps 8(%rdi), %xmm1
mulps %xmm0, %xmm2
addps %xmm2, %xmm1
movaps %xmm1, (%rdi)
addq $16, %rdi
cmpl %edx, %eax
jb .L47
rep
ret
.align 16
.L46:
mov %edx, %edx
xorl %eax, %eax
addq $1, %rdx
salq $4, %rdx
.align 16
.L48:
movaps %xmm0, %xmm2
mulss (%rsi,%rax), %xmm2
addss (%rdi,%rax), %xmm2
movss %xmm2, (%rdi,%rax)
movaps %xmm1, %xmm2
mulss 4(%rsi,%rax), %xmm2
addss 4(%rdi,%rax), %xmm2
movss %xmm2, 4(%rdi,%rax)
movaps %xmm0, %xmm2
mulss 8(%rsi,%rax), %xmm2
addss 8(%rdi,%rax), %xmm2
movss %xmm2, 8(%rdi,%rax)
movaps %xmm1, %xmm2
mulss 12(%rsi,%rax), %xmm2
addss 12(%rdi,%rax), %xmm2
movss %xmm2, 12(%rdi,%rax)
addq $16, %rax
cmpq %rdx, %rax
jne .L48
.L49:
rep
ret
.L52:
movaps %xmm0, %xmm2
subl $1, %edx
movss (%rdi), %xmm3
mulss (%rsi), %xmm2
addss %xmm3, %xmm2
movss 4(%rdi), %xmm3
movss %xmm2, (%rdi)
movaps %xmm1, %xmm2
mulss 4(%rsi), %xmm2
addq $8, %rsi
addss %xmm3, %xmm2
movss %xmm2, 4(%rdi)
addq $8, %rdi
jmp .L44
.LFE514:
.size unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE
.section .eh_frame,"aw",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1
.LSCIE1:
.long 0x0
.byte 0x1
.string "zR"
.byte 0x1
.byte 0x78
.byte 0x10
.byte 0x1
.byte 0x3
.byte 0xc
.byte 0x7
.byte 0x8
.byte 0x11
.byte 0x10
.byte 0x1
.align 8
.LECIE1:
.LSFDE1:
.long .LEFDE1-.LASFDE1
.LASFDE1:
.long .LASFDE1-.Lframe1
.long .LFB509
.long .LFE509-.LFB509
.byte 0x0
.align 8
.LEFDE1:
.LSFDE3:
.long .LEFDE3-.LASFDE3
.LASFDE3:
.long .LASFDE3-.Lframe1
.long .LFB510
.long .LFE510-.LFB510
.byte 0x0
.align 8
.LEFDE3:
.LSFDE5:
.long .LEFDE5-.LASFDE5
.LASFDE5:
.long .LASFDE5-.Lframe1
.long .LFB511
.long .LFE511-.LFB511
.byte 0x0
.align 8
.LEFDE5:
.LSFDE7:
.long .LEFDE7-.LASFDE7
.LASFDE7:
.long .LASFDE7-.Lframe1
.long .LFB512
.long .LFE512-.LFB512
.byte 0x0
.align 8
.LEFDE7:
.LSFDE9:
.long .LEFDE9-.LASFDE9
.LASFDE9:
.long .LASFDE9-.Lframe1
.long .LFB513
.long .LFE513-.LFB513
.byte 0x0
.align 8
.LEFDE9:
.LSFDE11:
.long .LEFDE11-.LASFDE11
.LASFDE11:
.long .LASFDE11-.Lframe1
.long .LFB515
.long .LFE515-.LFB515
.byte 0x0
.align 8
.LEFDE11:
.LSFDE13:
.long .LEFDE13-.LASFDE13
.LASFDE13:
.long .LASFDE13-.Lframe1
.long .LFB516
.long .LFE516-.LFB516
.byte 0x0
.byte 0x4
.long .LCFI0-.LFB516
.byte 0xe
.byte 0x10
.byte 0x4
.long .LCFI1-.LCFI0
.byte 0xe
.byte 0x18
.byte 0x11
.byte 0x3
.byte 0x3
.byte 0x11
.byte 0x6
.byte 0x2
.align 8
.LEFDE13:
.LSFDE15:
.long .LEFDE15-.LASFDE15
.LASFDE15:
.long .LASFDE15-.Lframe1
.long .LFB514
.long .LFE514-.LFB514
.byte 0x0
.align 8
.LEFDE15:
.ident "GCC: (GNU) 4.4.0 20081110 (experimental)"

View File

@@ -0,0 +1,395 @@
.file "basic_ops_x86.c"
.text
.align 16
.globl alignedMemCpySSE2
.type alignedMemCpySSE2, @function
alignedMemCpySSE2:
.LFB509:
movslq %edx,%rdx
shrq $6, %rdx
testl %edx, %edx
jle .L4
leal -1(%rdx), %r9d
xorl %eax, %eax
mov %r9d, %r8d
leaq 1(%r8), %rcx
movq %rcx, %rdx
salq $6, %rdx
.align 16
.L3:
movdqa (%rsi,%rax), %xmm0
movdqa %xmm0, (%rdi,%rax)
movdqa 16(%rsi,%rax), %xmm0
movdqa %xmm0, 16(%rdi,%rax)
movdqa 32(%rsi,%rax), %xmm0
movdqa %xmm0, 32(%rdi,%rax)
movdqa 48(%rsi,%rax), %xmm0
movdqa %xmm0, 48(%rdi,%rax)
addq $64, %rax
cmpq %rdx, %rax
jne .L3
.L4:
rep
ret
.LFE509:
.size alignedMemCpySSE2, .-alignedMemCpySSE2
.align 16
.globl alignedMemClearSSE2
.type alignedMemClearSSE2, @function
alignedMemClearSSE2:
.LFB510:
movslq %esi,%rax
shrq $6, %rax
testl %eax, %eax
jle .L10
subl $1, %eax
pxor %xmm0, %xmm0
salq $6, %rax
leaq 64(%rax,%rdi), %rax
.align 16
.L9:
movdqa %xmm0, (%rdi)
movdqa %xmm0, 16(%rdi)
movdqa %xmm0, 32(%rdi)
movdqa %xmm0, 48(%rdi)
addq $64, %rdi
cmpq %rax, %rdi
jne .L9
.L10:
rep
ret
.LFE510:
.size alignedMemClearSSE2, .-alignedMemClearSSE2
.align 16
.globl alignedConvertToS16SSE2
.type alignedConvertToS16SSE2, @function
alignedConvertToS16SSE2:
.LFB511:
pushq %rbp
.LCFI0:
testb %cl, %cl
movl %edx, %eax
mulss .LC0(%rip), %xmm0
pushq %rbx
.LCFI1:
jne .L13
testw %dx, %dx
jle .L15
movl %edx, %ebx
shrw $2, %bx
cmpw $3, %dx
leal 0(,%rbx,4), %r8d
ja .L33
.L28:
xorl %r8d, %r8d
.align 16
.L23:
movswq %r8w,%rdx
movl $32767, %ebx
leaq (%rdi,%rdx,8), %rcx
leaq (%rsi,%rdx,4), %rdx
movl $-32768, %edi
.align 16
.L25:
movaps %xmm0, %xmm1
mulss (%rcx), %xmm1
cvttss2si %xmm1, %esi
movaps %xmm0, %xmm1
mulss 4(%rcx), %xmm1
cmpl $-32768, %esi
cmovl %edi, %esi
cmpl $32767, %esi
cmovg %ebx, %esi
movw %si, (%rdx)
cvttss2si %xmm1, %esi
cmpl $-32768, %esi
cmovl %edi, %esi
cmpl $32767, %esi
cmovg %ebx, %esi
addl $1, %r8d
addq $8, %rcx
movw %si, 2(%rdx)
addq $4, %rdx
cmpw %r8w, %ax
jg .L25
.L15:
cwtl
popq %rbx
sall $2, %eax
popq %rbp
ret
.align 16
.L13:
testw %dx, %dx
jle .L15
movl %edx, %ebx
shrw $2, %bx
cmpw $3, %dx
leal 0(,%rbx,4), %r8d
ja .L34
.L27:
xorl %r8d, %r8d
.align 16
.L18:
movswq %r8w,%rdx
leaq (%rdi,%rdx,8), %rcx
leaq (%rsi,%rdx,4), %rdx
movl $-32768, %edi
movl $32767, %esi
.align 16
.L20:
movaps %xmm0, %xmm1
mulss (%rcx), %xmm1
cvttss2si %xmm1, %ebx
movaps %xmm0, %xmm1
mulss 4(%rcx), %xmm1
cmpl $-32768, %ebx
cmovl %edi, %ebx
cmpl $32767, %ebx
cmovg %esi, %ebx
movzbl %bh, %ebp
sall $8, %ebx
orl %ebp, %ebx
movw %bx, (%rdx)
cvttss2si %xmm1, %ebx
cmpl $-32768, %ebx
cmovl %edi, %ebx
cmpl $32767, %ebx
cmovg %esi, %ebx
addl $1, %r8d
addq $8, %rcx
movzbl %bh, %ebp
sall $8, %ebx
orl %ebp, %ebx
movw %bx, 2(%rdx)
addq $4, %rdx
cmpw %r8w, %ax
jg .L20
cwtl
popq %rbx
sall $2, %eax
popq %rbp
ret
.align 16
.L34:
testw %r8w, %r8w
je .L27
movaps %xmm0, %xmm1
movq %rdi, %rcx
movdqa .LC1(%rip), %xmm2
movq %rsi, %r10
shufps $0, %xmm1, %xmm1
xorl %r9d, %r9d
movdqa .LC3(%rip), %xmm8
movaps %xmm1, %xmm9
movdqa .LC2(%rip), %xmm1
.align 16
.L19:
movaps %xmm9, %xmm4
addl $1, %r9d
movaps %xmm9, %xmm3
mulps (%rcx), %xmm4
movdqa %xmm1, %xmm6
mulps 16(%rcx), %xmm3
addq $32, %rcx
cvttps2dq %xmm4, %xmm4
movdqa %xmm4, %xmm5
pcmpgtd %xmm2, %xmm5
cvttps2dq %xmm3, %xmm3
pand %xmm5, %xmm4
pandn %xmm2, %xmm5
por %xmm5, %xmm4
movdqa %xmm4, %xmm5
pcmpgtd %xmm1, %xmm5
pand %xmm5, %xmm6
pandn %xmm4, %xmm5
movdqa %xmm5, %xmm4
movdqa %xmm3, %xmm5
por %xmm6, %xmm4
movdqa %xmm1, %xmm6
pcmpgtd %xmm2, %xmm5
pand %xmm5, %xmm3
pandn %xmm2, %xmm5
movdqa %xmm4, %xmm7
pslld $8, %xmm4
pand %xmm8, %xmm7
por %xmm5, %xmm3
psrad $8, %xmm7
movdqa %xmm3, %xmm5
pcmpgtd %xmm1, %xmm5
pand %xmm5, %xmm6
pandn %xmm3, %xmm5
movdqa %xmm5, %xmm3
por %xmm6, %xmm3
movdqa %xmm7, %xmm6
movdqa %xmm3, %xmm5
pslld $8, %xmm3
pand %xmm8, %xmm5
psrad $8, %xmm5
punpcklwd %xmm5, %xmm7
punpckhwd %xmm5, %xmm6
movdqa %xmm4, %xmm5
punpcklwd %xmm3, %xmm4
movdqa %xmm7, %xmm10
punpckhwd %xmm3, %xmm5
punpcklwd %xmm6, %xmm7
punpckhwd %xmm6, %xmm10
punpcklwd %xmm10, %xmm7
movdqa %xmm4, %xmm10
punpcklwd %xmm5, %xmm4
punpckhwd %xmm5, %xmm10
punpcklwd %xmm10, %xmm4
por %xmm7, %xmm4
movdqa %xmm4, (%r10)
addq $16, %r10
cmpw %r9w, %bx
ja .L19
cmpw %dx, %r8w
jne .L18
jmp .L15
.align 16
.L33:
testw %r8w, %r8w
je .L28
movaps %xmm0, %xmm1
movq %rdi, %rcx
movdqa .LC1(%rip), %xmm2
movq %rsi, %r10
shufps $0, %xmm1, %xmm1
xorl %r9d, %r9d
movaps %xmm1, %xmm6
movdqa .LC2(%rip), %xmm1
.align 16
.L24:
movaps %xmm6, %xmm4
addl $1, %r9d
movaps %xmm6, %xmm3
mulps (%rcx), %xmm4
movdqa %xmm1, %xmm7
mulps 16(%rcx), %xmm3
addq $32, %rcx
cvttps2dq %xmm4, %xmm4
movdqa %xmm4, %xmm5
pcmpgtd %xmm2, %xmm5
cvttps2dq %xmm3, %xmm3
pand %xmm5, %xmm4
pandn %xmm2, %xmm5
por %xmm5, %xmm4
movdqa %xmm4, %xmm5
pcmpgtd %xmm1, %xmm5
pand %xmm5, %xmm7
pandn %xmm4, %xmm5
movdqa %xmm5, %xmm4
movdqa %xmm3, %xmm5
por %xmm7, %xmm4
movdqa %xmm1, %xmm7
pcmpgtd %xmm2, %xmm5
pand %xmm5, %xmm3
pandn %xmm2, %xmm5
por %xmm5, %xmm3
movdqa %xmm3, %xmm5
pcmpgtd %xmm1, %xmm5
pand %xmm5, %xmm7
pandn %xmm3, %xmm5
movdqa %xmm5, %xmm3
movdqa %xmm4, %xmm5
por %xmm7, %xmm3
punpcklwd %xmm3, %xmm4
punpckhwd %xmm3, %xmm5
movdqa %xmm4, %xmm7
punpcklwd %xmm5, %xmm4
punpckhwd %xmm5, %xmm7
punpcklwd %xmm7, %xmm4
movdqa %xmm4, (%r10)
addq $16, %r10
cmpw %r9w, %bx
ja .L24
cmpw %r8w, %dx
jne .L23
jmp .L15
.LFE511:
.size alignedConvertToS16SSE2, .-alignedConvertToS16SSE2
.section .rodata
.align 4
.LC0:
.long 1191181824
.align 16
.LC1:
.long -32768
.long -32768
.long -32768
.long -32768
.align 16
.LC2:
.long 32767
.long 32767
.long 32767
.long 32767
.align 16
.LC3:
.long 65280
.long 65280
.long 65280
.long 65280
.section .eh_frame,"aw",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1
.LSCIE1:
.long 0x0
.byte 0x1
.string "zR"
.byte 0x1
.byte 0x78
.byte 0x10
.byte 0x1
.byte 0x3
.byte 0xc
.byte 0x7
.byte 0x8
.byte 0x11
.byte 0x10
.byte 0x1
.align 8
.LECIE1:
.LSFDE1:
.long .LEFDE1-.LASFDE1
.LASFDE1:
.long .LASFDE1-.Lframe1
.long .LFB509
.long .LFE509-.LFB509
.byte 0x0
.align 8
.LEFDE1:
.LSFDE3:
.long .LEFDE3-.LASFDE3
.LASFDE3:
.long .LASFDE3-.Lframe1
.long .LFB510
.long .LFE510-.LFB510
.byte 0x0
.align 8
.LEFDE3:
.LSFDE5:
.long .LEFDE5-.LASFDE5
.LASFDE5:
.long .LASFDE5-.Lframe1
.long .LFB511
.long .LFE511-.LFB511
.byte 0x0
.byte 0x4
.long .LCFI0-.LFB511
.byte 0xe
.byte 0x10
.byte 0x4
.long .LCFI1-.LCFI0
.byte 0xe
.byte 0x18
.byte 0x11
.byte 0x3
.byte 0x3
.byte 0x11
.byte 0x6
.byte 0x2
.align 8
.LEFDE5:
.ident "GCC: (GNU) 4.4.0 20081110 (experimental)"

View File

@@ -0,0 +1,107 @@
.file "basic_ops_x86.c"
.text
.p2align 4,,15
.globl alignedMemCpyMMX
.type alignedMemCpyMMX, @function
alignedMemCpyMMX:
pushl %ebx
subl $112, %esp
movl 128(%esp), %ebx
movl 124(%esp), %eax
shrl $6, %ebx
#APP
# 42 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
fsave 4(%esp); fwait
# 0 "" 2
# 44 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
1: prefetchnta (%eax)
prefetchnta 64(%eax)
prefetchnta 128(%eax)
prefetchnta 192(%eax)
prefetchnta 256(%eax)
# 0 "" 2
#NO_APP
testl %ebx, %ebx
je .L2
movl 120(%esp), %ecx
xorl %edx, %edx
.p2align 4,,7
.p2align 3
.L3:
#APP
# 53 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
1: prefetchnta 320(%eax)
2: movq (%eax), %mm0
movq 8(%eax), %mm1
movq 16(%eax), %mm2
movq 24(%eax), %mm3
movq %mm0, (%ecx)
movq %mm1, 8(%ecx)
movq %mm2, 16(%ecx)
movq %mm3, 24(%ecx)
movq 32(%eax), %mm0
movq 40(%eax), %mm1
movq 48(%eax), %mm2
movq 56(%eax), %mm3
movq %mm0, 32(%ecx)
movq %mm1, 40(%ecx)
movq %mm2, 48(%ecx)
movq %mm3, 56(%ecx)
# 0 "" 2
#NO_APP
addl $1, %edx
addl $64, %eax
addl $64, %ecx
cmpl %edx, %ebx
jne .L3
.L2:
#APP
# 75 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
fsave 4(%esp); fwait
# 0 "" 2
#NO_APP
addl $112, %esp
popl %ebx
ret
.size alignedMemCpyMMX, .-alignedMemCpyMMX
.p2align 4,,15
.globl alignedMemClearMMX
.type alignedMemClearMMX, @function
alignedMemClearMMX:
movl 8(%esp), %ecx
shrl $6, %ecx
testl %ecx, %ecx
je .L8
movl 4(%esp), %edx
xorl %eax, %eax
pxor %mm0, %mm0
.p2align 4,,7
.p2align 3
.L9:
#APP
# 90 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
movq %mm0, (%edx)
movq %mm0, 8(%edx)
movq %mm0, 16(%edx)
movq %mm0, 24(%edx)
movq %mm0, 32(%edx)
movq %mm0, 40(%edx)
movq %mm0, 48(%edx)
movq %mm0, 56(%edx)
# 0 "" 2
#NO_APP
addl $1, %eax
addl $64, %edx
cmpl %eax, %ecx
jne .L9
.L8:
emms
ret
.size alignedMemClearMMX, .-alignedMemClearMMX
.ident "GCC: (GNU) 4.4.0 20081110 (experimental)"
.section .note.GNU-stack,"",@progbits

View File

@@ -0,0 +1,505 @@
.file "basic_ops_x86.c"
.text
.p2align 4,,15
.globl alignedMemCpySSE
.type alignedMemCpySSE, @function
alignedMemCpySSE:
pushl %esi
pushl %ebx
movl 20(%esp), %esi
movl 12(%esp), %edx
movl 16(%esp), %ecx
shrl $6, %esi
testl %esi, %esi
je .L4
xorl %eax, %eax
xorl %ebx, %ebx
.p2align 4,,7
.p2align 3
.L3:
movaps (%ecx,%eax), %xmm0
addl $1, %ebx
movaps %xmm0, (%edx,%eax)
movaps 16(%ecx,%eax), %xmm0
movaps %xmm0, 16(%edx,%eax)
movaps 32(%ecx,%eax), %xmm0
movaps %xmm0, 32(%edx,%eax)
movaps 48(%ecx,%eax), %xmm0
movaps %xmm0, 48(%edx,%eax)
addl $64, %eax
cmpl %ebx, %esi
jne .L3
.L4:
popl %ebx
popl %esi
ret
.size alignedMemCpySSE, .-alignedMemCpySSE
.p2align 4,,15
.globl alignedMemClearSSE
.type alignedMemClearSSE, @function
alignedMemClearSSE:
movl 8(%esp), %ecx
shrl $6, %ecx
testl %ecx, %ecx
je .L10
movl 4(%esp), %eax
xorps %xmm0, %xmm0
xorl %edx, %edx
.p2align 4,,7
.p2align 3
.L9:
addl $1, %edx
movaps %xmm0, (%eax)
movaps %xmm0, 16(%eax)
movaps %xmm0, 32(%eax)
movaps %xmm0, 48(%eax)
addl $64, %eax
cmpl %edx, %ecx
jne .L9
.L10:
rep
ret
.size alignedMemClearSSE, .-alignedMemClearSSE
.p2align 4,,15
.globl alignedBufApplyGainSSE
.type alignedBufApplyGainSSE, @function
alignedBufApplyGainSSE:
movl 12(%esp), %ecx
testl %ecx, %ecx
jle .L15
movss 8(%esp), %xmm0
subl $1, %ecx
movl 4(%esp), %eax
shrl $3, %ecx
xorl %edx, %edx
addl $1, %ecx
shufps $0, %xmm0, %xmm0
.p2align 4,,7
.p2align 3
.L14:
movaps %xmm0, %xmm3
addl $1, %edx
movaps %xmm0, %xmm2
movaps %xmm0, %xmm1
movaps %xmm0, %xmm4
mulps 16(%eax), %xmm3
mulps 32(%eax), %xmm2
mulps 48(%eax), %xmm1
movaps %xmm3, 16(%eax)
mulps (%eax), %xmm4
movaps %xmm2, 32(%eax)
movaps %xmm1, 48(%eax)
movaps %xmm4, (%eax)
addl $64, %eax
cmpl %edx, %ecx
ja .L14
.L15:
rep
ret
.size alignedBufApplyGainSSE, .-alignedBufApplyGainSSE
.p2align 4,,15
.globl alignedBufMixSSE
.type alignedBufMixSSE, @function
alignedBufMixSSE:
pushl %esi
pushl %ebx
movl 20(%esp), %esi
movl 12(%esp), %edx
movl 16(%esp), %ecx
testl %esi, %esi
jle .L20
subl $1, %esi
xorl %eax, %eax
shrl $3, %esi
xorl %ebx, %ebx
addl $1, %esi
.p2align 4,,7
.p2align 3
.L19:
movaps 16(%edx,%eax), %xmm2
addl $1, %ebx
movaps 32(%edx,%eax), %xmm1
movaps 48(%edx,%eax), %xmm0
movaps (%edx,%eax), %xmm3
addps 16(%ecx,%eax), %xmm2
addps 32(%ecx,%eax), %xmm1
addps 48(%ecx,%eax), %xmm0
addps (%ecx,%eax), %xmm3
movaps %xmm2, 16(%edx,%eax)
movaps %xmm3, (%edx,%eax)
movaps %xmm1, 32(%edx,%eax)
movaps %xmm0, 48(%edx,%eax)
addl $64, %eax
cmpl %ebx, %esi
ja .L19
.L20:
popl %ebx
popl %esi
ret
.size alignedBufMixSSE, .-alignedBufMixSSE
.p2align 4,,15
.globl alignedBufMixLRCoeffSSE
.type alignedBufMixLRCoeffSSE, @function
alignedBufMixLRCoeffSSE:
pushl %esi
pushl %ebx
movl 28(%esp), %esi
movl 12(%esp), %edx
movl 16(%esp), %ebx
testl %esi, %esi
jle .L25
movss 24(%esp), %xmm0
subl $1, %esi
movss 20(%esp), %xmm1
xorl %eax, %eax
shrl $2, %esi
xorl %ecx, %ecx
addl $1, %esi
unpcklps %xmm0, %xmm1
movaps %xmm1, %xmm0
movlhps %xmm1, %xmm0
.p2align 4,,7
.p2align 3
.L24:
movaps %xmm0, %xmm1
addl $1, %ecx
movaps %xmm0, %xmm2
mulps 16(%ebx,%eax), %xmm1
mulps (%ebx,%eax), %xmm2
addps 16(%edx,%eax), %xmm1
addps (%edx,%eax), %xmm2
movaps %xmm1, 16(%edx,%eax)
movaps %xmm2, (%edx,%eax)
addl $32, %eax
cmpl %ecx, %esi
ja .L24
.L25:
popl %ebx
popl %esi
ret
.size alignedBufMixLRCoeffSSE, .-alignedBufMixLRCoeffSSE
.p2align 4,,15
.globl alignedBufWetDryMixSSE
.type alignedBufWetDryMixSSE, @function
alignedBufWetDryMixSSE:
pushl %esi
pushl %ebx
movl 28(%esp), %esi
movl 12(%esp), %edx
movl 16(%esp), %ebx
testl %esi, %esi
jle .L30
movss 24(%esp), %xmm1
subl $1, %esi
movss 20(%esp), %xmm0
xorl %eax, %eax
shrl $2, %esi
xorl %ecx, %ecx
shufps $0, %xmm1, %xmm1
addl $1, %esi
shufps $0, %xmm0, %xmm0
.p2align 4,,7
.p2align 3
.L29:
movaps %xmm1, %xmm3
addl $1, %ecx
movaps %xmm0, %xmm2
movaps %xmm1, %xmm4
mulps 16(%edx,%eax), %xmm3
mulps 16(%ebx,%eax), %xmm2
mulps (%edx,%eax), %xmm4
addps %xmm3, %xmm2
movaps %xmm0, %xmm3
mulps (%ebx,%eax), %xmm3
movaps %xmm2, 16(%edx,%eax)
addps %xmm4, %xmm3
movaps %xmm3, (%edx,%eax)
addl $32, %eax
cmpl %ecx, %esi
ja .L29
.L30:
popl %ebx
popl %esi
ret
.size alignedBufWetDryMixSSE, .-alignedBufWetDryMixSSE
.p2align 4,,15
.globl alignedBufWetDryMixSplittedSSE
.type alignedBufWetDryMixSplittedSSE, @function
alignedBufWetDryMixSplittedSSE:
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $140, %esp
movl 180(%esp), %eax
movl 160(%esp), %edx
movl 164(%esp), %esi
movl 168(%esp), %ecx
testl %eax, %eax
movss 172(%esp), %xmm4
movss 176(%esp), %xmm5
jle .L39
movl 180(%esp), %eax
subl $1, %eax
shrl %eax
addl $1, %eax
movl %eax, %ebp
movl %eax, 112(%esp)
shrl $2, %ebp
cmpl $3, 112(%esp)
leal 0(,%ebp,4), %eax
movl %eax, 116(%esp)
jbe .L40
testl %eax, %eax
jne .L34
.L40:
xorl %edi, %edi
jmp .L36
.p2align 4,,7
.p2align 3
.L34:
movaps %xmm4, %xmm2
xorps %xmm6, %xmm6
shufps $0, %xmm2, %xmm2
movaps %xmm5, %xmm1
movl %esi, %ebx
shufps $0, %xmm1, %xmm1
movaps %xmm2, 32(%esp)
xorl %eax, %eax
xorl %edi, %edi
movss %xmm5, 124(%esp)
movss %xmm4, 120(%esp)
movaps %xmm1, %xmm4
.p2align 4,,7
.p2align 3
.L37:
movaps 16(%edx,%eax,2), %xmm3
addl $1, %edi
movaps (%edx,%eax,2), %xmm2
movaps 48(%edx,%eax,2), %xmm0
movaps %xmm2, %xmm5
shufps $221, %xmm3, %xmm2
movaps 32(%edx,%eax,2), %xmm1
shufps $136, %xmm3, %xmm5
movaps %xmm2, 96(%esp)
movaps %xmm1, %xmm7
shufps $221, %xmm0, %xmm1
shufps $136, %xmm0, %xmm7
movaps %xmm1, 64(%esp)
movaps %xmm6, %xmm3
movaps %xmm5, (%esp)
shufps $136, %xmm7, %xmm5
movlps (%ebx), %xmm3
movaps %xmm6, %xmm2
movhps 8(%ebx), %xmm3
movaps %xmm7, 80(%esp)
movlps 16(%ebx), %xmm2
movhps 24(%ebx), %xmm2
movaps 96(%esp), %xmm7
addl $32, %ebx
movaps %xmm3, %xmm0
shufps $221, %xmm2, %xmm3
shufps $136, %xmm2, %xmm0
shufps $136, 64(%esp), %xmm7
mulps 32(%esp), %xmm0
movaps %xmm6, %xmm1
movlps (%ecx,%eax), %xmm1
movhps 8(%ecx,%eax), %xmm1
movaps 96(%esp), %xmm2
mulps %xmm4, %xmm7
shufps $221, 64(%esp), %xmm2
mulps %xmm4, %xmm5
mulps 32(%esp), %xmm3
movaps %xmm7, 16(%esp)
movaps %xmm1, %xmm7
addps %xmm0, %xmm5
movaps %xmm6, %xmm0
movlps 16(%ecx,%eax), %xmm0
movhps 24(%ecx,%eax), %xmm0
shufps $136, %xmm0, %xmm7
shufps $221, %xmm0, %xmm1
mulps 32(%esp), %xmm7
mulps 32(%esp), %xmm1
mulps %xmm4, %xmm2
movaps %xmm7, 48(%esp)
movaps 16(%esp), %xmm7
addps 48(%esp), %xmm7
addps %xmm1, %xmm2
movaps %xmm7, 16(%esp)
movaps (%esp), %xmm7
shufps $221, 80(%esp), %xmm7
movaps 16(%esp), %xmm1
mulps %xmm4, %xmm7
movaps 16(%esp), %xmm0
unpckhps %xmm2, %xmm1
unpcklps %xmm2, %xmm0
movaps %xmm1, %xmm2
addps %xmm3, %xmm7
movaps %xmm5, %xmm3
unpcklps %xmm7, %xmm3
unpckhps %xmm7, %xmm5
movaps %xmm3, %xmm1
unpckhps %xmm0, %xmm3
unpcklps %xmm0, %xmm1
movaps %xmm5, %xmm0
unpckhps %xmm2, %xmm5
unpcklps %xmm2, %xmm0
movaps %xmm1, (%edx,%eax,2)
movaps %xmm3, 16(%edx,%eax,2)
movaps %xmm0, 32(%edx,%eax,2)
movaps %xmm5, 48(%edx,%eax,2)
addl $32, %eax
cmpl %edi, %ebp
ja .L37
movl 116(%esp), %edi
movl 112(%esp), %eax
movss 120(%esp), %xmm4
movss 124(%esp), %xmm5
addl %edi, %edi
cmpl %eax, 116(%esp)
je .L39
.L36:
leal (%edx,%edi,8), %ebx
xorl %ebp, %ebp
leal 8(%edx,%edi,8), %edx
movl %edi, %eax
.p2align 4,,7
.p2align 3
.L38:
movaps %xmm5, %xmm1
addl $2, %ebp
movaps %xmm4, %xmm0
mulss (%ebx), %xmm1
mulss (%esi,%eax,4), %xmm0
addss %xmm1, %xmm0
movaps %xmm5, %xmm1
movss %xmm0, (%ebx)
movaps %xmm4, %xmm0
mulss 4(%ebx), %xmm1
mulss (%ecx,%eax,4), %xmm0
addss %xmm1, %xmm0
movaps %xmm5, %xmm1
movss %xmm0, 4(%ebx)
addl $16, %ebx
movaps %xmm4, %xmm0
mulss (%edx), %xmm1
mulss 4(%esi,%eax,4), %xmm0
addss %xmm1, %xmm0
movaps %xmm5, %xmm1
movss %xmm0, (%edx)
movaps %xmm4, %xmm0
mulss 4(%edx), %xmm1
mulss 4(%ecx,%eax,4), %xmm0
leal (%edi,%ebp), %eax
addss %xmm1, %xmm0
movss %xmm0, 4(%edx)
addl $16, %edx
cmpl %eax, 180(%esp)
jg .L38
.L39:
addl $140, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.size alignedBufWetDryMixSplittedSSE, .-alignedBufWetDryMixSplittedSSE
.p2align 4,,15
.globl unalignedBufMixLRCoeffSSE
.type unalignedBufMixLRCoeffSSE, @function
unalignedBufMixLRCoeffSSE:
pushl %esi
pushl %ebx
movl 28(%esp), %esi
movl 12(%esp), %eax
movl 16(%esp), %edx
movss 20(%esp), %xmm0
movl %esi, %ecx
shrl $31, %ecx
leal (%esi,%ecx), %ebx
andl $1, %ebx
cmpl %ecx, %ebx
movss 24(%esp), %xmm3
jne .L52
.L44:
testl %esi, %esi
jle .L49
leal -1(%esi), %ebx
shrl %ebx
testb $15, %al
jne .L46
movaps %xmm0, %xmm1
xorps %xmm2, %xmm2
unpcklps %xmm3, %xmm1
addl $1, %ebx
xorl %ecx, %ecx
movaps %xmm1, %xmm3
movlhps %xmm1, %xmm3
.p2align 4,,7
.p2align 3
.L47:
movaps %xmm2, %xmm1
addl $1, %ecx
movlps (%edx), %xmm1
movhps 8(%edx), %xmm1
movaps %xmm2, %xmm0
movlps (%eax), %xmm0
movhps 8(%eax), %xmm0
addl $16, %edx
mulps %xmm3, %xmm1
addps %xmm1, %xmm0
movaps %xmm0, (%eax)
addl $16, %eax
cmpl %ebx, %ecx
jb .L47
.L49:
popl %ebx
popl %esi
ret
.p2align 4,,7
.p2align 3
.L46:
xorl %ecx, %ecx
.p2align 4,,7
.p2align 3
.L48:
movaps %xmm0, %xmm1
mulss (%edx,%ecx,8), %xmm1
addss (%eax,%ecx,8), %xmm1
movss %xmm1, (%eax,%ecx,8)
movaps %xmm3, %xmm1
mulss 4(%edx,%ecx,8), %xmm1
addss 4(%eax,%ecx,8), %xmm1
movss %xmm1, 4(%eax,%ecx,8)
movaps %xmm0, %xmm1
mulss 8(%edx,%ecx,8), %xmm1
addss 8(%eax,%ecx,8), %xmm1
movss %xmm1, 8(%eax,%ecx,8)
movaps %xmm3, %xmm1
mulss 12(%edx,%ecx,8), %xmm1
addss 12(%eax,%ecx,8), %xmm1
movss %xmm1, 12(%eax,%ecx,8)
addl $2, %ecx
cmpl %ecx, %esi
jg .L48
popl %ebx
popl %esi
ret
.L52:
movaps %xmm0, %xmm1
subl $1, %esi
movss (%eax), %xmm2
mulss (%edx), %xmm1
addss %xmm2, %xmm1
movss 4(%eax), %xmm2
movss %xmm1, (%eax)
movaps %xmm3, %xmm1
mulss 4(%edx), %xmm1
addl $8, %edx
addss %xmm2, %xmm1
movss %xmm1, 4(%eax)
addl $8, %eax
jmp .L44
.size unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE
.ident "GCC: (GNU) 4.4.0 20081110 (experimental)"
.section .note.GNU-stack,"",@progbits

View File

@@ -0,0 +1,349 @@
.file "basic_ops_x86.c"
.text
.p2align 4,,15
.globl alignedMemCpySSE2
.type alignedMemCpySSE2, @function
alignedMemCpySSE2:
pushl %esi
pushl %ebx
movl 20(%esp), %esi
movl 12(%esp), %edx
movl 16(%esp), %ecx
shrl $6, %esi
testl %esi, %esi
je .L4
xorl %eax, %eax
xorl %ebx, %ebx
.p2align 4,,7
.p2align 3
.L3:
addl $1, %ebx
movdqa (%ecx,%eax), %xmm0
movdqa %xmm0, (%edx,%eax)
movdqa 16(%ecx,%eax), %xmm0
movdqa %xmm0, 16(%edx,%eax)
movdqa 32(%ecx,%eax), %xmm0
movdqa %xmm0, 32(%edx,%eax)
movdqa 48(%ecx,%eax), %xmm0
movdqa %xmm0, 48(%edx,%eax)
addl $64, %eax
cmpl %ebx, %esi
jne .L3
.L4:
popl %ebx
popl %esi
ret
.size alignedMemCpySSE2, .-alignedMemCpySSE2
.p2align 4,,15
.globl alignedMemClearSSE2
.type alignedMemClearSSE2, @function
alignedMemClearSSE2:
movl 8(%esp), %ecx
shrl $6, %ecx
testl %ecx, %ecx
je .L10
movl 4(%esp), %eax
xorl %edx, %edx
pxor %xmm0, %xmm0
.p2align 4,,7
.p2align 3
.L9:
addl $1, %edx
movdqa %xmm0, (%eax)
movdqa %xmm0, 16(%eax)
movdqa %xmm0, 32(%eax)
movdqa %xmm0, 48(%eax)
addl $64, %eax
cmpl %edx, %ecx
jne .L9
.L10:
rep
ret
.size alignedMemClearSSE2, .-alignedMemClearSSE2
.p2align 4,,15
.globl alignedConvertToS16SSE2
.type alignedConvertToS16SSE2, @function
alignedConvertToS16SSE2:
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $8, %esp
movl 36(%esp), %eax
movss .LC0, %xmm4
cmpb $0, 44(%esp)
movl 28(%esp), %edx
movl 32(%esp), %ebx
movl %eax, %esi
mulss 40(%esp), %xmm4
jne .L13
testw %ax, %ax
jle .L15
movl %eax, %edi
shrw $2, %di
cmpw $3, %ax
movw %ax, 2(%esp)
leal 0(,%edi,4), %ebp
ja .L33
.L28:
xorl %ebp, %ebp
.p2align 4,,7
.p2align 3
.L23:
movswl %bp,%eax
movl $-32768, %edi
leal (%edx,%eax,8), %edx
leal (%ebx,%eax,4), %eax
movl $32767, %ebx
.p2align 4,,7
.p2align 3
.L25:
movaps %xmm4, %xmm0
mulss (%edx), %xmm0
cvttss2si %xmm0, %ecx
movaps %xmm4, %xmm0
mulss 4(%edx), %xmm0
cmpl $-32768, %ecx
cmovl %edi, %ecx
cmpl $32767, %ecx
cmovg %ebx, %ecx
movw %cx, (%eax)
cvttss2si %xmm0, %ecx
cmpl $-32768, %ecx
cmovl %edi, %ecx
cmpl $32767, %ecx
cmovg %ebx, %ecx
addl $1, %ebp
movw %cx, 2(%eax)
addl $8, %edx
addl $4, %eax
cmpw %bp, %si
jg .L25
.L15:
movswl %si,%esi
addl $8, %esp
leal 0(,%esi,4), %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,7
.p2align 3
.L13:
testw %ax, %ax
jle .L15
movl %eax, %ebp
shrw $2, %bp
cmpw $3, %si
movw %ax, 2(%esp)
leal 0(,%ebp,4), %eax
ja .L34
.L27:
xorl %eax, %eax
.p2align 4,,7
.p2align 3
.L18:
movswl %ax,%edi
leal (%edx,%edi,8), %ecx
leal (%ebx,%edi,4), %edx
movl $-32768, %edi
.p2align 4,,7
.p2align 3
.L20:
movaps %xmm4, %xmm0
movl $32767, %ebp
mulss (%ecx), %xmm0
cvttss2si %xmm0, %ebx
movaps %xmm4, %xmm0
mulss 4(%ecx), %xmm0
cmpl $-32768, %ebx
cmovl %edi, %ebx
cmpl $32767, %ebx
cmovg %ebp, %ebx
movzbl %bh, %ebp
sall $8, %ebx
orl %ebp, %ebx
movl $32767, %ebp
movw %bx, (%edx)
cvttss2si %xmm0, %ebx
cmpl $-32768, %ebx
cmovl %edi, %ebx
cmpl $32767, %ebx
cmovg %ebp, %ebx
addl $1, %eax
movzbl %bh, %ebp
addl $8, %ecx
sall $8, %ebx
orl %ebp, %ebx
movw %bx, 2(%edx)
addl $4, %edx
cmpw %ax, %si
jg .L20
jmp .L15
.p2align 4,,7
.p2align 3
.L34:
testw %ax, %ax
je .L27
movaps %xmm4, %xmm0
xorl %ecx, %ecx
movdqa .LC1, %xmm1
movss %xmm4, 4(%esp)
shufps $0, %xmm0, %xmm0
xorl %edi, %edi
movaps %xmm0, %xmm7
movdqa .LC2, %xmm0
.p2align 4,,7
.p2align 3
.L19:
movaps %xmm7, %xmm3
movdqa %xmm0, %xmm5
movdqa %xmm0, %xmm6
movaps %xmm7, %xmm2
addl $1, %edi
mulps (%edx,%ecx,2), %xmm3
mulps 16(%edx,%ecx,2), %xmm2
cvttps2dq %xmm3, %xmm3
movdqa %xmm3, %xmm4
pcmpgtd %xmm1, %xmm4
pand %xmm4, %xmm3
pandn %xmm1, %xmm4
por %xmm4, %xmm3
cvttps2dq %xmm2, %xmm2
movdqa %xmm3, %xmm4
pcmpgtd %xmm0, %xmm4
pand %xmm4, %xmm5
pandn %xmm3, %xmm4
movdqa %xmm4, %xmm3
movdqa %xmm2, %xmm4
por %xmm5, %xmm3
pcmpgtd %xmm1, %xmm4
movdqa .LC3, %xmm5
pand %xmm4, %xmm2
pand %xmm3, %xmm5
pandn %xmm1, %xmm4
psrad $8, %xmm5
por %xmm4, %xmm2
pslld $8, %xmm3
movdqa %xmm2, %xmm4
pcmpgtd %xmm0, %xmm4
pand %xmm4, %xmm6
pandn %xmm2, %xmm4
movdqa %xmm4, %xmm2
por %xmm6, %xmm2
movdqa .LC3, %xmm6
pand %xmm2, %xmm6
pslld $8, %xmm2
psrad $8, %xmm6
movdqa %xmm5, %xmm4
punpcklwd %xmm6, %xmm5
punpckhwd %xmm6, %xmm4
movdqa %xmm5, %xmm6
punpcklwd %xmm4, %xmm5
punpckhwd %xmm4, %xmm6
movdqa %xmm3, %xmm4
punpcklwd %xmm6, %xmm5
punpckhwd %xmm2, %xmm4
punpcklwd %xmm2, %xmm3
movdqa %xmm3, %xmm6
punpcklwd %xmm4, %xmm3
punpckhwd %xmm4, %xmm6
punpcklwd %xmm6, %xmm3
por %xmm3, %xmm5
movdqa %xmm5, (%ebx,%ecx)
addl $16, %ecx
cmpw %di, %bp
ja .L19
cmpw 2(%esp), %ax
movss 4(%esp), %xmm4
jne .L18
jmp .L15
.p2align 4,,7
.p2align 3
.L33:
testw %bp, %bp
.p2align 4,,3
.p2align 3
je .L28
movaps %xmm4, %xmm0
xorl %eax, %eax
movdqa .LC1, %xmm1
shufps $0, %xmm0, %xmm0
xorl %ecx, %ecx
movaps %xmm0, %xmm6
movdqa .LC2, %xmm0
.p2align 4,,7
.p2align 3
.L24:
movaps %xmm6, %xmm3
addl $1, %ecx
movdqa %xmm0, %xmm7
movaps %xmm6, %xmm2
mulps (%edx,%eax,2), %xmm3
mulps 16(%edx,%eax,2), %xmm2
cvttps2dq %xmm3, %xmm3
movdqa %xmm3, %xmm5
pcmpgtd %xmm1, %xmm5
pand %xmm5, %xmm3
pandn %xmm1, %xmm5
por %xmm5, %xmm3
cvttps2dq %xmm2, %xmm2
movdqa %xmm3, %xmm5
pcmpgtd %xmm0, %xmm5
pand %xmm5, %xmm7
pandn %xmm3, %xmm5
movdqa %xmm5, %xmm3
movdqa %xmm2, %xmm5
por %xmm7, %xmm3
pcmpgtd %xmm1, %xmm5
movdqa %xmm0, %xmm7
pand %xmm5, %xmm2
pandn %xmm1, %xmm5
por %xmm5, %xmm2
movdqa %xmm2, %xmm5
pcmpgtd %xmm0, %xmm5
pand %xmm5, %xmm7
pandn %xmm2, %xmm5
movdqa %xmm5, %xmm2
movdqa %xmm3, %xmm5
por %xmm7, %xmm2
punpckhwd %xmm2, %xmm5
punpcklwd %xmm2, %xmm3
movdqa %xmm3, %xmm7
punpcklwd %xmm5, %xmm3
punpckhwd %xmm5, %xmm7
punpcklwd %xmm7, %xmm3
movdqa %xmm3, (%ebx,%eax)
addl $16, %eax
cmpw %cx, %di
ja .L24
cmpw %bp, 2(%esp)
jne .L23
jmp .L15
.size alignedConvertToS16SSE2, .-alignedConvertToS16SSE2
.section .rodata.cst4,"aM",@progbits,4
.align 4
.LC0:
.long 1191181824
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC1:
.long -32768
.long -32768
.long -32768
.long -32768
.align 16
.LC2:
.long 32767
.long 32767
.long 32767
.long 32767
.align 16
.LC3:
.long 65280
.long 65280
.long 65280
.long 65280
.ident "GCC: (GNU) 4.4.0 20081110 (experimental)"
.section .note.GNU-stack,"",@progbits

View File

@@ -28,6 +28,7 @@
#include <QtXml/QDomElement>
#include "fx_mixer.h"
#include "basic_ops.h"
#include "effect.h"
#include "song.h"
@@ -38,7 +39,7 @@ fxChannel::fxChannel( model * _parent ) :
m_stillRunning( false ),
m_peakLeft( 0.0f ),
m_peakRight( 0.0f ),
m_buffer( new sampleFrame[engine::getMixer()->framesPerPeriod()] ),
m_buffer( alignedAllocFrames( engine::getMixer()->framesPerPeriod() ) ),
m_muteModel( false, _parent ),
m_volumeModel( 1.0, 0.0, 2.0, 0.01, _parent ),
m_name(),
@@ -53,7 +54,7 @@ fxChannel::fxChannel( model * _parent ) :
fxChannel::~fxChannel()
{
delete[] m_buffer;
alignedFreeFrames( m_buffer );
}
@@ -92,13 +93,7 @@ void fxMixer::mixToChannel( const sampleFrame * _buf, fx_ch_t _ch )
if( m_fxChannels[_ch]->m_muteModel.value() == false )
{
m_fxChannels[_ch]->m_lock.lock();
sampleFrame * buf = m_fxChannels[_ch]->m_buffer;
for( f_cnt_t f = 0; f < engine::getMixer()->framesPerPeriod();
++f )
{
buf[f][0] += _buf[f][0];
buf[f][1] += _buf[f][1];
}
alignedBufMix( m_fxChannels[_ch]->m_buffer, _buf, engine::getMixer()->framesPerPeriod() );
m_fxChannels[_ch]->m_used = true;
m_fxChannels[_ch]->m_lock.unlock();
}

View File

@@ -57,6 +57,7 @@
#include "main_window.h"
#include "project_renderer.h"
#include "song.h"
#include "basic_ops.h"
#warning TODO: move somewhere else
static inline QString baseName( const QString & _file )
@@ -78,12 +79,29 @@ inline void loadTranslation( const QString & _tname,
}
Uint32 convertToS16( const sampleFrameA * RP _ab,
const fpp_t _frames,
const float _master_gain,
intSampleFrameA * RP _output_buffer,
const bool _convert_endian );
int main( int argc, char * * argv )
{
// intialize RNG
srand( getpid() + time( 0 ) );
// init CPU specific optimized basic ops
initBasicOps();
#if 0
sampleFrameA * buf = (sampleFrameA *) alignedMalloc( sizeof( sampleFrameA ) * 256 );
intSampleFrameA * obuf = (intSampleFrameA*)alignedMalloc( sizeof( intSampleFrameA ) * 256 );
for( int i = 0; i< 1000000; ++i )
{
convertToS16( buf, 256, 0.7, obuf, false );
}
return 0;
#endif
bool core_only = FALSE;
for( int i = 1; i < argc; ++i )

View File

@@ -41,6 +41,7 @@
#include "sample_play_handle.h"
#include "piano_roll.h"
#include "micro_timer.h"
#include "basic_ops.h"
#include "audio_device.h"
#include "midi_client.h"
@@ -61,40 +62,15 @@
#include "midi_winmm.h"
#include "midi_dummy.h"
#ifdef LMMS_HAVE_PTHREAD_H
#include <pthread.h>
#endif
static QVector<fx_ch_t> __fx_channel_jobs( NumFxChannels );
static void aligned_free( void * _buf )
{
if( _buf != NULL )
{
int *ptr2=(int *)_buf - 1;
_buf = (char *)_buf- *ptr2;
free(_buf);
}
}
static void * aligned_malloc( int _bytes )
{
char *ptr,*ptr2,*aligned_ptr;
int align_mask = ALIGN_SIZE- 1;
ptr=(char *)malloc(_bytes +ALIGN_SIZE+ sizeof(int));
if(ptr==NULL) return(NULL);
ptr2 = ptr + sizeof(int);
aligned_ptr = ptr2 + (ALIGN_SIZE- ((size_t)ptr2 & align_mask));
ptr2 = aligned_ptr - sizeof(int);
*((int *)ptr2)=(int)(aligned_ptr - ptr);
return(aligned_ptr);
}
class mixerWorkerThread : public QThread
{
public:
@@ -152,9 +128,7 @@ public:
mixerWorkerThread( int _worker_num, mixer * _mixer ) :
QThread( _mixer ),
m_workingBuf( (sampleFrame *) aligned_malloc(
_mixer->framesPerPeriod() *
sizeof( sampleFrame ) ) ),
m_workingBuf( alignedAllocFrames( _mixer->framesPerPeriod() ) ),
m_workerNum( _worker_num ),
m_quit( false ),
m_mixer( _mixer ),
@@ -165,7 +139,7 @@ public:
virtual ~mixerWorkerThread()
{
aligned_free( m_workingBuf );
alignedFreeFrames( m_workingBuf );
}
virtual void quit( void )
@@ -234,11 +208,11 @@ private:
{
#if 0
#ifdef LMMS_BUILD_LINUX
#ifdef LMMS_HAVE_SCHED_H
#ifdef LMMS_HAVE_PTHREAD_H
cpu_set_t mask;
CPU_ZERO( &mask );
CPU_SET( m_workerNum, &mask );
sched_setaffinity( 0, sizeof( mask ), &mask );
pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask );
#endif
#endif
#endif
@@ -310,7 +284,8 @@ mixer::mixer( void ) :
{
m_inputBufferFrames[i] = 0;
m_inputBufferSize[i] = DEFAULT_BUFFER_SIZE * 100;
m_inputBuffer[i] = new sampleFrame[ DEFAULT_BUFFER_SIZE * 100 ];
m_inputBuffer[i] = alignedAllocFrames(
DEFAULT_BUFFER_SIZE * 100 );
clearAudioBuffer( m_inputBuffer[i], m_inputBufferSize[i] );
}
@@ -351,14 +326,10 @@ mixer::mixer( void ) :
m_fifo = new fifo( 1 );
}
m_workingBuf = (sampleFrame*) aligned_malloc( m_framesPerPeriod *
sizeof( sampleFrame ) );
m_workingBuf = alignedAllocFrames( m_framesPerPeriod );
for( Uint8 i = 0; i < 3; i++ )
{
m_readBuf = (surroundSampleFrame*)
aligned_malloc( m_framesPerPeriod *
sizeof( surroundSampleFrame ) );
m_readBuf = alignedAllocFrames( m_framesPerPeriod );
clearAudioBuffer( m_readBuf, m_framesPerPeriod );
m_bufferPool.push_back( m_readBuf );
}
@@ -409,10 +380,10 @@ mixer::~mixer()
for( Uint8 i = 0; i < 3; i++ )
{
aligned_free( m_bufferPool[i] );
alignedFreeFrames( m_bufferPool[i] );
}
aligned_free( m_workingBuf );
alignedFreeFrames( m_workingBuf );
}
@@ -524,9 +495,9 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames )
if( frames + _frames > size )
{
size = qMax( size * 2, frames + _frames );
sampleFrame * ab = new sampleFrame[ size ];
memcpy( ab, buf, frames * sizeof( sampleFrame ) );
delete [] buf;
sampleFrame * ab = alignedAllocFrames( size );
alignedMemCpy( ab, buf, frames * sizeof( sampleFrame ) );
alignedFreeFrames( buf );
m_inputBufferSize[ m_inputBufferWrite ] = size;
m_inputBuffer[ m_inputBufferWrite ] = ab;
@@ -534,7 +505,7 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames )
buf = ab;
}
memcpy( &buf[ frames ], _ab, _frames * sizeof( sampleFrame ) );
alignedMemCpy( &buf[ frames ], _ab, _frames * sizeof( sampleFrame ) );
m_inputBufferFrames[ m_inputBufferWrite ] += _frames;
unlockInputFrames();
@@ -543,7 +514,7 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames )
const surroundSampleFrame * mixer::renderNextBuffer( void )
sampleFrameA * mixer::renderNextBuffer( void )
{
microTimer timer;
static song::playPos last_metro_pos = -1;
@@ -709,12 +680,9 @@ void mixer::bufferToPort( const sampleFrame * _buf,
const int loop1_frame = qMin<int>( end_frame, m_framesPerPeriod );
_port->lockFirstBuffer();
sampleFrame * obuf = _port->firstBuffer()+start_frame;
for( int frame = 0; frame < loop1_frame-start_frame; ++frame )
{
obuf[frame][0] += _buf[frame][0] * _vv.vol[0];
obuf[frame][1] += _buf[frame][1] * _vv.vol[1];
}
unalignedBufMixLRCoeff( _port->firstBuffer() + start_frame,
_buf, _vv.vol[0], _vv.vol[1],
loop1_frame - start_frame );
_port->unlockFirstBuffer();
_port->lockSecondBuffer();
@@ -723,14 +691,10 @@ void mixer::bufferToPort( const sampleFrame * _buf,
const int frames_done = m_framesPerPeriod - start_frame;
end_frame -= m_framesPerPeriod;
end_frame = qMin<int>( end_frame, m_framesPerPeriod );
sampleFrame * obuf = _port->secondBuffer();
for( fpp_t frame = 0; frame < end_frame; ++frame )
{
obuf[frame][0] += _buf[frames_done + frame][0] *
_vv.vol[0];
obuf[frame][1] += _buf[frames_done + frame][1] *
_vv.vol[1];
}
unalignedBufMixLRCoeff( _port->secondBuffer(),
_buf+frames_done,
_vv.vol[0], _vv.vol[1],
end_frame );
// we used both buffers so set flags
_port->m_bufferUsage = audioPort::BothBuffers;
}
@@ -748,7 +712,14 @@ void mixer::bufferToPort( const sampleFrame * _buf,
void mixer::clearAudioBuffer( sampleFrame * _ab, const f_cnt_t _frames,
const f_cnt_t _offset )
{
memset( _ab+_offset, 0, sizeof( *_ab ) * _frames );
if( likely( (int)( _ab+_offset ) % 16 == 0 && _frames % 8 == 0 ) )
{
alignedMemClear( _ab+_offset, sizeof( *_ab ) * _frames );
}
else
{
memset( _ab+_offset, 0, sizeof( *_ab ) * _frames );
}
}
@@ -1166,11 +1137,11 @@ void mixer::fifoWriter::run( void )
{
#if 0
#ifdef LMMS_BUILD_LINUX
#ifdef LMMS_HAVE_SCHED_H
#ifdef LMMS_HAVE_PTHREAD_H
cpu_set_t mask;
CPU_ZERO( &mask );
CPU_SET( 0, &mask );
sched_setaffinity( 0, sizeof( mask ), &mask );
pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask );
#endif
#endif
#endif
@@ -1178,9 +1149,9 @@ void mixer::fifoWriter::run( void )
const fpp_t frames = m_mixer->framesPerPeriod();
while( m_writing )
{
surroundSampleFrame * buffer = new surroundSampleFrame[frames];
const surroundSampleFrame * b = m_mixer->renderNextBuffer();
memcpy( buffer, b, frames * sizeof( surroundSampleFrame ) );
sampleFrameA * buffer = alignedAllocFrames( frames );
const sampleFrameA * b = m_mixer->renderNextBuffer();
alignedMemCpy( buffer, b, frames * sizeof( sampleFrameA ) );
m_fifo->write( buffer );
}

View File

@@ -32,11 +32,12 @@
#include "audio_file_wave.h"
#include "audio_file_ogg.h"
#ifdef LMMS_HAVE_SCHED_H
#include <sched.h>
#ifdef LMMS_HAVE_PTHREAD_H
#include <pthread.h>
#endif
fileEncodeDevice __fileEncodeDevices[] =
{
@@ -148,11 +149,11 @@ void projectRenderer::run( void )
{
#if 0
#ifdef LMMS_BUILD_LINUX
#ifdef LMMS_HAVE_SCHED_H
#ifdef LMMS_HAVE_PTHREAD_H
cpu_set_t mask;
CPU_ZERO( &mask );
CPU_SET( 0, &mask );
sched_setaffinity( 0, sizeof( mask ), &mask );
pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask );
#endif
#endif
#endif