experimental support for MMX/SSE/SSE2 instructions

git-svn-id: https://lmms.svn.sf.net/svnroot/lmms/trunk/lmms@1832 0778d3d1-df1d-0410-868b-ea421aaaa00d
2026-05-06 13:56:29 -04:00 · 2008-11-10 10:31:11 +00:00
parent 986fce1126
commit 22dc97f13d
34 changed files with 3198 additions and 283 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ ENDIF(LMMS_BUILD_WIN32)


 CHECK_INCLUDE_FILES(stdint.h LMMS_HAVE_STDINT_H)
+CHECK_INCLUDE_FILES(stdbool.h LMMS_HAVE_STDBOOL_H)
 CHECK_INCLUDE_FILES(stdlib.h LMMS_HAVE_STDLIB_H)
 CHECK_INCLUDE_FILES(pthread.h LMMS_HAVE_PTHREAD_H)
 CHECK_INCLUDE_FILES(semaphore.h LMMS_HAVE_SEMAPHORE_H)
@@ -387,6 +388,43 @@ SET(LMMS_ER_H ${CMAKE_CURRENT_BINARY_DIR}/embedded_resources.h)
 ADD_FILE_DEPENDENCIES(${CMAKE_BINARY_DIR}/lmmsconfig.h ${lmms_MOC_out})

 ADD_CUSTOM_COMMAND(OUTPUT ${LMMS_ER_H} COMMAND ${BIN2RES} ARGS ${lmms_EMBEDDED_RESOURCES} > ${LMMS_ER_H} DEPENDS ${BIN2RES})
+SET(BASIC_OPS_X86_C "${CMAKE_SOURCE_DIR}/src/core/basic_ops_x86.c")
+
+IF(LMMS_HOST_X86 OR LMMS_HOST_X86_64)
+
+ADD_CUSTOM_TARGET(regen-basic-ops)
+
+IF(LMMS_HOST_X86)
+SET(opt_targets mmx sse sse2)
+SET(host_arch x86)
+ELSE(LMMS_HOST_X86)
+SET(opt_targets sse sse2)
+SET(host_arch x86_64)
+ENDIF(LMMS_HOST_X86)
+
+FOREACH(opt_target ${opt_targets})
+
+	STRING(TOUPPER ${opt_target} OPT_TARGET)
+
+	SET(BASIC_OPS_X86_TARGET_S "${CMAKE_SOURCE_DIR}/src/core/basic_ops_${host_arch}_${opt_target}.s")
+	SET(BASIC_OPS_X86_TARGET_O "${CMAKE_BINARY_DIR}/basic_ops_${host_arch}_${opt_target}.o")
+	IF(NOT "${OPT_TARGET}" STREQUAL "MMX")
+		SET(FPMATH_FLAGS "-mfpmath=sse")
+	ENDIF(NOT "${OPT_TARGET}" STREQUAL "MMX")
+	IF(EXISTS "$ENV{SVN_C_COMPILER}")
+		SET(C_COMPILER $ENV{SVN_C_COMPILER})
+	ELSE(EXISTS "$ENV{SVN_C_COMPILER}")
+		SET(C_COMPILER ${CMAKE_C_COMPILER})
+	ENDIF(EXISTS "$ENV{SVN_C_COMPILER}")
+	ADD_CUSTOM_TARGET(regen-basic-ops-${opt_target} COMMAND ${C_COMPILER} -O2 -ftree-vectorize -ftree-vectorizer-verbose=2 -fomit-frame-pointer -c -S -I${CMAKE_SOURCE_DIR}/include -I${CMAKE_BINARY_DIR} -g0 -DBUILD_${OPT_TARGET} -m${opt_target} ${FPMATH_FLAGS} -o ${BASIC_OPS_X86_TARGET_S} ${BASIC_OPS_X86_C} DEPENDS ${BASIC_OPS_X86_C})
+	ADD_CUSTOM_COMMAND(OUTPUT ${BASIC_OPS_X86_TARGET_O} COMMAND ${CMAKE_C_COMPILER} ARGS ${BASIC_OPS_X86_TARGET_S} -c -o ${BASIC_OPS_X86_TARGET_O} DEPENDS ${BASIC_OPS_X86_TARGET_S})
+	ADD_DEPENDENCIES(regen-basic-ops regen-basic-ops-${opt_target})
+	SET(opt_target_objects ${opt_target_objects} ${BASIC_OPS_X86_TARGET_O})
+
+ENDFOREACH(opt_target ${opt_targets})
+SET(lmms_SOURCES ${lmms_SOURCES} ${opt_target_objects})
+# to be used by maintainer with special ultra-optimizing super duper GCC
+ENDIF(LMMS_HOST_X86 OR LMMS_HOST_X86_64)

 IF(WIN32)
 	SET(WINRC "${CMAKE_BINARY_DIR}/lmmsrc.obj")
--- a/37
+++ b/37
@@ -1,3 +1,40 @@
+2008-11-10	Tobias Doerffel	<tobydox/at/users/dot/sourceforge/dot/net>
+
+	* include/audio_portaudio.h:
+	* include/lmms_basics.h:
+	* include/fifo_buffer.h:
+	* include/mixer.h:
+	* include/audio_port.h:
+	* include/audio_dummy.h:
+	* include/basic_ops.h:
+	* include/audio_sdl.h:
+	* include/audio_jack.h:
+	* include/audio_device.h:
+	* src/core/audio/audio_device.cpp:
+	* src/core/audio/audio_alsa.cpp:
+	* src/core/audio/audio_file_wave.cpp:
+	* src/core/audio/audio_sdl.cpp:
+	* src/core/audio/audio_oss.cpp:
+	* src/core/audio/audio_port.cpp:
+	* src/core/audio/audio_portaudio.cpp:
+	* src/core/audio/audio_jack.cpp:
+	* src/core/audio/audio_pulseaudio.cpp:
+	* src/core/basic_ops.cpp:
+	* src/core/basic_ops_x86.c:
+	* src/core/basic_ops_x86_mmx.s:
+	* src/core/basic_ops_x86_sse.s:
+	* src/core/basic_ops_x86_sse2.s:
+	* src/core/basic_ops_x86_64_sse.s:
+	* src/core/basic_ops_x86_64_sse2.s:
+	* src/core/mixer.cpp:
+	* src/core/main.cpp:
+	* src/core/project_renderer.cpp:
+	* src/core/fx_mixer.cpp:
+	* plugins/ladspa_effect/ladspa_effect.cpp:
+	* lmmsconfig.h.in:
+	* CMakeLists.txt:
+	experimental support for MMX/SSE/SSE2 instructions
+
 2008-11-04	Tobias Doerffel	<tobydox/at/users/dot/sourceforge/dot/net>

 	* plugins/sf2_player/sf2_player.cpp:
--- a/include/audio_device.h
+++ b/include/audio_device.h
@@ -121,31 +121,22 @@ public:
 protected:
 	// subclasses can re-implement this for being used in conjunction with
 	// processNextBuffer()
-	virtual void writeBuffer( const surroundSampleFrame * /* _buf*/,
+	virtual void writeBuffer( const sampleFrameA * /* _buf*/,
 						const fpp_t /*_frames*/,
 						const float /*_master_gain*/ )
 	{
 	}

 	// called by according driver for fetching new sound-data
-	fpp_t getNextBuffer( surroundSampleFrame * _ab );
-
-	// convert a given audio-buffer to a buffer in signed 16-bit samples
-	// returns num of bytes in outbuf
-	Uint32 convertToS16( const surroundSampleFrame * _ab,
-					const fpp_t _frames,
-					const float _master_gain,
-					int_sample_t * _output_buffer,
-					const bool _convert_endian = FALSE );
+	fpp_t getNextBuffer( sampleFrameA * _ab );

 	// clear given signed-int-16-buffer
-	void clearS16Buffer( int_sample_t * _outbuf,
-							const fpp_t _frames );
+	void clearS16Buffer( intSampleFrameA * _outbuf, const fpp_t _frames );

 	// resample given buffer from samplerate _src_sr to samplerate _dst_sr
-	void resample( const surroundSampleFrame * _src,
+	void resample( const sampleFrameA * _src,
 					const fpp_t _frames,
-					surroundSampleFrame * _dst,
+					sampleFrameA * _dst,
 					const sample_rate_t _src_sr,
 					const sample_rate_t _dst_sr );

@@ -161,9 +152,11 @@ protected:

 	bool hqAudio( void ) const;

+
 protected:
 	bool m_supportsCapture;

+
 private:
 	sample_rate_t m_sampleRate;
 	ch_cnt_t m_channels;
@@ -175,7 +168,7 @@ private:
 	SRC_DATA m_srcData;
 	SRC_STATE * m_srcState;

-	surroundSampleFrame * m_buffer;
+	sampleFrameA * m_buffer;

 } ;

--- a/include/audio_dummy.h
+++ b/include/audio_dummy.h
@@ -27,6 +27,7 @@
 #define _AUDIO_DUMMY_H

 #include "audio_device.h"
+#include "basic_ops.h"
 #include "micro_timer.h"


@@ -94,16 +95,16 @@ private:
 	virtual void run( void )
 	{
 		microTimer timer;
-		while( TRUE )
+		while( true )
 		{
 			timer.reset();
-			const surroundSampleFrame * b =
+			surroundSampleFrame * b =
 						getMixer()->nextBuffer();
 			if( !b )
 			{
 				break;
 			}
-			delete[] b;
+			alignedFreeFrames( b );

 			const Sint32 microseconds = static_cast<Sint32>(
 					getMixer()->framesPerPeriod() *
--- a/include/audio_jack.h
+++ b/include/audio_jack.h
@@ -94,7 +94,7 @@ private:
 	QSemaphore m_stop_semaphore;

 	QVector<jack_port_t *> m_outputPorts;
-	surroundSampleFrame * m_outBuf;
+	sampleFrameA * m_outBuf;


 	f_cnt_t m_framesDoneInCurBuf;
--- a/include/audio_port.h
+++ b/include/audio_port.h
@@ -40,14 +40,14 @@ public:
 	audioPort( const QString & _name, bool _has_effect_chain = true );
 	~audioPort();

-	inline sampleFrame * firstBuffer( void )
+	inline sampleFrameA * firstBuffer( void )
 	{
-		return( m_firstBuffer );
+		return m_firstBuffer;
 	}

-	inline sampleFrame * secondBuffer( void )
+	inline sampleFrameA * secondBuffer( void )
 	{
-		return( m_secondBuffer );
+		return m_secondBuffer;
 	}

 	inline void lockFirstBuffer( void )
@@ -76,7 +76,7 @@ public:
 	// indicate whether JACK & Co should provide output-buffer at ext. port
 	inline bool extOutputEnabled( void ) const
 	{
-		return( m_extOutputEnabled );
+		return m_extOutputEnabled;
 	}

 	void setExtOutputEnabled( bool _enabled );
@@ -86,12 +86,12 @@ public:
 	// (-1 = none  0 = master)
 	inline fx_ch_t nextFxChannel( void ) const
 	{
-		return( m_nextFxChannel );
+		return m_nextFxChannel;
 	}

 	inline effectChain * getEffects( void )
 	{
-		return( m_effects );
+		return m_effects;
 	}

 	void setNextFxChannel( const fx_ch_t _chnl )
@@ -102,7 +102,7 @@ public:

 	const QString & name( void ) const
 	{
-		return( m_name );
+		return m_name;
 	}

 	void setName( const QString & _new_name );
@@ -122,8 +122,8 @@ public:
 private:
 	volatile bufferUsages m_bufferUsage;

-	sampleFrame * m_firstBuffer;
-	sampleFrame * m_secondBuffer;
+	sampleFrameA * m_firstBuffer;
+	sampleFrameA * m_secondBuffer;
 	QMutex m_firstBufferLock;
 	QMutex m_secondBufferLock;

--- a/include/audio_portaudio.h
+++ b/include/audio_portaudio.h
@@ -140,7 +140,7 @@ private:

 	bool m_wasPAInitError;
 
-	surroundSampleFrame * m_outBuf;
+	sampleFrameA * m_outBuf;
 	int m_outBufPos;
 	int m_outBufSize;

--- a/include/audio_sdl.h
+++ b/include/audio_sdl.h
@@ -76,8 +76,8 @@ private:

 	SDL_AudioSpec m_audioHandle;

-	surroundSampleFrame * m_outBuf;
-	Uint8 * m_convertedBuf;
+	sampleFrameA * m_outBuf;
+	intSampleFrameA * m_convertedBuf;
 	int m_convertedBufPos;
 	int m_convertedBufSize;

--- a/include/basic_ops.h
+++ b/include/basic_ops.h
@@ -0,0 +1,94 @@
+/*
+ * basic_ops.h - basic memory operations
+ *
+ * Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
+ * 
+ * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program (see COPYING); if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301 USA.
+ *
+ */
+
+
+#ifndef _BASIC_OPS_H
+#define _BASIC_OPS_H
+
+#include "lmms_basics.h"
+
+#ifdef LMMS_HAVE_STDBOOL_H
+#include <stdbool.h>
+#endif
+
+void initBasicOps( void );
+
+void * alignedMalloc( int _bytes );
+void alignedFree( void * _buf );
+
+sampleFrameA * alignedAllocFrames( int _frames );
+void alignedFreeFrames( sampleFrameA * _buf );
+
+
+// all aligned* functions assume data to be 16 byte aligned and size to be
+// multiples of 64
+typedef void (*alignedMemCpyFunc)( void * RP _dst, const void * RP _src,
+								int _size );
+typedef void (*alignedMemClearFunc)( void * RP _dst, int _size );
+typedef void (*alignedBufApplyGainFunc)( sampleFrameA * RP _dst,
+						float _gain, int _frames );
+typedef void (*alignedBufMixFunc)( sampleFrameA * RP _dst,
+						const sampleFrameA * RP _src,
+								int _frames );
+typedef void (*alignedBufMixLRCoeffFunc)( sampleFrameA * RP _dst,
+						const sampleFrameA * RP _src,
+						float _left, float _right,
+								int _frames );
+typedef void (*unalignedBufMixLRCoeffFunc)( sampleFrame * RP _dst,
+						const sampleFrame * RP _src,
+						float _left, float _right,
+								int _frames );
+typedef void (*alignedBufWetDryMixFunc)( sampleFrameA * RP _dst,
+					const sampleFrameA * RP _src,
+					float _wet, float _dry, int _frames );
+typedef void (*alignedBufWetDryMixSplittedFunc)( sampleFrameA * RP _dst,
+					const float * RP _left,
+					const float * RP _right,
+					float _wet, float _dry, int _frames );
+typedef int (*alignedConvertToS16Func)( const sampleFrameA * RP _src,
+					intSampleFrameA * RP _dst,
+					const fpp_t _frames,
+					const float _master_gain,
+					const bool _convert_endian );
+
+extern alignedMemCpyFunc alignedMemCpy;
+extern alignedMemClearFunc alignedMemClear;
+extern alignedBufApplyGainFunc alignedBufApplyGain;
+extern alignedBufMixFunc alignedBufMix;
+extern alignedBufMixLRCoeffFunc alignedBufMixLRCoeff;
+extern unalignedBufMixLRCoeffFunc unalignedBufMixLRCoeff;
+extern alignedBufWetDryMixFunc alignedBufWetDryMix;
+extern alignedBufWetDryMixSplittedFunc alignedBufWetDryMixSplitted;
+extern alignedConvertToS16Func alignedConvertToS16;
+
+
+#ifdef LMMS_HOST_X86
+#define X86_OPTIMIZATIONS
+#endif
+#ifdef LMMS_HOST_X86_64
+#define X86_OPTIMIZATIONS
+#endif
+
+#endif
+
--- a/include/fifo_buffer.h
+++ b/include/fifo_buffer.h
@@ -2,6 +2,7 @@
 * fifo_buffer.h - FIFO fixed-size buffer
 *
 * Copyright (c) 2007 Javier Serrano Polo <jasp00/at/users.sourceforge.net>
+ * Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
 * 
 * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
 *
@@ -33,50 +34,50 @@ class fifoBuffer
 {
 public:
 	fifoBuffer( int _size ) :
-		m_reader_sem( _size ),
-		m_writer_sem( _size ),
-		m_reader_index( 0 ),
-		m_writer_index( 0 ),
+		m_readerSem( _size ),
+		m_writerSem( _size ),
+		m_readerIndex( 0 ),
+		m_writerIndex( 0 ),
 		m_size( _size )
 	{
 		m_buffer = new T[_size];
-		m_reader_sem.acquire( _size );
+		m_readerSem.acquire( _size );
 	}

 	~fifoBuffer()
 	{
 		delete[] m_buffer;
-		m_reader_sem.release( m_size );
+		m_readerSem.release( m_size );
 	}

 	void write( T _element )
 	{
-		m_writer_sem.acquire();
-		m_buffer[m_writer_index++] = _element;
-		m_writer_index %= m_size;
-		m_reader_sem.release();
+		m_writerSem.acquire();
+		m_buffer[m_writerIndex++] = _element;
+		m_writerIndex %= m_size;
+		m_readerSem.release();
 	}

 	T read( void )
 	{
-		m_reader_sem.acquire();
-		T element = m_buffer[m_reader_index++];
-		m_reader_index %= m_size;
-		m_writer_sem.release();
-		return( element );
+		m_readerSem.acquire();
+		T element = m_buffer[m_readerIndex++];
+		m_readerIndex %= m_size;
+		m_writerSem.release();
+		return element;
 	}

 	bool available( void )
 	{
-		return( m_reader_sem.available() );
+		return m_readerSem.available();
 	}


 private:
-	QSemaphore m_reader_sem;
-	QSemaphore m_writer_sem;
-	int m_reader_index;
-	int m_writer_index;
+	QSemaphore m_readerSem;
+	QSemaphore m_writerSem;
+	int m_readerIndex;
+	int m_writerIndex;
 	int m_size;
 	T * m_buffer;

--- a/include/lmms_basics.h
+++ b/include/lmms_basics.h
@@ -1,5 +1,5 @@
 /*
- * types.h - typedefs for common types that are used in the whole app
+ * lmms_basics.h - common basics for the whole App
 *
 * Copyright (c) 2004-2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
 * 
@@ -23,10 +23,8 @@
 */


-#ifndef _TYPES_H
-#define _TYPES_H
-
-#include <limits>
+#ifndef _LMMS_BASICS_H
+#define _LMMS_BASICS_H

 #include "lmmsconfig.h"

@@ -68,6 +66,9 @@ typedef Uint32 jo_id_t;			// (unique) ID of a journalling object
 #define likely(x)	__builtin_expect((x),1)
 #define unlikely(x)	__builtin_expect((x),0)

+#ifdef __cplusplus
+
+#include <limits>

 template<typename T>
 struct typeInfo
@@ -115,25 +116,50 @@ inline bool typeInfo<float>::isEqual( float _x, float _y )
 	return absVal( _x - _y ) < minEps();
 }

+#endif


-const ch_cnt_t DEFAULT_CHANNELS = 2;
-
-const ch_cnt_t SURROUND_CHANNELS =
+#define DEFAULT_CHANNELS 2
 #define LMMS_DISABLE_SURROUND
-#ifndef LMMS_DISABLE_SURROUND
-				4;
+#ifdef LMMS_DISABLE_SURROUND
+#define SURROUND_CHANNELS 2
 #else
-				2;
+#define SURROUND_CHANNELS 4
 #endif



 typedef sample_t sampleFrame[DEFAULT_CHANNELS];
 typedef sample_t surroundSampleFrame[SURROUND_CHANNELS];
+
 #define ALIGN_SIZE 16
+
 #if __GNUC__
+
 typedef sample_t sampleFrameA[DEFAULT_CHANNELS] __attribute__((__aligned__(ALIGN_SIZE)));
+typedef int_sample_t intSampleFrameA[DEFAULT_CHANNELS] __attribute__((__aligned__(ALIGN_SIZE)));
+#define RP __restrict__
+
+#else
+
+#define RP
+
+#endif
+
+
+#ifdef __cplusplus
+const int BYTES_PER_SAMPLE = sizeof( sample_t );
+const int BYTES_PER_INT_SAMPLE = sizeof( int_sample_t );
+const int BYTES_PER_FRAME = sizeof( sampleFrame );
+const int BYTES_PER_SURROUND_FRAME = sizeof( surroundSampleFrame );
+
+const float OUTPUT_SAMPLE_MULTIPLIER = 32767.0f;
+#else
+#define BYTES_PER_SAMPLE sizeof( sample_t )
+#define BYTES_PER_INT_SAMPLE sizeof( int_sample_t )
+#define BYTES_PER_FRAME sizeof( sampleFrame )
+#define BYTES_PER_SURROUND_FRAME sizeof( surroundSampleFrame )
+#define OUTPUT_SAMPLE_MULTIPLIER 32767.0f
 #endif


--- a/include/mixer.h
+++ b/include/mixer.h
@@ -57,13 +57,6 @@ class audioPort;

 const fpp_t DEFAULT_BUFFER_SIZE = 256;

-const int BYTES_PER_SAMPLE = sizeof( sample_t );
-const int BYTES_PER_INT_SAMPLE = sizeof( int_sample_t );
-const int BYTES_PER_FRAME = sizeof( sampleFrame );
-const int BYTES_PER_SURROUND_FRAME = sizeof( surroundSampleFrame );
-
-const float OUTPUT_SAMPLE_MULTIPLIER = 32767.0f;
-

 const float BaseFreq = 440.0f;
 const Keys BaseKey = Key_A;
@@ -361,7 +354,7 @@ public:
 		return m_inputBufferFrames[ m_inputBufferRead ];
 	}

-	inline const surroundSampleFrame * nextBuffer( void )
+	inline surroundSampleFrame * nextBuffer( void )
 	{
 		return hasFifoWriter() ? m_fifo->read() : renderNextBuffer();
 	}
@@ -407,7 +400,7 @@ private:
 	midiClient * tryMidiClients( void );


-	const surroundSampleFrame * renderNextBuffer( void );
+	surroundSampleFrame * renderNextBuffer( void );



--- a/lmmsconfig.h.in
+++ b/lmmsconfig.h.in
@@ -19,6 +19,7 @@
 #cmakedefine LMMS_HAVE_VST

 #cmakedefine LMMS_HAVE_STDINT_H
+#cmakedefine LMMS_HAVE_STDBOOL_H
 #cmakedefine LMMS_HAVE_STDLIB_H
 #cmakedefine LMMS_HAVE_PTHREAD_H
 #cmakedefine LMMS_HAVE_UNISTD_H
--- a/plugins/ladspa_effect/ladspa_effect.cpp
+++ b/plugins/ladspa_effect/ladspa_effect.cpp
@@ -34,6 +34,7 @@
 #include "ladspa_subplugin_features.h"
 #include "mixer.h"
 #include "effect_chain.h"
+#include "basic_ops.h"
 #include "automation_pattern.h"


@@ -144,7 +145,7 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
 	if( m_maxSampleRate < engine::getMixer()->processingSampleRate() )
 	{
 		o_buf = _buf;
-		_buf = new sampleFrame[_frames];
+		_buf = alignedAllocFrames( _frames );
 		sampleDown( o_buf, _buf, m_maxSampleRate );
 		frames = _frames * m_maxSampleRate /
 				engine::getMixer()->processingSampleRate();
@@ -217,8 +218,8 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
 	// Copy the LADSPA output buffers to the LMMS buffer.
 	double out_sum = 0.0;
 	channel = 0;
-	const float d = getDryLevel();
-	const float w = getWetLevel();
+	float * buffers[2];
+
 	for( ch_cnt_t proc = 0; proc < getProcessorCount(); ++proc )
 	{
 		for( int port = 0; port < m_portCount; ++port )
@@ -231,17 +232,9 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
 				case CONTROL_RATE_INPUT:
 					break;
 				case CHANNEL_OUT:
-					for( fpp_t frame = 0; 
-						frame < frames; ++frame )
+					if( channel < DEFAULT_CHANNELS )
 					{
-						_buf[frame][channel] = 
-							d * 
-							_buf[frame][channel] +
-							w *
-							pp->buffer[frame];
-						out_sum += 
-							_buf[frame][channel] *
-							_buf[frame][channel];
+						buffers[channel] = pp->buffer;
 					}
 					++channel;
 					break;
@@ -254,10 +247,27 @@ bool ladspaEffect::processAudioBuffer( sampleFrame * _buf,
 		}
 	}

+	if( channel == 1 )
+	{
+		buffers[1] = buffers[0];
+	}
+	if( channel >= 1 && channel <= DEFAULT_CHANNELS )
+	{
+		alignedBufWetDryMixSplitted( _buf, buffers[0], buffers[1],
+					getWetLevel(), getDryLevel(), frames );
+	}
+
+	for( int i = 0; i < frames; ++i )
+	{
+		out_sum += _buf[i][0]*_buf[i][0];
+		out_sum += _buf[i][1]*_buf[i][1];
+
+	}
+
 	if( o_buf != NULL )
 	{
 		sampleBack( _buf, o_buf, m_maxSampleRate );
-		delete[] _buf;
+		alignedFreeFrames( _buf );
 	}

 	checkGate( out_sum / frames );
--- a/src/core/audio/audio_alsa.cpp
+++ b/src/core/audio/audio_alsa.cpp
@@ -39,6 +39,7 @@
 #include "lcd_spinbox.h"
 #include "gui_templates.h"
 #include "templates.h"
+#include "basic_ops.h"



@@ -229,13 +230,15 @@ void audioALSA::applyQualitySettings( void )

 void audioALSA::run( void )
 {
-	surroundSampleFrame * temp =
-		new surroundSampleFrame[getMixer()->framesPerPeriod()];
-	int_sample_t * outbuf =
-			new int_sample_t[getMixer()->framesPerPeriod() *
-								channels()];
+	sampleFrameA * temp = alignedAllocFrames(
+					getMixer()->framesPerPeriod() );
+	intSampleFrameA * outbuf = (intSampleFrameA *)
+		alignedMalloc( sizeof( intSampleFrameA ) * channels() /
+			DEFAULT_CHANNELS * getMixer()->framesPerPeriod() );
+
 	int_sample_t * pcmbuf = new int_sample_t[m_periodSize * channels()];

+
 	int outbuf_size = getMixer()->framesPerPeriod() * channels();
 	int outbuf_pos = 0;
 	int pcmbuf_size = m_periodSize * channels();
@@ -254,16 +257,15 @@ void audioALSA::run( void )
 				if( !frames )
 				{
 					quit = TRUE;
-					memset( ptr, 0, len
+					alignedMemClear( ptr, len
 						* sizeof( int_sample_t ) );
 					break;
 				}
 				outbuf_size = frames * channels();

-				convertToS16( temp, frames,
+				alignedConvertToS16( temp, outbuf, frames,
 						getMixer()->masterGain(),
-						outbuf,
-						m_convertEndian );
+							m_convertEndian );
 			}
 			int min_len = qMin( len, outbuf_size - outbuf_pos );
 			memcpy( ptr, outbuf + outbuf_pos,
@@ -300,8 +302,8 @@ void audioALSA::run( void )
 		}
 	}

-	delete[] temp;
-	delete[] outbuf;
+	alignedFreeFrames( temp );
+	alignedFree( outbuf );
 	delete[] pcmbuf;
 }

--- a/src/core/audio/audio_device.cpp
+++ b/src/core/audio/audio_device.cpp
@@ -31,6 +31,7 @@
 #include "audio_device.h"
 #include "config_mgr.h"
 #include "debug.h"
+#include "basic_ops.h"



@@ -39,7 +40,7 @@ audioDevice::audioDevice( const ch_cnt_t _channels, mixer * _mixer ) :
 	m_sampleRate( _mixer->processingSampleRate() ),
 	m_channels( _channels ),
 	m_mixer( _mixer ),
-	m_buffer( new surroundSampleFrame[getMixer()->framesPerPeriod()] )
+	m_buffer( alignedAllocFrames( getMixer()->framesPerPeriod() ) )
 {
 	int error;
 	if( ( m_srcState = src_new(
@@ -56,7 +57,7 @@ audioDevice::audioDevice( const ch_cnt_t _channels, mixer * _mixer ) :
 audioDevice::~audioDevice()
 {
 	src_delete( m_srcState );
-	delete[] m_buffer;
+	alignedFreeFrames( m_buffer );

 	m_devMutex.tryLock();
 	unlock();
@@ -81,10 +82,10 @@ void audioDevice::processNextBuffer( void )



-fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab )
+fpp_t audioDevice::getNextBuffer( sampleFrameA * _ab )
 {
 	fpp_t frames = getMixer()->framesPerPeriod();
-	const surroundSampleFrame * b = getMixer()->nextBuffer();
+	sampleFrameA * b = getMixer()->nextBuffer();
 	if( !b )
 	{
 		return( 0 );
@@ -103,7 +104,7 @@ fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab )
 	}
 	else
 	{
-		memcpy( _ab, b, frames * sizeof( surroundSampleFrame ) );
+		alignedMemCpy( _ab, b, frames * sizeof( surroundSampleFrame ) );
 	}

 	// release lock
@@ -111,10 +112,10 @@ fpp_t audioDevice::getNextBuffer( surroundSampleFrame * _ab )

 	if( getMixer()->hasFifoWriter() )
 	{
-		delete[] b;
+		alignedFreeFrames( b );
 	}

-	return( frames );
+	return frames;
 }


@@ -171,11 +172,10 @@ void audioDevice::renamePort( audioPort * )



-void audioDevice::resample( const surroundSampleFrame * _src,
-						const fpp_t _frames,
-						surroundSampleFrame * _dst,
-						const sample_rate_t _src_sr,
-						const sample_rate_t _dst_sr )
+void audioDevice::resample( const sampleFrame * _src, const fpp_t _frames,
+					sampleFrame * _dst,
+					const sample_rate_t _src_sr,
+					const sample_rate_t _dst_sr )
 {
 	if( m_srcState == NULL )
 	{
@@ -197,57 +197,11 @@ void audioDevice::resample( const surroundSampleFrame * _src,



-Uint32 audioDevice::convertToS16( const surroundSampleFrame * _ab,
-						const fpp_t _frames,
-						const float _master_gain,
-						int_sample_t * _output_buffer,
-						const bool _convert_endian )
+
+void audioDevice::clearS16Buffer( intSampleFrameA * _outbuf, const fpp_t _frames )
 {
-	if( _convert_endian )
-	{
-		Uint16 temp;
-		for( fpp_t frame = 0; frame < _frames; ++frame )
-		{
-			for( ch_cnt_t chnl = 0; chnl < channels(); ++chnl )
-			{
-				temp = static_cast<int_sample_t>(
-						mixer::clip( _ab[frame][chnl] *
-						_master_gain ) *
-						OUTPUT_SAMPLE_MULTIPLIER );
-				
-				( _output_buffer + frame * channels() )[chnl] =
-						( temp & 0x00ff ) << 8 |
-						( temp & 0xff00 ) >> 8;
-			}
-		}
-	}
-	else
-	{
-		for( fpp_t frame = 0; frame < _frames; ++frame )
-		{
-			for( ch_cnt_t chnl = 0; chnl < channels(); ++chnl )
-			{
-				( _output_buffer + frame * channels() )[chnl] =
-						static_cast<int_sample_t>(
-						mixer::clip( _ab[frame][chnl] *
-						_master_gain ) *
-						OUTPUT_SAMPLE_MULTIPLIER );
-			}
-		}
-	}
-
-	return( _frames * channels() * BYTES_PER_INT_SAMPLE );
-}
-
-
-
-
-void audioDevice::clearS16Buffer( int_sample_t * _outbuf, const fpp_t _frames )
-{
-#ifdef LMMS_DEBUG
-	assert( _outbuf != NULL );
-#endif
-	memset( _outbuf, 0,  _frames * channels() * BYTES_PER_INT_SAMPLE );
+	alignedMemClear( _outbuf, _frames * sizeof( *_outbuf ) );
+//	memset( _outbuf, 0,  _frames * channels() * BYTES_PER_INT_SAMPLE );
 }


--- a/src/core/audio/audio_file_wave.cpp
+++ b/src/core/audio/audio_file_wave.cpp
@@ -29,6 +29,7 @@

 #include "audio_file_wave.h"
 #include "endian_handling.h"
+#include "basic_ops.h"

 #include <cstring>

@@ -101,12 +102,14 @@ void audioFileWave::writeBuffer( const surroundSampleFrame * _ab,
 	}
 	else
 	{
-		int_sample_t * buf = new int_sample_t[_frames * channels()];
-		convertToS16( _ab, _frames, _master_gain, buf,
+		intSampleFrameA * buf = (intSampleFrameA *)
+				alignedMalloc(
+					sizeof( intSampleFrameA ) * _frames );
+		alignedConvertToS16( _ab, buf, _frames, _master_gain,
 							!isLittleEndian() );

-		sf_writef_short( m_sf, buf, _frames );
-		delete[] buf;
+		sf_writef_short( m_sf, (int_sample_t *) buf, _frames );
+		alignedFree( buf );
 	}
 }

--- a/src/core/audio/audio_jack.cpp
+++ b/src/core/audio/audio_jack.cpp
@@ -45,6 +45,7 @@
 #include "config_mgr.h"
 #include "lcd_spinbox.h"
 #include "audio_port.h"
+#include "basic_ops.h"



@@ -57,7 +58,7 @@ audioJACK::audioJACK( bool & _success_ful, mixer * _mixer ) :
 	m_client( NULL ),
 	m_active( FALSE ),
 	m_stop_semaphore( 1 ),
-	m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ),
+	m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ),
 	m_framesDoneInCurBuf( 0 ),
 	m_framesToDoInCurBuf( 0 )
 {
@@ -159,7 +160,7 @@ audioJACK::~audioJACK()
 		jack_client_close( m_client );
 	}

-	delete[] m_outBuf;
+	alignedFreeFrames( m_outBuf );

 }

@@ -367,14 +368,14 @@ int audioJACK::processCallback( jack_nframes_t _nframes, void * _udata )
 						_this->m_framesDoneInCurBuf );
 		if( ts == JackTransportRolling )
 		{
+			const float gain = _this->getMixer()->masterGain();
 			for( Uint8 chnl = 0; chnl < _this->channels(); ++chnl )
 			{
 				for( jack_nframes_t frame = 0; frame < todo;
 								++frame )
 				{
 					outbufs[chnl][done+frame] = 
-		_this->m_outBuf[_this->m_framesDoneInCurBuf+frame][chnl] *
-						_this->getMixer()->masterGain();
+		_this->m_outBuf[_this->m_framesDoneInCurBuf+frame][chnl] * gain;
 				}
 			}
 		}
--- a/src/core/audio/audio_oss.cpp
+++ b/src/core/audio/audio_oss.cpp
@@ -39,6 +39,7 @@
 #include "engine.h"
 #include "gui_templates.h"
 #include "templates.h"
+#include "basic_ops.h"

 #ifdef LMMS_HAVE_UNISTD_H
 #include <unistd.h>
@@ -298,13 +299,13 @@ void audioOSS::applyQualitySettings( void )

 void audioOSS::run( void )
 {
-	surroundSampleFrame * temp =
-		new surroundSampleFrame[getMixer()->framesPerPeriod()];
-	int_sample_t * outbuf =
-			new int_sample_t[getMixer()->framesPerPeriod() *
-								channels()];
+	sampleFrameA * temp = alignedAllocFrames(
+						getMixer()->framesPerPeriod() );
+	intSampleFrameA * outbuf = (intSampleFrameA *)
+			alignedMalloc( sizeof( intSampleFrameA ) *
+						getMixer()->framesPerPeriod() );

-	while( TRUE )
+	while( 1 )
 	{
 		const fpp_t frames = getNextBuffer( temp );
 		if( !frames )
@@ -312,8 +313,8 @@ void audioOSS::run( void )
 			break;
 		}

-		int bytes = convertToS16( temp, frames,
-				getMixer()->masterGain(), outbuf,
+		int bytes = alignedConvertToS16( temp, outbuf, frames,
+						getMixer()->masterGain(),
 							m_convertEndian );
 		if( write( m_audioFD, outbuf, bytes ) != bytes )
 		{
@@ -321,8 +322,8 @@ void audioOSS::run( void )
 		}
 	}

-	delete[] temp;
-	delete[] outbuf;
+	alignedFreeFrames( temp );
+	alignedFree( outbuf );
 }


--- a/src/core/audio/audio_port.cpp
+++ b/src/core/audio/audio_port.cpp
@@ -26,13 +26,15 @@
 #include "audio_device.h"
 #include "effect_chain.h"
 #include "engine.h"
+#include "basic_ops.h"


 audioPort::audioPort( const QString & _name, bool _has_effect_chain ) :
 	m_bufferUsage( NoUsage ),
-	m_firstBuffer( new sampleFrame[engine::getMixer()->framesPerPeriod()] ),
-	m_secondBuffer( new sampleFrame[
-				engine::getMixer()->framesPerPeriod()] ),
+	m_firstBuffer( alignedAllocFrames( 
+				engine::getMixer()->framesPerPeriod() ) ),
+	m_secondBuffer( alignedAllocFrames(
+				engine::getMixer()->framesPerPeriod() ) ),
 	m_extOutputEnabled( false ),
 	m_nextFxChannel( 0 ),
 	m_name( "unnamed port" ),
@@ -53,8 +55,8 @@ audioPort::~audioPort()
 {
 	setExtOutputEnabled( false );
 	engine::getMixer()->removeAudioPort( this );
-	delete[] m_firstBuffer;
-	delete[] m_secondBuffer;
+	alignedFreeFrames( m_firstBuffer );
+	alignedFreeFrames( m_secondBuffer );
 	delete m_effects;
 }

--- a/src/core/audio/audio_portaudio.cpp
+++ b/src/core/audio/audio_portaudio.cpp
@@ -55,11 +55,12 @@ void audioPortAudioSetupUtil::updateChannels( void )

 audioPortAudio::audioPortAudio( bool & _success_ful, mixer * _mixer ) :
 	audioDevice( tLimit<ch_cnt_t>(
-		configManager::inst()->value( "audioportaudio", "channels" ).toInt(),
+		configManager::inst()->value( "audioportaudio",
+							"channels" ).toInt(),
 					DEFAULT_CHANNELS, SURROUND_CHANNELS ),
 								_mixer ),
 	m_wasPAInitError( false ),
-	m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ),
+	m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ),
 	m_outBufPos( 0 ),
 	m_stopSemaphore( 1 )
 {
@@ -205,7 +206,7 @@ audioPortAudio::~audioPortAudio()
 	{
 		Pa_Terminate();
 	}
-	delete[] m_outBuf;
+	alignedFreeFrames( m_outBuf );
 }


--- a/src/core/audio/audio_pulseaudio.cpp
+++ b/src/core/audio/audio_pulseaudio.cpp
@@ -40,6 +40,7 @@
 #include "lcd_spinbox.h"
 #include "gui_templates.h"
 #include "templates.h"
+#include "basic_ops.h"


 static void stream_write_callback(pa_stream *s, size_t length, void *userdata)
@@ -230,8 +231,9 @@ void audioPulseAudio::run( void )
 void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length)
 {
 	const fpp_t fpp = getMixer()->framesPerPeriod();
-	surroundSampleFrame * temp = new surroundSampleFrame[fpp];
-	Sint16 * pcmbuf = (Sint16*)pa_xmalloc( fpp * channels() * sizeof(Sint16) );
+	sampleFrameA * temp = alignedAllocFrames( fpp );
+	Sint16 * pcmbuf = (Sint16*)pa_xmalloc( fpp * channels() *
+							sizeof(Sint16) );

 	size_t fd = 0;
 	while( fd < length/4 )
@@ -241,9 +243,10 @@ void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length)
 		{
 			return;
 		}
-		int bytes = convertToS16( temp, frames,
+		int bytes = alignedConvertToS16( temp,
+						(intSampleFrameA *) pcmbuf,
+						frames,
 						getMixer()->masterGain(),
-						pcmbuf,
 						m_convertEndian );
 		if( bytes > 0 )
 		{
@@ -254,7 +257,7 @@ void audioPulseAudio::streamWriteCallback(pa_stream *s, size_t length)
 	}

 	pa_xfree( pcmbuf );
-	delete[] temp;
+	alignedFreeFrames( temp );
 }


--- a/src/core/audio/audio_sdl.cpp
+++ b/src/core/audio/audio_sdl.cpp
@@ -38,22 +38,22 @@
 #include "config_mgr.h"
 #include "gui_templates.h"
 #include "templates.h"
-
+#include "basic_ops.h"



 audioSDL::audioSDL( bool & _success_ful, mixer * _mixer ) :
 	audioDevice( DEFAULT_CHANNELS, _mixer ),
-	m_outBuf( new surroundSampleFrame[getMixer()->framesPerPeriod()] ),
+	m_outBuf( alignedAllocFrames( getMixer()->framesPerPeriod() ) ),
 	m_convertedBufPos( 0 ),
 	m_convertEndian( false ),
 	m_stopSemaphore( 1 )
 {
 	_success_ful = FALSE;

-	m_convertedBufSize = getMixer()->framesPerPeriod() * channels()
-						* sizeof( int_sample_t );
-	m_convertedBuf = new Uint8[m_convertedBufSize];
+	m_convertedBufSize = getMixer()->framesPerPeriod() *
+						sizeof( intSampleFrameA );
+	m_convertedBuf = (intSampleFrameA *) alignedMalloc( m_convertedBufSize );


 	if( SDL_Init( SDL_INIT_AUDIO | SDL_INIT_NOPARACHUTE ) < 0 )
@@ -97,8 +97,8 @@ audioSDL::~audioSDL()

 	SDL_CloseAudio();
 	SDL_Quit();
-	delete[] m_convertedBuf;
-	delete[] m_outBuf;
+	alignedFree( m_convertedBuf );
+	alignedFreeFrames( m_outBuf );
 }


@@ -190,12 +190,12 @@ void audioSDL::sdlAudioCallback( Uint8 * _buf, int _len )
 				memset( _buf, 0, _len );
 				return;
 			}
-			m_convertedBufSize = frames * channels()
-						* sizeof( int_sample_t );
+			m_convertedBufSize = frames * sizeof( intSampleFrameA );

-			convertToS16( m_outBuf, frames,
+			alignedConvertToS16( m_outBuf,
+						m_convertedBuf,
+						frames,
 						getMixer()->masterGain(),
-						(int_sample_t *)m_convertedBuf,
 						m_convertEndian );
 		}
 		const int min_len = qMin( _len, m_convertedBufSize
--- a/src/core/basic_ops.cpp
+++ b/src/core/basic_ops.cpp
@@ -0,0 +1,455 @@
+/*
+ * basic_ops.cpp - basic memory operations
+ *
+ * Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
+ * 
+ * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program (see COPYING); if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301 USA.
+ *
+ */
+
+
+#include "basic_ops.h"
+
+#include <cstdlib>
+#include <cstdio>
+#include <memory.h>
+
+
+
+void * alignedMalloc( int _bytes )
+{
+	char *ptr,*ptr2,*aligned_ptr;
+	int align_mask = ALIGN_SIZE- 1;
+	ptr =(char *) malloc( _bytes + ALIGN_SIZE + sizeof(int) );
+	if( ptr == NULL )
+	{
+		return NULL;
+	}
+
+	ptr2 = ptr + sizeof(int);
+	aligned_ptr = ptr2 + ( ALIGN_SIZE- ( (size_t) ptr2 & align_mask ) );
+
+
+	ptr2 = aligned_ptr - sizeof(int);
+	*((int *) ptr2) = (int)( aligned_ptr - ptr );
+
+	return aligned_ptr;
+}
+
+
+void alignedFree( void * _buf )
+{
+	if( _buf )
+	{
+		int * ptr2 = (int *) _buf - 1;
+		void * buf2 = (char *) _buf - *ptr2;
+		if( buf2 )
+		{
+			free( buf2 );
+		}
+	}
+}
+
+
+sampleFrameA * alignedAllocFrames( int _n )
+{
+	return (sampleFrameA *) alignedMalloc( _n * sizeof( sampleFrameA ) );
+}
+
+
+void alignedFreeFrames( sampleFrame * _buf )
+{
+	alignedFree( _buf );
+}
+
+
+
+
+// slow fallback
+void alignedMemCpyNoOpt( void * RP _dst, const void * RP _src, int _size )
+{
+	const int s = _size / ( sizeof( int ) * 16 );
+	const int * RP src = (const int *) _src;
+	int * RP dst = (int *) _dst;
+	for( int i = 0; i < s; )
+	{
+		dst[i+0] = src[i+0];
+		dst[i+1] = src[i+1];
+		dst[i+2] = src[i+2];
+		dst[i+3] = src[i+3];
+		dst[i+4] = src[i+4];
+		dst[i+5] = src[i+5];
+		dst[i+6] = src[i+6];
+		dst[i+7] = src[i+7];
+		dst[i+8] = src[i+8];
+		dst[i+9] = src[i+9];
+		dst[i+10] = src[i+10];
+		dst[i+11] = src[i+11];
+		dst[i+12] = src[i+12];
+		dst[i+13] = src[i+13];
+		dst[i+14] = src[i+14];
+		dst[i+15] = src[i+15];
+		i += 16;
+	}
+}
+
+
+// slow fallback
+void alignedMemClearNoOpt( void * _dst, int _size )
+{
+	const int s = _size / ( sizeof( int ) * 4 );
+	int * dst = (int *) _dst;
+	for( int i = 0; i < s; ++i )
+	{
+		dst[0] = 0;
+		dst[1] = 0;
+		dst[2] = 0;
+		dst[3] = 0;
+		dst += 4;
+	}
+}
+
+
+
+void alignedBufApplyGainNoOpt( sampleFrameA * RP _dst, float _gain,
+								int _frames )
+{
+	for( int i = 0; i < _frames; )
+	{
+		_dst[i+0][0] *= _gain;
+		_dst[i+0][1] *= _gain;
+		_dst[i+1][0] *= _gain;
+		_dst[i+1][1] *= _gain;
+		_dst[i+2][0] *= _gain;
+		_dst[i+2][1] *= _gain;
+		_dst[i+3][0] *= _gain;
+		_dst[i+3][1] *= _gain;
+		_dst[i+4][0] *= _gain;
+		_dst[i+4][1] *= _gain;
+		_dst[i+5][0] *= _gain;
+		_dst[i+5][1] *= _gain;
+		_dst[i+6][0] *= _gain;
+		_dst[i+6][1] *= _gain;
+		_dst[i+7][0] *= _gain;
+		_dst[i+7][1] *= _gain;
+		i += 8;
+	}
+}
+
+
+void alignedBufMixNoOpt( sampleFrameA * RP _dst, const sampleFrameA * RP _src,
+								int _frames )
+{
+	for( int i = 0; i < _frames; )
+	{
+		_dst[i+0][0] += _src[i+0][0];
+		_dst[i+0][1] += _src[i+0][1];
+		_dst[i+1][0] += _src[i+1][0];
+		_dst[i+1][1] += _src[i+1][1];
+		_dst[i+2][0] += _src[i+2][0];
+		_dst[i+2][1] += _src[i+2][1];
+		_dst[i+3][0] += _src[i+3][0];
+		_dst[i+3][1] += _src[i+3][1];
+		i += 4;
+	}
+}
+
+
+
+void alignedBufMixLRCoeffNoOpt( sampleFrameA * RP _dst,
+					const sampleFrameA * RP _src,
+					float _left, float _right, int _frames )
+{
+	for( int i = 0; i < _frames; )
+	{
+		_dst[i+0][0] += _src[i+0][0]*_left;
+		_dst[i+0][1] += _src[i+0][1]*_right;
+		_dst[i+1][0] += _src[i+1][0]*_left;
+		_dst[i+1][1] += _src[i+1][1]*_right;
+		_dst[i+2][0] += _src[i+2][0]*_left;
+		_dst[i+2][1] += _src[i+2][1]*_right;
+		_dst[i+3][0] += _src[i+3][0]*_left;
+		_dst[i+3][1] += _src[i+3][1]*_right;
+		i += 4;
+	}
+}
+
+
+
+void unalignedBufMixLRCoeffNoOpt( sampleFrame * RP _dst,
+						const sampleFrame * RP _src,
+							const float _left,
+							const float _right,
+								int _frames )
+{
+	if( _frames % 2 )
+	{
+		_dst[0][0] += _src[0][0] * _left;
+		_dst[0][1] += _src[0][1] * _right;
+		++_src;
+		++_dst;
+		--_frames;
+	}
+	for( int i = 0; i < _frames; )
+	{
+		_dst[i+0][0] += _src[i+0][0]*_left;
+		_dst[i+0][1] += _src[i+0][1]*_right;
+		_dst[i+1][0] += _src[i+1][0]*_left;
+		_dst[i+1][1] += _src[i+1][1]*_right;
+		i += 2;
+	}
+}
+
+
+
+void alignedBufWetDryMixNoOpt( sampleFrameA * RP _dst,
+					const sampleFrameA * RP _src,
+					float _wet, float _dry, int _frames )
+{
+	for( int i = 0; i < _frames; ++i )
+	{
+		_dst[i+0][0] = _dst[i+0][0]*_dry + _src[i+0][0]*_wet;
+		_dst[i+0][1] = _dst[i+0][1]*_dry + _src[i+0][1]*_wet;
+	}
+}
+
+
+
+
+void alignedBufWetDryMixSplittedNoOpt( sampleFrameA * RP _dst,
+					const float * RP _left,
+					const float * RP _right,
+					float _wet, float _dry, int _frames )
+{
+	int i;
+	for( i = 0; i < _frames; ++i )
+	{
+		_dst[i+0][0] = _dst[i+0][0]*_dry + _left[i+0]*_wet;
+		_dst[i+0][1] = _dst[i+0][1]*_dry + _right[i+0]*_wet;
+		++i;
+	}
+}
+
+
+
+
+int alignedConvertToS16NoOpt( const sampleFrameA * RP _src,
+					intSampleFrameA * RP _dst,
+					const fpp_t _frames,
+					const float _master_gain,
+					const bool _convert_endian )
+{
+	int t1;
+	int t2;
+	const float f = _master_gain * OUTPUT_SAMPLE_MULTIPLIER;
+	if( _convert_endian )
+	{
+		for( fpp_t frame = 0; frame < _frames; ++frame )
+		{
+			t1 = _src[frame][0] * f;
+			t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
+			t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
+			_dst[frame][0] = ( t1 & 0x00ff) << 8 |
+							( t1 & 0xff00 ) >> 8;
+
+			t2 = _src[frame][1] * f;
+			t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
+			t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
+			_dst[frame][1] = ( t2 & 0x00ff) << 8 |
+						( t2 & 0xff00 ) >> 8;
+		}
+	}
+	else
+	{
+		for( fpp_t frame = 0; frame < _frames; ++frame )
+		{
+			t1 = _src[frame][0] * f;
+			t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
+			t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
+			_dst[frame][0] = t1;
+
+			t2 = _src[frame][1] * f;
+			t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
+			t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
+			_dst[frame][1] = t2;
+		}
+	}
+
+	return _frames * DEFAULT_CHANNELS * BYTES_PER_INT_SAMPLE;
+}
+
+
+alignedMemCpyFunc alignedMemCpy = alignedMemCpyNoOpt;
+alignedMemClearFunc alignedMemClear = alignedMemClearNoOpt;
+alignedBufApplyGainFunc alignedBufApplyGain = alignedBufApplyGainNoOpt;
+alignedBufMixFunc alignedBufMix = alignedBufMixNoOpt;
+alignedBufMixLRCoeffFunc alignedBufMixLRCoeff = alignedBufMixLRCoeffNoOpt;
+unalignedBufMixLRCoeffFunc unalignedBufMixLRCoeff = unalignedBufMixLRCoeffNoOpt;
+alignedBufWetDryMixFunc alignedBufWetDryMix = alignedBufWetDryMixNoOpt;
+alignedBufWetDryMixSplittedFunc alignedBufWetDryMixSplitted = alignedBufWetDryMixSplittedNoOpt;
+alignedConvertToS16Func alignedConvertToS16 = alignedConvertToS16NoOpt;
+
+
+#ifdef X86_OPTIMIZATIONS
+enum CPUFeatures
+{
+    None        = 0,
+    MMX         = 0x1,
+    MMXEXT      = 0x2,
+    MMX3DNOW    = 0x4,
+    MMX3DNOWEXT = 0x8,
+    SSE         = 0x10,
+    SSE2        = 0x20,
+    CMOV        = 0x40,
+    IWMMXT      = 0x80
+};
+
+extern "C"
+{
+#ifdef LMMS_HOST_X86
+void alignedMemCpyMMX( void * RP _dst, const void * RP _src, int _size );
+void alignedMemClearMMX( void * RP _dst, int _size );
+#endif
+void alignedMemCpySSE( void * RP _dst, const void * RP _src, int _size );
+void alignedMemClearSSE( void * RP _dst, int _size );
+void alignedBufApplyGainSSE( sampleFrameA * RP _dst, float _gain, int _frames );
+void alignedBufMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, int _frames );
+void alignedBufMixLRCoeffSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _left, float _right, int _frames );
+void unalignedBufMixLRCoeffSSE( sampleFrame * RP _dst, const sampleFrame * RP _src, const float _left, const float _right, int _frames );
+void alignedBufWetDryMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src, float _wet, float _dry, int _frames );
+void alignedBufWetDryMixSplittedSSE( sampleFrameA * RP _dst, const float * RP _left, const float * RP _right, float _wet, float _dry, int _frames );
+#ifdef LMMS_HOST_X86
+void alignedMemCpySSE2( void * RP _dst, const void * RP _src, int _size );
+void alignedMemClearSSE2( void * RP _dst, int _size );
+int alignedConvertToS16SSE2( const sampleFrameA * RP _src, intSampleFrameA * RP _dst, const fpp_t _frames, const float _master_gain, const bool _convert_endian );
+#endif
+} ;
+#endif
+
+
+
+void initBasicOps( void )
+{
+#ifdef X86_OPTIMIZATIONS
+	static bool extensions_checked = false;
+	if( extensions_checked == false )
+	{
+		int features = 0;
+		unsigned int result = 0;
+		unsigned int extended_result = 0;
+		asm(	"push %%ebx\n"
+			"pushf\n"
+			"pop %%eax\n"
+			"mov %%eax, %%ebx\n"
+			"xor $0x00200000, %%eax\n"
+			"push %%eax\n"
+			"popf\n"
+			"pushf\n"
+			"pop %%eax\n"
+			"xor %%edx, %%edx\n"
+			"xor %%ebx, %%eax\n"
+			"jz 1f\n"
+
+			"mov $0x00000001, %%eax\n"
+			"cpuid\n"
+			"1:\n"
+			"pop %%ebx\n"
+			"mov %%edx, %0\n"
+
+			: "=r" (result)
+			:
+			: "%eax", "%ecx", "%edx"
+		);
+
+		asm(	"push %%ebx\n"
+			"pushf\n"
+			"pop %%eax\n"
+			"mov %%eax, %%ebx\n"
+			"xor $0x00200000, %%eax\n"
+			"push %%eax\n"
+			"popf\n"
+			"pushf\n"
+			"pop %%eax\n"
+			"xor %%edx, %%edx\n"
+			"xor %%ebx, %%eax\n"
+			"jz 2f\n"
+
+			"mov $0x80000000, %%eax\n"
+			"cpuid\n"
+			"cmp $0x80000000, %%eax\n"
+			"jbe 2f\n"
+			"mov $0x80000001, %%eax\n"
+			"cpuid\n"
+			"2:\n"
+			"pop %%ebx\n"
+			"mov %%edx, %0\n"
+
+			: "=r" (extended_result)
+			:
+			: "%eax", "%ecx", "%edx"
+		);
+
+		if( result & (1u << 15) )
+			features |= CMOV;
+		if( result & (1u << 23) )
+			features |= MMX;
+		if( extended_result & (1u << 22) )
+			features |= MMXEXT;
+		if( extended_result & (1u << 31) )
+			features |= MMX3DNOW;
+		if( extended_result & (1u << 30) )
+			features |= MMX3DNOWEXT;
+		if( result & (1u << 25) )
+			features |= SSE;
+		if( result & (1u << 26) )
+			features |= SSE2;
+
+#ifdef LMMS_HOST_X86
+		if( features & MMX )
+		{
+			alignedMemCpy = alignedMemCpyMMX;
+			alignedMemClear = alignedMemClearMMX;
+		}
+#endif
+		if( features & SSE )
+		{
+			fprintf( stderr, "Using SSE optimized routines\n" );
+			alignedMemCpy = alignedMemCpySSE;
+			alignedMemClear = alignedMemClearSSE;
+			alignedBufApplyGain = alignedBufApplyGainSSE;
+			alignedBufMix = alignedBufMixSSE;
+			alignedBufMixLRCoeff = alignedBufMixLRCoeffSSE;
+			unalignedBufMixLRCoeff = unalignedBufMixLRCoeffSSE;
+			alignedBufWetDryMix = alignedBufWetDryMixSSE;
+			alignedBufWetDryMixSplitted =
+						alignedBufWetDryMixSplittedSSE;
+		}
+		if( features & SSE2 )
+		{
+			fprintf( stderr, "Using SSE2 optimized routines\n" );
+			alignedMemCpy = alignedMemCpySSE2;
+			alignedMemClear = alignedMemClearSSE2;
+			alignedConvertToS16 = alignedConvertToS16SSE2;
+		}
+		extensions_checked = true;
+	}
+#endif
+}
+
+
+
--- a/src/core/basic_ops_x86.c
+++ b/src/core/basic_ops_x86.c
@@ -0,0 +1,395 @@
+/*
+ * basic_ops_x86.c - x86 specific optimized operations
+ *
+ * Copyright (c) 2008 Tobias Doerffel <tobydox/at/users.sourceforge.net>
+ * 
+ * This file is part of Linux MultiMedia Studio - http://lmms.sourceforge.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program (see COPYING); if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301 USA.
+ *
+ */
+
+
+
+#include "basic_ops.h"
+
+#ifdef X86_OPTIMIZATIONS
+
+#ifdef BUILD_MMX
+
+#include <mmintrin.h>
+
+void alignedMemCpyMMX( void * RP _dst, const void * RP _src, int _size )
+{
+	const int s = _size / ( sizeof( __m64 ) * 8 );
+	int i;
+	char fpu_save[108];
+	char * RP src = (char *) _src;
+	char * RP dst = (char *) _dst;
+	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+
+	__asm__ __volatile__ (
+		"1: prefetchnta (%0)\n"
+		"   prefetchnta 64(%0)\n"
+		"   prefetchnta 128(%0)\n"
+		"   prefetchnta 192(%0)\n"
+		"   prefetchnta 256(%0)\n"
+		: : "r" (src) );
+	for(i=0; i<s; i++)
+	{
+		__asm__ __volatile__ (
+		"1: prefetchnta 320(%0)\n"
+		"2: movq (%0), %%mm0\n"
+		"   movq 8(%0), %%mm1\n"
+		"   movq 16(%0), %%mm2\n"
+		"   movq 24(%0), %%mm3\n"
+		"   movq %%mm0, (%1)\n"
+		"   movq %%mm1, 8(%1)\n"
+		"   movq %%mm2, 16(%1)\n"
+		"   movq %%mm3, 24(%1)\n"
+		"   movq 32(%0), %%mm0\n"
+		"   movq 40(%0), %%mm1\n"
+		"   movq 48(%0), %%mm2\n"
+		"   movq 56(%0), %%mm3\n"
+		"   movq %%mm0, 32(%1)\n"
+		"   movq %%mm1, 40(%1)\n"
+		"   movq %%mm2, 48(%1)\n"
+		"   movq %%mm3, 56(%1)\n"
+		: : "r" (src), "r" (dst) : "memory");
+		src+=64;
+		dst+=64;
+	}
+	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+
+
+}
+
+
+
+void alignedMemClearMMX( void * RP _dst, int _size )
+{
+	__m64 * dst = (__m64 *) _dst;
+	const int s = _size / ( sizeof( *dst ) * 8 );
+	__m64 val = _mm_setzero_si64();
+	int i;
+	for( i = 0; i < s; ++i )
+	{
+		__asm__ __volatile__ (
+			"movq    %0, (%1)\n"
+			"movq    %0, 8(%1)\n"
+			"movq    %0, 16(%1)\n"
+			"movq    %0, 24(%1)\n"
+			"movq    %0, 32(%1)\n"
+			"movq    %0, 40(%1)\n"
+			"movq    %0, 48(%1)\n"
+			"movq    %0, 56(%1)\n"
+				: : "y" (val), "r" (dst) : "memory" );
+		dst += 8;
+	}
+	_mm_empty();
+}
+
+#endif
+
+
+#ifdef BUILD_SSE
+
+#include <xmmintrin.h>
+
+void alignedMemCpySSE( void * RP _dst, const void * RP _src, int _size )
+{
+	__m128 * dst = (__m128 *) _dst;
+	__m128 * src = (__m128 *) _src;
+	const int s = _size / ( sizeof( *dst ) * 4 );
+	int i;
+	for( i = 0; i < s; ++i )
+	{
+/*		_mm_store_ps( dst+0, _mm_load_ps( src+0 ) );
+		_mm_store_ps( dst+1, _mm_load_ps( src+1 ) );
+		_mm_store_ps( dst+2, _mm_load_ps( src+2 ) );
+		_mm_store_ps( dst+3, _mm_load_ps( src+3 ) );*/
+		dst[0] = src[0];
+		dst[1] = src[1];
+		dst[2] = src[2];
+		dst[3] = src[3];
+		src += 4;
+		dst += 4;
+	}
+}
+
+
+
+
+void alignedMemClearSSE( void * RP _dst, int _size )
+{
+	__m128 * dst = (__m128 *) _dst;
+	const int s = _size / ( sizeof( *dst ) * 4 );
+	__m128 val = _mm_setzero_ps();
+	int i;
+	for( i = 0; i < s; ++i )
+	{
+		dst[0] = val;
+		dst[1] = val;
+		dst[2] = val;
+		dst[3] = val;
+		dst += 4;
+	}
+}
+
+
+
+
+void alignedBufApplyGainSSE( sampleFrameA * RP _dst, float _gain, int _frames )
+{
+	int i;
+	for( i = 0; i < _frames; )
+	{
+		_dst[i+0][0] *= _gain;
+		_dst[i+0][1] *= _gain;
+		_dst[i+1][0] *= _gain;
+		_dst[i+1][1] *= _gain;
+		_dst[i+2][0] *= _gain;
+		_dst[i+2][1] *= _gain;
+		_dst[i+3][0] *= _gain;
+		_dst[i+3][1] *= _gain;
+		_dst[i+4][0] *= _gain;
+		_dst[i+4][1] *= _gain;
+		_dst[i+5][0] *= _gain;
+		_dst[i+5][1] *= _gain;
+		_dst[i+6][0] *= _gain;
+		_dst[i+6][1] *= _gain;
+		_dst[i+7][0] *= _gain;
+		_dst[i+7][1] *= _gain;
+		i += 8;
+	}
+}
+
+
+
+
+void alignedBufMixSSE( sampleFrameA * RP _dst, const sampleFrameA * RP _src,
+								int _frames )
+{
+	int i;
+	for( i = 0; i < _frames; )
+	{
+		_dst[i+0][0] += _src[i+0][0];
+		_dst[i+0][1] += _src[i+0][1];
+		_dst[i+1][0] += _src[i+1][0];
+		_dst[i+1][1] += _src[i+1][1];
+		_dst[i+2][0] += _src[i+2][0];
+		_dst[i+2][1] += _src[i+2][1];
+		_dst[i+3][0] += _src[i+3][0];
+		_dst[i+3][1] += _src[i+3][1];
+		i += 4;
+		_dst[i+0][0] += _src[i+0][0];
+		_dst[i+0][1] += _src[i+0][1];
+		_dst[i+1][0] += _src[i+1][0];
+		_dst[i+1][1] += _src[i+1][1];
+		_dst[i+2][0] += _src[i+2][0];
+		_dst[i+2][1] += _src[i+2][1];
+		_dst[i+3][0] += _src[i+3][0];
+		_dst[i+3][1] += _src[i+3][1];
+		i += 4;
+	}
+}
+
+
+
+void alignedBufMixLRCoeffSSE( sampleFrameA * RP _dst,
+					const sampleFrameA * RP _src,
+					float _left, float _right, int _frames )
+{
+	int i;
+	for( i = 0; i < _frames; )
+	{
+		_dst[i+0][0] += _src[i+0][0]*_left;
+		_dst[i+0][1] += _src[i+0][1]*_right;
+		_dst[i+1][0] += _src[i+1][0]*_left;
+		_dst[i+1][1] += _src[i+1][1]*_right;
+		_dst[i+2][0] += _src[i+2][0]*_left;
+		_dst[i+2][1] += _src[i+2][1]*_right;
+		_dst[i+3][0] += _src[i+3][0]*_left;
+		_dst[i+3][1] += _src[i+3][1]*_right;
+		i += 4;
+	}
+}
+
+
+
+void unalignedBufMixLRCoeffSSE( sampleFrame * RP _dst, const sampleFrame * RP _src,
+							const float _left,
+							const float _right,
+								int _frames )
+{
+	int i;
+	if( unlikely( _frames % 2 ) )
+	{
+		_dst[0][0] += _src[0][0] * _left;
+		_dst[0][1] += _src[0][1] * _right;
+		++_src;
+		++_dst;
+		--_frames;
+	}
+
+	for( i = 0; i < _frames; )
+	{
+		_dst[i+0][0] += _src[i+0][0]*_left;
+		_dst[i+0][1] += _src[i+0][1]*_right;
+		_dst[i+1][0] += _src[i+1][0]*_left;
+		_dst[i+1][1] += _src[i+1][1]*_right;
+		i += 2;
+	}
+}
+
+
+
+void alignedBufWetDryMixSSE( sampleFrameA * RP _dst,
+					const sampleFrameA * RP _src,
+					float _wet, float _dry, int _frames )
+{
+	int i;
+	for( i = 0; i < _frames; )
+	{
+		_dst[i+0][0] = _dst[i+0][0]*_dry + _src[i+0][0]*_wet;
+		_dst[i+0][1] = _dst[i+0][1]*_dry + _src[i+0][1]*_wet;
+		_dst[i+1][0] = _dst[i+1][0]*_dry + _src[i+1][0]*_wet;
+		_dst[i+1][1] = _dst[i+1][1]*_dry + _src[i+1][1]*_wet;
+		_dst[i+2][0] = _dst[i+2][0]*_dry + _src[i+2][0]*_wet;
+		_dst[i+2][1] = _dst[i+2][1]*_dry + _src[i+2][1]*_wet;
+		_dst[i+3][0] = _dst[i+3][0]*_dry + _src[i+3][0]*_wet;
+		_dst[i+3][1] = _dst[i+3][1]*_dry + _src[i+3][1]*_wet;
+		i += 4;
+	}
+}
+
+
+
+
+void alignedBufWetDryMixSplittedSSE( sampleFrameA * RP _dst,
+					const float * RP _left,
+					const float * RP _right,
+					float _wet, float _dry, int _frames )
+{
+	int i;
+	for( i = 0; i < _frames; )
+	{
+		_dst[i+0][0] = _dst[i+0][0]*_dry + _left[i+0]*_wet;
+		_dst[i+0][1] = _dst[i+0][1]*_dry + _right[i+0]*_wet;
+		_dst[i+1][0] = _dst[i+1][0]*_dry + _left[i+1]*_wet;
+		_dst[i+1][1] = _dst[i+1][1]*_dry + _right[i+1]*_wet;
+		i += 2;
+	}
+}
+
+
+
+#endif
+
+
+#ifdef BUILD_SSE2
+
+#include <emmintrin.h>
+
+void alignedMemCpySSE2( void * RP _dst, const void * RP _src, int _size )
+{
+	__m128i * dst = (__m128i *) _dst;
+	__m128i * src = (__m128i *) _src;
+	const int s = _size / ( sizeof( *dst ) * 4 );
+	int i;
+	for( i = 0; i < s; ++i )
+	{
+		_mm_store_si128( dst+0, _mm_load_si128( src+0 ) );
+		_mm_store_si128( dst+1, _mm_load_si128( src+1 ) );
+		_mm_store_si128( dst+2, _mm_load_si128( src+2 ) );
+		_mm_store_si128( dst+3, _mm_load_si128( src+3 ) );
+		src += 4;
+		dst += 4;
+	}
+}
+
+
+
+
+void alignedMemClearSSE2( void * RP _dst, int _size )
+{
+	__m128i * dst = (__m128i *) _dst;
+	const int s = _size / ( sizeof( *dst ) * 4 );
+	__m128i val = _mm_setzero_si128();
+	int i;
+	for( i = 0; i < s; ++i )
+	{
+		_mm_store_si128( dst+0, val );
+		_mm_store_si128( dst+1, val );
+		_mm_store_si128( dst+2, val );
+		_mm_store_si128( dst+3, val );
+		dst += 4;
+	}
+}
+
+
+
+int alignedConvertToS16SSE2( const sampleFrameA * RP _src,
+					intSampleFrameA * RP _dst,
+					const fpp_t _frames,
+					const float _master_gain,
+					const bool _convert_endian )
+{
+	int t1;
+	int t2;
+	fpp_t frame;
+	const float f = _master_gain * OUTPUT_SAMPLE_MULTIPLIER;
+	if( _convert_endian )
+	{
+		for( frame = 0; frame < _frames; ++frame )
+		{
+			t1 = _src[frame][0] * f;
+			t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
+			t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
+			_dst[frame][0] = ( t1 & 0x00ff) << 8 |
+							( t1 & 0xff00 ) >> 8;
+
+			t2 = _src[frame][1] * f;
+			t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
+			t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
+			_dst[frame][1] = ( t2 & 0x00ff) << 8 |
+						( t2 & 0xff00 ) >> 8;
+		}
+	}
+	else
+	{
+		for( frame = 0; frame < _frames; ++frame )
+		{
+			t1 = _src[frame][0] * f;
+			t1 = unlikely( t1 > 32767 ) ? 32767 : t1;
+			t1 = unlikely( t1 < -32768 ) ? -32768 : t1;
+			_dst[frame][0] = t1;
+
+			t2 = _src[frame][1] * f;
+			t2 = unlikely( t2 > 32767 ) ? 32767 : t2;
+			t2 = unlikely( t2 < -32768 ) ? -32768 : t2;
+			_dst[frame][1] = t2;
+		}
+	}
+
+	return _frames * DEFAULT_CHANNELS * BYTES_PER_INT_SAMPLE;
+}
+
+
+
+#endif
+
+#endif
--- a/src/core/basic_ops_x86_64_sse.s
+++ b/src/core/basic_ops_x86_64_sse.s
@@ -0,0 +1,563 @@
+	.file	"basic_ops_x86.c"
+	.text
+	.align 16
+.globl alignedMemCpySSE
+	.type	alignedMemCpySSE, @function
+alignedMemCpySSE:
+.LFB509:
+	movslq	%edx,%rdx
+	shrq	$6, %rdx
+	testl	%edx, %edx
+	jle	.L4
+	leal	-1(%rdx), %r9d
+	xorl	%eax, %eax
+	mov	%r9d, %r8d
+	leaq	1(%r8), %rcx
+	movq	%rcx, %rdx
+	salq	$6, %rdx
+	.align 16
+.L3:
+	movaps	(%rsi,%rax), %xmm0
+	movaps	%xmm0, (%rdi,%rax)
+	movaps	16(%rsi,%rax), %xmm0
+	movaps	%xmm0, 16(%rdi,%rax)
+	movaps	32(%rsi,%rax), %xmm0
+	movaps	%xmm0, 32(%rdi,%rax)
+	movaps	48(%rsi,%rax), %xmm0
+	movaps	%xmm0, 48(%rdi,%rax)
+	addq	$64, %rax
+	cmpq	%rdx, %rax
+	jne	.L3
+.L4:
+	rep
+	ret
+.LFE509:
+	.size	alignedMemCpySSE, .-alignedMemCpySSE
+	.align 16
+.globl alignedMemClearSSE
+	.type	alignedMemClearSSE, @function
+alignedMemClearSSE:
+.LFB510:
+	movslq	%esi,%rax
+	shrq	$6, %rax
+	testl	%eax, %eax
+	jle	.L10
+	subl	$1, %eax
+	xorps	%xmm0, %xmm0
+	salq	$6, %rax
+	leaq	64(%rax,%rdi), %rax
+	.align 16
+.L9:
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm0, 16(%rdi)
+	movaps	%xmm0, 32(%rdi)
+	movaps	%xmm0, 48(%rdi)
+	addq	$64, %rdi
+	cmpq	%rax, %rdi
+	jne	.L9
+.L10:
+	rep
+	ret
+.LFE510:
+	.size	alignedMemClearSSE, .-alignedMemClearSSE
+	.align 16
+.globl alignedBufApplyGainSSE
+	.type	alignedBufApplyGainSSE, @function
+alignedBufApplyGainSSE:
+.LFB511:
+	testl	%esi, %esi
+	jle	.L15
+	subl	$1, %esi
+	shufps	$0, %xmm0, %xmm0
+	shrl	$3, %esi
+	xorl	%eax, %eax
+	leal	1(%rsi), %edx
+	.align 16
+.L14:
+	movaps	%xmm0, %xmm3
+	addl	$1, %eax
+	movaps	%xmm0, %xmm2
+	movaps	%xmm0, %xmm1
+	movaps	%xmm0, %xmm4
+	mulps	16(%rdi), %xmm3
+	mulps	32(%rdi), %xmm2
+	mulps	48(%rdi), %xmm1
+	mulps	(%rdi), %xmm4
+	movaps	%xmm3, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm1, 48(%rdi)
+	movaps	%xmm4, (%rdi)
+	addq	$64, %rdi
+	cmpl	%eax, %edx
+	ja	.L14
+.L15:
+	rep
+	ret
+.LFE511:
+	.size	alignedBufApplyGainSSE, .-alignedBufApplyGainSSE
+	.align 16
+.globl alignedBufMixSSE
+	.type	alignedBufMixSSE, @function
+alignedBufMixSSE:
+.LFB512:
+	testl	%edx, %edx
+	jle	.L20
+	subl	$1, %edx
+	xorl	%eax, %eax
+	shrl	$3, %edx
+	leal	1(%rdx), %ecx
+	xorl	%edx, %edx
+	.align 16
+.L19:
+	movaps	16(%rdi,%rax), %xmm2
+	addl	$1, %edx
+	movaps	32(%rdi,%rax), %xmm1
+	addps	16(%rsi,%rax), %xmm2
+	movaps	48(%rdi,%rax), %xmm0
+	addps	32(%rsi,%rax), %xmm1
+	movaps	(%rdi,%rax), %xmm3
+	addps	48(%rsi,%rax), %xmm0
+	addps	(%rsi,%rax), %xmm3
+	movaps	%xmm2, 16(%rdi,%rax)
+	movaps	%xmm1, 32(%rdi,%rax)
+	movaps	%xmm0, 48(%rdi,%rax)
+	movaps	%xmm3, (%rdi,%rax)
+	addq	$64, %rax
+	cmpl	%edx, %ecx
+	ja	.L19
+.L20:
+	rep
+	ret
+.LFE512:
+	.size	alignedBufMixSSE, .-alignedBufMixSSE
+	.align 16
+.globl alignedBufMixLRCoeffSSE
+	.type	alignedBufMixLRCoeffSSE, @function
+alignedBufMixLRCoeffSSE:
+.LFB513:
+	testl	%edx, %edx
+	jle	.L25
+	unpcklps	%xmm1, %xmm0
+	subl	$1, %edx
+	shrl	$2, %edx
+	xorl	%eax, %eax
+	leal	1(%rdx), %ecx
+	xorl	%edx, %edx
+	movlhps	%xmm0, %xmm0
+	.align 16
+.L24:
+	movaps	%xmm0, %xmm1
+	addl	$1, %edx
+	movaps	%xmm0, %xmm2
+	mulps	16(%rsi,%rax), %xmm1
+	mulps	(%rsi,%rax), %xmm2
+	addps	16(%rdi,%rax), %xmm1
+	addps	(%rdi,%rax), %xmm2
+	movaps	%xmm1, 16(%rdi,%rax)
+	movaps	%xmm2, (%rdi,%rax)
+	addq	$32, %rax
+	cmpl	%edx, %ecx
+	ja	.L24
+.L25:
+	rep
+	ret
+.LFE513:
+	.size	alignedBufMixLRCoeffSSE, .-alignedBufMixLRCoeffSSE
+	.align 16
+.globl alignedBufWetDryMixSSE
+	.type	alignedBufWetDryMixSSE, @function
+alignedBufWetDryMixSSE:
+.LFB515:
+	testl	%edx, %edx
+	jle	.L30
+	subl	$1, %edx
+	shufps	$0, %xmm1, %xmm1
+	shufps	$0, %xmm0, %xmm0
+	shrl	$2, %edx
+	leal	1(%rdx), %ecx
+	xorl	%eax, %eax
+	xorl	%edx, %edx
+	.align 16
+.L29:
+	movaps	%xmm1, %xmm3
+	addl	$1, %edx
+	movaps	%xmm0, %xmm2
+	mulps	16(%rdi,%rax), %xmm3
+	movaps	%xmm1, %xmm4
+	mulps	16(%rsi,%rax), %xmm2
+	mulps	(%rdi,%rax), %xmm4
+	addps	%xmm3, %xmm2
+	movaps	%xmm0, %xmm3
+	mulps	(%rsi,%rax), %xmm3
+	movaps	%xmm2, 16(%rdi,%rax)
+	addps	%xmm4, %xmm3
+	movaps	%xmm3, (%rdi,%rax)
+	addq	$32, %rax
+	cmpl	%edx, %ecx
+	ja	.L29
+.L30:
+	rep
+	ret
+.LFE515:
+	.size	alignedBufWetDryMixSSE, .-alignedBufWetDryMixSSE
+	.align 16
+.globl alignedBufWetDryMixSplittedSSE
+	.type	alignedBufWetDryMixSplittedSSE, @function
+alignedBufWetDryMixSplittedSSE:
+.LFB516:
+	pushq	%rbp
+.LCFI0:
+	testl	%ecx, %ecx
+	pushq	%rbx
+.LCFI1:
+	jle	.L39
+	leal	-1(%rcx), %ebx
+	shrl	%ebx
+	addl	$1, %ebx
+	movl	%ebx, %r11d
+	shrl	$2, %r11d
+	cmpl	$3, %ebx
+	leal	0(,%r11,4), %ebp
+	jbe	.L40
+	testl	%ebp, %ebp
+	jne	.L34
+.L40:
+	xorl	%r9d, %r9d
+	jmp	.L36
+	.align 16
+.L34:
+	movaps	%xmm1, %xmm2
+	movq	%rdi, %rax
+	xorps	%xmm6, %xmm6
+	movq	%rsi, %r9
+	shufps	$0, %xmm2, %xmm2
+	movq	%rdx, %r8
+	xorl	%r10d, %r10d
+	movaps	%xmm2, %xmm8
+	movaps	%xmm0, %xmm2
+	shufps	$0, %xmm2, %xmm2
+	movaps	%xmm2, %xmm7
+	.align 16
+.L37:
+	movaps	(%rax), %xmm12
+	addl	$1, %r10d
+	movaps	%xmm6, %xmm3
+	movaps	16(%rax), %xmm5
+	movaps	%xmm12, %xmm14
+	movlps	(%r8), %xmm3
+	movaps	32(%rax), %xmm9
+	shufps	$136, %xmm5, %xmm14
+	shufps	$221, %xmm5, %xmm12
+	movhps	8(%r8), %xmm3
+	movaps	48(%rax), %xmm4
+	movaps	%xmm9, %xmm13
+	movaps	%xmm6, %xmm5
+	shufps	$221, %xmm4, %xmm9
+	movlps	(%r9), %xmm5
+	shufps	$136, %xmm4, %xmm13
+	movaps	%xmm6, %xmm4
+	movhps	8(%r9), %xmm5
+	movaps	%xmm14, %xmm11
+	movlps	16(%r9), %xmm4
+	movaps	%xmm12, %xmm15
+	movaps	%xmm5, %xmm2
+	movhps	24(%r9), %xmm4
+	shufps	$136, %xmm13, %xmm11
+	movaps	%xmm3, %xmm10
+	addq	$32, %r9
+	shufps	$136, %xmm4, %xmm2
+	mulps	%xmm8, %xmm11
+	mulps	%xmm7, %xmm2
+	shufps	$221, %xmm13, %xmm14
+	shufps	$136, %xmm9, %xmm15
+	shufps	$221, %xmm4, %xmm5
+	addps	%xmm2, %xmm11
+	movaps	%xmm6, %xmm2
+	shufps	$221, %xmm9, %xmm12
+	movlps	16(%r8), %xmm2
+	mulps	%xmm8, %xmm14
+	movhps	24(%r8), %xmm2
+	mulps	%xmm7, %xmm5
+	movaps	%xmm11, %xmm9
+	addq	$32, %r8
+	shufps	$136, %xmm2, %xmm10
+	shufps	$221, %xmm2, %xmm3
+	movaps	%xmm14, %xmm4
+	mulps	%xmm8, %xmm15
+	addps	%xmm5, %xmm4
+	mulps	%xmm7, %xmm10
+	movaps	%xmm11, %xmm5
+	mulps	%xmm8, %xmm12
+	mulps	%xmm7, %xmm3
+	addps	%xmm15, %xmm10
+	unpcklps	%xmm4, %xmm9
+	movaps	%xmm12, %xmm2
+	unpckhps	%xmm4, %xmm5
+	addps	%xmm3, %xmm2
+	movaps	%xmm10, %xmm4
+	movaps	%xmm10, %xmm3
+	unpcklps	%xmm2, %xmm4
+	unpckhps	%xmm2, %xmm3
+	movaps	%xmm9, %xmm2
+	unpcklps	%xmm4, %xmm2
+	unpckhps	%xmm4, %xmm9
+	movaps	%xmm2, (%rax)
+	movaps	%xmm5, %xmm2
+	unpckhps	%xmm3, %xmm5
+	unpcklps	%xmm3, %xmm2
+	movaps	%xmm9, 16(%rax)
+	movaps	%xmm2, 32(%rax)
+	movaps	%xmm5, 48(%rax)
+	addq	$64, %rax
+	cmpl	%r10d, %r11d
+	ja	.L37
+	cmpl	%ebx, %ebp
+	leal	(%rbp,%rbp), %r9d
+	je	.L39
+.L36:
+	movslq	%r9d,%rax
+	leaq	1(%rax), %rbx
+	leaq	0(,%rax,4), %r10
+	leaq	(%rdi,%rax,8), %r8
+	leaq	(%rdi,%rbx,8), %rax
+	salq	$2, %rbx
+	leaq	(%rsi,%r10), %r11
+	leaq	(%rdx,%r10), %r10
+	addq	%rbx, %rsi
+	addq	%rbx, %rdx
+	.align 16
+.L38:
+	movaps	%xmm1, %xmm3
+	addl	$2, %r9d
+	movaps	%xmm0, %xmm2
+	mulss	(%r8), %xmm3
+	mulss	(%r11), %xmm2
+	addq	$8, %r11
+	addss	%xmm3, %xmm2
+	movaps	%xmm1, %xmm3
+	mulss	4(%r8), %xmm3
+	movss	%xmm2, (%r8)
+	movaps	%xmm0, %xmm2
+	mulss	(%r10), %xmm2
+	addq	$8, %r10
+	addss	%xmm3, %xmm2
+	movaps	%xmm1, %xmm3
+	movss	%xmm2, 4(%r8)
+	movaps	%xmm0, %xmm2
+	addq	$16, %r8
+	mulss	(%rax), %xmm3
+	mulss	(%rsi), %xmm2
+	addq	$8, %rsi
+	addss	%xmm3, %xmm2
+	movaps	%xmm1, %xmm3
+	mulss	4(%rax), %xmm3
+	movss	%xmm2, (%rax)
+	movaps	%xmm0, %xmm2
+	mulss	(%rdx), %xmm2
+	addq	$8, %rdx
+	addss	%xmm3, %xmm2
+	movss	%xmm2, 4(%rax)
+	addq	$16, %rax
+	cmpl	%r9d, %ecx
+	jg	.L38
+.L39:
+	popq	%rbx
+	popq	%rbp
+	ret
+.LFE516:
+	.size	alignedBufWetDryMixSplittedSSE, .-alignedBufWetDryMixSplittedSSE
+	.align 16
+.globl unalignedBufMixLRCoeffSSE
+	.type	unalignedBufMixLRCoeffSSE, @function
+unalignedBufMixLRCoeffSSE:
+.LFB514:
+	movl	%edx, %eax
+	shrl	$31, %eax
+	leal	(%rdx,%rax), %ecx
+	andl	$1, %ecx
+	cmpl	%eax, %ecx
+	jne	.L52
+.L44:
+	testl	%edx, %edx
+	jle	.L49
+	subl	$1, %edx
+	shrl	%edx
+	testb	$15, %dil
+	jne	.L46
+	unpcklps	%xmm1, %xmm0
+	addl	$1, %edx
+	xorps	%xmm3, %xmm3
+	xorl	%eax, %eax
+	movlhps	%xmm0, %xmm0
+	.align 16
+.L47:
+	movaps	%xmm3, %xmm2
+	addl	$1, %eax
+	movaps	%xmm3, %xmm1
+	movlps	(%rsi), %xmm2
+	movlps	(%rdi), %xmm1
+	movhps	8(%rsi), %xmm2
+	addq	$16, %rsi
+	movhps	8(%rdi), %xmm1
+	mulps	%xmm0, %xmm2
+	addps	%xmm2, %xmm1
+	movaps	%xmm1, (%rdi)
+	addq	$16, %rdi
+	cmpl	%edx, %eax
+	jb	.L47
+	rep
+	ret
+	.align 16
+.L46:
+	mov	%edx, %edx
+	xorl	%eax, %eax
+	addq	$1, %rdx
+	salq	$4, %rdx
+	.align 16
+.L48:
+	movaps	%xmm0, %xmm2
+	mulss	(%rsi,%rax), %xmm2
+	addss	(%rdi,%rax), %xmm2
+	movss	%xmm2, (%rdi,%rax)
+	movaps	%xmm1, %xmm2
+	mulss	4(%rsi,%rax), %xmm2
+	addss	4(%rdi,%rax), %xmm2
+	movss	%xmm2, 4(%rdi,%rax)
+	movaps	%xmm0, %xmm2
+	mulss	8(%rsi,%rax), %xmm2
+	addss	8(%rdi,%rax), %xmm2
+	movss	%xmm2, 8(%rdi,%rax)
+	movaps	%xmm1, %xmm2
+	mulss	12(%rsi,%rax), %xmm2
+	addss	12(%rdi,%rax), %xmm2
+	movss	%xmm2, 12(%rdi,%rax)
+	addq	$16, %rax
+	cmpq	%rdx, %rax
+	jne	.L48
+.L49:
+	rep
+	ret
+.L52:
+	movaps	%xmm0, %xmm2
+	subl	$1, %edx
+	movss	(%rdi), %xmm3
+	mulss	(%rsi), %xmm2
+	addss	%xmm3, %xmm2
+	movss	4(%rdi), %xmm3
+	movss	%xmm2, (%rdi)
+	movaps	%xmm1, %xmm2
+	mulss	4(%rsi), %xmm2
+	addq	$8, %rsi
+	addss	%xmm3, %xmm2
+	movss	%xmm2, 4(%rdi)
+	addq	$8, %rdi
+	jmp	.L44
+.LFE514:
+	.size	unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE
+	.section	.eh_frame,"aw",@progbits
+.Lframe1:
+	.long	.LECIE1-.LSCIE1
+.LSCIE1:
+	.long	0x0
+	.byte	0x1
+	.string	"zR"
+	.byte	0x1
+	.byte	0x78
+	.byte	0x10
+	.byte	0x1
+	.byte	0x3
+	.byte	0xc
+	.byte	0x7
+	.byte	0x8
+	.byte	0x11
+	.byte	0x10
+	.byte	0x1
+	.align 8
+.LECIE1:
+.LSFDE1:
+	.long	.LEFDE1-.LASFDE1
+.LASFDE1:
+	.long	.LASFDE1-.Lframe1
+	.long	.LFB509
+	.long	.LFE509-.LFB509
+	.byte	0x0
+	.align 8
+.LEFDE1:
+.LSFDE3:
+	.long	.LEFDE3-.LASFDE3
+.LASFDE3:
+	.long	.LASFDE3-.Lframe1
+	.long	.LFB510
+	.long	.LFE510-.LFB510
+	.byte	0x0
+	.align 8
+.LEFDE3:
+.LSFDE5:
+	.long	.LEFDE5-.LASFDE5
+.LASFDE5:
+	.long	.LASFDE5-.Lframe1
+	.long	.LFB511
+	.long	.LFE511-.LFB511
+	.byte	0x0
+	.align 8
+.LEFDE5:
+.LSFDE7:
+	.long	.LEFDE7-.LASFDE7
+.LASFDE7:
+	.long	.LASFDE7-.Lframe1
+	.long	.LFB512
+	.long	.LFE512-.LFB512
+	.byte	0x0
+	.align 8
+.LEFDE7:
+.LSFDE9:
+	.long	.LEFDE9-.LASFDE9
+.LASFDE9:
+	.long	.LASFDE9-.Lframe1
+	.long	.LFB513
+	.long	.LFE513-.LFB513
+	.byte	0x0
+	.align 8
+.LEFDE9:
+.LSFDE11:
+	.long	.LEFDE11-.LASFDE11
+.LASFDE11:
+	.long	.LASFDE11-.Lframe1
+	.long	.LFB515
+	.long	.LFE515-.LFB515
+	.byte	0x0
+	.align 8
+.LEFDE11:
+.LSFDE13:
+	.long	.LEFDE13-.LASFDE13
+.LASFDE13:
+	.long	.LASFDE13-.Lframe1
+	.long	.LFB516
+	.long	.LFE516-.LFB516
+	.byte	0x0
+	.byte	0x4
+	.long	.LCFI0-.LFB516
+	.byte	0xe
+	.byte	0x10
+	.byte	0x4
+	.long	.LCFI1-.LCFI0
+	.byte	0xe
+	.byte	0x18
+	.byte	0x11
+	.byte	0x3
+	.byte	0x3
+	.byte	0x11
+	.byte	0x6
+	.byte	0x2
+	.align 8
+.LEFDE13:
+.LSFDE15:
+	.long	.LEFDE15-.LASFDE15
+.LASFDE15:
+	.long	.LASFDE15-.Lframe1
+	.long	.LFB514
+	.long	.LFE514-.LFB514
+	.byte	0x0
+	.align 8
+.LEFDE15:
+	.ident	"GCC: (GNU) 4.4.0 20081110 (experimental)"
--- a/src/core/basic_ops_x86_64_sse2.s
+++ b/src/core/basic_ops_x86_64_sse2.s
@@ -0,0 +1,395 @@
+	.file	"basic_ops_x86.c"
+	.text
+	.align 16
+.globl alignedMemCpySSE2
+	.type	alignedMemCpySSE2, @function
+alignedMemCpySSE2:
+.LFB509:
+	movslq	%edx,%rdx
+	shrq	$6, %rdx
+	testl	%edx, %edx
+	jle	.L4
+	leal	-1(%rdx), %r9d
+	xorl	%eax, %eax
+	mov	%r9d, %r8d
+	leaq	1(%r8), %rcx
+	movq	%rcx, %rdx
+	salq	$6, %rdx
+	.align 16
+.L3:
+	movdqa	(%rsi,%rax), %xmm0
+	movdqa	%xmm0, (%rdi,%rax)
+	movdqa	16(%rsi,%rax), %xmm0
+	movdqa	%xmm0, 16(%rdi,%rax)
+	movdqa	32(%rsi,%rax), %xmm0
+	movdqa	%xmm0, 32(%rdi,%rax)
+	movdqa	48(%rsi,%rax), %xmm0
+	movdqa	%xmm0, 48(%rdi,%rax)
+	addq	$64, %rax
+	cmpq	%rdx, %rax
+	jne	.L3
+.L4:
+	rep
+	ret
+.LFE509:
+	.size	alignedMemCpySSE2, .-alignedMemCpySSE2
+	.align 16
+.globl alignedMemClearSSE2
+	.type	alignedMemClearSSE2, @function
+alignedMemClearSSE2:
+.LFB510:
+	movslq	%esi,%rax
+	shrq	$6, %rax
+	testl	%eax, %eax
+	jle	.L10
+	subl	$1, %eax
+	pxor	%xmm0, %xmm0
+	salq	$6, %rax
+	leaq	64(%rax,%rdi), %rax
+	.align 16
+.L9:
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm0, 16(%rdi)
+	movdqa	%xmm0, 32(%rdi)
+	movdqa	%xmm0, 48(%rdi)
+	addq	$64, %rdi
+	cmpq	%rax, %rdi
+	jne	.L9
+.L10:
+	rep
+	ret
+.LFE510:
+	.size	alignedMemClearSSE2, .-alignedMemClearSSE2
+	.align 16
+.globl alignedConvertToS16SSE2
+	.type	alignedConvertToS16SSE2, @function
+alignedConvertToS16SSE2:
+.LFB511:
+	pushq	%rbp
+.LCFI0:
+	testb	%cl, %cl
+	movl	%edx, %eax
+	mulss	.LC0(%rip), %xmm0
+	pushq	%rbx
+.LCFI1:
+	jne	.L13
+	testw	%dx, %dx
+	jle	.L15
+	movl	%edx, %ebx
+	shrw	$2, %bx
+	cmpw	$3, %dx
+	leal	0(,%rbx,4), %r8d
+	ja	.L33
+.L28:
+	xorl	%r8d, %r8d
+	.align 16
+.L23:
+	movswq	%r8w,%rdx
+	movl	$32767, %ebx
+	leaq	(%rdi,%rdx,8), %rcx
+	leaq	(%rsi,%rdx,4), %rdx
+	movl	$-32768, %edi
+	.align 16
+.L25:
+	movaps	%xmm0, %xmm1
+	mulss	(%rcx), %xmm1
+	cvttss2si	%xmm1, %esi
+	movaps	%xmm0, %xmm1
+	mulss	4(%rcx), %xmm1
+	cmpl	$-32768, %esi
+	cmovl	%edi, %esi
+	cmpl	$32767, %esi
+	cmovg	%ebx, %esi
+	movw	%si, (%rdx)
+	cvttss2si	%xmm1, %esi
+	cmpl	$-32768, %esi
+	cmovl	%edi, %esi
+	cmpl	$32767, %esi
+	cmovg	%ebx, %esi
+	addl	$1, %r8d
+	addq	$8, %rcx
+	movw	%si, 2(%rdx)
+	addq	$4, %rdx
+	cmpw	%r8w, %ax
+	jg	.L25
+.L15:
+	cwtl
+	popq	%rbx
+	sall	$2, %eax
+	popq	%rbp
+	ret
+	.align 16
+.L13:
+	testw	%dx, %dx
+	jle	.L15
+	movl	%edx, %ebx
+	shrw	$2, %bx
+	cmpw	$3, %dx
+	leal	0(,%rbx,4), %r8d
+	ja	.L34
+.L27:
+	xorl	%r8d, %r8d
+	.align 16
+.L18:
+	movswq	%r8w,%rdx
+	leaq	(%rdi,%rdx,8), %rcx
+	leaq	(%rsi,%rdx,4), %rdx
+	movl	$-32768, %edi
+	movl	$32767, %esi
+	.align 16
+.L20:
+	movaps	%xmm0, %xmm1
+	mulss	(%rcx), %xmm1
+	cvttss2si	%xmm1, %ebx
+	movaps	%xmm0, %xmm1
+	mulss	4(%rcx), %xmm1
+	cmpl	$-32768, %ebx
+	cmovl	%edi, %ebx
+	cmpl	$32767, %ebx
+	cmovg	%esi, %ebx
+	movzbl	%bh, %ebp
+	sall	$8, %ebx
+	orl	%ebp, %ebx
+	movw	%bx, (%rdx)
+	cvttss2si	%xmm1, %ebx
+	cmpl	$-32768, %ebx
+	cmovl	%edi, %ebx
+	cmpl	$32767, %ebx
+	cmovg	%esi, %ebx
+	addl	$1, %r8d
+	addq	$8, %rcx
+	movzbl	%bh, %ebp
+	sall	$8, %ebx
+	orl	%ebp, %ebx
+	movw	%bx, 2(%rdx)
+	addq	$4, %rdx
+	cmpw	%r8w, %ax
+	jg	.L20
+	cwtl
+	popq	%rbx
+	sall	$2, %eax
+	popq	%rbp
+	ret
+	.align 16
+.L34:
+	testw	%r8w, %r8w
+	je	.L27
+	movaps	%xmm0, %xmm1
+	movq	%rdi, %rcx
+	movdqa	.LC1(%rip), %xmm2
+	movq	%rsi, %r10
+	shufps	$0, %xmm1, %xmm1
+	xorl	%r9d, %r9d
+	movdqa	.LC3(%rip), %xmm8
+	movaps	%xmm1, %xmm9
+	movdqa	.LC2(%rip), %xmm1
+	.align 16
+.L19:
+	movaps	%xmm9, %xmm4
+	addl	$1, %r9d
+	movaps	%xmm9, %xmm3
+	mulps	(%rcx), %xmm4
+	movdqa	%xmm1, %xmm6
+	mulps	16(%rcx), %xmm3
+	addq	$32, %rcx
+	cvttps2dq	%xmm4, %xmm4
+	movdqa	%xmm4, %xmm5
+	pcmpgtd	%xmm2, %xmm5
+	cvttps2dq	%xmm3, %xmm3
+	pand	%xmm5, %xmm4
+	pandn	%xmm2, %xmm5
+	por	%xmm5, %xmm4
+	movdqa	%xmm4, %xmm5
+	pcmpgtd	%xmm1, %xmm5
+	pand	%xmm5, %xmm6
+	pandn	%xmm4, %xmm5
+	movdqa	%xmm5, %xmm4
+	movdqa	%xmm3, %xmm5
+	por	%xmm6, %xmm4
+	movdqa	%xmm1, %xmm6
+	pcmpgtd	%xmm2, %xmm5
+	pand	%xmm5, %xmm3
+	pandn	%xmm2, %xmm5
+	movdqa	%xmm4, %xmm7
+	pslld	$8, %xmm4
+	pand	%xmm8, %xmm7
+	por	%xmm5, %xmm3
+	psrad	$8, %xmm7
+	movdqa	%xmm3, %xmm5
+	pcmpgtd	%xmm1, %xmm5
+	pand	%xmm5, %xmm6
+	pandn	%xmm3, %xmm5
+	movdqa	%xmm5, %xmm3
+	por	%xmm6, %xmm3
+	movdqa	%xmm7, %xmm6
+	movdqa	%xmm3, %xmm5
+	pslld	$8, %xmm3
+	pand	%xmm8, %xmm5
+	psrad	$8, %xmm5
+	punpcklwd	%xmm5, %xmm7
+	punpckhwd	%xmm5, %xmm6
+	movdqa	%xmm4, %xmm5
+	punpcklwd	%xmm3, %xmm4
+	movdqa	%xmm7, %xmm10
+	punpckhwd	%xmm3, %xmm5
+	punpcklwd	%xmm6, %xmm7
+	punpckhwd	%xmm6, %xmm10
+	punpcklwd	%xmm10, %xmm7
+	movdqa	%xmm4, %xmm10
+	punpcklwd	%xmm5, %xmm4
+	punpckhwd	%xmm5, %xmm10
+	punpcklwd	%xmm10, %xmm4
+	por	%xmm7, %xmm4
+	movdqa	%xmm4, (%r10)
+	addq	$16, %r10
+	cmpw	%r9w, %bx
+	ja	.L19
+	cmpw	%dx, %r8w
+	jne	.L18
+	jmp	.L15
+	.align 16
+.L33:
+	testw	%r8w, %r8w
+	je	.L28
+	movaps	%xmm0, %xmm1
+	movq	%rdi, %rcx
+	movdqa	.LC1(%rip), %xmm2
+	movq	%rsi, %r10
+	shufps	$0, %xmm1, %xmm1
+	xorl	%r9d, %r9d
+	movaps	%xmm1, %xmm6
+	movdqa	.LC2(%rip), %xmm1
+	.align 16
+.L24:
+	movaps	%xmm6, %xmm4
+	addl	$1, %r9d
+	movaps	%xmm6, %xmm3
+	mulps	(%rcx), %xmm4
+	movdqa	%xmm1, %xmm7
+	mulps	16(%rcx), %xmm3
+	addq	$32, %rcx
+	cvttps2dq	%xmm4, %xmm4
+	movdqa	%xmm4, %xmm5
+	pcmpgtd	%xmm2, %xmm5
+	cvttps2dq	%xmm3, %xmm3
+	pand	%xmm5, %xmm4
+	pandn	%xmm2, %xmm5
+	por	%xmm5, %xmm4
+	movdqa	%xmm4, %xmm5
+	pcmpgtd	%xmm1, %xmm5
+	pand	%xmm5, %xmm7
+	pandn	%xmm4, %xmm5
+	movdqa	%xmm5, %xmm4
+	movdqa	%xmm3, %xmm5
+	por	%xmm7, %xmm4
+	movdqa	%xmm1, %xmm7
+	pcmpgtd	%xmm2, %xmm5
+	pand	%xmm5, %xmm3
+	pandn	%xmm2, %xmm5
+	por	%xmm5, %xmm3
+	movdqa	%xmm3, %xmm5
+	pcmpgtd	%xmm1, %xmm5
+	pand	%xmm5, %xmm7
+	pandn	%xmm3, %xmm5
+	movdqa	%xmm5, %xmm3
+	movdqa	%xmm4, %xmm5
+	por	%xmm7, %xmm3
+	punpcklwd	%xmm3, %xmm4
+	punpckhwd	%xmm3, %xmm5
+	movdqa	%xmm4, %xmm7
+	punpcklwd	%xmm5, %xmm4
+	punpckhwd	%xmm5, %xmm7
+	punpcklwd	%xmm7, %xmm4
+	movdqa	%xmm4, (%r10)
+	addq	$16, %r10
+	cmpw	%r9w, %bx
+	ja	.L24
+	cmpw	%r8w, %dx
+	jne	.L23
+	jmp	.L15
+.LFE511:
+	.size	alignedConvertToS16SSE2, .-alignedConvertToS16SSE2
+	.section	.rodata
+	.align 4
+.LC0:
+	.long	1191181824
+	.align 16
+.LC1:
+	.long	-32768
+	.long	-32768
+	.long	-32768
+	.long	-32768
+	.align 16
+.LC2:
+	.long	32767
+	.long	32767
+	.long	32767
+	.long	32767
+	.align 16
+.LC3:
+	.long	65280
+	.long	65280
+	.long	65280
+	.long	65280
+	.section	.eh_frame,"aw",@progbits
+.Lframe1:
+	.long	.LECIE1-.LSCIE1
+.LSCIE1:
+	.long	0x0
+	.byte	0x1
+	.string	"zR"
+	.byte	0x1
+	.byte	0x78
+	.byte	0x10
+	.byte	0x1
+	.byte	0x3
+	.byte	0xc
+	.byte	0x7
+	.byte	0x8
+	.byte	0x11
+	.byte	0x10
+	.byte	0x1
+	.align 8
+.LECIE1:
+.LSFDE1:
+	.long	.LEFDE1-.LASFDE1
+.LASFDE1:
+	.long	.LASFDE1-.Lframe1
+	.long	.LFB509
+	.long	.LFE509-.LFB509
+	.byte	0x0
+	.align 8
+.LEFDE1:
+.LSFDE3:
+	.long	.LEFDE3-.LASFDE3
+.LASFDE3:
+	.long	.LASFDE3-.Lframe1
+	.long	.LFB510
+	.long	.LFE510-.LFB510
+	.byte	0x0
+	.align 8
+.LEFDE3:
+.LSFDE5:
+	.long	.LEFDE5-.LASFDE5
+.LASFDE5:
+	.long	.LASFDE5-.Lframe1
+	.long	.LFB511
+	.long	.LFE511-.LFB511
+	.byte	0x0
+	.byte	0x4
+	.long	.LCFI0-.LFB511
+	.byte	0xe
+	.byte	0x10
+	.byte	0x4
+	.long	.LCFI1-.LCFI0
+	.byte	0xe
+	.byte	0x18
+	.byte	0x11
+	.byte	0x3
+	.byte	0x3
+	.byte	0x11
+	.byte	0x6
+	.byte	0x2
+	.align 8
+.LEFDE5:
+	.ident	"GCC: (GNU) 4.4.0 20081110 (experimental)"
--- a/src/core/basic_ops_x86_mmx.s
+++ b/src/core/basic_ops_x86_mmx.s
@@ -0,0 +1,107 @@
+	.file	"basic_ops_x86.c"
+	.text
+	.p2align 4,,15
+.globl alignedMemCpyMMX
+	.type	alignedMemCpyMMX, @function
+alignedMemCpyMMX:
+	pushl	%ebx
+	subl	$112, %esp
+	movl	128(%esp), %ebx
+	movl	124(%esp), %eax
+	shrl	$6, %ebx
+#APP
+# 42 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+	 fsave 4(%esp); fwait
+
+# 0 "" 2
+# 44 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+	1: prefetchnta (%eax)
+   prefetchnta 64(%eax)
+   prefetchnta 128(%eax)
+   prefetchnta 192(%eax)
+   prefetchnta 256(%eax)
+
+# 0 "" 2
+#NO_APP
+	testl	%ebx, %ebx
+	je	.L2
+	movl	120(%esp), %ecx
+	xorl	%edx, %edx
+	.p2align 4,,7
+	.p2align 3
+.L3:
+#APP
+# 53 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+	1: prefetchnta 320(%eax)
+2: movq (%eax), %mm0
+   movq 8(%eax), %mm1
+   movq 16(%eax), %mm2
+   movq 24(%eax), %mm3
+   movq %mm0, (%ecx)
+   movq %mm1, 8(%ecx)
+   movq %mm2, 16(%ecx)
+   movq %mm3, 24(%ecx)
+   movq 32(%eax), %mm0
+   movq 40(%eax), %mm1
+   movq 48(%eax), %mm2
+   movq 56(%eax), %mm3
+   movq %mm0, 32(%ecx)
+   movq %mm1, 40(%ecx)
+   movq %mm2, 48(%ecx)
+   movq %mm3, 56(%ecx)
+
+# 0 "" 2
+#NO_APP
+	addl	$1, %edx
+	addl	$64, %eax
+	addl	$64, %ecx
+	cmpl	%edx, %ebx
+	jne	.L3
+.L2:
+#APP
+# 75 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+	 fsave 4(%esp); fwait
+
+# 0 "" 2
+#NO_APP
+	addl	$112, %esp
+	popl	%ebx
+	ret
+	.size	alignedMemCpyMMX, .-alignedMemCpyMMX
+	.p2align 4,,15
+.globl alignedMemClearMMX
+	.type	alignedMemClearMMX, @function
+alignedMemClearMMX:
+	movl	8(%esp), %ecx
+	shrl	$6, %ecx
+	testl	%ecx, %ecx
+	je	.L8
+	movl	4(%esp), %edx
+	xorl	%eax, %eax
+	pxor	%mm0, %mm0
+	.p2align 4,,7
+	.p2align 3
+.L9:
+#APP
+# 90 "/home/toby/development/svn/lmms-trunk/src/core/basic_ops_x86.c" 1
+	movq    %mm0, (%edx)
+movq    %mm0, 8(%edx)
+movq    %mm0, 16(%edx)
+movq    %mm0, 24(%edx)
+movq    %mm0, 32(%edx)
+movq    %mm0, 40(%edx)
+movq    %mm0, 48(%edx)
+movq    %mm0, 56(%edx)
+
+# 0 "" 2
+#NO_APP
+	addl	$1, %eax
+	addl	$64, %edx
+	cmpl	%eax, %ecx
+	jne	.L9
+.L8:
+	emms
+	ret
+	.size	alignedMemClearMMX, .-alignedMemClearMMX
+	.ident	"GCC: (GNU) 4.4.0 20081110 (experimental)"
+	.section	.note.GNU-stack,"",@progbits
--- a/src/core/basic_ops_x86_sse.s
+++ b/src/core/basic_ops_x86_sse.s
@@ -0,0 +1,505 @@
+	.file	"basic_ops_x86.c"
+	.text
+	.p2align 4,,15
+.globl alignedMemCpySSE
+	.type	alignedMemCpySSE, @function
+alignedMemCpySSE:
+	pushl	%esi
+	pushl	%ebx
+	movl	20(%esp), %esi
+	movl	12(%esp), %edx
+	movl	16(%esp), %ecx
+	shrl	$6, %esi
+	testl	%esi, %esi
+	je	.L4
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	.p2align 4,,7
+	.p2align 3
+.L3:
+	movaps	(%ecx,%eax), %xmm0
+	addl	$1, %ebx
+	movaps	%xmm0, (%edx,%eax)
+	movaps	16(%ecx,%eax), %xmm0
+	movaps	%xmm0, 16(%edx,%eax)
+	movaps	32(%ecx,%eax), %xmm0
+	movaps	%xmm0, 32(%edx,%eax)
+	movaps	48(%ecx,%eax), %xmm0
+	movaps	%xmm0, 48(%edx,%eax)
+	addl	$64, %eax
+	cmpl	%ebx, %esi
+	jne	.L3
+.L4:
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	alignedMemCpySSE, .-alignedMemCpySSE
+	.p2align 4,,15
+.globl alignedMemClearSSE
+	.type	alignedMemClearSSE, @function
+alignedMemClearSSE:
+	movl	8(%esp), %ecx
+	shrl	$6, %ecx
+	testl	%ecx, %ecx
+	je	.L10
+	movl	4(%esp), %eax
+	xorps	%xmm0, %xmm0
+	xorl	%edx, %edx
+	.p2align 4,,7
+	.p2align 3
+.L9:
+	addl	$1, %edx
+	movaps	%xmm0, (%eax)
+	movaps	%xmm0, 16(%eax)
+	movaps	%xmm0, 32(%eax)
+	movaps	%xmm0, 48(%eax)
+	addl	$64, %eax
+	cmpl	%edx, %ecx
+	jne	.L9
+.L10:
+	rep
+	ret
+	.size	alignedMemClearSSE, .-alignedMemClearSSE
+	.p2align 4,,15
+.globl alignedBufApplyGainSSE
+	.type	alignedBufApplyGainSSE, @function
+alignedBufApplyGainSSE:
+	movl	12(%esp), %ecx
+	testl	%ecx, %ecx
+	jle	.L15
+	movss	8(%esp), %xmm0
+	subl	$1, %ecx
+	movl	4(%esp), %eax
+	shrl	$3, %ecx
+	xorl	%edx, %edx
+	addl	$1, %ecx
+	shufps	$0, %xmm0, %xmm0
+	.p2align 4,,7
+	.p2align 3
+.L14:
+	movaps	%xmm0, %xmm3
+	addl	$1, %edx
+	movaps	%xmm0, %xmm2
+	movaps	%xmm0, %xmm1
+	movaps	%xmm0, %xmm4
+	mulps	16(%eax), %xmm3
+	mulps	32(%eax), %xmm2
+	mulps	48(%eax), %xmm1
+	movaps	%xmm3, 16(%eax)
+	mulps	(%eax), %xmm4
+	movaps	%xmm2, 32(%eax)
+	movaps	%xmm1, 48(%eax)
+	movaps	%xmm4, (%eax)
+	addl	$64, %eax
+	cmpl	%edx, %ecx
+	ja	.L14
+.L15:
+	rep
+	ret
+	.size	alignedBufApplyGainSSE, .-alignedBufApplyGainSSE
+	.p2align 4,,15
+.globl alignedBufMixSSE
+	.type	alignedBufMixSSE, @function
+alignedBufMixSSE:
+	pushl	%esi
+	pushl	%ebx
+	movl	20(%esp), %esi
+	movl	12(%esp), %edx
+	movl	16(%esp), %ecx
+	testl	%esi, %esi
+	jle	.L20
+	subl	$1, %esi
+	xorl	%eax, %eax
+	shrl	$3, %esi
+	xorl	%ebx, %ebx
+	addl	$1, %esi
+	.p2align 4,,7
+	.p2align 3
+.L19:
+	movaps	16(%edx,%eax), %xmm2
+	addl	$1, %ebx
+	movaps	32(%edx,%eax), %xmm1
+	movaps	48(%edx,%eax), %xmm0
+	movaps	(%edx,%eax), %xmm3
+	addps	16(%ecx,%eax), %xmm2
+	addps	32(%ecx,%eax), %xmm1
+	addps	48(%ecx,%eax), %xmm0
+	addps	(%ecx,%eax), %xmm3
+	movaps	%xmm2, 16(%edx,%eax)
+	movaps	%xmm3, (%edx,%eax)
+	movaps	%xmm1, 32(%edx,%eax)
+	movaps	%xmm0, 48(%edx,%eax)
+	addl	$64, %eax
+	cmpl	%ebx, %esi
+	ja	.L19
+.L20:
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	alignedBufMixSSE, .-alignedBufMixSSE
+	.p2align 4,,15
+.globl alignedBufMixLRCoeffSSE
+	.type	alignedBufMixLRCoeffSSE, @function
+alignedBufMixLRCoeffSSE:
+	pushl	%esi
+	pushl	%ebx
+	movl	28(%esp), %esi
+	movl	12(%esp), %edx
+	movl	16(%esp), %ebx
+	testl	%esi, %esi
+	jle	.L25
+	movss	24(%esp), %xmm0
+	subl	$1, %esi
+	movss	20(%esp), %xmm1
+	xorl	%eax, %eax
+	shrl	$2, %esi
+	xorl	%ecx, %ecx
+	addl	$1, %esi
+	unpcklps	%xmm0, %xmm1
+	movaps	%xmm1, %xmm0
+	movlhps	%xmm1, %xmm0
+	.p2align 4,,7
+	.p2align 3
+.L24:
+	movaps	%xmm0, %xmm1
+	addl	$1, %ecx
+	movaps	%xmm0, %xmm2
+	mulps	16(%ebx,%eax), %xmm1
+	mulps	(%ebx,%eax), %xmm2
+	addps	16(%edx,%eax), %xmm1
+	addps	(%edx,%eax), %xmm2
+	movaps	%xmm1, 16(%edx,%eax)
+	movaps	%xmm2, (%edx,%eax)
+	addl	$32, %eax
+	cmpl	%ecx, %esi
+	ja	.L24
+.L25:
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	alignedBufMixLRCoeffSSE, .-alignedBufMixLRCoeffSSE
+	.p2align 4,,15
+.globl alignedBufWetDryMixSSE
+	.type	alignedBufWetDryMixSSE, @function
+alignedBufWetDryMixSSE:
+	pushl	%esi
+	pushl	%ebx
+	movl	28(%esp), %esi
+	movl	12(%esp), %edx
+	movl	16(%esp), %ebx
+	testl	%esi, %esi
+	jle	.L30
+	movss	24(%esp), %xmm1
+	subl	$1, %esi
+	movss	20(%esp), %xmm0
+	xorl	%eax, %eax
+	shrl	$2, %esi
+	xorl	%ecx, %ecx
+	shufps	$0, %xmm1, %xmm1
+	addl	$1, %esi
+	shufps	$0, %xmm0, %xmm0
+	.p2align 4,,7
+	.p2align 3
+.L29:
+	movaps	%xmm1, %xmm3
+	addl	$1, %ecx
+	movaps	%xmm0, %xmm2
+	movaps	%xmm1, %xmm4
+	mulps	16(%edx,%eax), %xmm3
+	mulps	16(%ebx,%eax), %xmm2
+	mulps	(%edx,%eax), %xmm4
+	addps	%xmm3, %xmm2
+	movaps	%xmm0, %xmm3
+	mulps	(%ebx,%eax), %xmm3
+	movaps	%xmm2, 16(%edx,%eax)
+	addps	%xmm4, %xmm3
+	movaps	%xmm3, (%edx,%eax)
+	addl	$32, %eax
+	cmpl	%ecx, %esi
+	ja	.L29
+.L30:
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	alignedBufWetDryMixSSE, .-alignedBufWetDryMixSSE
+	.p2align 4,,15
+.globl alignedBufWetDryMixSplittedSSE
+	.type	alignedBufWetDryMixSplittedSSE, @function
+alignedBufWetDryMixSplittedSSE:
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	subl	$140, %esp
+	movl	180(%esp), %eax
+	movl	160(%esp), %edx
+	movl	164(%esp), %esi
+	movl	168(%esp), %ecx
+	testl	%eax, %eax
+	movss	172(%esp), %xmm4
+	movss	176(%esp), %xmm5
+	jle	.L39
+	movl	180(%esp), %eax
+	subl	$1, %eax
+	shrl	%eax
+	addl	$1, %eax
+	movl	%eax, %ebp
+	movl	%eax, 112(%esp)
+	shrl	$2, %ebp
+	cmpl	$3, 112(%esp)
+	leal	0(,%ebp,4), %eax
+	movl	%eax, 116(%esp)
+	jbe	.L40
+	testl	%eax, %eax
+	jne	.L34
+.L40:
+	xorl	%edi, %edi
+	jmp	.L36
+	.p2align 4,,7
+	.p2align 3
+.L34:
+	movaps	%xmm4, %xmm2
+	xorps	%xmm6, %xmm6
+	shufps	$0, %xmm2, %xmm2
+	movaps	%xmm5, %xmm1
+	movl	%esi, %ebx
+	shufps	$0, %xmm1, %xmm1
+	movaps	%xmm2, 32(%esp)
+	xorl	%eax, %eax
+	xorl	%edi, %edi
+	movss	%xmm5, 124(%esp)
+	movss	%xmm4, 120(%esp)
+	movaps	%xmm1, %xmm4
+	.p2align 4,,7
+	.p2align 3
+.L37:
+	movaps	16(%edx,%eax,2), %xmm3
+	addl	$1, %edi
+	movaps	(%edx,%eax,2), %xmm2
+	movaps	48(%edx,%eax,2), %xmm0
+	movaps	%xmm2, %xmm5
+	shufps	$221, %xmm3, %xmm2
+	movaps	32(%edx,%eax,2), %xmm1
+	shufps	$136, %xmm3, %xmm5
+	movaps	%xmm2, 96(%esp)
+	movaps	%xmm1, %xmm7
+	shufps	$221, %xmm0, %xmm1
+	shufps	$136, %xmm0, %xmm7
+	movaps	%xmm1, 64(%esp)
+	movaps	%xmm6, %xmm3
+	movaps	%xmm5, (%esp)
+	shufps	$136, %xmm7, %xmm5
+	movlps	(%ebx), %xmm3
+	movaps	%xmm6, %xmm2
+	movhps	8(%ebx), %xmm3
+	movaps	%xmm7, 80(%esp)
+	movlps	16(%ebx), %xmm2
+	movhps	24(%ebx), %xmm2
+	movaps	96(%esp), %xmm7
+	addl	$32, %ebx
+	movaps	%xmm3, %xmm0
+	shufps	$221, %xmm2, %xmm3
+	shufps	$136, %xmm2, %xmm0
+	shufps	$136, 64(%esp), %xmm7
+	mulps	32(%esp), %xmm0
+	movaps	%xmm6, %xmm1
+	movlps	(%ecx,%eax), %xmm1
+	movhps	8(%ecx,%eax), %xmm1
+	movaps	96(%esp), %xmm2
+	mulps	%xmm4, %xmm7
+	shufps	$221, 64(%esp), %xmm2
+	mulps	%xmm4, %xmm5
+	mulps	32(%esp), %xmm3
+	movaps	%xmm7, 16(%esp)
+	movaps	%xmm1, %xmm7
+	addps	%xmm0, %xmm5
+	movaps	%xmm6, %xmm0
+	movlps	16(%ecx,%eax), %xmm0
+	movhps	24(%ecx,%eax), %xmm0
+	shufps	$136, %xmm0, %xmm7
+	shufps	$221, %xmm0, %xmm1
+	mulps	32(%esp), %xmm7
+	mulps	32(%esp), %xmm1
+	mulps	%xmm4, %xmm2
+	movaps	%xmm7, 48(%esp)
+	movaps	16(%esp), %xmm7
+	addps	48(%esp), %xmm7
+	addps	%xmm1, %xmm2
+	movaps	%xmm7, 16(%esp)
+	movaps	(%esp), %xmm7
+	shufps	$221, 80(%esp), %xmm7
+	movaps	16(%esp), %xmm1
+	mulps	%xmm4, %xmm7
+	movaps	16(%esp), %xmm0
+	unpckhps	%xmm2, %xmm1
+	unpcklps	%xmm2, %xmm0
+	movaps	%xmm1, %xmm2
+	addps	%xmm3, %xmm7
+	movaps	%xmm5, %xmm3
+	unpcklps	%xmm7, %xmm3
+	unpckhps	%xmm7, %xmm5
+	movaps	%xmm3, %xmm1
+	unpckhps	%xmm0, %xmm3
+	unpcklps	%xmm0, %xmm1
+	movaps	%xmm5, %xmm0
+	unpckhps	%xmm2, %xmm5
+	unpcklps	%xmm2, %xmm0
+	movaps	%xmm1, (%edx,%eax,2)
+	movaps	%xmm3, 16(%edx,%eax,2)
+	movaps	%xmm0, 32(%edx,%eax,2)
+	movaps	%xmm5, 48(%edx,%eax,2)
+	addl	$32, %eax
+	cmpl	%edi, %ebp
+	ja	.L37
+	movl	116(%esp), %edi
+	movl	112(%esp), %eax
+	movss	120(%esp), %xmm4
+	movss	124(%esp), %xmm5
+	addl	%edi, %edi
+	cmpl	%eax, 116(%esp)
+	je	.L39
+.L36:
+	leal	(%edx,%edi,8), %ebx
+	xorl	%ebp, %ebp
+	leal	8(%edx,%edi,8), %edx
+	movl	%edi, %eax
+	.p2align 4,,7
+	.p2align 3
+.L38:
+	movaps	%xmm5, %xmm1
+	addl	$2, %ebp
+	movaps	%xmm4, %xmm0
+	mulss	(%ebx), %xmm1
+	mulss	(%esi,%eax,4), %xmm0
+	addss	%xmm1, %xmm0
+	movaps	%xmm5, %xmm1
+	movss	%xmm0, (%ebx)
+	movaps	%xmm4, %xmm0
+	mulss	4(%ebx), %xmm1
+	mulss	(%ecx,%eax,4), %xmm0
+	addss	%xmm1, %xmm0
+	movaps	%xmm5, %xmm1
+	movss	%xmm0, 4(%ebx)
+	addl	$16, %ebx
+	movaps	%xmm4, %xmm0
+	mulss	(%edx), %xmm1
+	mulss	4(%esi,%eax,4), %xmm0
+	addss	%xmm1, %xmm0
+	movaps	%xmm5, %xmm1
+	movss	%xmm0, (%edx)
+	movaps	%xmm4, %xmm0
+	mulss	4(%edx), %xmm1
+	mulss	4(%ecx,%eax,4), %xmm0
+	leal	(%edi,%ebp), %eax
+	addss	%xmm1, %xmm0
+	movss	%xmm0, 4(%edx)
+	addl	$16, %edx
+	cmpl	%eax, 180(%esp)
+	jg	.L38
+.L39:
+	addl	$140, %esp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	ret
+	.size	alignedBufWetDryMixSplittedSSE, .-alignedBufWetDryMixSplittedSSE
+	.p2align 4,,15
+.globl unalignedBufMixLRCoeffSSE
+	.type	unalignedBufMixLRCoeffSSE, @function
+unalignedBufMixLRCoeffSSE:
+	pushl	%esi
+	pushl	%ebx
+	movl	28(%esp), %esi
+	movl	12(%esp), %eax
+	movl	16(%esp), %edx
+	movss	20(%esp), %xmm0
+	movl	%esi, %ecx
+	shrl	$31, %ecx
+	leal	(%esi,%ecx), %ebx
+	andl	$1, %ebx
+	cmpl	%ecx, %ebx
+	movss	24(%esp), %xmm3
+	jne	.L52
+.L44:
+	testl	%esi, %esi
+	jle	.L49
+	leal	-1(%esi), %ebx
+	shrl	%ebx
+	testb	$15, %al
+	jne	.L46
+	movaps	%xmm0, %xmm1
+	xorps	%xmm2, %xmm2
+	unpcklps	%xmm3, %xmm1
+	addl	$1, %ebx
+	xorl	%ecx, %ecx
+	movaps	%xmm1, %xmm3
+	movlhps	%xmm1, %xmm3
+	.p2align 4,,7
+	.p2align 3
+.L47:
+	movaps	%xmm2, %xmm1
+	addl	$1, %ecx
+	movlps	(%edx), %xmm1
+	movhps	8(%edx), %xmm1
+	movaps	%xmm2, %xmm0
+	movlps	(%eax), %xmm0
+	movhps	8(%eax), %xmm0
+	addl	$16, %edx
+	mulps	%xmm3, %xmm1
+	addps	%xmm1, %xmm0
+	movaps	%xmm0, (%eax)
+	addl	$16, %eax
+	cmpl	%ebx, %ecx
+	jb	.L47
+.L49:
+	popl	%ebx
+	popl	%esi
+	ret
+	.p2align 4,,7
+	.p2align 3
+.L46:
+	xorl	%ecx, %ecx
+	.p2align 4,,7
+	.p2align 3
+.L48:
+	movaps	%xmm0, %xmm1
+	mulss	(%edx,%ecx,8), %xmm1
+	addss	(%eax,%ecx,8), %xmm1
+	movss	%xmm1, (%eax,%ecx,8)
+	movaps	%xmm3, %xmm1
+	mulss	4(%edx,%ecx,8), %xmm1
+	addss	4(%eax,%ecx,8), %xmm1
+	movss	%xmm1, 4(%eax,%ecx,8)
+	movaps	%xmm0, %xmm1
+	mulss	8(%edx,%ecx,8), %xmm1
+	addss	8(%eax,%ecx,8), %xmm1
+	movss	%xmm1, 8(%eax,%ecx,8)
+	movaps	%xmm3, %xmm1
+	mulss	12(%edx,%ecx,8), %xmm1
+	addss	12(%eax,%ecx,8), %xmm1
+	movss	%xmm1, 12(%eax,%ecx,8)
+	addl	$2, %ecx
+	cmpl	%ecx, %esi
+	jg	.L48
+	popl	%ebx
+	popl	%esi
+	ret
+.L52:
+	movaps	%xmm0, %xmm1
+	subl	$1, %esi
+	movss	(%eax), %xmm2
+	mulss	(%edx), %xmm1
+	addss	%xmm2, %xmm1
+	movss	4(%eax), %xmm2
+	movss	%xmm1, (%eax)
+	movaps	%xmm3, %xmm1
+	mulss	4(%edx), %xmm1
+	addl	$8, %edx
+	addss	%xmm2, %xmm1
+	movss	%xmm1, 4(%eax)
+	addl	$8, %eax
+	jmp	.L44
+	.size	unalignedBufMixLRCoeffSSE, .-unalignedBufMixLRCoeffSSE
+	.ident	"GCC: (GNU) 4.4.0 20081110 (experimental)"
+	.section	.note.GNU-stack,"",@progbits
--- a/src/core/basic_ops_x86_sse2.s
+++ b/src/core/basic_ops_x86_sse2.s
@@ -0,0 +1,349 @@
+	.file	"basic_ops_x86.c"
+	.text
+	.p2align 4,,15
+.globl alignedMemCpySSE2
+	.type	alignedMemCpySSE2, @function
+alignedMemCpySSE2:
+	pushl	%esi
+	pushl	%ebx
+	movl	20(%esp), %esi
+	movl	12(%esp), %edx
+	movl	16(%esp), %ecx
+	shrl	$6, %esi
+	testl	%esi, %esi
+	je	.L4
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	.p2align 4,,7
+	.p2align 3
+.L3:
+	addl	$1, %ebx
+	movdqa	(%ecx,%eax), %xmm0
+	movdqa	%xmm0, (%edx,%eax)
+	movdqa	16(%ecx,%eax), %xmm0
+	movdqa	%xmm0, 16(%edx,%eax)
+	movdqa	32(%ecx,%eax), %xmm0
+	movdqa	%xmm0, 32(%edx,%eax)
+	movdqa	48(%ecx,%eax), %xmm0
+	movdqa	%xmm0, 48(%edx,%eax)
+	addl	$64, %eax
+	cmpl	%ebx, %esi
+	jne	.L3
+.L4:
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	alignedMemCpySSE2, .-alignedMemCpySSE2
+	.p2align 4,,15
+.globl alignedMemClearSSE2
+	.type	alignedMemClearSSE2, @function
+alignedMemClearSSE2:
+	movl	8(%esp), %ecx
+	shrl	$6, %ecx
+	testl	%ecx, %ecx
+	je	.L10
+	movl	4(%esp), %eax
+	xorl	%edx, %edx
+	pxor	%xmm0, %xmm0
+	.p2align 4,,7
+	.p2align 3
+.L9:
+	addl	$1, %edx
+	movdqa	%xmm0, (%eax)
+	movdqa	%xmm0, 16(%eax)
+	movdqa	%xmm0, 32(%eax)
+	movdqa	%xmm0, 48(%eax)
+	addl	$64, %eax
+	cmpl	%edx, %ecx
+	jne	.L9
+.L10:
+	rep
+	ret
+	.size	alignedMemClearSSE2, .-alignedMemClearSSE2
+	.p2align 4,,15
+.globl alignedConvertToS16SSE2
+	.type	alignedConvertToS16SSE2, @function
+alignedConvertToS16SSE2:
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	subl	$8, %esp
+	movl	36(%esp), %eax
+	movss	.LC0, %xmm4
+	cmpb	$0, 44(%esp)
+	movl	28(%esp), %edx
+	movl	32(%esp), %ebx
+	movl	%eax, %esi
+	mulss	40(%esp), %xmm4
+	jne	.L13
+	testw	%ax, %ax
+	jle	.L15
+	movl	%eax, %edi
+	shrw	$2, %di
+	cmpw	$3, %ax
+	movw	%ax, 2(%esp)
+	leal	0(,%edi,4), %ebp
+	ja	.L33
+.L28:
+	xorl	%ebp, %ebp
+	.p2align 4,,7
+	.p2align 3
+.L23:
+	movswl	%bp,%eax
+	movl	$-32768, %edi
+	leal	(%edx,%eax,8), %edx
+	leal	(%ebx,%eax,4), %eax
+	movl	$32767, %ebx
+	.p2align 4,,7
+	.p2align 3
+.L25:
+	movaps	%xmm4, %xmm0
+	mulss	(%edx), %xmm0
+	cvttss2si	%xmm0, %ecx
+	movaps	%xmm4, %xmm0
+	mulss	4(%edx), %xmm0
+	cmpl	$-32768, %ecx
+	cmovl	%edi, %ecx
+	cmpl	$32767, %ecx
+	cmovg	%ebx, %ecx
+	movw	%cx, (%eax)
+	cvttss2si	%xmm0, %ecx
+	cmpl	$-32768, %ecx
+	cmovl	%edi, %ecx
+	cmpl	$32767, %ecx
+	cmovg	%ebx, %ecx
+	addl	$1, %ebp
+	movw	%cx, 2(%eax)
+	addl	$8, %edx
+	addl	$4, %eax
+	cmpw	%bp, %si
+	jg	.L25
+.L15:
+	movswl	%si,%esi
+	addl	$8, %esp
+	leal	0(,%esi,4), %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	ret
+	.p2align 4,,7
+	.p2align 3
+.L13:
+	testw	%ax, %ax
+	jle	.L15
+	movl	%eax, %ebp
+	shrw	$2, %bp
+	cmpw	$3, %si
+	movw	%ax, 2(%esp)
+	leal	0(,%ebp,4), %eax
+	ja	.L34
+.L27:
+	xorl	%eax, %eax
+	.p2align 4,,7
+	.p2align 3
+.L18:
+	movswl	%ax,%edi
+	leal	(%edx,%edi,8), %ecx
+	leal	(%ebx,%edi,4), %edx
+	movl	$-32768, %edi
+	.p2align 4,,7
+	.p2align 3
+.L20:
+	movaps	%xmm4, %xmm0
+	movl	$32767, %ebp
+	mulss	(%ecx), %xmm0
+	cvttss2si	%xmm0, %ebx
+	movaps	%xmm4, %xmm0
+	mulss	4(%ecx), %xmm0
+	cmpl	$-32768, %ebx
+	cmovl	%edi, %ebx
+	cmpl	$32767, %ebx
+	cmovg	%ebp, %ebx
+	movzbl	%bh, %ebp
+	sall	$8, %ebx
+	orl	%ebp, %ebx
+	movl	$32767, %ebp
+	movw	%bx, (%edx)
+	cvttss2si	%xmm0, %ebx
+	cmpl	$-32768, %ebx
+	cmovl	%edi, %ebx
+	cmpl	$32767, %ebx
+	cmovg	%ebp, %ebx
+	addl	$1, %eax
+	movzbl	%bh, %ebp
+	addl	$8, %ecx
+	sall	$8, %ebx
+	orl	%ebp, %ebx
+	movw	%bx, 2(%edx)
+	addl	$4, %edx
+	cmpw	%ax, %si
+	jg	.L20
+	jmp	.L15
+	.p2align 4,,7
+	.p2align 3
+.L34:
+	testw	%ax, %ax
+	je	.L27
+	movaps	%xmm4, %xmm0
+	xorl	%ecx, %ecx
+	movdqa	.LC1, %xmm1
+	movss	%xmm4, 4(%esp)
+	shufps	$0, %xmm0, %xmm0
+	xorl	%edi, %edi
+	movaps	%xmm0, %xmm7
+	movdqa	.LC2, %xmm0
+	.p2align 4,,7
+	.p2align 3
+.L19:
+	movaps	%xmm7, %xmm3
+	movdqa	%xmm0, %xmm5
+	movdqa	%xmm0, %xmm6
+	movaps	%xmm7, %xmm2
+	addl	$1, %edi
+	mulps	(%edx,%ecx,2), %xmm3
+	mulps	16(%edx,%ecx,2), %xmm2
+	cvttps2dq	%xmm3, %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpgtd	%xmm1, %xmm4
+	pand	%xmm4, %xmm3
+	pandn	%xmm1, %xmm4
+	por	%xmm4, %xmm3
+	cvttps2dq	%xmm2, %xmm2
+	movdqa	%xmm3, %xmm4
+	pcmpgtd	%xmm0, %xmm4
+	pand	%xmm4, %xmm5
+	pandn	%xmm3, %xmm4
+	movdqa	%xmm4, %xmm3
+	movdqa	%xmm2, %xmm4
+	por	%xmm5, %xmm3
+	pcmpgtd	%xmm1, %xmm4
+	movdqa	.LC3, %xmm5
+	pand	%xmm4, %xmm2
+	pand	%xmm3, %xmm5
+	pandn	%xmm1, %xmm4
+	psrad	$8, %xmm5
+	por	%xmm4, %xmm2
+	pslld	$8, %xmm3
+	movdqa	%xmm2, %xmm4
+	pcmpgtd	%xmm0, %xmm4
+	pand	%xmm4, %xmm6
+	pandn	%xmm2, %xmm4
+	movdqa	%xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movdqa	.LC3, %xmm6
+	pand	%xmm2, %xmm6
+	pslld	$8, %xmm2
+	psrad	$8, %xmm6
+	movdqa	%xmm5, %xmm4
+	punpcklwd	%xmm6, %xmm5
+	punpckhwd	%xmm6, %xmm4
+	movdqa	%xmm5, %xmm6
+	punpcklwd	%xmm4, %xmm5
+	punpckhwd	%xmm4, %xmm6
+	movdqa	%xmm3, %xmm4
+	punpcklwd	%xmm6, %xmm5
+	punpckhwd	%xmm2, %xmm4
+	punpcklwd	%xmm2, %xmm3
+	movdqa	%xmm3, %xmm6
+	punpcklwd	%xmm4, %xmm3
+	punpckhwd	%xmm4, %xmm6
+	punpcklwd	%xmm6, %xmm3
+	por	%xmm3, %xmm5
+	movdqa	%xmm5, (%ebx,%ecx)
+	addl	$16, %ecx
+	cmpw	%di, %bp
+	ja	.L19
+	cmpw	2(%esp), %ax
+	movss	4(%esp), %xmm4
+	jne	.L18
+	jmp	.L15
+	.p2align 4,,7
+	.p2align 3
+.L33:
+	testw	%bp, %bp
+	.p2align 4,,3
+	.p2align 3
+	je	.L28
+	movaps	%xmm4, %xmm0
+	xorl	%eax, %eax
+	movdqa	.LC1, %xmm1
+	shufps	$0, %xmm0, %xmm0
+	xorl	%ecx, %ecx
+	movaps	%xmm0, %xmm6
+	movdqa	.LC2, %xmm0
+	.p2align 4,,7
+	.p2align 3
+.L24:
+	movaps	%xmm6, %xmm3
+	addl	$1, %ecx
+	movdqa	%xmm0, %xmm7
+	movaps	%xmm6, %xmm2
+	mulps	(%edx,%eax,2), %xmm3
+	mulps	16(%edx,%eax,2), %xmm2
+	cvttps2dq	%xmm3, %xmm3
+	movdqa	%xmm3, %xmm5
+	pcmpgtd	%xmm1, %xmm5
+	pand	%xmm5, %xmm3
+	pandn	%xmm1, %xmm5
+	por	%xmm5, %xmm3
+	cvttps2dq	%xmm2, %xmm2
+	movdqa	%xmm3, %xmm5
+	pcmpgtd	%xmm0, %xmm5
+	pand	%xmm5, %xmm7
+	pandn	%xmm3, %xmm5
+	movdqa	%xmm5, %xmm3
+	movdqa	%xmm2, %xmm5
+	por	%xmm7, %xmm3
+	pcmpgtd	%xmm1, %xmm5
+	movdqa	%xmm0, %xmm7
+	pand	%xmm5, %xmm2
+	pandn	%xmm1, %xmm5
+	por	%xmm5, %xmm2
+	movdqa	%xmm2, %xmm5
+	pcmpgtd	%xmm0, %xmm5
+	pand	%xmm5, %xmm7
+	pandn	%xmm2, %xmm5
+	movdqa	%xmm5, %xmm2
+	movdqa	%xmm3, %xmm5
+	por	%xmm7, %xmm2
+	punpckhwd	%xmm2, %xmm5
+	punpcklwd	%xmm2, %xmm3
+	movdqa	%xmm3, %xmm7
+	punpcklwd	%xmm5, %xmm3
+	punpckhwd	%xmm5, %xmm7
+	punpcklwd	%xmm7, %xmm3
+	movdqa	%xmm3, (%ebx,%eax)
+	addl	$16, %eax
+	cmpw	%cx, %di
+	ja	.L24
+	cmpw	%bp, 2(%esp)
+	jne	.L23
+	jmp	.L15
+	.size	alignedConvertToS16SSE2, .-alignedConvertToS16SSE2
+	.section	.rodata.cst4,"aM",@progbits,4
+	.align 4
+.LC0:
+	.long	1191181824
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 16
+.LC1:
+	.long	-32768
+	.long	-32768
+	.long	-32768
+	.long	-32768
+	.align 16
+.LC2:
+	.long	32767
+	.long	32767
+	.long	32767
+	.long	32767
+	.align 16
+.LC3:
+	.long	65280
+	.long	65280
+	.long	65280
+	.long	65280
+	.ident	"GCC: (GNU) 4.4.0 20081110 (experimental)"
+	.section	.note.GNU-stack,"",@progbits
--- a/src/core/fx_mixer.cpp
+++ b/src/core/fx_mixer.cpp
@@ -28,6 +28,7 @@
 #include <QtXml/QDomElement>

 #include "fx_mixer.h"
+#include "basic_ops.h"
 #include "effect.h"
 #include "song.h"

@@ -38,7 +39,7 @@ fxChannel::fxChannel( model * _parent ) :
 	m_stillRunning( false ),
 	m_peakLeft( 0.0f ),
 	m_peakRight( 0.0f ),
-	m_buffer( new sampleFrame[engine::getMixer()->framesPerPeriod()] ),
+	m_buffer( alignedAllocFrames( engine::getMixer()->framesPerPeriod() ) ),
 	m_muteModel( false, _parent ),
 	m_volumeModel( 1.0, 0.0, 2.0, 0.01, _parent ),
 	m_name(),
@@ -53,7 +54,7 @@ fxChannel::fxChannel( model * _parent ) :

 fxChannel::~fxChannel()
 {
-	delete[] m_buffer;
+	alignedFreeFrames( m_buffer );
 }


@@ -92,13 +93,7 @@ void fxMixer::mixToChannel( const sampleFrame * _buf, fx_ch_t _ch )
 	if( m_fxChannels[_ch]->m_muteModel.value() == false )
 	{
 		m_fxChannels[_ch]->m_lock.lock();
-		sampleFrame * buf = m_fxChannels[_ch]->m_buffer;
-		for( f_cnt_t f = 0; f < engine::getMixer()->framesPerPeriod();
-									++f )
-		{
-			buf[f][0] += _buf[f][0];
-			buf[f][1] += _buf[f][1];
-		}
+		alignedBufMix( m_fxChannels[_ch]->m_buffer, _buf, engine::getMixer()->framesPerPeriod() );
 		m_fxChannels[_ch]->m_used = true;
 		m_fxChannels[_ch]->m_lock.unlock();
 	}
--- a/src/core/main.cpp
+++ b/src/core/main.cpp
@@ -57,6 +57,7 @@
 #include "main_window.h"
 #include "project_renderer.h"
 #include "song.h"
+#include "basic_ops.h"

 #warning TODO: move somewhere else
 static inline QString baseName( const QString & _file )
@@ -78,12 +79,29 @@ inline void loadTranslation( const QString & _tname,
 }


+Uint32 convertToS16( const sampleFrameA * RP _ab,
+						const fpp_t _frames,
+						const float _master_gain,
+						intSampleFrameA * RP _output_buffer,
+						const bool _convert_endian );

 int main( int argc, char * * argv )
 {
 	// intialize RNG
 	srand( getpid() + time( 0 ) );

+	// init CPU specific optimized basic ops
+	initBasicOps();
+
+#if 0
+	sampleFrameA * buf = (sampleFrameA *) alignedMalloc( sizeof( sampleFrameA ) * 256 );
+	intSampleFrameA * obuf = (intSampleFrameA*)alignedMalloc( sizeof( intSampleFrameA ) * 256 );
+	for( int i = 0; i< 1000000; ++i )
+	{
+		convertToS16( buf, 256, 0.7, obuf, false );
+	}
+return 0;
+#endif
 	bool core_only = FALSE;

 	for( int i = 1; i < argc; ++i )
--- a/src/core/mixer.cpp
+++ b/src/core/mixer.cpp
@@ -41,6 +41,7 @@
 #include "sample_play_handle.h"
 #include "piano_roll.h"
 #include "micro_timer.h"
+#include "basic_ops.h"

 #include "audio_device.h"
 #include "midi_client.h"
@@ -61,40 +62,15 @@
 #include "midi_winmm.h"
 #include "midi_dummy.h"

+#ifdef LMMS_HAVE_PTHREAD_H
+#include <pthread.h>
+#endif
+

 static QVector<fx_ch_t> __fx_channel_jobs( NumFxChannels );



-static void aligned_free( void * _buf )
-{
-	if( _buf != NULL )
-	{
-		int *ptr2=(int *)_buf - 1;
-		_buf = (char *)_buf- *ptr2;
-		free(_buf);
-	}
-}
-
-static void * aligned_malloc( int _bytes )
-{
-	char *ptr,*ptr2,*aligned_ptr;
-	int align_mask = ALIGN_SIZE- 1;
-	ptr=(char *)malloc(_bytes +ALIGN_SIZE+ sizeof(int));
-	if(ptr==NULL) return(NULL);
-
-	ptr2 = ptr + sizeof(int);
-	aligned_ptr = ptr2 + (ALIGN_SIZE- ((size_t)ptr2 & align_mask));
-
-
-	ptr2 = aligned_ptr - sizeof(int);
-	*((int *)ptr2)=(int)(aligned_ptr - ptr);
-
-	return(aligned_ptr);
-}
-
-
-
 class mixerWorkerThread : public QThread
 {
 public:
@@ -152,9 +128,7 @@ public:

 	mixerWorkerThread( int _worker_num, mixer * _mixer ) :
 		QThread( _mixer ),
-		m_workingBuf( (sampleFrame *) aligned_malloc(
-					_mixer->framesPerPeriod() *
-						sizeof( sampleFrame ) ) ),
+		m_workingBuf( alignedAllocFrames( _mixer->framesPerPeriod() ) ),
 		m_workerNum( _worker_num ),
 		m_quit( false ),
 		m_mixer( _mixer ),
@@ -165,7 +139,7 @@ public:

 	virtual ~mixerWorkerThread()
 	{
-		aligned_free( m_workingBuf );
+		alignedFreeFrames( m_workingBuf );
 	}

 	virtual void quit( void )
@@ -234,11 +208,11 @@ private:
 	{
 #if 0
 #ifdef LMMS_BUILD_LINUX
-#ifdef LMMS_HAVE_SCHED_H
+#ifdef LMMS_HAVE_PTHREAD_H
 		cpu_set_t mask;
 		CPU_ZERO( &mask );
 		CPU_SET( m_workerNum, &mask );
-		sched_setaffinity( 0, sizeof( mask ), &mask );
+		pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask );
 #endif
 #endif
 #endif
@@ -310,7 +284,8 @@ mixer::mixer( void ) :
 	{
 		m_inputBufferFrames[i] = 0;
 		m_inputBufferSize[i] = DEFAULT_BUFFER_SIZE * 100;
-		m_inputBuffer[i] = new sampleFrame[ DEFAULT_BUFFER_SIZE * 100 ];
+		m_inputBuffer[i] = alignedAllocFrames( 
+						DEFAULT_BUFFER_SIZE * 100 );
 		clearAudioBuffer( m_inputBuffer[i], m_inputBufferSize[i] );
 	}

@@ -351,14 +326,10 @@ mixer::mixer( void ) :
 		m_fifo = new fifo( 1 );
 	}

-	m_workingBuf = (sampleFrame*) aligned_malloc( m_framesPerPeriod *
-							sizeof( sampleFrame ) );
+	m_workingBuf = alignedAllocFrames( m_framesPerPeriod );
 	for( Uint8 i = 0; i < 3; i++ )
 	{
-		m_readBuf = (surroundSampleFrame*)
-			aligned_malloc( m_framesPerPeriod *
-						sizeof( surroundSampleFrame ) );
-
+		m_readBuf = alignedAllocFrames( m_framesPerPeriod );
 		clearAudioBuffer( m_readBuf, m_framesPerPeriod );
 		m_bufferPool.push_back( m_readBuf );
 	}
@@ -409,10 +380,10 @@ mixer::~mixer()

 	for( Uint8 i = 0; i < 3; i++ )
 	{
-		aligned_free( m_bufferPool[i] );
+		alignedFreeFrames( m_bufferPool[i] );
 	}

-	aligned_free( m_workingBuf );
+	alignedFreeFrames( m_workingBuf );
 }


@@ -524,9 +495,9 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames )
 	if( frames + _frames > size )
 	{
 		size = qMax( size * 2, frames + _frames );
-		sampleFrame * ab = new sampleFrame[ size ];
-		memcpy( ab, buf, frames * sizeof( sampleFrame ) );
-		delete [] buf;
+		sampleFrame * ab = alignedAllocFrames( size );
+		alignedMemCpy( ab, buf, frames * sizeof( sampleFrame ) );
+		alignedFreeFrames( buf );

 		m_inputBufferSize[ m_inputBufferWrite ] = size;
 		m_inputBuffer[ m_inputBufferWrite ] = ab;
@@ -534,7 +505,7 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames )
 		buf = ab;
 	}
 	
-	memcpy( &buf[ frames ], _ab, _frames * sizeof( sampleFrame ) );
+	alignedMemCpy( &buf[ frames ], _ab, _frames * sizeof( sampleFrame ) );
 	m_inputBufferFrames[ m_inputBufferWrite ] += _frames;
 	
 	unlockInputFrames();
@@ -543,7 +514,7 @@ void mixer::pushInputFrames( sampleFrame * _ab, const f_cnt_t _frames )



-const surroundSampleFrame * mixer::renderNextBuffer( void )
+sampleFrameA * mixer::renderNextBuffer( void )
 {
 	microTimer timer;
 	static song::playPos last_metro_pos = -1;
@@ -709,12 +680,9 @@ void mixer::bufferToPort( const sampleFrame * _buf,
 	const int loop1_frame = qMin<int>( end_frame, m_framesPerPeriod );

 	_port->lockFirstBuffer();
-	sampleFrame * obuf = _port->firstBuffer()+start_frame;
-	for( int frame = 0; frame < loop1_frame-start_frame; ++frame )
-	{
-		obuf[frame][0] += _buf[frame][0] * _vv.vol[0];
-		obuf[frame][1] += _buf[frame][1] * _vv.vol[1];
-	}
+	unalignedBufMixLRCoeff( _port->firstBuffer() + start_frame,
+					_buf, _vv.vol[0], _vv.vol[1],
+						loop1_frame - start_frame );
 	_port->unlockFirstBuffer();

 	_port->lockSecondBuffer();
@@ -723,14 +691,10 @@ void mixer::bufferToPort( const sampleFrame * _buf,
 		const int frames_done = m_framesPerPeriod - start_frame;
 		end_frame -= m_framesPerPeriod;
 		end_frame = qMin<int>( end_frame, m_framesPerPeriod );
-		sampleFrame * obuf = _port->secondBuffer();
-		for( fpp_t frame = 0; frame < end_frame; ++frame )
-		{
-			obuf[frame][0] += _buf[frames_done + frame][0] *
-								_vv.vol[0];
-			obuf[frame][1] += _buf[frames_done + frame][1] *
-								_vv.vol[1];
-		}
+		unalignedBufMixLRCoeff( _port->secondBuffer(),
+						_buf+frames_done,
+						_vv.vol[0], _vv.vol[1],
+						end_frame );
 		// we used both buffers so set flags
 		_port->m_bufferUsage = audioPort::BothBuffers;
 	}
@@ -748,7 +712,14 @@ void mixer::bufferToPort( const sampleFrame * _buf,
 void mixer::clearAudioBuffer( sampleFrame * _ab, const f_cnt_t _frames,
 							const f_cnt_t _offset )
 {
-	memset( _ab+_offset, 0, sizeof( *_ab ) * _frames );
+	if( likely( (int)( _ab+_offset ) % 16 == 0 && _frames % 8 == 0 ) )
+	{
+		alignedMemClear( _ab+_offset, sizeof( *_ab ) * _frames );
+	}
+	else
+	{
+		memset( _ab+_offset, 0, sizeof( *_ab ) * _frames );
+	}
 }


@@ -1166,11 +1137,11 @@ void mixer::fifoWriter::run( void )
 {
 #if 0
 #ifdef LMMS_BUILD_LINUX
-#ifdef LMMS_HAVE_SCHED_H
+#ifdef LMMS_HAVE_PTHREAD_H
 	cpu_set_t mask;
 	CPU_ZERO( &mask );
 	CPU_SET( 0, &mask );
-	sched_setaffinity( 0, sizeof( mask ), &mask );
+	pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask );
 #endif
 #endif
 #endif
@@ -1178,9 +1149,9 @@ void mixer::fifoWriter::run( void )
 	const fpp_t frames = m_mixer->framesPerPeriod();
 	while( m_writing )
 	{
-		surroundSampleFrame * buffer = new surroundSampleFrame[frames];
-		const surroundSampleFrame * b = m_mixer->renderNextBuffer();
-		memcpy( buffer, b, frames * sizeof( surroundSampleFrame ) );
+		sampleFrameA * buffer = alignedAllocFrames( frames );
+		const sampleFrameA * b = m_mixer->renderNextBuffer();
+		alignedMemCpy( buffer, b, frames * sizeof( sampleFrameA ) );
 		m_fifo->write( buffer );
 	}

--- a/src/core/project_renderer.cpp
+++ b/src/core/project_renderer.cpp
@@ -32,11 +32,12 @@
 #include "audio_file_wave.h"
 #include "audio_file_ogg.h"

-#ifdef LMMS_HAVE_SCHED_H
-#include <sched.h>
+#ifdef LMMS_HAVE_PTHREAD_H
+#include <pthread.h>
 #endif


+
 fileEncodeDevice __fileEncodeDevices[] =
 {

@@ -148,11 +149,11 @@ void projectRenderer::run( void )
 {
 #if 0
 #ifdef LMMS_BUILD_LINUX
-#ifdef LMMS_HAVE_SCHED_H
+#ifdef LMMS_HAVE_PTHREAD_H
 	cpu_set_t mask;
 	CPU_ZERO( &mask );
 	CPU_SET( 0, &mask );
-	sched_setaffinity( 0, sizeof( mask ), &mask );
+	pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask );
 #endif
 #endif
 #endif