From 019d2089bbadf70d73ba85aa8ea51490b071262c Mon Sep 17 00:00:00 2001
From: MerryMage <MerryMage@users.noreply.github.com>
Date: Thu, 17 Aug 2017 11:09:20 +0100
Subject: [PATCH] Stripped down version of SoundTouch 2.0.0

---
 .gitignore                  |   1 +
 CMakeLists.txt              |  26 ++-
 include/BPMDetect.h         | 158 ++++++++++++++
 include/FIFOSampleBuffer.h  |   2 +-
 include/FIFOSamplePipe.h    |   2 +-
 include/STTypes.h           |  18 +-
 include/SoundTouch.h        |  86 ++++++--
 include/soundtouch_config.h |   5 +
 src/AAFilter.cpp            |   6 +-
 src/AAFilter.h              |   2 +-
 src/BPMDetect.cpp           | 318 ++++++++++++++++++++++++++++
 src/FIFOSampleBuffer.cpp    |   2 +-
 src/FIRFilter.cpp           |  12 +-
 src/FIRFilter.h             |   2 +-
 src/InterpolateCubic.cpp    | 200 ++++++++++++++++++
 src/InterpolateCubic.h      |  67 ++++++
 src/InterpolateLinear.cpp   |   0
 src/InterpolateLinear.h     |   0
 src/InterpolateShannon.cpp  | 185 +++++++++++++++++
 src/InterpolateShannon.h    |  72 +++++++
 src/PeakFinder.cpp          | 286 ++++++++++++++++++++++++++
 src/PeakFinder.h            |  97 +++++++++
 src/RateTransposer.cpp      |  13 +-
 src/RateTransposer.h        |   7 +-
 src/SoundTouch.cpp          | 116 ++++++++---
 src/TDStretch.cpp           | 104 +++++++---
 src/TDStretch.h             |  12 +-
 src/cpu_detect.h            |   2 +-
 src/cpu_detect_x86.cpp      |   2 +-
 src/mmx_optimized.cpp       | 400 ++++++++++++++++++++++++++++++++++++
 src/sse_optimized.cpp       | 372 +++++++++++++++++++++++++++++++++
 31 files changed, 2456 insertions(+), 119 deletions(-)
 create mode 100644 .gitignore
 create mode 100755 include/BPMDetect.h
 mode change 100644 => 100755 include/FIFOSampleBuffer.h
 mode change 100644 => 100755 include/FIFOSamplePipe.h
 mode change 100644 => 100755 include/STTypes.h
 mode change 100644 => 100755 include/SoundTouch.h
 create mode 100644 include/soundtouch_config.h
 mode change 100644 => 100755 src/AAFilter.cpp
 mode change 100644 => 100755 src/AAFilter.h
 create mode 100755 src/BPMDetect.cpp
 mode change 100644 => 100755 src/FIFOSampleBuffer.cpp
 mode change 100644 => 100755 src/FIRFilter.cpp
 mode change 100644 => 100755 src/FIRFilter.h
 create mode 100755 src/InterpolateCubic.cpp
 create mode 100755 src/InterpolateCubic.h
 mode change 100644 => 100755 src/InterpolateLinear.cpp
 mode change 100644 => 100755 src/InterpolateLinear.h
 create mode 100755 src/InterpolateShannon.cpp
 create mode 100755 src/InterpolateShannon.h
 create mode 100755 src/PeakFinder.cpp
 create mode 100755 src/PeakFinder.h
 mode change 100644 => 100755 src/RateTransposer.cpp
 mode change 100644 => 100755 src/RateTransposer.h
 mode change 100644 => 100755 src/SoundTouch.cpp
 mode change 100644 => 100755 src/TDStretch.cpp
 mode change 100644 => 100755 src/TDStretch.h
 mode change 100644 => 100755 src/cpu_detect.h
 mode change 100644 => 100755 src/cpu_detect_x86.cpp
 create mode 100755 src/mmx_optimized.cpp
 create mode 100755 src/sse_optimized.cpp

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..496ee2c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30cc4e9..17379f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,14 +1,18 @@
 set(SRCS
-    src/AAFilter.cpp
-    src/cpu_detect_x86.cpp
-    src/FIFOSampleBuffer.cpp
-    src/FIRFilter.cpp
-    src/InterpolateLinear.cpp
-    src/RateTransposer.cpp
-    src/SoundTouch.cpp
-    src/TDStretch.cpp
-    )
-
-include_directories(src include)
+	src/AAFilter.cpp
+	src/BPMDetect.cpp
+	src/cpu_detect_x86.cpp
+	src/FIFOSampleBuffer.cpp
+	src/FIRFilter.cpp
+	src/InterpolateCubic.cpp
+	src/InterpolateLinear.cpp
+	src/InterpolateShannon.cpp
+	src/mmx_optimized.cpp
+	src/PeakFinder.cpp
+	src/RateTransposer.cpp
+	src/SoundTouch.cpp
+	src/sse_optimized.cpp
+	src/TDStretch.cpp)
 
 add_library(SoundTouch STATIC ${SRCS})
+target_include_directories(SoundTouch PUBLIC include PRIVATE src)
diff --git a/include/BPMDetect.h b/include/BPMDetect.h
new file mode 100755
index 0000000..a452ed1
--- /dev/null
+++ b/include/BPMDetect.h
@@ -0,0 +1,158 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Beats-per-minute (BPM) detection routine.
+///
+/// The beat detection algorithm works as follows:
+/// - Use function 'inputSamples' to input a chunks of samples to the class for
+///   analysis. It's a good idea to enter a large sound file or stream in smallish
+///   chunks of around few kilosamples in order not to extinguish too much RAM memory.
+/// - Input sound data is decimated to approx 500 Hz to reduce calculation burden,
+///   which is basically ok as low (bass) frequencies mostly determine the beat rate.
+///   Simple averaging is used for anti-alias filtering because the resulting signal
+///   quality isn't of that high importance.
+/// - Decimated sound data is enveloped, i.e. the amplitude shape is detected by
+///   taking absolute value that's smoothed by sliding average. Signal levels that
+///   are below a couple of times the general RMS amplitude level are cut away to
+///   leave only notable peaks there.
+/// - Repeating sound patterns (e.g. beats) are detected by calculating short-term 
+///   autocorrelation function of the enveloped signal.
+/// - After whole sound data file has been analyzed as above, the bpm level is 
+///   detected by function 'getBpm' that finds the highest peak of the autocorrelation 
+///   function, calculates it's precise location and converts this reading to bpm's.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2016-01-12 19:24:46 +0200 (ti, 12 tammi 2016) $
+// File revision : $Revision: 4 $
+//
+// $Id: BPMDetect.h 239 2016-01-12 17:24:46Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _BPMDetect_H_
+#define _BPMDetect_H_
+
+#include "STTypes.h"
+#include "FIFOSampleBuffer.h"
+
+namespace soundtouch
+{
+
+/// Minimum allowed BPM rate. Used to restrict accepted result above a reasonable limit.
+#define MIN_BPM 29
+
+/// Maximum allowed BPM rate. Used to restrict accepted result below a reasonable limit.
+#define MAX_BPM 200
+
+
+/// Class for calculating BPM rate for audio data.
+class BPMDetect
+{
+protected:
+    /// Auto-correlation accumulator bins.
+    float *xcorr;
+    
+    /// Sample average counter.
+    int decimateCount;
+
+    /// Sample average accumulator for FIFO-like decimation.
+    soundtouch::LONG_SAMPLETYPE decimateSum;
+
+    /// Decimate sound by this coefficient to reach approx. 500 Hz.
+    int decimateBy;
+
+    /// Auto-correlation window length
+    int windowLen;
+
+    /// Number of channels (1 = mono, 2 = stereo)
+    int channels;
+
+    /// sample rate
+    int sampleRate;
+
+    /// Beginning of auto-correlation window: Autocorrelation isn't being updated for
+    /// the first these many correlation bins.
+    int windowStart;
+ 
+    /// FIFO-buffer for decimated processing samples.
+    soundtouch::FIFOSampleBuffer *buffer;
+
+    /// Updates auto-correlation function for given number of decimated samples that 
+    /// are read from the internal 'buffer' pipe (samples aren't removed from the pipe 
+    /// though).
+    void updateXCorr(int process_samples      /// How many samples are processed.
+                     );
+
+    /// Decimates samples to approx. 500 Hz.
+    ///
+    /// \return Number of output samples.
+    int decimate(soundtouch::SAMPLETYPE *dest,      ///< Destination buffer
+                 const soundtouch::SAMPLETYPE *src, ///< Source sample buffer
+                 int numsamples                     ///< Number of source samples.
+                 );
+
+    /// Calculates amplitude envelope for the buffer of samples.
+    /// Result is output to 'samples'.
+    void calcEnvelope(soundtouch::SAMPLETYPE *samples,  ///< Pointer to input/output data buffer
+                      int numsamples                    ///< Number of samples in buffer
+                      );
+
+    /// remove constant bias from xcorr data
+    void removeBias();
+
+public:
+    /// Constructor.
+    BPMDetect(int numChannels,  ///< Number of channels in sample data.
+              int sampleRate    ///< Sample rate in Hz.
+              );
+
+    /// Destructor.
+    virtual ~BPMDetect();
+
+    /// Inputs a block of samples for analyzing: Envelopes the samples and then
+    /// updates the autocorrelation estimation. When whole song data has been input
+    /// in smaller blocks using this function, read the resulting bpm with 'getBpm' 
+    /// function. 
+    /// 
+    /// Notice that data in 'samples' array can be disrupted in processing.
+    void inputSamples(const soundtouch::SAMPLETYPE *samples,    ///< Pointer to input/working data buffer
+                      int numSamples                            ///< Number of samples in buffer
+                      );
+
+
+    /// Analyzes the results and returns the BPM rate. Use this function to read result
+    /// after whole song data has been input to the class by consecutive calls of
+    /// 'inputSamples' function.
+    ///
+    /// \return Beats-per-minute rate, or zero if detection failed.
+    float getBpm();
+};
+
+}
+
+#endif // _BPMDetect_H_
diff --git a/include/FIFOSampleBuffer.h b/include/FIFOSampleBuffer.h
old mode 100644
new mode 100755
index 4b22a51..691ff1a
--- a/include/FIFOSampleBuffer.h
+++ b/include/FIFOSampleBuffer.h
@@ -15,7 +15,7 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-05 23:40:22 +0200 (Sun, 05 Jan 2014) $
+// Last changed  : $Date: 2014-01-05 23:40:22 +0200 (su, 05 tammi 2014) $
 // File revision : $Revision: 4 $
 //
 // $Id: FIFOSampleBuffer.h 177 2014-01-05 21:40:22Z oparviai $
diff --git a/include/FIFOSamplePipe.h b/include/FIFOSamplePipe.h
old mode 100644
new mode 100755
index f26c57b..56af60c
--- a/include/FIFOSamplePipe.h
+++ b/include/FIFOSamplePipe.h
@@ -17,7 +17,7 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2012-06-13 22:29:53 +0300 (Wed, 13 Jun 2012) $
+// Last changed  : $Date: 2012-06-13 22:29:53 +0300 (ke, 13 kesä 2012) $
 // File revision : $Revision: 4 $
 //
 // $Id: FIFOSamplePipe.h 143 2012-06-13 19:29:53Z oparviai $
diff --git a/include/STTypes.h b/include/STTypes.h
old mode 100644
new mode 100755
index b257974..337e11c
--- a/include/STTypes.h
+++ b/include/STTypes.h
@@ -8,10 +8,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-05-18 18:25:07 +0300 (Mon, 18 May 2015) $
+// Last changed  : $Date: 2017-07-30 12:28:06 +0300 (su, 30 heinä 2017) $
 // File revision : $Revision: 3 $
 //
-// $Id: STTypes.h 215 2015-05-18 15:25:07Z oparviai $
+// $Id: STTypes.h 252 2017-07-30 09:28:06Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -53,6 +53,8 @@ typedef unsigned long   ulong;
 // Helper macro for aligning pointer up to next 16-byte boundary
 #define SOUNDTOUCH_ALIGN_POINTER_16(x)      ( ( (ulongptr)(x) + 15 ) & ~(ulongptr)15 )
 
+#include "soundtouch_config.h"
+
 namespace soundtouch
 {
     /// Activate these undef's to overrule the possible sampletype 
@@ -90,8 +92,8 @@ namespace soundtouch
         ///   However, if you still prefer to select the sample format here 
         ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
         ///   and FLOAT_SAMPLE defines first as in comments above.
-        #define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
-        //#define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
+        //#define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
+        #define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
      
     #endif
 
@@ -102,7 +104,7 @@ namespace soundtouch
         /// routines compiled for whatever reason, you may disable these optimizations 
         /// to make the library compile.
 
-        //#define SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS     1
+        #define SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS     1
 
         /// In GNU environment, allow the user to override this setting by
         /// giving the following switch to the configure script:
@@ -135,8 +137,10 @@ namespace soundtouch
         #endif // SOUNDTOUCH_FLOAT_SAMPLES
 
         #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
-            // Allow MMX optimizations
-            #define SOUNDTOUCH_ALLOW_MMX   1
+            // Allow MMX optimizations (not available in X64 mode)
+            #if (!_M_X64)
+                #define SOUNDTOUCH_ALLOW_MMX   1
+            #endif
         #endif
 
     #else
diff --git a/include/SoundTouch.h b/include/SoundTouch.h
old mode 100644
new mode 100755
index b03855e..c97b393
--- a/include/SoundTouch.h
+++ b/include/SoundTouch.h
@@ -41,10 +41,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-09-20 10:38:32 +0300 (Sun, 20 Sep 2015) $
+// Last changed  : $Date: 2017-07-30 12:35:00 +0300 (su, 30 heinä 2017) $
 // File revision : $Revision: 4 $
 //
-// $Id: SoundTouch.h 230 2015-09-20 07:38:32Z oparviai $
+// $Id: SoundTouch.h 253 2017-07-30 09:35:00Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -79,10 +79,10 @@ namespace soundtouch
 {
 
 /// Soundtouch library version string
-#define SOUNDTOUCH_VERSION          "1.9.2"
+#define SOUNDTOUCH_VERSION          "2.0.0"
 
 /// SoundTouch library version id
-#define SOUNDTOUCH_VERSION_ID       (10902)
+#define SOUNDTOUCH_VERSION_ID       (20000)
 
 //
 // Available setting IDs for the 'setSetting' & 'get_setting' functions:
@@ -116,30 +116,61 @@ namespace soundtouch
 #define SETTING_OVERLAP_MS          5
 
 
-/// Call "getSetting" with this ID to query nominal average processing sequence
-/// size in samples. This value tells approcimate value how many input samples 
-/// SoundTouch needs to gather before it does DSP processing run for the sample batch.
+/// Call "getSetting" with this ID to query processing sequence size in samples. 
+/// This value gives approximate value of how many input samples you'll need to 
+/// feed into SoundTouch after initial buffering to get out a new batch of
+/// output samples. 
+///
+/// This value does not include initial buffering at beginning of a new processing 
+/// stream, use SETTING_INITIAL_LATENCY to get the initial buffering size.
 ///
 /// Notices: 
 /// - This is read-only parameter, i.e. setSetting ignores this parameter
-/// - Returned value is approximate average value, exact processing batch
-///   size may wary from time to time
-/// - This parameter value is not constant but may change depending on 
+/// - This parameter value is not constant but change depending on 
 ///   tempo/pitch/rate/samplerate settings.
-#define SETTING_NOMINAL_INPUT_SEQUENCE		6
+#define SETTING_NOMINAL_INPUT_SEQUENCE      6
 
 
 /// Call "getSetting" with this ID to query nominal average processing output 
 /// size in samples. This value tells approcimate value how many output samples 
 /// SoundTouch outputs once it does DSP processing run for a batch of input samples.
-///	
+///
 /// Notices: 
 /// - This is read-only parameter, i.e. setSetting ignores this parameter
-/// - Returned value is approximate average value, exact processing batch
-///   size may wary from time to time
-/// - This parameter value is not constant but may change depending on 
+/// - This parameter value is not constant but change depending on 
 ///   tempo/pitch/rate/samplerate settings.
-#define SETTING_NOMINAL_OUTPUT_SEQUENCE		7
+#define SETTING_NOMINAL_OUTPUT_SEQUENCE     7
+
+
+/// Call "getSetting" with this ID to query initial processing latency, i.e.
+/// approx. how many samples you'll need to enter to SoundTouch pipeline before 
+/// you can expect to get first batch of ready output samples out. 
+///
+/// After the first output batch, you can then expect to get approx. 
+/// SETTING_NOMINAL_OUTPUT_SEQUENCE ready samples out for every
+/// SETTING_NOMINAL_INPUT_SEQUENCE samples that you enter into SoundTouch.
+///
+/// Example:
+///     processing with parameter -tempo=5
+///     => initial latency = 5509 samples
+///        input sequence  = 4167 samples
+///        output sequence = 3969 samples
+///
+/// Accordingly, you can expect to feed in approx. 5509 samples at beginning of 
+/// the stream, and then you'll get out the first 3969 samples. After that, for 
+/// every approx. 4167 samples that you'll put in, you'll receive again approx. 
+/// 3969 samples out.
+///
+/// This also means that average latency during stream processing is 
+/// INITIAL_LATENCY-OUTPUT_SEQUENCE/2, in the above example case 5509-3969/2 
+/// = 3524 samples
+/// 
+/// Notices: 
+/// - This is read-only parameter, i.e. setSetting ignores this parameter
+/// - This parameter value is not constant but change depending on 
+///   tempo/pitch/rate/samplerate settings.
+#define SETTING_INITIAL_LATENCY             8
+
 
 class SoundTouch : public FIFOProcessor
 {
@@ -228,6 +259,24 @@ public:
     /// Sets sample rate.
     void setSampleRate(uint srate);
 
+    /// Get ratio between input and output audio durations, useful for calculating
+    /// processed output duration: if you'll process a stream of N samples, then 
+    /// you can expect to get out N * getInputOutputSampleRatio() samples.
+    ///
+    /// This ratio will give accurate target duration ratio for a full audio track, 
+    /// given that the the whole track is processed with same processing parameters.
+    /// 
+    /// If this ratio is applied to calculate intermediate offsets inside a processing
+    /// stream, then this ratio is approximate and can deviate +- some tens of milliseconds 
+    /// from ideal offset, yet by end of the audio stream the duration ratio will become
+    /// exact.
+    ///
+    /// Example: if processing with parameters "-tempo=15 -pitch=-3", the function
+    /// will return value 0.8695652... Now, if processing an audio stream whose duration
+    /// is exactly one million audio samples, then you can expect the processed 
+    /// output duration  be 0.869565 * 1000000 = 869565 samples.
+    double getInputOutputSampleRatio();
+
     /// Flushes the last samples from the processing pipeline to the output.
     /// Clears also the internal processing buffers.
     //
@@ -286,6 +335,11 @@ public:
     /// Returns number of samples currently unprocessed.
     virtual uint numUnprocessedSamples() const;
 
+    /// Return number of channels
+    uint numChannels() const
+    {
+        return channels;
+    }
 
     /// Other handy functions that are implemented in the ancestor classes (see
     /// classes 'FIFOProcessor' and 'FIFOSamplePipe')
diff --git a/include/soundtouch_config.h b/include/soundtouch_config.h
new file mode 100644
index 0000000..2e65377
--- /dev/null
+++ b/include/soundtouch_config.h
@@ -0,0 +1,5 @@
+/* Use Float as Sample type */
+#undef SOUNDTOUCH_FLOAT_SAMPLES
+
+/* Use Integer as Sample type */
+#define SOUNDTOUCH_INTEGER_SAMPLES 1
diff --git a/src/AAFilter.cpp b/src/AAFilter.cpp
old mode 100644
new mode 100755
index a254300..66114ab
--- a/src/AAFilter.cpp
+++ b/src/AAFilter.cpp
@@ -12,10 +12,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-05 23:40:22 +0200 (Sun, 05 Jan 2014) $
+// Last changed  : $Date: 2016-01-12 19:26:21 +0200 (ti, 12 tammi 2016) $
 // File revision : $Revision: 4 $
 //
-// $Id: AAFilter.cpp 177 2014-01-05 21:40:22Z oparviai $
+// $Id: AAFilter.cpp 240 2016-01-12 17:26:21Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -49,7 +49,7 @@
 
 using namespace soundtouch;
 
-#define PI        3.141592655357989
+#define PI       3.14159265358979323846
 #define TWOPI    (2 * PI)
 
 // define this to save AA filter coefficients to a file
diff --git a/src/AAFilter.h b/src/AAFilter.h
old mode 100644
new mode 100755
index 8c34b6b..fc06481
--- a/src/AAFilter.h
+++ b/src/AAFilter.h
@@ -13,7 +13,7 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-07 21:41:23 +0200 (Tue, 07 Jan 2014) $
+// Last changed  : $Date: 2014-01-07 21:41:23 +0200 (ti, 07 tammi 2014) $
 // File revision : $Revision: 4 $
 //
 // $Id: AAFilter.h 187 2014-01-07 19:41:23Z oparviai $
diff --git a/src/BPMDetect.cpp b/src/BPMDetect.cpp
new file mode 100755
index 0000000..66a7d05
--- /dev/null
+++ b/src/BPMDetect.cpp
@@ -0,0 +1,318 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Beats-per-minute (BPM) detection routine.
+///
+/// The beat detection algorithm works as follows:
+/// - Use function 'inputSamples' to input a chunks of samples to the class for
+///   analysis. It's a good idea to enter a large sound file or stream in smallish
+///   chunks of around few kilosamples in order not to extinguish too much RAM memory.
+/// - Inputted sound data is decimated to approx 500 Hz to reduce calculation burden,
+///   which is basically ok as low (bass) frequencies mostly determine the beat rate.
+///   Simple averaging is used for anti-alias filtering because the resulting signal
+///   quality isn't of that high importance.
+/// - Decimated sound data is enveloped, i.e. the amplitude shape is detected by
+///   taking absolute value that's smoothed by sliding average. Signal levels that
+///   are below a couple of times the general RMS amplitude level are cut away to
+///   leave only notable peaks there.
+/// - Repeating sound patterns (e.g. beats) are detected by calculating short-term 
+///   autocorrelation function of the enveloped signal.
+/// - After whole sound data file has been analyzed as above, the bpm level is 
+///   detected by function 'getBpm' that finds the highest peak of the autocorrelation 
+///   function, calculates it's precise location and converts this reading to bpm's.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2016-01-05 22:59:57 +0200 (ti, 05 tammi 2016) $
+// File revision : $Revision: 4 $
+//
+// $Id: BPMDetect.cpp 237 2016-01-05 20:59:57Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include "FIFOSampleBuffer.h"
+#include "PeakFinder.h"
+#include "BPMDetect.h"
+
+using namespace soundtouch;
+
+#define INPUT_BLOCK_SAMPLES       2048
+#define DECIMATED_BLOCK_SAMPLES   256
+
+/// Target sample rate after decimation
+const int target_srate = 1000;
+
+/// XCorr update sequence size, update in about 200msec chunks
+const int xcorr_update_sequence = 200;
+
+/// XCorr decay time constant, decay to half in 30 seconds
+/// If it's desired to have the system adapt quicker to beat rate 
+/// changes within a continuing music stream, then the 
+/// 'xcorr_decay_time_constant' value can be reduced, yet that
+/// can increase possibility of glitches in bpm detection.
+const double xcorr_decay_time_constant = 30.0;
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Enable following define to create bpm analysis file:
+
+// #define _CREATE_BPM_DEBUG_FILE
+
+#ifdef _CREATE_BPM_DEBUG_FILE
+
+    #define DEBUGFILE_NAME  "c:\\temp\\soundtouch-bpm-debug.txt"
+
+    static void _SaveDebugData(const float *data, int minpos, int maxpos, double coeff)
+    {
+        FILE *fptr = fopen(DEBUGFILE_NAME, "wt");
+        int i;
+
+        if (fptr)
+        {
+            printf("\n\nWriting BPM debug data into file " DEBUGFILE_NAME "\n\n");
+            for (i = minpos; i < maxpos; i ++)
+            {
+                fprintf(fptr, "%d\t%.1lf\t%f\n", i, coeff / (double)i, data[i]);
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    #define _SaveDebugData(a,b,c,d)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+BPMDetect::BPMDetect(int numChannels, int aSampleRate)
+{
+    this->sampleRate = aSampleRate;
+    this->channels = numChannels;
+
+    decimateSum = 0;
+    decimateCount = 0;
+
+    // choose decimation factor so that result is approx. 1000 Hz
+    decimateBy = sampleRate / target_srate;
+    assert(decimateBy > 0);
+    assert(INPUT_BLOCK_SAMPLES < decimateBy * DECIMATED_BLOCK_SAMPLES);
+
+    // Calculate window length & starting item according to desired min & max bpms
+    windowLen = (60 * sampleRate) / (decimateBy * MIN_BPM);
+    windowStart = (60 * sampleRate) / (decimateBy * MAX_BPM);
+
+    assert(windowLen > windowStart);
+
+    // allocate new working objects
+    xcorr = new float[windowLen];
+    memset(xcorr, 0, windowLen * sizeof(float));
+
+    // allocate processing buffer
+    buffer = new FIFOSampleBuffer();
+    // we do processing in mono mode
+    buffer->setChannels(1);
+    buffer->clear();
+}
+
+
+
+BPMDetect::~BPMDetect()
+{
+    delete[] xcorr;
+    delete buffer;
+}
+
+
+
+/// convert to mono, low-pass filter & decimate to about 500 Hz. 
+/// return number of outputted samples.
+///
+/// Decimation is used to remove the unnecessary frequencies and thus to reduce 
+/// the amount of data needed to be processed as calculating autocorrelation 
+/// function is a very-very heavy operation.
+///
+/// Anti-alias filtering is done simply by averaging the samples. This is really a 
+/// poor-man's anti-alias filtering, but it's not so critical in this kind of application
+/// (it'd also be difficult to design a high-quality filter with steep cut-off at very 
+/// narrow band)
+int BPMDetect::decimate(SAMPLETYPE *dest, const SAMPLETYPE *src, int numsamples)
+{
+    int count, outcount;
+    LONG_SAMPLETYPE out;
+
+    assert(channels > 0);
+    assert(decimateBy > 0);
+    outcount = 0;
+    for (count = 0; count < numsamples; count ++) 
+    {
+        int j;
+
+        // convert to mono and accumulate
+        for (j = 0; j < channels; j ++)
+        {
+            decimateSum += src[j];
+        }
+        src += j;
+
+        decimateCount ++;
+        if (decimateCount >= decimateBy) 
+        {
+            // Store every Nth sample only
+            out = (LONG_SAMPLETYPE)(decimateSum / (decimateBy * channels));
+            decimateSum = 0;
+            decimateCount = 0;
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+            // check ranges for sure (shouldn't actually be necessary)
+            if (out > 32767) 
+            {
+                out = 32767;
+            } 
+            else if (out < -32768) 
+            {
+                out = -32768;
+            }
+#endif // SOUNDTOUCH_INTEGER_SAMPLES
+            dest[outcount] = (SAMPLETYPE)out;
+            outcount ++;
+        }
+    }
+    return outcount;
+}
+
+
+
+// Calculates autocorrelation function of the sample history buffer
+void BPMDetect::updateXCorr(int process_samples)
+{
+    int offs;
+    SAMPLETYPE *pBuffer;
+    
+    assert(buffer->numSamples() >= (uint)(process_samples + windowLen));
+
+    pBuffer = buffer->ptrBegin();
+
+    // calculate decay factor for xcorr filtering
+    float xcorr_decay = (float)pow(0.5, 1.0 / (xcorr_decay_time_constant * target_srate / process_samples));
+
+    #pragma omp parallel for
+    for (offs = windowStart; offs < windowLen; offs ++) 
+    {
+        LONG_SAMPLETYPE sum;
+        int i;
+
+        sum = 0;
+        for (i = 0; i < process_samples; i ++) 
+        {
+            sum += pBuffer[i] * pBuffer[i + offs];    // scaling the sub-result shouldn't be necessary
+        }
+        xcorr[offs] *= xcorr_decay;   // decay 'xcorr' here with suitable time constant.
+
+        xcorr[offs] += (float)fabs(sum);
+    }
+}
+
+
+
+void BPMDetect::inputSamples(const SAMPLETYPE *samples, int numSamples)
+{
+    SAMPLETYPE decimated[DECIMATED_BLOCK_SAMPLES];
+
+    // iterate so that max INPUT_BLOCK_SAMPLES processed per iteration
+    while (numSamples > 0)
+    {
+        int block;
+        int decSamples;
+
+        block = (numSamples > INPUT_BLOCK_SAMPLES) ? INPUT_BLOCK_SAMPLES : numSamples;
+
+        // decimate. note that converts to mono at the same time
+        decSamples = decimate(decimated, samples, block);
+        samples += block * channels;
+        numSamples -= block;
+
+        buffer->putSamples(decimated, decSamples);
+    }
+
+    // when the buffer has enought samples for processing...
+    while ((int)buffer->numSamples() >= windowLen + xcorr_update_sequence) 
+    {
+        // ... calculate autocorrelations for oldest samples...
+        updateXCorr(xcorr_update_sequence);
+        // ... and remove these from the buffer
+        buffer->receiveSamples(xcorr_update_sequence);
+    }
+}
+
+
+
+void BPMDetect::removeBias()
+{
+    int i;
+    float minval = 1e12f;   // arbitrary large number
+
+    for (i = windowStart; i < windowLen; i ++)
+    {
+        if (xcorr[i] < minval)
+        {
+            minval = xcorr[i];
+        }
+    }
+
+    for (i = windowStart; i < windowLen; i ++)
+    {
+        xcorr[i] -= minval;
+    }
+}
+
+
+float BPMDetect::getBpm()
+{
+    double peakPos;
+    double coeff;
+    PeakFinder peakFinder;
+
+    coeff = 60.0 * ((double)sampleRate / (double)decimateBy);
+
+    // save bpm debug analysis data if debug data enabled
+    _SaveDebugData(xcorr, windowStart, windowLen, coeff);
+
+    // remove bias from xcorr data
+    removeBias();
+
+    // find peak position
+    peakPos = peakFinder.detectPeak(xcorr, windowStart, windowLen);
+
+    assert(decimateBy != 0);
+    if (peakPos < 1e-9) return 0.0; // detection failed.
+
+    // calculate BPM
+    return (float) (coeff / peakPos);
+}
diff --git a/src/FIFOSampleBuffer.cpp b/src/FIFOSampleBuffer.cpp
old mode 100644
new mode 100755
index 4d9740a..4f9b9a7
--- a/src/FIFOSampleBuffer.cpp
+++ b/src/FIFOSampleBuffer.cpp
@@ -15,7 +15,7 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2012-11-08 20:53:01 +0200 (Thu, 08 Nov 2012) $
+// Last changed  : $Date: 2012-11-08 20:53:01 +0200 (to, 08 marras 2012) $
 // File revision : $Revision: 4 $
 //
 // $Id: FIFOSampleBuffer.cpp 160 2012-11-08 18:53:01Z oparviai $
diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp
old mode 100644
new mode 100755
index 456418a..41ebc78
--- a/src/FIRFilter.cpp
+++ b/src/FIRFilter.cpp
@@ -2,19 +2,25 @@
 ///
 /// General FIR digital filter routines with MMX optimization. 
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific file, 
+/// Notes : MMX optimized functions reside in a separate, platform-specific file, 
 /// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
 ///
+/// This source file contains OpenMP optimizations that allow speeding up the
+/// corss-correlation algorithm by executing it in several threads / CPU cores 
+/// in parallel. See the following article link for more detailed discussion 
+/// about SoundTouch OpenMP optimizations:
+/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
+///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
 /// SoundTouch WWW: http://www.surina.net/soundtouch
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-21 23:24:29 +0200 (Sat, 21 Feb 2015) $
+// Last changed  : $Date: 2015-11-05 19:46:08 +0200 (to, 05 marras 2015) $
 // File revision : $Revision: 4 $
 //
-// $Id: FIRFilter.cpp 202 2015-02-21 21:24:29Z oparviai $
+// $Id: FIRFilter.cpp 234 2015-11-05 17:46:08Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/src/FIRFilter.h b/src/FIRFilter.h
old mode 100644
new mode 100755
index 158cdd2..3a27d0a
--- a/src/FIRFilter.h
+++ b/src/FIRFilter.h
@@ -11,7 +11,7 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-21 23:24:29 +0200 (Sat, 21 Feb 2015) $
+// Last changed  : $Date: 2015-02-21 23:24:29 +0200 (la, 21 helmi 2015) $
 // File revision : $Revision: 4 $
 //
 // $Id: FIRFilter.h 202 2015-02-21 21:24:29Z oparviai $
diff --git a/src/InterpolateCubic.cpp b/src/InterpolateCubic.cpp
new file mode 100755
index 0000000..8aa7374
--- /dev/null
+++ b/src/InterpolateCubic.cpp
@@ -0,0 +1,200 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Cubic interpolation routine.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// $Id: InterpolateCubic.cpp 179 2014-01-06 18:41:42Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <stddef.h>
+#include <math.h>
+#include "InterpolateCubic.h"
+#include "STTypes.h"
+
+using namespace soundtouch;
+
+// cubic interpolation coefficients
+static const float _coeffs[]= 
+{ -0.5f,  1.0f, -0.5f, 0.0f,
+   1.5f, -2.5f,  0.0f, 1.0f,
+  -1.5f,  2.0f,  0.5f, 0.0f,
+   0.5f, -0.5f,  0.0f, 0.0f};
+
+
+InterpolateCubic::InterpolateCubic()
+{
+    fract = 0;
+}
+
+
+void InterpolateCubic::resetRegisters()
+{
+    fract = 0;
+}
+
+
+/// Transpose mono audio. Returns number of produced output samples, and 
+/// updates "srcSamples" to amount of consumed source samples
+int InterpolateCubic::transposeMono(SAMPLETYPE *pdest, 
+                    const SAMPLETYPE *psrc, 
+                    int &srcSamples)
+{
+    int i;
+    int srcSampleEnd = srcSamples - 4;
+    int srcCount = 0;
+
+    i = 0;
+    while (srcCount < srcSampleEnd)
+    {
+        float out;
+        const float x3 = 1.0f;
+        const float x2 = (float)fract;    // x
+        const float x1 = x2*x2;           // x^2
+        const float x0 = x1*x2;           // x^3
+        float y0, y1, y2, y3;
+
+        assert(fract < 1.0);
+
+        y0 =  _coeffs[0] * x0 +  _coeffs[1] * x1 +  _coeffs[2] * x2 +  _coeffs[3] * x3;
+        y1 =  _coeffs[4] * x0 +  _coeffs[5] * x1 +  _coeffs[6] * x2 +  _coeffs[7] * x3;
+        y2 =  _coeffs[8] * x0 +  _coeffs[9] * x1 + _coeffs[10] * x2 + _coeffs[11] * x3;
+        y3 = _coeffs[12] * x0 + _coeffs[13] * x1 + _coeffs[14] * x2 + _coeffs[15] * x3;
+
+        out = y0 * psrc[0] + y1 * psrc[1] + y2 * psrc[2] + y3 * psrc[3];
+
+        pdest[i] = (SAMPLETYPE)out;
+        i ++;
+
+        // update position fraction
+        fract += rate;
+        // update whole positions
+        int whole = (int)fract;
+        fract -= whole;
+        psrc += whole;
+        srcCount += whole;
+    }
+    srcSamples = srcCount;
+    return i;
+}
+
+
+/// Transpose stereo audio. Returns number of produced output samples, and 
+/// updates "srcSamples" to amount of consumed source samples
+int InterpolateCubic::transposeStereo(SAMPLETYPE *pdest, 
+                    const SAMPLETYPE *psrc, 
+                    int &srcSamples)
+{
+    int i;
+    int srcSampleEnd = srcSamples - 4;
+    int srcCount = 0;
+
+    i = 0;
+    while (srcCount < srcSampleEnd)
+    {
+        const float x3 = 1.0f;
+        const float x2 = (float)fract;    // x
+        const float x1 = x2*x2;           // x^2
+        const float x0 = x1*x2;           // x^3
+        float y0, y1, y2, y3;
+        float out0, out1;
+
+        assert(fract < 1.0);
+
+        y0 =  _coeffs[0] * x0 +  _coeffs[1] * x1 +  _coeffs[2] * x2 +  _coeffs[3] * x3;
+        y1 =  _coeffs[4] * x0 +  _coeffs[5] * x1 +  _coeffs[6] * x2 +  _coeffs[7] * x3;
+        y2 =  _coeffs[8] * x0 +  _coeffs[9] * x1 + _coeffs[10] * x2 + _coeffs[11] * x3;
+        y3 = _coeffs[12] * x0 + _coeffs[13] * x1 + _coeffs[14] * x2 + _coeffs[15] * x3;
+
+        out0 = y0 * psrc[0] + y1 * psrc[2] + y2 * psrc[4] + y3 * psrc[6];
+        out1 = y0 * psrc[1] + y1 * psrc[3] + y2 * psrc[5] + y3 * psrc[7];
+
+        pdest[2*i]   = (SAMPLETYPE)out0;
+        pdest[2*i+1] = (SAMPLETYPE)out1;
+        i ++;
+
+        // update position fraction
+        fract += rate;
+        // update whole positions
+        int whole = (int)fract;
+        fract -= whole;
+        psrc += 2*whole;
+        srcCount += whole;
+    }
+    srcSamples = srcCount;
+    return i;
+}
+
+
+/// Transpose multi-channel audio. Returns number of produced output samples, and 
+/// updates "srcSamples" to amount of consumed source samples
+int InterpolateCubic::transposeMulti(SAMPLETYPE *pdest, 
+                    const SAMPLETYPE *psrc, 
+                    int &srcSamples)
+{
+    int i;
+    int srcSampleEnd = srcSamples - 4;
+    int srcCount = 0;
+
+    i = 0;
+    while (srcCount < srcSampleEnd)
+    {
+        const float x3 = 1.0f;
+        const float x2 = (float)fract;    // x
+        const float x1 = x2*x2;           // x^2
+        const float x0 = x1*x2;           // x^3
+        float y0, y1, y2, y3;
+
+        assert(fract < 1.0);
+
+        y0 =  _coeffs[0] * x0 +  _coeffs[1] * x1 +  _coeffs[2] * x2 +  _coeffs[3] * x3;
+        y1 =  _coeffs[4] * x0 +  _coeffs[5] * x1 +  _coeffs[6] * x2 +  _coeffs[7] * x3;
+        y2 =  _coeffs[8] * x0 +  _coeffs[9] * x1 + _coeffs[10] * x2 + _coeffs[11] * x3;
+        y3 = _coeffs[12] * x0 + _coeffs[13] * x1 + _coeffs[14] * x2 + _coeffs[15] * x3;
+
+        for (int c = 0; c < numChannels; c ++)
+        {
+            float out;
+            out = y0 * psrc[c] + y1 * psrc[c + numChannels] + y2 * psrc[c + 2 * numChannels] + y3 * psrc[c + 3 * numChannels];
+            pdest[0] = (SAMPLETYPE)out;
+            pdest ++;
+        }
+        i ++;
+
+        // update position fraction
+        fract += rate;
+        // update whole positions
+        int whole = (int)fract;
+        fract -= whole;
+        psrc += numChannels*whole;
+        srcCount += whole;
+    }
+    srcSamples = srcCount;
+    return i;
+}
diff --git a/src/InterpolateCubic.h b/src/InterpolateCubic.h
new file mode 100755
index 0000000..f98e701
--- /dev/null
+++ b/src/InterpolateCubic.h
@@ -0,0 +1,67 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Cubic interpolation routine.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// $Id: InterpolateCubic.h 225 2015-07-26 14:45:48Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _InterpolateCubic_H_
+#define _InterpolateCubic_H_
+
+#include "RateTransposer.h"
+#include "STTypes.h"
+
+namespace soundtouch
+{
+
+class InterpolateCubic : public TransposerBase
+{
+protected:
+    virtual void resetRegisters();
+    virtual int transposeMono(SAMPLETYPE *dest, 
+                        const SAMPLETYPE *src, 
+                        int &srcSamples);
+    virtual int transposeStereo(SAMPLETYPE *dest, 
+                        const SAMPLETYPE *src, 
+                        int &srcSamples);
+    virtual int transposeMulti(SAMPLETYPE *dest, 
+                        const SAMPLETYPE *src, 
+                        int &srcSamples);
+
+    double fract;
+
+public:
+    InterpolateCubic();
+};
+
+}
+
+#endif
diff --git a/src/InterpolateLinear.cpp b/src/InterpolateLinear.cpp
old mode 100644
new mode 100755
diff --git a/src/InterpolateLinear.h b/src/InterpolateLinear.h
old mode 100644
new mode 100755
diff --git a/src/InterpolateShannon.cpp b/src/InterpolateShannon.cpp
new file mode 100755
index 0000000..1085fd1
--- /dev/null
+++ b/src/InterpolateShannon.cpp
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Sample interpolation routine using 8-tap band-limited Shannon interpolation 
+/// with kaiser window.
+///
+/// Notice. This algorithm is remarkably much heavier than linear or cubic
+/// interpolation, and not remarkably better than cubic algorithm. Thus mostly
+/// for experimental purposes
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// $Id: InterpolateShannon.cpp 195 2014-04-06 15:57:21Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <math.h>
+#include "InterpolateShannon.h"
+#include "STTypes.h"
+
+using namespace soundtouch;
+
+
+/// Kaiser window with beta = 2.0
+/// Values scaled down by 5% to avoid overflows
+static const double _kaiser8[8] = 
+{
+   0.41778693317814,
+   0.64888025049173,
+   0.83508562409944,
+   0.93887857733412,
+   0.93887857733412,
+   0.83508562409944,
+   0.64888025049173,
+   0.41778693317814
+};
+
+
+InterpolateShannon::InterpolateShannon()
+{
+    fract = 0;
+}
+
+
+void InterpolateShannon::resetRegisters()
+{
+    fract = 0;
+}
+
+
+#define PI 3.1415926536
+#define sinc(x) (sin(PI * (x)) / (PI * (x)))
+
+/// Transpose mono audio. Returns number of produced output samples, and 
+/// updates "srcSamples" to amount of consumed source samples
+int InterpolateShannon::transposeMono(SAMPLETYPE *pdest, 
+                    const SAMPLETYPE *psrc, 
+                    int &srcSamples)
+{
+    int i;
+    int srcSampleEnd = srcSamples - 8;
+    int srcCount = 0;
+
+    i = 0;
+    while (srcCount < srcSampleEnd)
+    {
+        double out;
+        assert(fract < 1.0);
+
+        out  = psrc[0] * sinc(-3.0 - fract) * _kaiser8[0];
+        out += psrc[1] * sinc(-2.0 - fract) * _kaiser8[1];
+        out += psrc[2] * sinc(-1.0 - fract) * _kaiser8[2];
+        if (fract < 1e-6)
+        {
+            out += psrc[3] * _kaiser8[3];     // sinc(0) = 1
+        }
+        else
+        {
+            out += psrc[3] * sinc(- fract) * _kaiser8[3];
+        }
+        out += psrc[4] * sinc( 1.0 - fract) * _kaiser8[4];
+        out += psrc[5] * sinc( 2.0 - fract) * _kaiser8[5];
+        out += psrc[6] * sinc( 3.0 - fract) * _kaiser8[6];
+        out += psrc[7] * sinc( 4.0 - fract) * _kaiser8[7];
+
+        pdest[i] = (SAMPLETYPE)out;
+        i ++;
+
+        // update position fraction
+        fract += rate;
+        // update whole positions
+        int whole = (int)fract;
+        fract -= whole;
+        psrc += whole;
+        srcCount += whole;
+    }
+    srcSamples = srcCount;
+    return i;
+}
+
+
+/// Transpose stereo audio. Returns number of produced output samples, and 
+/// updates "srcSamples" to amount of consumed source samples
+int InterpolateShannon::transposeStereo(SAMPLETYPE *pdest, 
+                    const SAMPLETYPE *psrc, 
+                    int &srcSamples)
+{
+    int i;
+    int srcSampleEnd = srcSamples - 8;
+    int srcCount = 0;
+
+    i = 0;
+    while (srcCount < srcSampleEnd)
+    {
+        double out0, out1, w;
+        assert(fract < 1.0);
+
+        w = sinc(-3.0 - fract) * _kaiser8[0];
+        out0 = psrc[0] * w; out1 = psrc[1] * w;
+        w = sinc(-2.0 - fract) * _kaiser8[1];
+        out0 += psrc[2] * w; out1 += psrc[3] * w;
+        w = sinc(-1.0 - fract) * _kaiser8[2];
+        out0 += psrc[4] * w; out1 += psrc[5] * w;
+        w = _kaiser8[3] * ((fract < 1e-5) ? 1.0 : sinc(- fract));   // sinc(0) = 1
+        out0 += psrc[6] * w; out1 += psrc[7] * w;
+        w = sinc( 1.0 - fract) * _kaiser8[4];
+        out0 += psrc[8] * w; out1 += psrc[9] * w;
+        w = sinc( 2.0 - fract) * _kaiser8[5];
+        out0 += psrc[10] * w; out1 += psrc[11] * w;
+        w = sinc( 3.0 - fract) * _kaiser8[6];
+        out0 += psrc[12] * w; out1 += psrc[13] * w;
+        w = sinc( 4.0 - fract) * _kaiser8[7];
+        out0 += psrc[14] * w; out1 += psrc[15] * w;
+
+        pdest[2*i]   = (SAMPLETYPE)out0;
+        pdest[2*i+1] = (SAMPLETYPE)out1;
+        i ++;
+
+        // update position fraction
+        fract += rate;
+        // update whole positions
+        int whole = (int)fract;
+        fract -= whole;
+        psrc += 2*whole;
+        srcCount += whole;
+    }
+    srcSamples = srcCount;
+    return i;
+}
+
+
+/// Transpose stereo audio. Returns number of produced output samples, and 
+/// updates "srcSamples" to amount of consumed source samples
+int InterpolateShannon::transposeMulti(SAMPLETYPE *pdest, 
+                    const SAMPLETYPE *psrc, 
+                    int &srcSamples)
+{
+    // not implemented
+    assert(false);
+    return 0;
+}
diff --git a/src/InterpolateShannon.h b/src/InterpolateShannon.h
new file mode 100755
index 0000000..54703d9
--- /dev/null
+++ b/src/InterpolateShannon.h
@@ -0,0 +1,72 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Sample interpolation routine using 8-tap band-limited Shannon interpolation 
+/// with kaiser window.
+///
+/// Notice. This algorithm is remarkably much heavier than linear or cubic
+/// interpolation, and not remarkably better than cubic algorithm. Thus mostly
+/// for experimental purposes
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// $Id: InterpolateShannon.h 225 2015-07-26 14:45:48Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _InterpolateShannon_H_
+#define _InterpolateShannon_H_
+
+#include "RateTransposer.h"
+#include "STTypes.h"
+
+namespace soundtouch
+{
+
+class InterpolateShannon : public TransposerBase
+{
+protected:
+    void resetRegisters();
+    int transposeMono(SAMPLETYPE *dest, 
+                        const SAMPLETYPE *src, 
+                        int &srcSamples);
+    int transposeStereo(SAMPLETYPE *dest, 
+                        const SAMPLETYPE *src, 
+                        int &srcSamples);
+    int transposeMulti(SAMPLETYPE *dest, 
+                        const SAMPLETYPE *src, 
+                        int &srcSamples);
+
+    double fract;
+
+public:
+    InterpolateShannon();
+};
+
+}
+
+#endif
diff --git a/src/PeakFinder.cpp b/src/PeakFinder.cpp
new file mode 100755
index 0000000..20aba31
--- /dev/null
+++ b/src/PeakFinder.cpp
@@ -0,0 +1,286 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Peak detection routine. 
+///
+/// The routine detects highest value on an array of values and calculates the 
+/// precise peak location as a mass-center of the 'hump' around the peak value.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2015-05-18 18:22:02 +0300 (ma, 18 touko 2015) $
+// File revision : $Revision: 4 $
+//
+// $Id: PeakFinder.cpp 213 2015-05-18 15:22:02Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <math.h>
+#include <assert.h>
+
+#include "PeakFinder.h"
+
+using namespace soundtouch;
+
+#define max(x, y) (((x) > (y)) ? (x) : (y))
+
+
+PeakFinder::PeakFinder()
+{
+    minPos = maxPos = 0;
+}
+
+
+// Finds real 'top' of a peak hump from neighnourhood of the given 'peakpos'.
+int PeakFinder::findTop(const float *data, int peakpos) const
+{
+    int i;
+    int start, end;
+    float refvalue;
+
+    refvalue = data[peakpos];
+
+    // seek within �10 points
+    start = peakpos - 10;
+    if (start < minPos) start = minPos;
+    end = peakpos + 10;
+    if (end > maxPos) end = maxPos;
+
+    for (i = start; i <= end; i ++)
+    {
+        if (data[i] > refvalue)
+        {
+            peakpos = i;
+            refvalue = data[i];
+        }
+    }
+
+    // failure if max value is at edges of seek range => it's not peak, it's at slope.
+    if ((peakpos == start) || (peakpos == end)) return 0;
+
+    return peakpos;
+}
+
+
+// Finds 'ground level' of a peak hump by starting from 'peakpos' and proceeding
+// to direction defined by 'direction' until next 'hump' after minimum value will 
+// begin
+int PeakFinder::findGround(const float *data, int peakpos, int direction) const
+{
+    int lowpos;
+    int pos;
+    int climb_count;
+    float refvalue;
+    float delta;
+
+    climb_count = 0;
+    refvalue = data[peakpos];
+    lowpos = peakpos;
+
+    pos = peakpos;
+
+    while ((pos > minPos+1) && (pos < maxPos-1))
+    {
+        int prevpos;
+
+        prevpos = pos;
+        pos += direction;
+
+        // calculate derivate
+        delta = data[pos] - data[prevpos];
+        if (delta <= 0)
+        {
+            // going downhill, ok
+            if (climb_count)
+            {
+                climb_count --;  // decrease climb count
+            }
+
+            // check if new minimum found
+            if (data[pos] < refvalue)
+            {
+                // new minimum found
+                lowpos = pos;
+                refvalue = data[pos];
+            }
+        }
+        else
+        {
+            // going uphill, increase climbing counter
+            climb_count ++;
+            if (climb_count > 5) break;    // we've been climbing too long => it's next uphill => quit
+        }
+    }
+    return lowpos;
+}
+
+
+// Find offset where the value crosses the given level, when starting from 'peakpos' and
+// proceeds to direction defined in 'direction'
+int PeakFinder::findCrossingLevel(const float *data, float level, int peakpos, int direction) const
+{
+    float peaklevel;
+    int pos;
+
+    peaklevel = data[peakpos];
+    assert(peaklevel >= level);
+    pos = peakpos;
+    while ((pos >= minPos) && (pos < maxPos))
+    {
+        if (data[pos + direction] < level) return pos;   // crossing found
+        pos += direction;
+    }
+    return -1;  // not found
+}
+
+
+// Calculates the center of mass location of 'data' array items between 'firstPos' and 'lastPos'
+double PeakFinder::calcMassCenter(const float *data, int firstPos, int lastPos) const
+{
+    int i;
+    float sum;
+    float wsum;
+
+    sum = 0;
+    wsum = 0;
+    for (i = firstPos; i <= lastPos; i ++)
+    {
+        sum += (float)i * data[i];
+        wsum += data[i];
+    }
+
+    if (wsum < 1e-6) return 0;
+    return sum / wsum;
+}
+
+
+
+/// get exact center of peak near given position by calculating local mass of center
+double PeakFinder::getPeakCenter(const float *data, int peakpos) const
+{
+    float peakLevel;            // peak level
+    int crosspos1, crosspos2;   // position where the peak 'hump' crosses cutting level
+    float cutLevel;             // cutting value
+    float groundLevel;          // ground level of the peak
+    int gp1, gp2;               // bottom positions of the peak 'hump'
+
+    // find ground positions.
+    gp1 = findGround(data, peakpos, -1);
+    gp2 = findGround(data, peakpos, 1);
+
+    peakLevel = data[peakpos];
+
+    if (gp1 == gp2) 
+    {
+        // avoid rounding errors when all are equal
+        assert(gp1 == peakpos);
+        cutLevel = groundLevel = peakLevel;
+    } else {
+        // get average of the ground levels
+        groundLevel = 0.5f * (data[gp1] + data[gp2]);
+
+        // calculate 70%-level of the peak
+        cutLevel = 0.70f * peakLevel + 0.30f * groundLevel;
+    }
+
+    // find mid-level crossings
+    crosspos1 = findCrossingLevel(data, cutLevel, peakpos, -1);
+    crosspos2 = findCrossingLevel(data, cutLevel, peakpos, 1);
+
+    if ((crosspos1 < 0) || (crosspos2 < 0)) return 0;   // no crossing, no peak..
+
+    // calculate mass center of the peak surroundings
+    return calcMassCenter(data, crosspos1, crosspos2);
+}
+
+
+
+double PeakFinder::detectPeak(const float *data, int aminPos, int amaxPos) 
+{
+
+    int i;
+    int peakpos;                // position of peak level
+    double highPeak, peak;
+
+    this->minPos = aminPos;
+    this->maxPos = amaxPos;
+
+    // find absolute peak
+    peakpos = minPos;
+    peak = data[minPos];
+    for (i = minPos + 1; i < maxPos; i ++)
+    {
+        if (data[i] > peak) 
+        {
+            peak = data[i];
+            peakpos = i;
+        }
+    }
+    
+    // Calculate exact location of the highest peak mass center
+    highPeak = getPeakCenter(data, peakpos);
+    peak = highPeak;
+
+    // Now check if the highest peak were in fact harmonic of the true base beat peak 
+    // - sometimes the highest peak can be Nth harmonic of the true base peak yet 
+    // just a slightly higher than the true base
+
+    for (i = 3; i < 10; i ++)
+    {
+        double peaktmp, harmonic;
+        int i1,i2;
+
+        harmonic = (double)i * 0.5;
+        peakpos = (int)(highPeak / harmonic + 0.5f);
+        if (peakpos < minPos) break;
+        peakpos = findTop(data, peakpos);   // seek true local maximum index
+        if (peakpos == 0) continue;         // no local max here
+
+        // calculate mass-center of possible harmonic peak
+        peaktmp = getPeakCenter(data, peakpos);
+
+        // accept harmonic peak if 
+        // (a) it is found
+        // (b) is within �4% of the expected harmonic interval
+        // (c) has at least half x-corr value of the max. peak
+
+        double diff = harmonic * peaktmp / highPeak;
+        if ((diff < 0.96) || (diff > 1.04)) continue;   // peak too afar from expected
+
+        // now compare to highest detected peak
+        i1 = (int)(highPeak + 0.5);
+        i2 = (int)(peaktmp + 0.5);
+        if (data[i2] >= 0.4*data[i1])
+        {
+            // The harmonic is at least half as high primary peak,
+            // thus use the harmonic peak instead
+            peak = peaktmp;
+        }
+    }
+
+    return peak;
+}
diff --git a/src/PeakFinder.h b/src/PeakFinder.h
new file mode 100755
index 0000000..bd2d9bf
--- /dev/null
+++ b/src/PeakFinder.h
@@ -0,0 +1,97 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// The routine detects highest value on an array of values and calculates the 
+/// precise peak location as a mass-center of the 'hump' around the peak value.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2011-12-30 22:33:46 +0200 (pe, 30 joulu 2011) $
+// File revision : $Revision: 4 $
+//
+// $Id: PeakFinder.h 132 2011-12-30 20:33:46Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _PeakFinder_H_
+#define _PeakFinder_H_
+
+namespace soundtouch
+{
+
+class PeakFinder
+{
+protected:
+    /// Min, max allowed peak positions within the data vector
+    int minPos, maxPos;
+
+    /// Calculates the mass center between given vector items.
+    double calcMassCenter(const float *data, ///< Data vector.
+                         int firstPos,      ///< Index of first vector item beloging to the peak.
+                         int lastPos        ///< Index of last vector item beloging to the peak.
+                         ) const;
+
+    /// Finds the data vector index where the monotoniously decreasing signal crosses the
+    /// given level.
+    int   findCrossingLevel(const float *data,  ///< Data vector.
+                            float level,        ///< Goal crossing level.
+                            int peakpos,        ///< Peak position index within the data vector.
+                            int direction       /// Direction where to proceed from the peak: 1 = right, -1 = left.
+                            ) const;
+
+    // Finds real 'top' of a peak hump from neighnourhood of the given 'peakpos'.
+    int findTop(const float *data, int peakpos) const;
+
+
+    /// Finds the 'ground' level, i.e. smallest level between two neighbouring peaks, to right- 
+    /// or left-hand side of the given peak position.
+    int   findGround(const float *data,     /// Data vector.
+                     int peakpos,           /// Peak position index within the data vector.
+                     int direction          /// Direction where to proceed from the peak: 1 = right, -1 = left.
+                     ) const;
+
+    /// get exact center of peak near given position by calculating local mass of center
+    double getPeakCenter(const float *data, int peakpos) const;
+
+public:
+    /// Constructor. 
+    PeakFinder();
+
+    /// Detect exact peak position of the data vector by finding the largest peak 'hump'
+    /// and calculating the mass-center location of the peak hump.
+    ///
+    /// \return The location of the largest base harmonic peak hump.
+    double detectPeak(const float *data, /// Data vector to be analyzed. The data vector has
+                                        /// to be at least 'maxPos' items long.
+                     int minPos,        ///< Min allowed peak location within the vector data.
+                     int maxPos         ///< Max allowed peak location within the vector data.
+                     );
+};
+
+}
+
+#endif // _PeakFinder_H_
diff --git a/src/RateTransposer.cpp b/src/RateTransposer.cpp
old mode 100644
new mode 100755
index 6e0f03e..5d3cee1
--- a/src/RateTransposer.cpp
+++ b/src/RateTransposer.cpp
@@ -10,10 +10,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-07-26 17:45:48 +0300 (Sun, 26 Jul 2015) $
+// Last changed  : $Date: 2016-10-15 22:34:59 +0300 (la, 15 loka 2016) $
 // File revision : $Revision: 4 $
 //
-// $Id: RateTransposer.cpp 225 2015-07-26 14:45:48Z oparviai $
+// $Id: RateTransposer.cpp 243 2016-10-15 19:34:59Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -44,6 +44,8 @@
 #include <stdio.h>
 #include "RateTransposer.h"
 #include "InterpolateLinear.h"
+#include "InterpolateCubic.h"
+#include "InterpolateShannon.h"
 #include "AAFilter.h"
 
 using namespace soundtouch;
@@ -206,6 +208,13 @@ int RateTransposer::isEmpty() const
 }
 
 
+/// Return approximate initial input-output latency
+int RateTransposer::getLatency() const
+{
+    return (bUseAAFilter) ? pAAFilter->getLength() : 0;
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // TransposerBase - Base class for interpolation
diff --git a/src/RateTransposer.h b/src/RateTransposer.h
old mode 100644
new mode 100755
index 8d0eab8..561f490
--- a/src/RateTransposer.h
+++ b/src/RateTransposer.h
@@ -14,10 +14,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-07-26 17:45:48 +0300 (Sun, 26 Jul 2015) $
+// Last changed  : $Date: 2016-10-15 22:34:59 +0300 (la, 15 loka 2016) $
 // File revision : $Revision: 4 $
 //
-// $Id: RateTransposer.h 225 2015-07-26 14:45:48Z oparviai $
+// $Id: RateTransposer.h 243 2016-10-15 19:34:59Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -172,6 +172,9 @@ public:
 
     /// Returns nonzero if there aren't any samples available for outputting.
     int isEmpty() const;
+
+    /// Return approximate initial input-output latency
+    int getLatency() const;
 };
 
 }
diff --git a/src/SoundTouch.cpp b/src/SoundTouch.cpp
old mode 100644
new mode 100755
index 2c715e2..e6ad0a0
--- a/src/SoundTouch.cpp
+++ b/src/SoundTouch.cpp
@@ -41,10 +41,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-07-26 17:45:48 +0300 (Sun, 26 Jul 2015) $
+// Last changed  : $Date: 2016-10-15 22:34:59 +0300 (la, 15 loka 2016) $
 // File revision : $Revision: 4 $
 //
-// $Id: SoundTouch.cpp 225 2015-07-26 14:45:48Z oparviai $
+// $Id: SoundTouch.cpp 243 2016-10-15 19:34:59Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -110,8 +110,8 @@ SoundTouch::SoundTouch()
 
     calcEffectiveRateAndTempo();
 
-	samplesExpectedOut = 0;
-	samplesOutput = 0;
+    samplesExpectedOut = 0;
+    samplesOutput = 0;
 
     channels = 0;
     bSrateSet = false;
@@ -149,7 +149,7 @@ void SoundTouch::setChannels(uint numChannels)
     /*if (numChannels != 1 && numChannels != 2) 
     {
         //ST_THROW_RT_ERROR("Illegal number of channels");
-		return;
+        return;
     }*/
     channels = numChannels;
     pRateTransposer->setChannels((int)numChannels);
@@ -240,11 +240,11 @@ void SoundTouch::calcEffectiveRateAndTempo()
     double oldTempo = tempo;
     double oldRate = rate;
 
-	tempo = virtualTempo / virtualPitch;
-	rate = virtualPitch * virtualRate;
+    tempo = virtualTempo / virtualPitch;
+    rate = virtualPitch * virtualRate;
 
     if (!TEST_FLOAT_EQUAL(rate,oldRate)) pRateTransposer->setRate(rate);
-	if (!TEST_FLOAT_EQUAL(tempo, oldTempo)) pTDStretch->setTempo(tempo);
+    if (!TEST_FLOAT_EQUAL(tempo, oldTempo)) pTDStretch->setTempo(tempo);
 
 #ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
     if (rate <= 1.0f) 
@@ -321,9 +321,9 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples)
     } 
     */
 
-	// accumulate how many samples are expected out from processing, given the current 
-	// processing setting
-	samplesExpectedOut += (double)nSamples / ((double)rate * (double)tempo);
+    // accumulate how many samples are expected out from processing, given the current 
+    // processing setting
+    samplesExpectedOut += (double)nSamples / ((double)rate * (double)tempo);
 
 #ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
     if (rate <= 1.0f) 
@@ -354,23 +354,24 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples)
 void SoundTouch::flush()
 {
     int i;
-	int numStillExpected;
+    int numStillExpected;
     SAMPLETYPE *buff = new SAMPLETYPE[128 * channels];
 
-	// how many samples are still expected to output
-	numStillExpected = (int)((long)(samplesExpectedOut + 0.5) - samplesOutput);
+    // how many samples are still expected to output
+    numStillExpected = (int)((long)(samplesExpectedOut + 0.5) - samplesOutput);
+    if (numStillExpected < 0) numStillExpected = 0;
 
     memset(buff, 0, 128 * channels * sizeof(SAMPLETYPE));
     // "Push" the last active samples out from the processing pipeline by
     // feeding blank samples into the processing pipeline until new, 
     // processed samples appear in the output (not however, more than 
     // 24ksamples in any case)
-	for (i = 0; (numStillExpected > (int)numSamples()) && (i < 200); i ++)
-	{
-		putSamples(buff, 128);
-	}
+    for (i = 0; (numStillExpected > (int)numSamples()) && (i < 200); i ++)
+    {
+        putSamples(buff, 128);
+    }
 
-	adjustAmountOfSamples(numStillExpected);
+    adjustAmountOfSamples(numStillExpected);
 
     delete[] buff;
 
@@ -446,7 +447,7 @@ int SoundTouch::getSetting(int settingId) const
             return pRateTransposer->getAAFilter()->getLength();
 
         case SETTING_USE_QUICKSEEK :
-            return (uint)   pTDStretch->isQuickSeekEnabled();
+            return (uint)pTDStretch->isQuickSeekEnabled();
 
         case SETTING_SEQUENCE_MS:
             pTDStretch->getParameters(NULL, &temp, NULL, NULL);
@@ -460,23 +461,65 @@ int SoundTouch::getSetting(int settingId) const
             pTDStretch->getParameters(NULL, NULL, NULL, &temp);
             return temp;
 
-		case SETTING_NOMINAL_INPUT_SEQUENCE :
-			return pTDStretch->getInputSampleReq();
+        case SETTING_NOMINAL_INPUT_SEQUENCE :
+        {
+            int size = pTDStretch->getInputSampleReq();
 
-		case SETTING_NOMINAL_OUTPUT_SEQUENCE :
-			return pTDStretch->getOutputBatchSize();
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
+            if (rate <= 1.0)
+            {
+                // transposing done before timestretch, which impacts latency
+                return (int)(size * rate + 0.5);
+            }
+#endif
+            return size;
+        }
 
-		default :
+        case SETTING_NOMINAL_OUTPUT_SEQUENCE :
+        {
+            int size = pTDStretch->getOutputBatchSize();
+
+            if (rate > 1.0)
+            {
+                // transposing done after timestretch, which impacts latency
+                return (int)(size / rate + 0.5);
+            }
+            return size;
+        }
+
+        case SETTING_INITIAL_LATENCY:
+        {
+            double latency = pTDStretch->getLatency();
+            int latency_tr = pRateTransposer->getLatency();
+
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
+            if (rate <= 1.0)
+            {
+                // transposing done before timestretch, which impacts latency
+                latency = (latency + latency_tr) * rate;
+            }
+            else
+#endif
+            {
+                latency += (double)latency_tr / rate;
+            }
+
+            return (int)(latency + 0.5);
+        }
+
+        default :
             return 0;
     }
 }
 
 
+
 // Clears all the samples in the object's output and internal processing
 // buffers.
 void SoundTouch::clear()
 {
-	samplesExpectedOut = 0;
+    samplesExpectedOut = 0;
+    samplesOutput = 0;
     pRateTransposer->clear();
     pTDStretch->clear();
 }
@@ -507,9 +550,9 @@ uint SoundTouch::numUnprocessedSamples() const
 /// \return Number of samples returned.
 uint SoundTouch::receiveSamples(SAMPLETYPE *output, uint maxSamples)
 {
-	uint ret = FIFOProcessor::receiveSamples(output, maxSamples);
-	samplesOutput += (long)ret;
-	return ret;
+    uint ret = FIFOProcessor::receiveSamples(output, maxSamples);
+    samplesOutput += (long)ret;
+    return ret;
 }
 
 
@@ -520,7 +563,16 @@ uint SoundTouch::receiveSamples(SAMPLETYPE *output, uint maxSamples)
 /// with 'ptrBegin' function.
 uint SoundTouch::receiveSamples(uint maxSamples)
 {
-	uint ret = FIFOProcessor::receiveSamples(maxSamples);
-	samplesOutput += (long)ret;
-	return ret;
+    uint ret = FIFOProcessor::receiveSamples(maxSamples);
+    samplesOutput += (long)ret;
+    return ret;
+}
+
+
+/// Get ratio between input and output audio durations, useful for calculating
+/// processed output duration: if you'll process a stream of N samples, then 
+/// you can expect to get out N * getInputOutputSampleRatio() samples.
+double SoundTouch::getInputOutputSampleRatio()
+{
+    return 1.0 / (tempo * rate);
 }
diff --git a/src/TDStretch.cpp b/src/TDStretch.cpp
old mode 100644
new mode 100755
index d030558..f3a4636
--- a/src/TDStretch.cpp
+++ b/src/TDStretch.cpp
@@ -4,8 +4,14 @@
 /// while maintaining the original pitch by using a time domain WSOLA-like 
 /// method with several performance-increasing tweaks.
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific 
-/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Notes : MMX optimized functions reside in a separate, platform-specific 
+/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'.
+///
+/// This source file contains OpenMP optimizations that allow speeding up the
+/// corss-correlation algorithm by executing it in several threads / CPU cores 
+/// in parallel. See the following article link for more detailed discussion 
+/// about SoundTouch OpenMP optimizations:
+/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
 ///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
@@ -13,10 +19,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-08-09 00:00:15 +0300 (Sun, 09 Aug 2015) $
+// Last changed  : $Date: 2017-04-07 22:01:22 +0300 (pe, 07 huhti 2017) $
 // File revision : $Revision: 1.12 $
 //
-// $Id: TDStretch.cpp 226 2015-08-08 21:00:15Z oparviai $
+// $Id: TDStretch.cpp 249 2017-04-07 19:01:22Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -219,6 +225,7 @@ void TDStretch::clearInput()
 {
     inputBuffer.clear();
     clearMidBuffer();
+    isBeginning = true;
 }
 
 
@@ -297,12 +304,13 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
     int i;
     double norm;
 
-    bestCorr = FLT_MIN;
+    bestCorr = -FLT_MAX;
     bestOffs = 0;
 
     // Scans for the best correlation value by testing each possible position
     // over the permitted range.
     bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
+    bestCorr = (bestCorr + 0.1) * 0.75;
 
     #pragma omp parallel for
     for (i = 1; i < seekLength; i ++) 
@@ -373,12 +381,10 @@ int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
 
     // note: 'float' types used in this function in case that the platform would need to use software-fp
 
-    bestCorr = FLT_MIN;
-    bestOffs = SCANWIND;
-    bestCorr2 = FLT_MIN;
-    bestOffs2 = 0;
-
-    int best = 0;
+    bestCorr =
+    bestCorr2 = -FLT_MAX;
+    bestOffs = 
+    bestOffs2 = SCANWIND;
 
     // Scans for the best correlation value by testing each possible position
     // over the permitted range. Look for two best matches on the first pass to
@@ -436,7 +442,6 @@ int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
         {
             bestCorr = corr;
             bestOffs = i;
-            best = 1;
         }
     }
 
@@ -458,7 +463,6 @@ int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
         {
             bestCorr = corr;
             bestOffs = i;
-            best = 2;
         }
     }
 
@@ -520,13 +524,13 @@ void TDStretch::calcSeqParameters()
     #define AUTOSEQ_TEMPO_TOP   2.0     // auto setting top tempo range (+100%)
 
     // sequence-ms setting values at above low & top tempo
-    #define AUTOSEQ_AT_MIN      125.0
-    #define AUTOSEQ_AT_MAX      50.0
+    #define AUTOSEQ_AT_MIN      90.0
+    #define AUTOSEQ_AT_MAX      40.0
     #define AUTOSEQ_K           ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
     #define AUTOSEQ_C           (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
 
     // seek-window-ms setting values at above low & top tempoq
-    #define AUTOSEEK_AT_MIN     25.0
+    #define AUTOSEEK_AT_MIN     20.0
     #define AUTOSEEK_AT_MAX     15.0
     #define AUTOSEEK_K          ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
     #define AUTOSEEK_C          (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW))
@@ -637,7 +641,8 @@ void TDStretch::processNominalTempo()
 // the result into 'outputBuffer'
 void TDStretch::processSamples()
 {
-    int ovlSkip, offset;
+    int ovlSkip;
+    int offset = 0;
     int temp;
 
     /* Removed this small optimization - can introduce a click to sound when tempo setting
@@ -654,35 +659,61 @@ void TDStretch::processSamples()
     // to form a processing frame.
     while ((int)inputBuffer.numSamples() >= sampleReq) 
     {
-        // If tempo differs from the normal ('SCALE'), scan for the best overlapping
-        // position
-        offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
+        if (isBeginning == false)
+        {
+            // apart from the very beginning of the track, 
+            // scan for the best overlapping position & do overlap-add
+            offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
 
-        // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
-        // samples in 'midBuffer' using sliding overlapping
-        // ... first partially overlap with the end of the previous sequence
-        // (that's in 'midBuffer')
-        overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
-        outputBuffer.putSamples((uint)overlapLength);
+            // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
+            // samples in 'midBuffer' using sliding overlapping
+            // ... first partially overlap with the end of the previous sequence
+            // (that's in 'midBuffer')
+            overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
+            outputBuffer.putSamples((uint)overlapLength);
+            offset += overlapLength;
+        }
+        else
+        {
+            // Adjust processing offset at beginning of track by not perform initial overlapping
+            // and compensating that in the 'input buffer skip' calculation
+            isBeginning = false;
+            int skip = (int)(tempo * overlapLength + 0.5);
+
+            #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+                #ifdef SOUNDTOUCH_ALLOW_SSE
+                // if SSE mode, round the skip amount to value corresponding to aligned memory address
+                if (channels == 1)
+                {
+                    skip &= -4;
+                }
+                else if (channels == 2)
+                {
+                    skip &= -2;
+                }
+                #endif
+            #endif
+            skipFract -= skip;
+            assert(nominalSkip >= -skipFract);
+        }
 
         // ... then copy sequence samples from 'inputBuffer' to output:
 
-        // length of sequence
-        temp = (seekWindowLength - 2 * overlapLength);
-
         // crosscheck that we don't have buffer overflow...
-        if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
+        if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength))
         {
             continue;    // just in case, shouldn't really happen
         }
 
-        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+        // length of sequence
+        temp = (seekWindowLength - 2 * overlapLength);
+        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp);
 
         // Copies the end of the current sequence from 'inputBuffer' to 
         // 'midBuffer' for being mixed with the beginning of the next 
         // processing sequence and so on
-        assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
-        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), 
+        assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples());
+        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp), 
             channels * sizeof(SAMPLETYPE) * overlapLength);
 
         // Remove the processed samples from the input buffer. Update
@@ -879,7 +910,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
 
     if (lnorm > maxnorm)
     {
-        maxnorm = lnorm;
+        // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
+        #pragma omp critical
+        if (lnorm > maxnorm)
+        {
+            maxnorm = lnorm;
+        }
     }
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
diff --git a/src/TDStretch.h b/src/TDStretch.h
old mode 100644
new mode 100755
index e6d75aa..bde8282
--- a/src/TDStretch.h
+++ b/src/TDStretch.h
@@ -13,10 +13,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-08-09 00:00:15 +0300 (Sun, 09 Aug 2015) $
+// Last changed  : $Date: 2016-10-20 19:30:11 +0300 (to, 20 loka 2016) $
 // File revision : $Revision: 4 $
 //
-// $Id: TDStretch.h 226 2015-08-08 21:00:15Z oparviai $
+// $Id: TDStretch.h 244 2016-10-20 16:30:11Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -134,6 +134,7 @@ protected:
     bool bQuickSeek;
     bool bAutoSeqSetting;
     bool bAutoSeekSetting;
+    bool isBeginning;
 
     SAMPLETYPE *pMidBuffer;
     SAMPLETYPE *pMidBufferUnaligned;
@@ -247,6 +248,13 @@ public:
     {
         return seekWindowLength - overlapLength;
     }
+
+
+	/// return approximate initial input-output latency
+	int getLatency() const
+	{
+		return sampleReq;
+	}
 };
 
 
diff --git a/src/cpu_detect.h b/src/cpu_detect.h
old mode 100644
new mode 100755
index 7859ffb..0023c14
--- a/src/cpu_detect.h
+++ b/src/cpu_detect.h
@@ -12,7 +12,7 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2008-02-10 18:26:55 +0200 (Sun, 10 Feb 2008) $
+// Last changed  : $Date: 2008-02-10 18:26:55 +0200 (su, 10 helmi 2008) $
 // File revision : $Revision: 4 $
 //
 // $Id: cpu_detect.h 11 2008-02-10 16:26:55Z oparviai $
diff --git a/src/cpu_detect_x86.cpp b/src/cpu_detect_x86.cpp
old mode 100644
new mode 100755
index 00f22ab..98799a6
--- a/src/cpu_detect_x86.cpp
+++ b/src/cpu_detect_x86.cpp
@@ -11,7 +11,7 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-07 20:24:28 +0200 (Tue, 07 Jan 2014) $
+// Last changed  : $Date: 2014-01-07 20:24:28 +0200 (ti, 07 tammi 2014) $
 // File revision : $Revision: 4 $
 //
 // $Id: cpu_detect_x86.cpp 183 2014-01-07 18:24:28Z oparviai $
diff --git a/src/mmx_optimized.cpp b/src/mmx_optimized.cpp
new file mode 100755
index 0000000..16557b7
--- /dev/null
+++ b/src/mmx_optimized.cpp
@@ -0,0 +1,400 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// MMX optimized routines. All MMX optimized functions have been gathered into 
+/// this single source code file, regardless to their class or original source 
+/// code file, in order to ease porting the library to other compiler and 
+/// processor platforms.
+///
+/// The MMX-optimizations are programmed using MMX compiler intrinsics that
+/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
+/// should compile with both toolsets.
+///
+/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
+/// 6.0 processor pack" update to support compiler intrinsic syntax. The update
+/// is available for download at Microsoft Developers Network, see here:
+/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2017-03-05 15:56:03 +0200 (su, 05 maalis 2017) $
+// File revision : $Revision: 4 $
+//
+// $Id: mmx_optimized.cpp 247 2017-03-05 13:56:03Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "STTypes.h"
+
+#ifdef SOUNDTOUCH_ALLOW_MMX
+// MMX routines available only with integer sample type
+
+using namespace soundtouch;
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of MMX optimized functions of class 'TDStretchMMX'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <mmintrin.h>
+#include <limits.h>
+#include <math.h>
+
+
+// Calculates cross correlation of two buffers
+double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
+{
+    const __m64 *pVec1, *pVec2;
+    __m64 shifter;
+    __m64 accu, normaccu;
+    long corr, norm;
+    int i;
+   
+    pVec1 = (__m64*)pV1;
+    pVec2 = (__m64*)pV2;
+
+    shifter = _m_from_int(overlapDividerBitsNorm);
+    normaccu = accu = _mm_setzero_si64();
+
+    // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
+    // during each round for improved CPU-level parallellization.
+    for (i = 0; i < channels * overlapLength / 16; i ++)
+    {
+        __m64 temp, temp2;
+
+        // dictionary of instructions:
+        // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
+        // _mm_add_pi32 : 2*32bit add
+        // _m_psrad     : 32bit right-shift
+
+        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter),
+                            _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter));
+        temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), shifter),
+                            _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec1[1]), shifter));
+        accu = _mm_add_pi32(accu, temp);
+        normaccu = _mm_add_pi32(normaccu, temp2);
+
+        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter),
+                            _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter));
+        temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), shifter),
+                            _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec1[3]), shifter));
+        accu = _mm_add_pi32(accu, temp);
+        normaccu = _mm_add_pi32(normaccu, temp2);
+
+        pVec1 += 4;
+        pVec2 += 4;
+    }
+
+    // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
+    // and finally store the result into the variable "corr"
+
+    accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
+    corr = _m_to_int(accu);
+
+    normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
+    norm = _m_to_int(normaccu);
+
+    // Clear MMS state
+    _m_empty();
+
+    if (norm > (long)maxnorm)
+    {
+        // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
+        #pragma omp critical
+        if (norm > (long)maxnorm)
+        {
+            maxnorm = norm;
+        }
+    }
+
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    dnorm = (double)norm;
+
+    return (double)corr / sqrt(dnorm < 1e-9 ? 1.0 : dnorm);
+    // Note: Warning about the missing EMMS instruction is harmless
+    // as it'll be called elsewhere.
+}
+
+
+/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
+double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
+{
+    const __m64 *pVec1, *pVec2;
+    __m64 shifter;
+    __m64 accu;
+    long corr, lnorm;
+    int i;
+   
+    // cancel first normalizer tap from previous round
+    lnorm = 0;
+    for (i = 1; i <= channels; i ++)
+    {
+        lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBitsNorm;
+    }
+
+    pVec1 = (__m64*)pV1;
+    pVec2 = (__m64*)pV2;
+
+    shifter = _m_from_int(overlapDividerBitsNorm);
+    accu = _mm_setzero_si64();
+
+    // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
+    // during each round for improved CPU-level parallellization.
+    for (i = 0; i < channels * overlapLength / 16; i ++)
+    {
+        __m64 temp;
+
+        // dictionary of instructions:
+        // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
+        // _mm_add_pi32 : 2*32bit add
+        // _m_psrad     : 32bit right-shift
+
+        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter),
+                            _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter));
+        accu = _mm_add_pi32(accu, temp);
+
+        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter),
+                            _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter));
+        accu = _mm_add_pi32(accu, temp);
+
+        pVec1 += 4;
+        pVec2 += 4;
+    }
+
+    // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
+    // and finally store the result into the variable "corr"
+
+    accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
+    corr = _m_to_int(accu);
+
+    // Clear MMS state
+    _m_empty();
+
+    // update normalizer with last samples of this round
+    pV1 = (short *)pVec1;
+    for (int j = 1; j <= channels; j ++)
+    {
+        lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBitsNorm;
+    }
+    dnorm += (double)lnorm;
+
+    if (lnorm > (long)maxnorm)
+    {
+        maxnorm = lnorm;
+    }
+
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm);
+}
+
+
+void TDStretchMMX::clearCrossCorrState()
+{
+    // Clear MMS state
+    _m_empty();
+    //_asm EMMS;
+}
+
+
+
+// MMX-optimized version of the function overlapStereo
+void TDStretchMMX::overlapStereo(short *output, const short *input) const
+{
+    const __m64 *pVinput, *pVMidBuf;
+    __m64 *pVdest;
+    __m64 mix1, mix2, adder, shifter;
+    int i;
+
+    pVinput  = (const __m64*)input;
+    pVMidBuf = (const __m64*)pMidBuffer;
+    pVdest   = (__m64*)output;
+
+    // mix1  = mixer values for 1st stereo sample
+    // mix1  = mixer values for 2nd stereo sample
+    // adder = adder for updating mixer values after each round
+    
+    mix1  = _mm_set_pi16(0, overlapLength,   0, overlapLength);
+    adder = _mm_set_pi16(1, -1, 1, -1);
+    mix2  = _mm_add_pi16(mix1, adder);
+    adder = _mm_add_pi16(adder, adder);
+
+    // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
+    // overlapDividerBits calculation earlier.
+    shifter = _m_from_int(overlapDividerBitsPure + 1);
+
+    for (i = 0; i < overlapLength / 4; i ++)
+    {
+        __m64 temp1, temp2;
+                
+        // load & shuffle data so that input & mixbuffer data samples are paired
+        temp1 = _mm_unpacklo_pi16(pVMidBuf[0], pVinput[0]);     // = i0l m0l i0r m0r
+        temp2 = _mm_unpackhi_pi16(pVMidBuf[0], pVinput[0]);     // = i1l m1l i1r m1r
+
+        // temp = (temp .* mix) >> shifter
+        temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
+        temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
+        pVdest[0] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
+
+        // update mix += adder
+        mix1 = _mm_add_pi16(mix1, adder);
+        mix2 = _mm_add_pi16(mix2, adder);
+
+        // --- second round begins here ---
+
+        // load & shuffle data so that input & mixbuffer data samples are paired
+        temp1 = _mm_unpacklo_pi16(pVMidBuf[1], pVinput[1]);       // = i2l m2l i2r m2r
+        temp2 = _mm_unpackhi_pi16(pVMidBuf[1], pVinput[1]);       // = i3l m3l i3r m3r
+
+        // temp = (temp .* mix) >> shifter
+        temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
+        temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
+        pVdest[1] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
+
+        // update mix += adder
+        mix1 = _mm_add_pi16(mix1, adder);
+        mix2 = _mm_add_pi16(mix2, adder);
+
+        pVinput  += 2;
+        pVMidBuf += 2;
+        pVdest   += 2;
+    }
+
+    _m_empty(); // clear MMS state
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of MMX optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+
+FIRFilterMMX::FIRFilterMMX() : FIRFilter()
+{
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilterMMX::~FIRFilterMMX()
+{
+    delete[] filterCoeffsUnalign;
+}
+
+
+// (overloaded) Calculates filter coefficients for MMX routine
+void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
+{
+    uint i;
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Ensure that filter coeffs array is aligned to 16-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new short[2 * newLength + 8];
+    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
+
+    // rearrange the filter coefficients for mmx routines 
+    for (i = 0;i < length; i += 4) 
+    {
+        filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
+        filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
+        filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
+        filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
+
+        filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
+        filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
+        filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
+        filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
+    }
+}
+
+
+
+// mmx-optimized version of the filter routine for stereo sound
+uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
+{
+    // Create stack copies of the needed member variables for asm routines :
+    uint i, j;
+    __m64 *pVdest = (__m64*)dest;
+
+    if (length < 2) return 0;
+
+    for (i = 0; i < (numSamples - length) / 2; i ++)
+    {
+        __m64 accu1;
+        __m64 accu2;
+        const __m64 *pVsrc = (const __m64*)src;
+        const __m64 *pVfilter = (const __m64*)filterCoeffsAlign;
+
+        accu1 = accu2 = _mm_setzero_si64();
+        for (j = 0; j < lengthDiv8 * 2; j ++)
+        {
+            __m64 temp1, temp2;
+
+            temp1 = _mm_unpacklo_pi16(pVsrc[0], pVsrc[1]);  // = l2 l0 r2 r0
+            temp2 = _mm_unpackhi_pi16(pVsrc[0], pVsrc[1]);  // = l3 l1 r3 r1
+
+            accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp1, pVfilter[0]));  // += l2*f2+l0*f0 r2*f2+r0*f0
+            accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp2, pVfilter[1]));  // += l3*f3+l1*f1 r3*f3+r1*f1
+
+            temp1 = _mm_unpacklo_pi16(pVsrc[1], pVsrc[2]);  // = l4 l2 r4 r2
+
+            accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp2, pVfilter[0]));  // += l3*f2+l1*f0 r3*f2+r1*f0
+            accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp1, pVfilter[1]));  // += l4*f3+l2*f1 r4*f3+r2*f1
+
+            // accu1 += l2*f2+l0*f0 r2*f2+r0*f0
+            //       += l3*f3+l1*f1 r3*f3+r1*f1
+
+            // accu2 += l3*f2+l1*f0 r3*f2+r1*f0
+            //          l4*f3+l2*f1 r4*f3+r2*f1
+
+            pVfilter += 2;
+            pVsrc += 2;
+        }
+        // accu >>= resultDivFactor
+        accu1 = _mm_srai_pi32(accu1, resultDivFactor);
+        accu2 = _mm_srai_pi32(accu2, resultDivFactor);
+
+        // pack 2*2*32bits => 4*16 bits
+        pVdest[0] = _mm_packs_pi32(accu1, accu2);
+        src += 4;
+        pVdest ++;
+    }
+
+   _m_empty();  // clear emms state
+
+    return (numSamples & 0xfffffffe) - length;
+}
+
+#endif  // SOUNDTOUCH_ALLOW_MMX
diff --git a/src/sse_optimized.cpp b/src/sse_optimized.cpp
new file mode 100755
index 0000000..c4b594f
--- /dev/null
+++ b/src/sse_optimized.cpp
@@ -0,0 +1,372 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE 
+/// optimized functions have been gathered into this single source 
+/// code file, regardless to their class or original source code file, in order 
+/// to ease porting the library to other compiler and processor platforms.
+///
+/// The SSE-optimizations are programmed using SSE compiler intrinsics that
+/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
+/// should compile with both toolsets.
+///
+/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
+/// 6.0 processor pack" update to support SSE instruction set. The update is 
+/// available for download at Microsoft Developers Network, see here:
+/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
+///
+/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
+/// perform a search with keywords "processor pack".
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2015-08-09 00:00:15 +0300 (su, 09 elo 2015) $
+// File revision : $Revision: 4 $
+//
+// $Id: sse_optimized.cpp 226 2015-08-08 21:00:15Z oparviai $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cpu_detect.h"
+#include "STTypes.h"
+
+using namespace soundtouch;
+
+#ifdef SOUNDTOUCH_ALLOW_SSE
+
+// SSE routines available only with float sample type    
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE optimized functions of class 'TDStretchSSE'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <xmmintrin.h>
+#include <math.h>
+
+// Calculates cross correlation of two buffers
+double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm)
+{
+    int i;
+    const float *pVec1;
+    const __m128 *pVec2;
+    __m128 vSum, vNorm;
+
+    // Note. It means a major slow-down if the routine needs to tolerate 
+    // unaligned __m128 memory accesses. It's way faster if we can skip 
+    // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
+    // This can mean up to ~ 10-fold difference (incl. part of which is
+    // due to skipping every second round for stereo sound though).
+    //
+    // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
+    // for choosing if this little cheating is allowed.
+
+#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+    // Little cheating allowed, return valid correlation only for 
+    // aligned locations, meaning every second round for stereo sound.
+
+    #define _MM_LOAD    _mm_load_ps
+
+    if (((ulongptr)pV1) & 15) return -1e50;    // skip unaligned locations
+
+#else
+    // No cheating allowed, use unaligned load & take the resulting
+    // performance hit.
+    #define _MM_LOAD    _mm_loadu_ps
+#endif 
+
+    // ensure overlapLength is divisible by 8
+    assert((overlapLength % 8) == 0);
+
+    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
+    // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
+    pVec1 = (const float*)pV1;
+    pVec2 = (const __m128*)pV2;
+    vSum = vNorm = _mm_setzero_ps();
+
+    // Unroll the loop by factor of 4 * 4 operations. Use same routine for
+    // stereo & mono, for mono it just means twice the amount of unrolling.
+    for (i = 0; i < channels * overlapLength / 16; i ++) 
+    {
+        __m128 vTemp;
+        // vSum += pV1[0..3] * pV2[0..3]
+        vTemp = _MM_LOAD(pVec1);
+        vSum  = _mm_add_ps(vSum,  _mm_mul_ps(vTemp ,pVec2[0]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
+
+        // vSum += pV1[4..7] * pV2[4..7]
+        vTemp = _MM_LOAD(pVec1 + 4);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
+
+        // vSum += pV1[8..11] * pV2[8..11]
+        vTemp = _MM_LOAD(pVec1 + 8);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
+
+        // vSum += pV1[12..15] * pV2[12..15]
+        vTemp = _MM_LOAD(pVec1 + 12);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
+
+        pVec1 += 16;
+        pVec2 += 4;
+    }
+
+    // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
+    float *pvNorm = (float*)&vNorm;
+    float norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
+    anorm = norm;
+
+    float *pvSum = (float*)&vSum;
+    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm);
+
+    /* This is approximately corresponding routine in C-language yet without normalization:
+    double corr, norm;
+    uint i;
+
+    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
+    corr = norm = 0.0;
+    for (i = 0; i < channels * overlapLength / 16; i ++) 
+    {
+        corr += pV1[0] * pV2[0] +
+                pV1[1] * pV2[1] +
+                pV1[2] * pV2[2] +
+                pV1[3] * pV2[3] +
+                pV1[4] * pV2[4] +
+                pV1[5] * pV2[5] +
+                pV1[6] * pV2[6] +
+                pV1[7] * pV2[7] +
+                pV1[8] * pV2[8] +
+                pV1[9] * pV2[9] +
+                pV1[10] * pV2[10] +
+                pV1[11] * pV2[11] +
+                pV1[12] * pV2[12] +
+                pV1[13] * pV2[13] +
+                pV1[14] * pV2[14] +
+                pV1[15] * pV2[15];
+
+    for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j];
+
+        pV1 += 16;
+        pV2 += 16;
+    }
+    return corr / sqrt(norm);
+    */
+}
+
+
+
+double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm)
+{
+    // call usual calcCrossCorr function because SSE does not show big benefit of 
+    // accumulating "norm" value, and also the "norm" rolling algorithm would get 
+    // complicated due to SSE-specific alignment-vs-nonexact correlation rules.
+    return calcCrossCorr(pV1, pV2, norm);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+FIRFilterSSE::FIRFilterSSE() : FIRFilter()
+{
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilterSSE::~FIRFilterSSE()
+{
+    delete[] filterCoeffsUnalign;
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
+}
+
+
+// (overloaded) Calculates filter coefficients for SSE routine
+void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
+{
+    uint i;
+    float fDivider;
+
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
+    // also rearrange coefficients suitably for SSE
+    // Ensure that filter coeffs array is aligned to 16-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new float[2 * newLength + 4];
+    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
+
+    fDivider = (float)resultDivider;
+
+    // rearrange the filter coefficients for mmx routines 
+    for (i = 0; i < newLength; i ++)
+    {
+        filterCoeffsAlign[2 * i + 0] =
+        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
+    }
+}
+
+
+
+// SSE-optimized version of the filter routine for stereo sound
+uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
+{
+    int count = (int)((numSamples - length) & (uint)-2);
+    int j;
+
+    assert(count % 2 == 0);
+
+    if (count < 2) return 0;
+
+    assert(source != NULL);
+    assert(dest != NULL);
+    assert((length % 8) == 0);
+    assert(filterCoeffsAlign != NULL);
+    assert(((ulongptr)filterCoeffsAlign) % 16 == 0);
+
+    // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
+    #pragma omp parallel for
+    for (j = 0; j < count; j += 2)
+    {
+        const float *pSrc;
+        float *pDest;
+        const __m128 *pFil;
+        __m128 sum1, sum2;
+        uint i;
+
+        pSrc = (const float*)source + j * 2;      // source audio data
+        pDest = dest + j * 2;                     // destination audio data
+        pFil = (const __m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
+                                                  // are aligned to 16-byte boundary
+        sum1 = sum2 = _mm_setzero_ps();
+
+        for (i = 0; i < length / 8; i ++) 
+        {
+            // Unroll loop for efficiency & calculate filter for 2*2 stereo samples 
+            // at each pass
+
+            // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
+            // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc)    , pFil[0]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) ,  pFil[2]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
+
+            pSrc += 16;
+            pFil += 4;
+        }
+
+        // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
+        // to sum the two hi- and lo-floats of these registers together.
+
+        // post-shuffle & add the filtered values and store to dest.
+        _mm_storeu_ps(pDest, _mm_add_ps(
+                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)),   // s2_1 s2_0 s1_3 s1_2
+                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0))    // s2_3 s2_2 s1_1 s1_0
+                    ));
+    }
+
+    // Ideas for further improvement:
+    // 1. If it could be guaranteed that 'source' were always aligned to 16-byte 
+    //    boundary, a faster aligned '_mm_load_ps' instruction could be used.
+    // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte 
+    //    boundary, a faster '_mm_store_ps' instruction could be used.
+
+    return (uint)count;
+
+    /* original routine in C-language. please notice the C-version has differently 
+       organized coefficients though.
+    double suml1, suml2;
+    double sumr1, sumr2;
+    uint i, j;
+
+    for (j = 0; j < count; j += 2)
+    {
+        const float *ptr;
+        const float *pFil;
+
+        suml1 = sumr1 = 0.0;
+        suml2 = sumr2 = 0.0;
+        ptr = src;
+        pFil = filterCoeffs;
+        for (i = 0; i < lengthLocal; i ++) 
+        {
+            // unroll loop for efficiency.
+
+            suml1 += ptr[0] * pFil[0] + 
+                     ptr[2] * pFil[2] +
+                     ptr[4] * pFil[4] +
+                     ptr[6] * pFil[6];
+
+            sumr1 += ptr[1] * pFil[1] + 
+                     ptr[3] * pFil[3] +
+                     ptr[5] * pFil[5] +
+                     ptr[7] * pFil[7];
+
+            suml2 += ptr[8] * pFil[0] + 
+                     ptr[10] * pFil[2] +
+                     ptr[12] * pFil[4] +
+                     ptr[14] * pFil[6];
+
+            sumr2 += ptr[9] * pFil[1] + 
+                     ptr[11] * pFil[3] +
+                     ptr[13] * pFil[5] +
+                     ptr[15] * pFil[7];
+
+            ptr += 16;
+            pFil += 8;
+        }
+        dest[0] = (float)suml1;
+        dest[1] = (float)sumr1;
+        dest[2] = (float)suml2;
+        dest[3] = (float)sumr2;
+
+        src += 4;
+        dest += 4;
+    }
+    */
+}
+
+#endif  // SOUNDTOUCH_ALLOW_SSE