From 019d2089bbadf70d73ba85aa8ea51490b071262c Mon Sep 17 00:00:00 2001 From: MerryMage Date: Thu, 17 Aug 2017 11:09:20 +0100 Subject: [PATCH] Stripped down version of SoundTouch 2.0.0 --- .gitignore | 1 + CMakeLists.txt | 26 ++- include/BPMDetect.h | 158 ++++++++++++++ include/FIFOSampleBuffer.h | 2 +- include/FIFOSamplePipe.h | 2 +- include/STTypes.h | 18 +- include/SoundTouch.h | 86 ++++++-- include/soundtouch_config.h | 5 + src/AAFilter.cpp | 6 +- src/AAFilter.h | 2 +- src/BPMDetect.cpp | 318 ++++++++++++++++++++++++++++ src/FIFOSampleBuffer.cpp | 2 +- src/FIRFilter.cpp | 12 +- src/FIRFilter.h | 2 +- src/InterpolateCubic.cpp | 200 ++++++++++++++++++ src/InterpolateCubic.h | 67 ++++++ src/InterpolateLinear.cpp | 0 src/InterpolateLinear.h | 0 src/InterpolateShannon.cpp | 185 +++++++++++++++++ src/InterpolateShannon.h | 72 +++++++ src/PeakFinder.cpp | 286 ++++++++++++++++++++++++++ src/PeakFinder.h | 97 +++++++++ src/RateTransposer.cpp | 13 +- src/RateTransposer.h | 7 +- src/SoundTouch.cpp | 116 ++++++++--- src/TDStretch.cpp | 104 +++++++--- src/TDStretch.h | 12 +- src/cpu_detect.h | 2 +- src/cpu_detect_x86.cpp | 2 +- src/mmx_optimized.cpp | 400 ++++++++++++++++++++++++++++++++++++ src/sse_optimized.cpp | 372 +++++++++++++++++++++++++++++++++ 31 files changed, 2456 insertions(+), 119 deletions(-) create mode 100644 .gitignore create mode 100755 include/BPMDetect.h mode change 100644 => 100755 include/FIFOSampleBuffer.h mode change 100644 => 100755 include/FIFOSamplePipe.h mode change 100644 => 100755 include/STTypes.h mode change 100644 => 100755 include/SoundTouch.h create mode 100644 include/soundtouch_config.h mode change 100644 => 100755 src/AAFilter.cpp mode change 100644 => 100755 src/AAFilter.h create mode 100755 src/BPMDetect.cpp mode change 100644 => 100755 src/FIFOSampleBuffer.cpp mode change 100644 => 100755 src/FIRFilter.cpp mode change 100644 => 100755 src/FIRFilter.h create mode 100755 src/InterpolateCubic.cpp create mode 100755 src/InterpolateCubic.h mode change 100644 => 100755 src/InterpolateLinear.cpp mode change 100644 => 100755 src/InterpolateLinear.h create mode 100755 src/InterpolateShannon.cpp create mode 100755 src/InterpolateShannon.h create mode 100755 src/PeakFinder.cpp create mode 100755 src/PeakFinder.h mode change 100644 => 100755 src/RateTransposer.cpp mode change 100644 => 100755 src/RateTransposer.h mode change 100644 => 100755 src/SoundTouch.cpp mode change 100644 => 100755 src/TDStretch.cpp mode change 100644 => 100755 src/TDStretch.h mode change 100644 => 100755 src/cpu_detect.h mode change 100644 => 100755 src/cpu_detect_x86.cpp create mode 100755 src/mmx_optimized.cpp create mode 100755 src/sse_optimized.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..496ee2c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 30cc4e9..17379f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,18 @@ set(SRCS - src/AAFilter.cpp - src/cpu_detect_x86.cpp - src/FIFOSampleBuffer.cpp - src/FIRFilter.cpp - src/InterpolateLinear.cpp - src/RateTransposer.cpp - src/SoundTouch.cpp - src/TDStretch.cpp - ) - -include_directories(src include) + src/AAFilter.cpp + src/BPMDetect.cpp + src/cpu_detect_x86.cpp + src/FIFOSampleBuffer.cpp + src/FIRFilter.cpp + src/InterpolateCubic.cpp + src/InterpolateLinear.cpp + src/InterpolateShannon.cpp + src/mmx_optimized.cpp + src/PeakFinder.cpp + src/RateTransposer.cpp + src/SoundTouch.cpp + src/sse_optimized.cpp + src/TDStretch.cpp) add_library(SoundTouch STATIC ${SRCS}) +target_include_directories(SoundTouch PUBLIC include PRIVATE src) diff --git a/include/BPMDetect.h b/include/BPMDetect.h new file mode 100755 index 0000000..a452ed1 --- /dev/null +++ b/include/BPMDetect.h @@ -0,0 +1,158 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Beats-per-minute (BPM) detection routine. +/// +/// The beat detection algorithm works as follows: +/// - Use function 'inputSamples' to input a chunks of samples to the class for +/// analysis. It's a good idea to enter a large sound file or stream in smallish +/// chunks of around few kilosamples in order not to extinguish too much RAM memory. +/// - Input sound data is decimated to approx 500 Hz to reduce calculation burden, +/// which is basically ok as low (bass) frequencies mostly determine the beat rate. +/// Simple averaging is used for anti-alias filtering because the resulting signal +/// quality isn't of that high importance. +/// - Decimated sound data is enveloped, i.e. the amplitude shape is detected by +/// taking absolute value that's smoothed by sliding average. Signal levels that +/// are below a couple of times the general RMS amplitude level are cut away to +/// leave only notable peaks there. +/// - Repeating sound patterns (e.g. beats) are detected by calculating short-term +/// autocorrelation function of the enveloped signal. +/// - After whole sound data file has been analyzed as above, the bpm level is +/// detected by function 'getBpm' that finds the highest peak of the autocorrelation +/// function, calculates it's precise location and converts this reading to bpm's. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date: 2016-01-12 19:24:46 +0200 (ti, 12 tammi 2016) $ +// File revision : $Revision: 4 $ +// +// $Id: BPMDetect.h 239 2016-01-12 17:24:46Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _BPMDetect_H_ +#define _BPMDetect_H_ + +#include "STTypes.h" +#include "FIFOSampleBuffer.h" + +namespace soundtouch +{ + +/// Minimum allowed BPM rate. Used to restrict accepted result above a reasonable limit. +#define MIN_BPM 29 + +/// Maximum allowed BPM rate. Used to restrict accepted result below a reasonable limit. +#define MAX_BPM 200 + + +/// Class for calculating BPM rate for audio data. +class BPMDetect +{ +protected: + /// Auto-correlation accumulator bins. + float *xcorr; + + /// Sample average counter. + int decimateCount; + + /// Sample average accumulator for FIFO-like decimation. + soundtouch::LONG_SAMPLETYPE decimateSum; + + /// Decimate sound by this coefficient to reach approx. 500 Hz. + int decimateBy; + + /// Auto-correlation window length + int windowLen; + + /// Number of channels (1 = mono, 2 = stereo) + int channels; + + /// sample rate + int sampleRate; + + /// Beginning of auto-correlation window: Autocorrelation isn't being updated for + /// the first these many correlation bins. + int windowStart; + + /// FIFO-buffer for decimated processing samples. + soundtouch::FIFOSampleBuffer *buffer; + + /// Updates auto-correlation function for given number of decimated samples that + /// are read from the internal 'buffer' pipe (samples aren't removed from the pipe + /// though). + void updateXCorr(int process_samples /// How many samples are processed. + ); + + /// Decimates samples to approx. 500 Hz. + /// + /// \return Number of output samples. + int decimate(soundtouch::SAMPLETYPE *dest, ///< Destination buffer + const soundtouch::SAMPLETYPE *src, ///< Source sample buffer + int numsamples ///< Number of source samples. + ); + + /// Calculates amplitude envelope for the buffer of samples. + /// Result is output to 'samples'. + void calcEnvelope(soundtouch::SAMPLETYPE *samples, ///< Pointer to input/output data buffer + int numsamples ///< Number of samples in buffer + ); + + /// remove constant bias from xcorr data + void removeBias(); + +public: + /// Constructor. + BPMDetect(int numChannels, ///< Number of channels in sample data. + int sampleRate ///< Sample rate in Hz. + ); + + /// Destructor. + virtual ~BPMDetect(); + + /// Inputs a block of samples for analyzing: Envelopes the samples and then + /// updates the autocorrelation estimation. When whole song data has been input + /// in smaller blocks using this function, read the resulting bpm with 'getBpm' + /// function. + /// + /// Notice that data in 'samples' array can be disrupted in processing. + void inputSamples(const soundtouch::SAMPLETYPE *samples, ///< Pointer to input/working data buffer + int numSamples ///< Number of samples in buffer + ); + + + /// Analyzes the results and returns the BPM rate. Use this function to read result + /// after whole song data has been input to the class by consecutive calls of + /// 'inputSamples' function. + /// + /// \return Beats-per-minute rate, or zero if detection failed. + float getBpm(); +}; + +} + +#endif // _BPMDetect_H_ diff --git a/include/FIFOSampleBuffer.h b/include/FIFOSampleBuffer.h old mode 100644 new mode 100755 index 4b22a51..691ff1a --- a/include/FIFOSampleBuffer.h +++ b/include/FIFOSampleBuffer.h @@ -15,7 +15,7 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2014-01-05 23:40:22 +0200 (Sun, 05 Jan 2014) $ +// Last changed : $Date: 2014-01-05 23:40:22 +0200 (su, 05 tammi 2014) $ // File revision : $Revision: 4 $ // // $Id: FIFOSampleBuffer.h 177 2014-01-05 21:40:22Z oparviai $ diff --git a/include/FIFOSamplePipe.h b/include/FIFOSamplePipe.h old mode 100644 new mode 100755 index f26c57b..56af60c --- a/include/FIFOSamplePipe.h +++ b/include/FIFOSamplePipe.h @@ -17,7 +17,7 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2012-06-13 22:29:53 +0300 (Wed, 13 Jun 2012) $ +// Last changed : $Date: 2012-06-13 22:29:53 +0300 (ke, 13 kesä 2012) $ // File revision : $Revision: 4 $ // // $Id: FIFOSamplePipe.h 143 2012-06-13 19:29:53Z oparviai $ diff --git a/include/STTypes.h b/include/STTypes.h old mode 100644 new mode 100755 index b257974..337e11c --- a/include/STTypes.h +++ b/include/STTypes.h @@ -8,10 +8,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-05-18 18:25:07 +0300 (Mon, 18 May 2015) $ +// Last changed : $Date: 2017-07-30 12:28:06 +0300 (su, 30 heinä 2017) $ // File revision : $Revision: 3 $ // -// $Id: STTypes.h 215 2015-05-18 15:25:07Z oparviai $ +// $Id: STTypes.h 252 2017-07-30 09:28:06Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -53,6 +53,8 @@ typedef unsigned long ulong; // Helper macro for aligning pointer up to next 16-byte boundary #define SOUNDTOUCH_ALIGN_POINTER_16(x) ( ( (ulongptr)(x) + 15 ) & ~(ulongptr)15 ) +#include "soundtouch_config.h" + namespace soundtouch { /// Activate these undef's to overrule the possible sampletype @@ -90,8 +92,8 @@ namespace soundtouch /// However, if you still prefer to select the sample format here /// also in GNU environment, then please #undef the INTEGER_SAMPLE /// and FLOAT_SAMPLE defines first as in comments above. - #define SOUNDTOUCH_INTEGER_SAMPLES 1 //< 16bit integer samples - //#define SOUNDTOUCH_FLOAT_SAMPLES 1 //< 32bit float samples + //#define SOUNDTOUCH_INTEGER_SAMPLES 1 //< 16bit integer samples + #define SOUNDTOUCH_FLOAT_SAMPLES 1 //< 32bit float samples #endif @@ -102,7 +104,7 @@ namespace soundtouch /// routines compiled for whatever reason, you may disable these optimizations /// to make the library compile. - //#define SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS 1 + #define SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS 1 /// In GNU environment, allow the user to override this setting by /// giving the following switch to the configure script: @@ -135,8 +137,10 @@ namespace soundtouch #endif // SOUNDTOUCH_FLOAT_SAMPLES #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS - // Allow MMX optimizations - #define SOUNDTOUCH_ALLOW_MMX 1 + // Allow MMX optimizations (not available in X64 mode) + #if (!_M_X64) + #define SOUNDTOUCH_ALLOW_MMX 1 + #endif #endif #else diff --git a/include/SoundTouch.h b/include/SoundTouch.h old mode 100644 new mode 100755 index b03855e..c97b393 --- a/include/SoundTouch.h +++ b/include/SoundTouch.h @@ -41,10 +41,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-09-20 10:38:32 +0300 (Sun, 20 Sep 2015) $ +// Last changed : $Date: 2017-07-30 12:35:00 +0300 (su, 30 heinä 2017) $ // File revision : $Revision: 4 $ // -// $Id: SoundTouch.h 230 2015-09-20 07:38:32Z oparviai $ +// $Id: SoundTouch.h 253 2017-07-30 09:35:00Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -79,10 +79,10 @@ namespace soundtouch { /// Soundtouch library version string -#define SOUNDTOUCH_VERSION "1.9.2" +#define SOUNDTOUCH_VERSION "2.0.0" /// SoundTouch library version id -#define SOUNDTOUCH_VERSION_ID (10902) +#define SOUNDTOUCH_VERSION_ID (20000) // // Available setting IDs for the 'setSetting' & 'get_setting' functions: @@ -116,30 +116,61 @@ namespace soundtouch #define SETTING_OVERLAP_MS 5 -/// Call "getSetting" with this ID to query nominal average processing sequence -/// size in samples. This value tells approcimate value how many input samples -/// SoundTouch needs to gather before it does DSP processing run for the sample batch. +/// Call "getSetting" with this ID to query processing sequence size in samples. +/// This value gives approximate value of how many input samples you'll need to +/// feed into SoundTouch after initial buffering to get out a new batch of +/// output samples. +/// +/// This value does not include initial buffering at beginning of a new processing +/// stream, use SETTING_INITIAL_LATENCY to get the initial buffering size. /// /// Notices: /// - This is read-only parameter, i.e. setSetting ignores this parameter -/// - Returned value is approximate average value, exact processing batch -/// size may wary from time to time -/// - This parameter value is not constant but may change depending on +/// - This parameter value is not constant but change depending on /// tempo/pitch/rate/samplerate settings. -#define SETTING_NOMINAL_INPUT_SEQUENCE 6 +#define SETTING_NOMINAL_INPUT_SEQUENCE 6 /// Call "getSetting" with this ID to query nominal average processing output /// size in samples. This value tells approcimate value how many output samples /// SoundTouch outputs once it does DSP processing run for a batch of input samples. -/// +/// /// Notices: /// - This is read-only parameter, i.e. setSetting ignores this parameter -/// - Returned value is approximate average value, exact processing batch -/// size may wary from time to time -/// - This parameter value is not constant but may change depending on +/// - This parameter value is not constant but change depending on /// tempo/pitch/rate/samplerate settings. -#define SETTING_NOMINAL_OUTPUT_SEQUENCE 7 +#define SETTING_NOMINAL_OUTPUT_SEQUENCE 7 + + +/// Call "getSetting" with this ID to query initial processing latency, i.e. +/// approx. how many samples you'll need to enter to SoundTouch pipeline before +/// you can expect to get first batch of ready output samples out. +/// +/// After the first output batch, you can then expect to get approx. +/// SETTING_NOMINAL_OUTPUT_SEQUENCE ready samples out for every +/// SETTING_NOMINAL_INPUT_SEQUENCE samples that you enter into SoundTouch. +/// +/// Example: +/// processing with parameter -tempo=5 +/// => initial latency = 5509 samples +/// input sequence = 4167 samples +/// output sequence = 3969 samples +/// +/// Accordingly, you can expect to feed in approx. 5509 samples at beginning of +/// the stream, and then you'll get out the first 3969 samples. After that, for +/// every approx. 4167 samples that you'll put in, you'll receive again approx. +/// 3969 samples out. +/// +/// This also means that average latency during stream processing is +/// INITIAL_LATENCY-OUTPUT_SEQUENCE/2, in the above example case 5509-3969/2 +/// = 3524 samples +/// +/// Notices: +/// - This is read-only parameter, i.e. setSetting ignores this parameter +/// - This parameter value is not constant but change depending on +/// tempo/pitch/rate/samplerate settings. +#define SETTING_INITIAL_LATENCY 8 + class SoundTouch : public FIFOProcessor { @@ -228,6 +259,24 @@ public: /// Sets sample rate. void setSampleRate(uint srate); + /// Get ratio between input and output audio durations, useful for calculating + /// processed output duration: if you'll process a stream of N samples, then + /// you can expect to get out N * getInputOutputSampleRatio() samples. + /// + /// This ratio will give accurate target duration ratio for a full audio track, + /// given that the the whole track is processed with same processing parameters. + /// + /// If this ratio is applied to calculate intermediate offsets inside a processing + /// stream, then this ratio is approximate and can deviate +- some tens of milliseconds + /// from ideal offset, yet by end of the audio stream the duration ratio will become + /// exact. + /// + /// Example: if processing with parameters "-tempo=15 -pitch=-3", the function + /// will return value 0.8695652... Now, if processing an audio stream whose duration + /// is exactly one million audio samples, then you can expect the processed + /// output duration be 0.869565 * 1000000 = 869565 samples. + double getInputOutputSampleRatio(); + /// Flushes the last samples from the processing pipeline to the output. /// Clears also the internal processing buffers. // @@ -286,6 +335,11 @@ public: /// Returns number of samples currently unprocessed. virtual uint numUnprocessedSamples() const; + /// Return number of channels + uint numChannels() const + { + return channels; + } /// Other handy functions that are implemented in the ancestor classes (see /// classes 'FIFOProcessor' and 'FIFOSamplePipe') diff --git a/include/soundtouch_config.h b/include/soundtouch_config.h new file mode 100644 index 0000000..2e65377 --- /dev/null +++ b/include/soundtouch_config.h @@ -0,0 +1,5 @@ +/* Use Float as Sample type */ +#undef SOUNDTOUCH_FLOAT_SAMPLES + +/* Use Integer as Sample type */ +#define SOUNDTOUCH_INTEGER_SAMPLES 1 diff --git a/src/AAFilter.cpp b/src/AAFilter.cpp old mode 100644 new mode 100755 index a254300..66114ab --- a/src/AAFilter.cpp +++ b/src/AAFilter.cpp @@ -12,10 +12,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2014-01-05 23:40:22 +0200 (Sun, 05 Jan 2014) $ +// Last changed : $Date: 2016-01-12 19:26:21 +0200 (ti, 12 tammi 2016) $ // File revision : $Revision: 4 $ // -// $Id: AAFilter.cpp 177 2014-01-05 21:40:22Z oparviai $ +// $Id: AAFilter.cpp 240 2016-01-12 17:26:21Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -49,7 +49,7 @@ using namespace soundtouch; -#define PI 3.141592655357989 +#define PI 3.14159265358979323846 #define TWOPI (2 * PI) // define this to save AA filter coefficients to a file diff --git a/src/AAFilter.h b/src/AAFilter.h old mode 100644 new mode 100755 index 8c34b6b..fc06481 --- a/src/AAFilter.h +++ b/src/AAFilter.h @@ -13,7 +13,7 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2014-01-07 21:41:23 +0200 (Tue, 07 Jan 2014) $ +// Last changed : $Date: 2014-01-07 21:41:23 +0200 (ti, 07 tammi 2014) $ // File revision : $Revision: 4 $ // // $Id: AAFilter.h 187 2014-01-07 19:41:23Z oparviai $ diff --git a/src/BPMDetect.cpp b/src/BPMDetect.cpp new file mode 100755 index 0000000..66a7d05 --- /dev/null +++ b/src/BPMDetect.cpp @@ -0,0 +1,318 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Beats-per-minute (BPM) detection routine. +/// +/// The beat detection algorithm works as follows: +/// - Use function 'inputSamples' to input a chunks of samples to the class for +/// analysis. It's a good idea to enter a large sound file or stream in smallish +/// chunks of around few kilosamples in order not to extinguish too much RAM memory. +/// - Inputted sound data is decimated to approx 500 Hz to reduce calculation burden, +/// which is basically ok as low (bass) frequencies mostly determine the beat rate. +/// Simple averaging is used for anti-alias filtering because the resulting signal +/// quality isn't of that high importance. +/// - Decimated sound data is enveloped, i.e. the amplitude shape is detected by +/// taking absolute value that's smoothed by sliding average. Signal levels that +/// are below a couple of times the general RMS amplitude level are cut away to +/// leave only notable peaks there. +/// - Repeating sound patterns (e.g. beats) are detected by calculating short-term +/// autocorrelation function of the enveloped signal. +/// - After whole sound data file has been analyzed as above, the bpm level is +/// detected by function 'getBpm' that finds the highest peak of the autocorrelation +/// function, calculates it's precise location and converts this reading to bpm's. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date: 2016-01-05 22:59:57 +0200 (ti, 05 tammi 2016) $ +// File revision : $Revision: 4 $ +// +// $Id: BPMDetect.cpp 237 2016-01-05 20:59:57Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include "FIFOSampleBuffer.h" +#include "PeakFinder.h" +#include "BPMDetect.h" + +using namespace soundtouch; + +#define INPUT_BLOCK_SAMPLES 2048 +#define DECIMATED_BLOCK_SAMPLES 256 + +/// Target sample rate after decimation +const int target_srate = 1000; + +/// XCorr update sequence size, update in about 200msec chunks +const int xcorr_update_sequence = 200; + +/// XCorr decay time constant, decay to half in 30 seconds +/// If it's desired to have the system adapt quicker to beat rate +/// changes within a continuing music stream, then the +/// 'xcorr_decay_time_constant' value can be reduced, yet that +/// can increase possibility of glitches in bpm detection. +const double xcorr_decay_time_constant = 30.0; + +//////////////////////////////////////////////////////////////////////////////// + +// Enable following define to create bpm analysis file: + +// #define _CREATE_BPM_DEBUG_FILE + +#ifdef _CREATE_BPM_DEBUG_FILE + + #define DEBUGFILE_NAME "c:\\temp\\soundtouch-bpm-debug.txt" + + static void _SaveDebugData(const float *data, int minpos, int maxpos, double coeff) + { + FILE *fptr = fopen(DEBUGFILE_NAME, "wt"); + int i; + + if (fptr) + { + printf("\n\nWriting BPM debug data into file " DEBUGFILE_NAME "\n\n"); + for (i = minpos; i < maxpos; i ++) + { + fprintf(fptr, "%d\t%.1lf\t%f\n", i, coeff / (double)i, data[i]); + } + fclose(fptr); + } + } +#else + #define _SaveDebugData(a,b,c,d) +#endif + +//////////////////////////////////////////////////////////////////////////////// + + +BPMDetect::BPMDetect(int numChannels, int aSampleRate) +{ + this->sampleRate = aSampleRate; + this->channels = numChannels; + + decimateSum = 0; + decimateCount = 0; + + // choose decimation factor so that result is approx. 1000 Hz + decimateBy = sampleRate / target_srate; + assert(decimateBy > 0); + assert(INPUT_BLOCK_SAMPLES < decimateBy * DECIMATED_BLOCK_SAMPLES); + + // Calculate window length & starting item according to desired min & max bpms + windowLen = (60 * sampleRate) / (decimateBy * MIN_BPM); + windowStart = (60 * sampleRate) / (decimateBy * MAX_BPM); + + assert(windowLen > windowStart); + + // allocate new working objects + xcorr = new float[windowLen]; + memset(xcorr, 0, windowLen * sizeof(float)); + + // allocate processing buffer + buffer = new FIFOSampleBuffer(); + // we do processing in mono mode + buffer->setChannels(1); + buffer->clear(); +} + + + +BPMDetect::~BPMDetect() +{ + delete[] xcorr; + delete buffer; +} + + + +/// convert to mono, low-pass filter & decimate to about 500 Hz. +/// return number of outputted samples. +/// +/// Decimation is used to remove the unnecessary frequencies and thus to reduce +/// the amount of data needed to be processed as calculating autocorrelation +/// function is a very-very heavy operation. +/// +/// Anti-alias filtering is done simply by averaging the samples. This is really a +/// poor-man's anti-alias filtering, but it's not so critical in this kind of application +/// (it'd also be difficult to design a high-quality filter with steep cut-off at very +/// narrow band) +int BPMDetect::decimate(SAMPLETYPE *dest, const SAMPLETYPE *src, int numsamples) +{ + int count, outcount; + LONG_SAMPLETYPE out; + + assert(channels > 0); + assert(decimateBy > 0); + outcount = 0; + for (count = 0; count < numsamples; count ++) + { + int j; + + // convert to mono and accumulate + for (j = 0; j < channels; j ++) + { + decimateSum += src[j]; + } + src += j; + + decimateCount ++; + if (decimateCount >= decimateBy) + { + // Store every Nth sample only + out = (LONG_SAMPLETYPE)(decimateSum / (decimateBy * channels)); + decimateSum = 0; + decimateCount = 0; +#ifdef SOUNDTOUCH_INTEGER_SAMPLES + // check ranges for sure (shouldn't actually be necessary) + if (out > 32767) + { + out = 32767; + } + else if (out < -32768) + { + out = -32768; + } +#endif // SOUNDTOUCH_INTEGER_SAMPLES + dest[outcount] = (SAMPLETYPE)out; + outcount ++; + } + } + return outcount; +} + + + +// Calculates autocorrelation function of the sample history buffer +void BPMDetect::updateXCorr(int process_samples) +{ + int offs; + SAMPLETYPE *pBuffer; + + assert(buffer->numSamples() >= (uint)(process_samples + windowLen)); + + pBuffer = buffer->ptrBegin(); + + // calculate decay factor for xcorr filtering + float xcorr_decay = (float)pow(0.5, 1.0 / (xcorr_decay_time_constant * target_srate / process_samples)); + + #pragma omp parallel for + for (offs = windowStart; offs < windowLen; offs ++) + { + LONG_SAMPLETYPE sum; + int i; + + sum = 0; + for (i = 0; i < process_samples; i ++) + { + sum += pBuffer[i] * pBuffer[i + offs]; // scaling the sub-result shouldn't be necessary + } + xcorr[offs] *= xcorr_decay; // decay 'xcorr' here with suitable time constant. + + xcorr[offs] += (float)fabs(sum); + } +} + + + +void BPMDetect::inputSamples(const SAMPLETYPE *samples, int numSamples) +{ + SAMPLETYPE decimated[DECIMATED_BLOCK_SAMPLES]; + + // iterate so that max INPUT_BLOCK_SAMPLES processed per iteration + while (numSamples > 0) + { + int block; + int decSamples; + + block = (numSamples > INPUT_BLOCK_SAMPLES) ? INPUT_BLOCK_SAMPLES : numSamples; + + // decimate. note that converts to mono at the same time + decSamples = decimate(decimated, samples, block); + samples += block * channels; + numSamples -= block; + + buffer->putSamples(decimated, decSamples); + } + + // when the buffer has enought samples for processing... + while ((int)buffer->numSamples() >= windowLen + xcorr_update_sequence) + { + // ... calculate autocorrelations for oldest samples... + updateXCorr(xcorr_update_sequence); + // ... and remove these from the buffer + buffer->receiveSamples(xcorr_update_sequence); + } +} + + + +void BPMDetect::removeBias() +{ + int i; + float minval = 1e12f; // arbitrary large number + + for (i = windowStart; i < windowLen; i ++) + { + if (xcorr[i] < minval) + { + minval = xcorr[i]; + } + } + + for (i = windowStart; i < windowLen; i ++) + { + xcorr[i] -= minval; + } +} + + +float BPMDetect::getBpm() +{ + double peakPos; + double coeff; + PeakFinder peakFinder; + + coeff = 60.0 * ((double)sampleRate / (double)decimateBy); + + // save bpm debug analysis data if debug data enabled + _SaveDebugData(xcorr, windowStart, windowLen, coeff); + + // remove bias from xcorr data + removeBias(); + + // find peak position + peakPos = peakFinder.detectPeak(xcorr, windowStart, windowLen); + + assert(decimateBy != 0); + if (peakPos < 1e-9) return 0.0; // detection failed. + + // calculate BPM + return (float) (coeff / peakPos); +} diff --git a/src/FIFOSampleBuffer.cpp b/src/FIFOSampleBuffer.cpp old mode 100644 new mode 100755 index 4d9740a..4f9b9a7 --- a/src/FIFOSampleBuffer.cpp +++ b/src/FIFOSampleBuffer.cpp @@ -15,7 +15,7 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2012-11-08 20:53:01 +0200 (Thu, 08 Nov 2012) $ +// Last changed : $Date: 2012-11-08 20:53:01 +0200 (to, 08 marras 2012) $ // File revision : $Revision: 4 $ // // $Id: FIFOSampleBuffer.cpp 160 2012-11-08 18:53:01Z oparviai $ diff --git a/src/FIRFilter.cpp b/src/FIRFilter.cpp old mode 100644 new mode 100755 index 456418a..41ebc78 --- a/src/FIRFilter.cpp +++ b/src/FIRFilter.cpp @@ -2,19 +2,25 @@ /// /// General FIR digital filter routines with MMX optimization. /// -/// Note : MMX optimized functions reside in a separate, platform-specific file, +/// Notes : MMX optimized functions reside in a separate, platform-specific file, /// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp' /// +/// This source file contains OpenMP optimizations that allow speeding up the +/// corss-correlation algorithm by executing it in several threads / CPU cores +/// in parallel. See the following article link for more detailed discussion +/// about SoundTouch OpenMP optimizations: +/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices +/// /// Author : Copyright (c) Olli Parviainen /// Author e-mail : oparviai 'at' iki.fi /// SoundTouch WWW: http://www.surina.net/soundtouch /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-02-21 23:24:29 +0200 (Sat, 21 Feb 2015) $ +// Last changed : $Date: 2015-11-05 19:46:08 +0200 (to, 05 marras 2015) $ // File revision : $Revision: 4 $ // -// $Id: FIRFilter.cpp 202 2015-02-21 21:24:29Z oparviai $ +// $Id: FIRFilter.cpp 234 2015-11-05 17:46:08Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // diff --git a/src/FIRFilter.h b/src/FIRFilter.h old mode 100644 new mode 100755 index 158cdd2..3a27d0a --- a/src/FIRFilter.h +++ b/src/FIRFilter.h @@ -11,7 +11,7 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-02-21 23:24:29 +0200 (Sat, 21 Feb 2015) $ +// Last changed : $Date: 2015-02-21 23:24:29 +0200 (la, 21 helmi 2015) $ // File revision : $Revision: 4 $ // // $Id: FIRFilter.h 202 2015-02-21 21:24:29Z oparviai $ diff --git a/src/InterpolateCubic.cpp b/src/InterpolateCubic.cpp new file mode 100755 index 0000000..8aa7374 --- /dev/null +++ b/src/InterpolateCubic.cpp @@ -0,0 +1,200 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Cubic interpolation routine. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// $Id: InterpolateCubic.cpp 179 2014-01-06 18:41:42Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include "InterpolateCubic.h" +#include "STTypes.h" + +using namespace soundtouch; + +// cubic interpolation coefficients +static const float _coeffs[]= +{ -0.5f, 1.0f, -0.5f, 0.0f, + 1.5f, -2.5f, 0.0f, 1.0f, + -1.5f, 2.0f, 0.5f, 0.0f, + 0.5f, -0.5f, 0.0f, 0.0f}; + + +InterpolateCubic::InterpolateCubic() +{ + fract = 0; +} + + +void InterpolateCubic::resetRegisters() +{ + fract = 0; +} + + +/// Transpose mono audio. Returns number of produced output samples, and +/// updates "srcSamples" to amount of consumed source samples +int InterpolateCubic::transposeMono(SAMPLETYPE *pdest, + const SAMPLETYPE *psrc, + int &srcSamples) +{ + int i; + int srcSampleEnd = srcSamples - 4; + int srcCount = 0; + + i = 0; + while (srcCount < srcSampleEnd) + { + float out; + const float x3 = 1.0f; + const float x2 = (float)fract; // x + const float x1 = x2*x2; // x^2 + const float x0 = x1*x2; // x^3 + float y0, y1, y2, y3; + + assert(fract < 1.0); + + y0 = _coeffs[0] * x0 + _coeffs[1] * x1 + _coeffs[2] * x2 + _coeffs[3] * x3; + y1 = _coeffs[4] * x0 + _coeffs[5] * x1 + _coeffs[6] * x2 + _coeffs[7] * x3; + y2 = _coeffs[8] * x0 + _coeffs[9] * x1 + _coeffs[10] * x2 + _coeffs[11] * x3; + y3 = _coeffs[12] * x0 + _coeffs[13] * x1 + _coeffs[14] * x2 + _coeffs[15] * x3; + + out = y0 * psrc[0] + y1 * psrc[1] + y2 * psrc[2] + y3 * psrc[3]; + + pdest[i] = (SAMPLETYPE)out; + i ++; + + // update position fraction + fract += rate; + // update whole positions + int whole = (int)fract; + fract -= whole; + psrc += whole; + srcCount += whole; + } + srcSamples = srcCount; + return i; +} + + +/// Transpose stereo audio. Returns number of produced output samples, and +/// updates "srcSamples" to amount of consumed source samples +int InterpolateCubic::transposeStereo(SAMPLETYPE *pdest, + const SAMPLETYPE *psrc, + int &srcSamples) +{ + int i; + int srcSampleEnd = srcSamples - 4; + int srcCount = 0; + + i = 0; + while (srcCount < srcSampleEnd) + { + const float x3 = 1.0f; + const float x2 = (float)fract; // x + const float x1 = x2*x2; // x^2 + const float x0 = x1*x2; // x^3 + float y0, y1, y2, y3; + float out0, out1; + + assert(fract < 1.0); + + y0 = _coeffs[0] * x0 + _coeffs[1] * x1 + _coeffs[2] * x2 + _coeffs[3] * x3; + y1 = _coeffs[4] * x0 + _coeffs[5] * x1 + _coeffs[6] * x2 + _coeffs[7] * x3; + y2 = _coeffs[8] * x0 + _coeffs[9] * x1 + _coeffs[10] * x2 + _coeffs[11] * x3; + y3 = _coeffs[12] * x0 + _coeffs[13] * x1 + _coeffs[14] * x2 + _coeffs[15] * x3; + + out0 = y0 * psrc[0] + y1 * psrc[2] + y2 * psrc[4] + y3 * psrc[6]; + out1 = y0 * psrc[1] + y1 * psrc[3] + y2 * psrc[5] + y3 * psrc[7]; + + pdest[2*i] = (SAMPLETYPE)out0; + pdest[2*i+1] = (SAMPLETYPE)out1; + i ++; + + // update position fraction + fract += rate; + // update whole positions + int whole = (int)fract; + fract -= whole; + psrc += 2*whole; + srcCount += whole; + } + srcSamples = srcCount; + return i; +} + + +/// Transpose multi-channel audio. Returns number of produced output samples, and +/// updates "srcSamples" to amount of consumed source samples +int InterpolateCubic::transposeMulti(SAMPLETYPE *pdest, + const SAMPLETYPE *psrc, + int &srcSamples) +{ + int i; + int srcSampleEnd = srcSamples - 4; + int srcCount = 0; + + i = 0; + while (srcCount < srcSampleEnd) + { + const float x3 = 1.0f; + const float x2 = (float)fract; // x + const float x1 = x2*x2; // x^2 + const float x0 = x1*x2; // x^3 + float y0, y1, y2, y3; + + assert(fract < 1.0); + + y0 = _coeffs[0] * x0 + _coeffs[1] * x1 + _coeffs[2] * x2 + _coeffs[3] * x3; + y1 = _coeffs[4] * x0 + _coeffs[5] * x1 + _coeffs[6] * x2 + _coeffs[7] * x3; + y2 = _coeffs[8] * x0 + _coeffs[9] * x1 + _coeffs[10] * x2 + _coeffs[11] * x3; + y3 = _coeffs[12] * x0 + _coeffs[13] * x1 + _coeffs[14] * x2 + _coeffs[15] * x3; + + for (int c = 0; c < numChannels; c ++) + { + float out; + out = y0 * psrc[c] + y1 * psrc[c + numChannels] + y2 * psrc[c + 2 * numChannels] + y3 * psrc[c + 3 * numChannels]; + pdest[0] = (SAMPLETYPE)out; + pdest ++; + } + i ++; + + // update position fraction + fract += rate; + // update whole positions + int whole = (int)fract; + fract -= whole; + psrc += numChannels*whole; + srcCount += whole; + } + srcSamples = srcCount; + return i; +} diff --git a/src/InterpolateCubic.h b/src/InterpolateCubic.h new file mode 100755 index 0000000..f98e701 --- /dev/null +++ b/src/InterpolateCubic.h @@ -0,0 +1,67 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Cubic interpolation routine. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// $Id: InterpolateCubic.h 225 2015-07-26 14:45:48Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _InterpolateCubic_H_ +#define _InterpolateCubic_H_ + +#include "RateTransposer.h" +#include "STTypes.h" + +namespace soundtouch +{ + +class InterpolateCubic : public TransposerBase +{ +protected: + virtual void resetRegisters(); + virtual int transposeMono(SAMPLETYPE *dest, + const SAMPLETYPE *src, + int &srcSamples); + virtual int transposeStereo(SAMPLETYPE *dest, + const SAMPLETYPE *src, + int &srcSamples); + virtual int transposeMulti(SAMPLETYPE *dest, + const SAMPLETYPE *src, + int &srcSamples); + + double fract; + +public: + InterpolateCubic(); +}; + +} + +#endif diff --git a/src/InterpolateLinear.cpp b/src/InterpolateLinear.cpp old mode 100644 new mode 100755 diff --git a/src/InterpolateLinear.h b/src/InterpolateLinear.h old mode 100644 new mode 100755 diff --git a/src/InterpolateShannon.cpp b/src/InterpolateShannon.cpp new file mode 100755 index 0000000..1085fd1 --- /dev/null +++ b/src/InterpolateShannon.cpp @@ -0,0 +1,185 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Sample interpolation routine using 8-tap band-limited Shannon interpolation +/// with kaiser window. +/// +/// Notice. This algorithm is remarkably much heavier than linear or cubic +/// interpolation, and not remarkably better than cubic algorithm. Thus mostly +/// for experimental purposes +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// $Id: InterpolateShannon.cpp 195 2014-04-06 15:57:21Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "InterpolateShannon.h" +#include "STTypes.h" + +using namespace soundtouch; + + +/// Kaiser window with beta = 2.0 +/// Values scaled down by 5% to avoid overflows +static const double _kaiser8[8] = +{ + 0.41778693317814, + 0.64888025049173, + 0.83508562409944, + 0.93887857733412, + 0.93887857733412, + 0.83508562409944, + 0.64888025049173, + 0.41778693317814 +}; + + +InterpolateShannon::InterpolateShannon() +{ + fract = 0; +} + + +void InterpolateShannon::resetRegisters() +{ + fract = 0; +} + + +#define PI 3.1415926536 +#define sinc(x) (sin(PI * (x)) / (PI * (x))) + +/// Transpose mono audio. Returns number of produced output samples, and +/// updates "srcSamples" to amount of consumed source samples +int InterpolateShannon::transposeMono(SAMPLETYPE *pdest, + const SAMPLETYPE *psrc, + int &srcSamples) +{ + int i; + int srcSampleEnd = srcSamples - 8; + int srcCount = 0; + + i = 0; + while (srcCount < srcSampleEnd) + { + double out; + assert(fract < 1.0); + + out = psrc[0] * sinc(-3.0 - fract) * _kaiser8[0]; + out += psrc[1] * sinc(-2.0 - fract) * _kaiser8[1]; + out += psrc[2] * sinc(-1.0 - fract) * _kaiser8[2]; + if (fract < 1e-6) + { + out += psrc[3] * _kaiser8[3]; // sinc(0) = 1 + } + else + { + out += psrc[3] * sinc(- fract) * _kaiser8[3]; + } + out += psrc[4] * sinc( 1.0 - fract) * _kaiser8[4]; + out += psrc[5] * sinc( 2.0 - fract) * _kaiser8[5]; + out += psrc[6] * sinc( 3.0 - fract) * _kaiser8[6]; + out += psrc[7] * sinc( 4.0 - fract) * _kaiser8[7]; + + pdest[i] = (SAMPLETYPE)out; + i ++; + + // update position fraction + fract += rate; + // update whole positions + int whole = (int)fract; + fract -= whole; + psrc += whole; + srcCount += whole; + } + srcSamples = srcCount; + return i; +} + + +/// Transpose stereo audio. Returns number of produced output samples, and +/// updates "srcSamples" to amount of consumed source samples +int InterpolateShannon::transposeStereo(SAMPLETYPE *pdest, + const SAMPLETYPE *psrc, + int &srcSamples) +{ + int i; + int srcSampleEnd = srcSamples - 8; + int srcCount = 0; + + i = 0; + while (srcCount < srcSampleEnd) + { + double out0, out1, w; + assert(fract < 1.0); + + w = sinc(-3.0 - fract) * _kaiser8[0]; + out0 = psrc[0] * w; out1 = psrc[1] * w; + w = sinc(-2.0 - fract) * _kaiser8[1]; + out0 += psrc[2] * w; out1 += psrc[3] * w; + w = sinc(-1.0 - fract) * _kaiser8[2]; + out0 += psrc[4] * w; out1 += psrc[5] * w; + w = _kaiser8[3] * ((fract < 1e-5) ? 1.0 : sinc(- fract)); // sinc(0) = 1 + out0 += psrc[6] * w; out1 += psrc[7] * w; + w = sinc( 1.0 - fract) * _kaiser8[4]; + out0 += psrc[8] * w; out1 += psrc[9] * w; + w = sinc( 2.0 - fract) * _kaiser8[5]; + out0 += psrc[10] * w; out1 += psrc[11] * w; + w = sinc( 3.0 - fract) * _kaiser8[6]; + out0 += psrc[12] * w; out1 += psrc[13] * w; + w = sinc( 4.0 - fract) * _kaiser8[7]; + out0 += psrc[14] * w; out1 += psrc[15] * w; + + pdest[2*i] = (SAMPLETYPE)out0; + pdest[2*i+1] = (SAMPLETYPE)out1; + i ++; + + // update position fraction + fract += rate; + // update whole positions + int whole = (int)fract; + fract -= whole; + psrc += 2*whole; + srcCount += whole; + } + srcSamples = srcCount; + return i; +} + + +/// Transpose stereo audio. Returns number of produced output samples, and +/// updates "srcSamples" to amount of consumed source samples +int InterpolateShannon::transposeMulti(SAMPLETYPE *pdest, + const SAMPLETYPE *psrc, + int &srcSamples) +{ + // not implemented + assert(false); + return 0; +} diff --git a/src/InterpolateShannon.h b/src/InterpolateShannon.h new file mode 100755 index 0000000..54703d9 --- /dev/null +++ b/src/InterpolateShannon.h @@ -0,0 +1,72 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Sample interpolation routine using 8-tap band-limited Shannon interpolation +/// with kaiser window. +/// +/// Notice. This algorithm is remarkably much heavier than linear or cubic +/// interpolation, and not remarkably better than cubic algorithm. Thus mostly +/// for experimental purposes +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// $Id: InterpolateShannon.h 225 2015-07-26 14:45:48Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _InterpolateShannon_H_ +#define _InterpolateShannon_H_ + +#include "RateTransposer.h" +#include "STTypes.h" + +namespace soundtouch +{ + +class InterpolateShannon : public TransposerBase +{ +protected: + void resetRegisters(); + int transposeMono(SAMPLETYPE *dest, + const SAMPLETYPE *src, + int &srcSamples); + int transposeStereo(SAMPLETYPE *dest, + const SAMPLETYPE *src, + int &srcSamples); + int transposeMulti(SAMPLETYPE *dest, + const SAMPLETYPE *src, + int &srcSamples); + + double fract; + +public: + InterpolateShannon(); +}; + +} + +#endif diff --git a/src/PeakFinder.cpp b/src/PeakFinder.cpp new file mode 100755 index 0000000..20aba31 --- /dev/null +++ b/src/PeakFinder.cpp @@ -0,0 +1,286 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Peak detection routine. +/// +/// The routine detects highest value on an array of values and calculates the +/// precise peak location as a mass-center of the 'hump' around the peak value. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date: 2015-05-18 18:22:02 +0300 (ma, 18 touko 2015) $ +// File revision : $Revision: 4 $ +// +// $Id: PeakFinder.cpp 213 2015-05-18 15:22:02Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include + +#include "PeakFinder.h" + +using namespace soundtouch; + +#define max(x, y) (((x) > (y)) ? (x) : (y)) + + +PeakFinder::PeakFinder() +{ + minPos = maxPos = 0; +} + + +// Finds real 'top' of a peak hump from neighnourhood of the given 'peakpos'. +int PeakFinder::findTop(const float *data, int peakpos) const +{ + int i; + int start, end; + float refvalue; + + refvalue = data[peakpos]; + + // seek within ±10 points + start = peakpos - 10; + if (start < minPos) start = minPos; + end = peakpos + 10; + if (end > maxPos) end = maxPos; + + for (i = start; i <= end; i ++) + { + if (data[i] > refvalue) + { + peakpos = i; + refvalue = data[i]; + } + } + + // failure if max value is at edges of seek range => it's not peak, it's at slope. + if ((peakpos == start) || (peakpos == end)) return 0; + + return peakpos; +} + + +// Finds 'ground level' of a peak hump by starting from 'peakpos' and proceeding +// to direction defined by 'direction' until next 'hump' after minimum value will +// begin +int PeakFinder::findGround(const float *data, int peakpos, int direction) const +{ + int lowpos; + int pos; + int climb_count; + float refvalue; + float delta; + + climb_count = 0; + refvalue = data[peakpos]; + lowpos = peakpos; + + pos = peakpos; + + while ((pos > minPos+1) && (pos < maxPos-1)) + { + int prevpos; + + prevpos = pos; + pos += direction; + + // calculate derivate + delta = data[pos] - data[prevpos]; + if (delta <= 0) + { + // going downhill, ok + if (climb_count) + { + climb_count --; // decrease climb count + } + + // check if new minimum found + if (data[pos] < refvalue) + { + // new minimum found + lowpos = pos; + refvalue = data[pos]; + } + } + else + { + // going uphill, increase climbing counter + climb_count ++; + if (climb_count > 5) break; // we've been climbing too long => it's next uphill => quit + } + } + return lowpos; +} + + +// Find offset where the value crosses the given level, when starting from 'peakpos' and +// proceeds to direction defined in 'direction' +int PeakFinder::findCrossingLevel(const float *data, float level, int peakpos, int direction) const +{ + float peaklevel; + int pos; + + peaklevel = data[peakpos]; + assert(peaklevel >= level); + pos = peakpos; + while ((pos >= minPos) && (pos < maxPos)) + { + if (data[pos + direction] < level) return pos; // crossing found + pos += direction; + } + return -1; // not found +} + + +// Calculates the center of mass location of 'data' array items between 'firstPos' and 'lastPos' +double PeakFinder::calcMassCenter(const float *data, int firstPos, int lastPos) const +{ + int i; + float sum; + float wsum; + + sum = 0; + wsum = 0; + for (i = firstPos; i <= lastPos; i ++) + { + sum += (float)i * data[i]; + wsum += data[i]; + } + + if (wsum < 1e-6) return 0; + return sum / wsum; +} + + + +/// get exact center of peak near given position by calculating local mass of center +double PeakFinder::getPeakCenter(const float *data, int peakpos) const +{ + float peakLevel; // peak level + int crosspos1, crosspos2; // position where the peak 'hump' crosses cutting level + float cutLevel; // cutting value + float groundLevel; // ground level of the peak + int gp1, gp2; // bottom positions of the peak 'hump' + + // find ground positions. + gp1 = findGround(data, peakpos, -1); + gp2 = findGround(data, peakpos, 1); + + peakLevel = data[peakpos]; + + if (gp1 == gp2) + { + // avoid rounding errors when all are equal + assert(gp1 == peakpos); + cutLevel = groundLevel = peakLevel; + } else { + // get average of the ground levels + groundLevel = 0.5f * (data[gp1] + data[gp2]); + + // calculate 70%-level of the peak + cutLevel = 0.70f * peakLevel + 0.30f * groundLevel; + } + + // find mid-level crossings + crosspos1 = findCrossingLevel(data, cutLevel, peakpos, -1); + crosspos2 = findCrossingLevel(data, cutLevel, peakpos, 1); + + if ((crosspos1 < 0) || (crosspos2 < 0)) return 0; // no crossing, no peak.. + + // calculate mass center of the peak surroundings + return calcMassCenter(data, crosspos1, crosspos2); +} + + + +double PeakFinder::detectPeak(const float *data, int aminPos, int amaxPos) +{ + + int i; + int peakpos; // position of peak level + double highPeak, peak; + + this->minPos = aminPos; + this->maxPos = amaxPos; + + // find absolute peak + peakpos = minPos; + peak = data[minPos]; + for (i = minPos + 1; i < maxPos; i ++) + { + if (data[i] > peak) + { + peak = data[i]; + peakpos = i; + } + } + + // Calculate exact location of the highest peak mass center + highPeak = getPeakCenter(data, peakpos); + peak = highPeak; + + // Now check if the highest peak were in fact harmonic of the true base beat peak + // - sometimes the highest peak can be Nth harmonic of the true base peak yet + // just a slightly higher than the true base + + for (i = 3; i < 10; i ++) + { + double peaktmp, harmonic; + int i1,i2; + + harmonic = (double)i * 0.5; + peakpos = (int)(highPeak / harmonic + 0.5f); + if (peakpos < minPos) break; + peakpos = findTop(data, peakpos); // seek true local maximum index + if (peakpos == 0) continue; // no local max here + + // calculate mass-center of possible harmonic peak + peaktmp = getPeakCenter(data, peakpos); + + // accept harmonic peak if + // (a) it is found + // (b) is within ±4% of the expected harmonic interval + // (c) has at least half x-corr value of the max. peak + + double diff = harmonic * peaktmp / highPeak; + if ((diff < 0.96) || (diff > 1.04)) continue; // peak too afar from expected + + // now compare to highest detected peak + i1 = (int)(highPeak + 0.5); + i2 = (int)(peaktmp + 0.5); + if (data[i2] >= 0.4*data[i1]) + { + // The harmonic is at least half as high primary peak, + // thus use the harmonic peak instead + peak = peaktmp; + } + } + + return peak; +} diff --git a/src/PeakFinder.h b/src/PeakFinder.h new file mode 100755 index 0000000..bd2d9bf --- /dev/null +++ b/src/PeakFinder.h @@ -0,0 +1,97 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// The routine detects highest value on an array of values and calculates the +/// precise peak location as a mass-center of the 'hump' around the peak value. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date: 2011-12-30 22:33:46 +0200 (pe, 30 joulu 2011) $ +// File revision : $Revision: 4 $ +// +// $Id: PeakFinder.h 132 2011-12-30 20:33:46Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _PeakFinder_H_ +#define _PeakFinder_H_ + +namespace soundtouch +{ + +class PeakFinder +{ +protected: + /// Min, max allowed peak positions within the data vector + int minPos, maxPos; + + /// Calculates the mass center between given vector items. + double calcMassCenter(const float *data, ///< Data vector. + int firstPos, ///< Index of first vector item beloging to the peak. + int lastPos ///< Index of last vector item beloging to the peak. + ) const; + + /// Finds the data vector index where the monotoniously decreasing signal crosses the + /// given level. + int findCrossingLevel(const float *data, ///< Data vector. + float level, ///< Goal crossing level. + int peakpos, ///< Peak position index within the data vector. + int direction /// Direction where to proceed from the peak: 1 = right, -1 = left. + ) const; + + // Finds real 'top' of a peak hump from neighnourhood of the given 'peakpos'. + int findTop(const float *data, int peakpos) const; + + + /// Finds the 'ground' level, i.e. smallest level between two neighbouring peaks, to right- + /// or left-hand side of the given peak position. + int findGround(const float *data, /// Data vector. + int peakpos, /// Peak position index within the data vector. + int direction /// Direction where to proceed from the peak: 1 = right, -1 = left. + ) const; + + /// get exact center of peak near given position by calculating local mass of center + double getPeakCenter(const float *data, int peakpos) const; + +public: + /// Constructor. + PeakFinder(); + + /// Detect exact peak position of the data vector by finding the largest peak 'hump' + /// and calculating the mass-center location of the peak hump. + /// + /// \return The location of the largest base harmonic peak hump. + double detectPeak(const float *data, /// Data vector to be analyzed. The data vector has + /// to be at least 'maxPos' items long. + int minPos, ///< Min allowed peak location within the vector data. + int maxPos ///< Max allowed peak location within the vector data. + ); +}; + +} + +#endif // _PeakFinder_H_ diff --git a/src/RateTransposer.cpp b/src/RateTransposer.cpp old mode 100644 new mode 100755 index 6e0f03e..5d3cee1 --- a/src/RateTransposer.cpp +++ b/src/RateTransposer.cpp @@ -10,10 +10,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-07-26 17:45:48 +0300 (Sun, 26 Jul 2015) $ +// Last changed : $Date: 2016-10-15 22:34:59 +0300 (la, 15 loka 2016) $ // File revision : $Revision: 4 $ // -// $Id: RateTransposer.cpp 225 2015-07-26 14:45:48Z oparviai $ +// $Id: RateTransposer.cpp 243 2016-10-15 19:34:59Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -44,6 +44,8 @@ #include #include "RateTransposer.h" #include "InterpolateLinear.h" +#include "InterpolateCubic.h" +#include "InterpolateShannon.h" #include "AAFilter.h" using namespace soundtouch; @@ -206,6 +208,13 @@ int RateTransposer::isEmpty() const } +/// Return approximate initial input-output latency +int RateTransposer::getLatency() const +{ + return (bUseAAFilter) ? pAAFilter->getLength() : 0; +} + + ////////////////////////////////////////////////////////////////////////////// // // TransposerBase - Base class for interpolation diff --git a/src/RateTransposer.h b/src/RateTransposer.h old mode 100644 new mode 100755 index 8d0eab8..561f490 --- a/src/RateTransposer.h +++ b/src/RateTransposer.h @@ -14,10 +14,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-07-26 17:45:48 +0300 (Sun, 26 Jul 2015) $ +// Last changed : $Date: 2016-10-15 22:34:59 +0300 (la, 15 loka 2016) $ // File revision : $Revision: 4 $ // -// $Id: RateTransposer.h 225 2015-07-26 14:45:48Z oparviai $ +// $Id: RateTransposer.h 243 2016-10-15 19:34:59Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -172,6 +172,9 @@ public: /// Returns nonzero if there aren't any samples available for outputting. int isEmpty() const; + + /// Return approximate initial input-output latency + int getLatency() const; }; } diff --git a/src/SoundTouch.cpp b/src/SoundTouch.cpp old mode 100644 new mode 100755 index 2c715e2..e6ad0a0 --- a/src/SoundTouch.cpp +++ b/src/SoundTouch.cpp @@ -41,10 +41,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-07-26 17:45:48 +0300 (Sun, 26 Jul 2015) $ +// Last changed : $Date: 2016-10-15 22:34:59 +0300 (la, 15 loka 2016) $ // File revision : $Revision: 4 $ // -// $Id: SoundTouch.cpp 225 2015-07-26 14:45:48Z oparviai $ +// $Id: SoundTouch.cpp 243 2016-10-15 19:34:59Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -110,8 +110,8 @@ SoundTouch::SoundTouch() calcEffectiveRateAndTempo(); - samplesExpectedOut = 0; - samplesOutput = 0; + samplesExpectedOut = 0; + samplesOutput = 0; channels = 0; bSrateSet = false; @@ -149,7 +149,7 @@ void SoundTouch::setChannels(uint numChannels) /*if (numChannels != 1 && numChannels != 2) { //ST_THROW_RT_ERROR("Illegal number of channels"); - return; + return; }*/ channels = numChannels; pRateTransposer->setChannels((int)numChannels); @@ -240,11 +240,11 @@ void SoundTouch::calcEffectiveRateAndTempo() double oldTempo = tempo; double oldRate = rate; - tempo = virtualTempo / virtualPitch; - rate = virtualPitch * virtualRate; + tempo = virtualTempo / virtualPitch; + rate = virtualPitch * virtualRate; if (!TEST_FLOAT_EQUAL(rate,oldRate)) pRateTransposer->setRate(rate); - if (!TEST_FLOAT_EQUAL(tempo, oldTempo)) pTDStretch->setTempo(tempo); + if (!TEST_FLOAT_EQUAL(tempo, oldTempo)) pTDStretch->setTempo(tempo); #ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER if (rate <= 1.0f) @@ -321,9 +321,9 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples) } */ - // accumulate how many samples are expected out from processing, given the current - // processing setting - samplesExpectedOut += (double)nSamples / ((double)rate * (double)tempo); + // accumulate how many samples are expected out from processing, given the current + // processing setting + samplesExpectedOut += (double)nSamples / ((double)rate * (double)tempo); #ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER if (rate <= 1.0f) @@ -354,23 +354,24 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples) void SoundTouch::flush() { int i; - int numStillExpected; + int numStillExpected; SAMPLETYPE *buff = new SAMPLETYPE[128 * channels]; - // how many samples are still expected to output - numStillExpected = (int)((long)(samplesExpectedOut + 0.5) - samplesOutput); + // how many samples are still expected to output + numStillExpected = (int)((long)(samplesExpectedOut + 0.5) - samplesOutput); + if (numStillExpected < 0) numStillExpected = 0; memset(buff, 0, 128 * channels * sizeof(SAMPLETYPE)); // "Push" the last active samples out from the processing pipeline by // feeding blank samples into the processing pipeline until new, // processed samples appear in the output (not however, more than // 24ksamples in any case) - for (i = 0; (numStillExpected > (int)numSamples()) && (i < 200); i ++) - { - putSamples(buff, 128); - } + for (i = 0; (numStillExpected > (int)numSamples()) && (i < 200); i ++) + { + putSamples(buff, 128); + } - adjustAmountOfSamples(numStillExpected); + adjustAmountOfSamples(numStillExpected); delete[] buff; @@ -446,7 +447,7 @@ int SoundTouch::getSetting(int settingId) const return pRateTransposer->getAAFilter()->getLength(); case SETTING_USE_QUICKSEEK : - return (uint) pTDStretch->isQuickSeekEnabled(); + return (uint)pTDStretch->isQuickSeekEnabled(); case SETTING_SEQUENCE_MS: pTDStretch->getParameters(NULL, &temp, NULL, NULL); @@ -460,23 +461,65 @@ int SoundTouch::getSetting(int settingId) const pTDStretch->getParameters(NULL, NULL, NULL, &temp); return temp; - case SETTING_NOMINAL_INPUT_SEQUENCE : - return pTDStretch->getInputSampleReq(); + case SETTING_NOMINAL_INPUT_SEQUENCE : + { + int size = pTDStretch->getInputSampleReq(); - case SETTING_NOMINAL_OUTPUT_SEQUENCE : - return pTDStretch->getOutputBatchSize(); +#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER + if (rate <= 1.0) + { + // transposing done before timestretch, which impacts latency + return (int)(size * rate + 0.5); + } +#endif + return size; + } - default : + case SETTING_NOMINAL_OUTPUT_SEQUENCE : + { + int size = pTDStretch->getOutputBatchSize(); + + if (rate > 1.0) + { + // transposing done after timestretch, which impacts latency + return (int)(size / rate + 0.5); + } + return size; + } + + case SETTING_INITIAL_LATENCY: + { + double latency = pTDStretch->getLatency(); + int latency_tr = pRateTransposer->getLatency(); + +#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER + if (rate <= 1.0) + { + // transposing done before timestretch, which impacts latency + latency = (latency + latency_tr) * rate; + } + else +#endif + { + latency += (double)latency_tr / rate; + } + + return (int)(latency + 0.5); + } + + default : return 0; } } + // Clears all the samples in the object's output and internal processing // buffers. void SoundTouch::clear() { - samplesExpectedOut = 0; + samplesExpectedOut = 0; + samplesOutput = 0; pRateTransposer->clear(); pTDStretch->clear(); } @@ -507,9 +550,9 @@ uint SoundTouch::numUnprocessedSamples() const /// \return Number of samples returned. uint SoundTouch::receiveSamples(SAMPLETYPE *output, uint maxSamples) { - uint ret = FIFOProcessor::receiveSamples(output, maxSamples); - samplesOutput += (long)ret; - return ret; + uint ret = FIFOProcessor::receiveSamples(output, maxSamples); + samplesOutput += (long)ret; + return ret; } @@ -520,7 +563,16 @@ uint SoundTouch::receiveSamples(SAMPLETYPE *output, uint maxSamples) /// with 'ptrBegin' function. uint SoundTouch::receiveSamples(uint maxSamples) { - uint ret = FIFOProcessor::receiveSamples(maxSamples); - samplesOutput += (long)ret; - return ret; + uint ret = FIFOProcessor::receiveSamples(maxSamples); + samplesOutput += (long)ret; + return ret; +} + + +/// Get ratio between input and output audio durations, useful for calculating +/// processed output duration: if you'll process a stream of N samples, then +/// you can expect to get out N * getInputOutputSampleRatio() samples. +double SoundTouch::getInputOutputSampleRatio() +{ + return 1.0 / (tempo * rate); } diff --git a/src/TDStretch.cpp b/src/TDStretch.cpp old mode 100644 new mode 100755 index d030558..f3a4636 --- a/src/TDStretch.cpp +++ b/src/TDStretch.cpp @@ -4,8 +4,14 @@ /// while maintaining the original pitch by using a time domain WSOLA-like /// method with several performance-increasing tweaks. /// -/// Note : MMX optimized functions reside in a separate, platform-specific -/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp' +/// Notes : MMX optimized functions reside in a separate, platform-specific +/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'. +/// +/// This source file contains OpenMP optimizations that allow speeding up the +/// corss-correlation algorithm by executing it in several threads / CPU cores +/// in parallel. See the following article link for more detailed discussion +/// about SoundTouch OpenMP optimizations: +/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices /// /// Author : Copyright (c) Olli Parviainen /// Author e-mail : oparviai 'at' iki.fi @@ -13,10 +19,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-08-09 00:00:15 +0300 (Sun, 09 Aug 2015) $ +// Last changed : $Date: 2017-04-07 22:01:22 +0300 (pe, 07 huhti 2017) $ // File revision : $Revision: 1.12 $ // -// $Id: TDStretch.cpp 226 2015-08-08 21:00:15Z oparviai $ +// $Id: TDStretch.cpp 249 2017-04-07 19:01:22Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -219,6 +225,7 @@ void TDStretch::clearInput() { inputBuffer.clear(); clearMidBuffer(); + isBeginning = true; } @@ -297,12 +304,13 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) int i; double norm; - bestCorr = FLT_MIN; + bestCorr = -FLT_MAX; bestOffs = 0; // Scans for the best correlation value by testing each possible position // over the permitted range. bestCorr = calcCrossCorr(refPos, pMidBuffer, norm); + bestCorr = (bestCorr + 0.1) * 0.75; #pragma omp parallel for for (i = 1; i < seekLength; i ++) @@ -373,12 +381,10 @@ int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) // note: 'float' types used in this function in case that the platform would need to use software-fp - bestCorr = FLT_MIN; - bestOffs = SCANWIND; - bestCorr2 = FLT_MIN; - bestOffs2 = 0; - - int best = 0; + bestCorr = + bestCorr2 = -FLT_MAX; + bestOffs = + bestOffs2 = SCANWIND; // Scans for the best correlation value by testing each possible position // over the permitted range. Look for two best matches on the first pass to @@ -436,7 +442,6 @@ int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) { bestCorr = corr; bestOffs = i; - best = 1; } } @@ -458,7 +463,6 @@ int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) { bestCorr = corr; bestOffs = i; - best = 2; } } @@ -520,13 +524,13 @@ void TDStretch::calcSeqParameters() #define AUTOSEQ_TEMPO_TOP 2.0 // auto setting top tempo range (+100%) // sequence-ms setting values at above low & top tempo - #define AUTOSEQ_AT_MIN 125.0 - #define AUTOSEQ_AT_MAX 50.0 + #define AUTOSEQ_AT_MIN 90.0 + #define AUTOSEQ_AT_MAX 40.0 #define AUTOSEQ_K ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW)) #define AUTOSEQ_C (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW)) // seek-window-ms setting values at above low & top tempoq - #define AUTOSEEK_AT_MIN 25.0 + #define AUTOSEEK_AT_MIN 20.0 #define AUTOSEEK_AT_MAX 15.0 #define AUTOSEEK_K ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW)) #define AUTOSEEK_C (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW)) @@ -637,7 +641,8 @@ void TDStretch::processNominalTempo() // the result into 'outputBuffer' void TDStretch::processSamples() { - int ovlSkip, offset; + int ovlSkip; + int offset = 0; int temp; /* Removed this small optimization - can introduce a click to sound when tempo setting @@ -654,35 +659,61 @@ void TDStretch::processSamples() // to form a processing frame. while ((int)inputBuffer.numSamples() >= sampleReq) { - // If tempo differs from the normal ('SCALE'), scan for the best overlapping - // position - offset = seekBestOverlapPosition(inputBuffer.ptrBegin()); + if (isBeginning == false) + { + // apart from the very beginning of the track, + // scan for the best overlapping position & do overlap-add + offset = seekBestOverlapPosition(inputBuffer.ptrBegin()); - // Mix the samples in the 'inputBuffer' at position of 'offset' with the - // samples in 'midBuffer' using sliding overlapping - // ... first partially overlap with the end of the previous sequence - // (that's in 'midBuffer') - overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset); - outputBuffer.putSamples((uint)overlapLength); + // Mix the samples in the 'inputBuffer' at position of 'offset' with the + // samples in 'midBuffer' using sliding overlapping + // ... first partially overlap with the end of the previous sequence + // (that's in 'midBuffer') + overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset); + outputBuffer.putSamples((uint)overlapLength); + offset += overlapLength; + } + else + { + // Adjust processing offset at beginning of track by not perform initial overlapping + // and compensating that in the 'input buffer skip' calculation + isBeginning = false; + int skip = (int)(tempo * overlapLength + 0.5); + + #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION + #ifdef SOUNDTOUCH_ALLOW_SSE + // if SSE mode, round the skip amount to value corresponding to aligned memory address + if (channels == 1) + { + skip &= -4; + } + else if (channels == 2) + { + skip &= -2; + } + #endif + #endif + skipFract -= skip; + assert(nominalSkip >= -skipFract); + } // ... then copy sequence samples from 'inputBuffer' to output: - // length of sequence - temp = (seekWindowLength - 2 * overlapLength); - // crosscheck that we don't have buffer overflow... - if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2)) + if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength)) { continue; // just in case, shouldn't really happen } - outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp); + // length of sequence + temp = (seekWindowLength - 2 * overlapLength); + outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp); // Copies the end of the current sequence from 'inputBuffer' to // 'midBuffer' for being mixed with the beginning of the next // processing sequence and so on - assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples()); - memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), + assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples()); + memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp), channels * sizeof(SAMPLETYPE) * overlapLength); // Remove the processed samples from the input buffer. Update @@ -879,7 +910,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do if (lnorm > maxnorm) { - maxnorm = lnorm; + // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode + #pragma omp critical + if (lnorm > maxnorm) + { + maxnorm = lnorm; + } } // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation diff --git a/src/TDStretch.h b/src/TDStretch.h old mode 100644 new mode 100755 index e6d75aa..bde8282 --- a/src/TDStretch.h +++ b/src/TDStretch.h @@ -13,10 +13,10 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-08-09 00:00:15 +0300 (Sun, 09 Aug 2015) $ +// Last changed : $Date: 2016-10-20 19:30:11 +0300 (to, 20 loka 2016) $ // File revision : $Revision: 4 $ // -// $Id: TDStretch.h 226 2015-08-08 21:00:15Z oparviai $ +// $Id: TDStretch.h 244 2016-10-20 16:30:11Z oparviai $ // //////////////////////////////////////////////////////////////////////////////// // @@ -134,6 +134,7 @@ protected: bool bQuickSeek; bool bAutoSeqSetting; bool bAutoSeekSetting; + bool isBeginning; SAMPLETYPE *pMidBuffer; SAMPLETYPE *pMidBufferUnaligned; @@ -247,6 +248,13 @@ public: { return seekWindowLength - overlapLength; } + + + /// return approximate initial input-output latency + int getLatency() const + { + return sampleReq; + } }; diff --git a/src/cpu_detect.h b/src/cpu_detect.h old mode 100644 new mode 100755 index 7859ffb..0023c14 --- a/src/cpu_detect.h +++ b/src/cpu_detect.h @@ -12,7 +12,7 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2008-02-10 18:26:55 +0200 (Sun, 10 Feb 2008) $ +// Last changed : $Date: 2008-02-10 18:26:55 +0200 (su, 10 helmi 2008) $ // File revision : $Revision: 4 $ // // $Id: cpu_detect.h 11 2008-02-10 16:26:55Z oparviai $ diff --git a/src/cpu_detect_x86.cpp b/src/cpu_detect_x86.cpp old mode 100644 new mode 100755 index 00f22ab..98799a6 --- a/src/cpu_detect_x86.cpp +++ b/src/cpu_detect_x86.cpp @@ -11,7 +11,7 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2014-01-07 20:24:28 +0200 (Tue, 07 Jan 2014) $ +// Last changed : $Date: 2014-01-07 20:24:28 +0200 (ti, 07 tammi 2014) $ // File revision : $Revision: 4 $ // // $Id: cpu_detect_x86.cpp 183 2014-01-07 18:24:28Z oparviai $ diff --git a/src/mmx_optimized.cpp b/src/mmx_optimized.cpp new file mode 100755 index 0000000..16557b7 --- /dev/null +++ b/src/mmx_optimized.cpp @@ -0,0 +1,400 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// MMX optimized routines. All MMX optimized functions have been gathered into +/// this single source code file, regardless to their class or original source +/// code file, in order to ease porting the library to other compiler and +/// processor platforms. +/// +/// The MMX-optimizations are programmed using MMX compiler intrinsics that +/// are supported both by Microsoft Visual C++ and GCC compilers, so this file +/// should compile with both toolsets. +/// +/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ +/// 6.0 processor pack" update to support compiler intrinsic syntax. The update +/// is available for download at Microsoft Developers Network, see here: +/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date: 2017-03-05 15:56:03 +0200 (su, 05 maalis 2017) $ +// File revision : $Revision: 4 $ +// +// $Id: mmx_optimized.cpp 247 2017-03-05 13:56:03Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include "STTypes.h" + +#ifdef SOUNDTOUCH_ALLOW_MMX +// MMX routines available only with integer sample type + +using namespace soundtouch; + +////////////////////////////////////////////////////////////////////////////// +// +// implementation of MMX optimized functions of class 'TDStretchMMX' +// +////////////////////////////////////////////////////////////////////////////// + +#include "TDStretch.h" +#include +#include +#include + + +// Calculates cross correlation of two buffers +double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm) +{ + const __m64 *pVec1, *pVec2; + __m64 shifter; + __m64 accu, normaccu; + long corr, norm; + int i; + + pVec1 = (__m64*)pV1; + pVec2 = (__m64*)pV2; + + shifter = _m_from_int(overlapDividerBitsNorm); + normaccu = accu = _mm_setzero_si64(); + + // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples + // during each round for improved CPU-level parallellization. + for (i = 0; i < channels * overlapLength / 16; i ++) + { + __m64 temp, temp2; + + // dictionary of instructions: + // _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] + // _mm_add_pi32 : 2*32bit add + // _m_psrad : 32bit right-shift + + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter)); + temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec1[1]), shifter)); + accu = _mm_add_pi32(accu, temp); + normaccu = _mm_add_pi32(normaccu, temp2); + + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter)); + temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec1[3]), shifter)); + accu = _mm_add_pi32(accu, temp); + normaccu = _mm_add_pi32(normaccu, temp2); + + pVec1 += 4; + pVec2 += 4; + } + + // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 + // and finally store the result into the variable "corr" + + accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32)); + corr = _m_to_int(accu); + + normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32)); + norm = _m_to_int(normaccu); + + // Clear MMS state + _m_empty(); + + if (norm > (long)maxnorm) + { + // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode + #pragma omp critical + if (norm > (long)maxnorm) + { + maxnorm = norm; + } + } + + // Normalize result by dividing by sqrt(norm) - this step is easiest + // done using floating point operation + dnorm = (double)norm; + + return (double)corr / sqrt(dnorm < 1e-9 ? 1.0 : dnorm); + // Note: Warning about the missing EMMS instruction is harmless + // as it'll be called elsewhere. +} + + +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value +double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm) +{ + const __m64 *pVec1, *pVec2; + __m64 shifter; + __m64 accu; + long corr, lnorm; + int i; + + // cancel first normalizer tap from previous round + lnorm = 0; + for (i = 1; i <= channels; i ++) + { + lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBitsNorm; + } + + pVec1 = (__m64*)pV1; + pVec2 = (__m64*)pV2; + + shifter = _m_from_int(overlapDividerBitsNorm); + accu = _mm_setzero_si64(); + + // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples + // during each round for improved CPU-level parallellization. + for (i = 0; i < channels * overlapLength / 16; i ++) + { + __m64 temp; + + // dictionary of instructions: + // _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] + // _mm_add_pi32 : 2*32bit add + // _m_psrad : 32bit right-shift + + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter)); + accu = _mm_add_pi32(accu, temp); + + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter)); + accu = _mm_add_pi32(accu, temp); + + pVec1 += 4; + pVec2 += 4; + } + + // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 + // and finally store the result into the variable "corr" + + accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32)); + corr = _m_to_int(accu); + + // Clear MMS state + _m_empty(); + + // update normalizer with last samples of this round + pV1 = (short *)pVec1; + for (int j = 1; j <= channels; j ++) + { + lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBitsNorm; + } + dnorm += (double)lnorm; + + if (lnorm > (long)maxnorm) + { + maxnorm = lnorm; + } + + // Normalize result by dividing by sqrt(norm) - this step is easiest + // done using floating point operation + return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm); +} + + +void TDStretchMMX::clearCrossCorrState() +{ + // Clear MMS state + _m_empty(); + //_asm EMMS; +} + + + +// MMX-optimized version of the function overlapStereo +void TDStretchMMX::overlapStereo(short *output, const short *input) const +{ + const __m64 *pVinput, *pVMidBuf; + __m64 *pVdest; + __m64 mix1, mix2, adder, shifter; + int i; + + pVinput = (const __m64*)input; + pVMidBuf = (const __m64*)pMidBuffer; + pVdest = (__m64*)output; + + // mix1 = mixer values for 1st stereo sample + // mix1 = mixer values for 2nd stereo sample + // adder = adder for updating mixer values after each round + + mix1 = _mm_set_pi16(0, overlapLength, 0, overlapLength); + adder = _mm_set_pi16(1, -1, 1, -1); + mix2 = _mm_add_pi16(mix1, adder); + adder = _mm_add_pi16(adder, adder); + + // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in + // overlapDividerBits calculation earlier. + shifter = _m_from_int(overlapDividerBitsPure + 1); + + for (i = 0; i < overlapLength / 4; i ++) + { + __m64 temp1, temp2; + + // load & shuffle data so that input & mixbuffer data samples are paired + temp1 = _mm_unpacklo_pi16(pVMidBuf[0], pVinput[0]); // = i0l m0l i0r m0r + temp2 = _mm_unpackhi_pi16(pVMidBuf[0], pVinput[0]); // = i1l m1l i1r m1r + + // temp = (temp .* mix) >> shifter + temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter); + temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter); + pVdest[0] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit + + // update mix += adder + mix1 = _mm_add_pi16(mix1, adder); + mix2 = _mm_add_pi16(mix2, adder); + + // --- second round begins here --- + + // load & shuffle data so that input & mixbuffer data samples are paired + temp1 = _mm_unpacklo_pi16(pVMidBuf[1], pVinput[1]); // = i2l m2l i2r m2r + temp2 = _mm_unpackhi_pi16(pVMidBuf[1], pVinput[1]); // = i3l m3l i3r m3r + + // temp = (temp .* mix) >> shifter + temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter); + temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter); + pVdest[1] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit + + // update mix += adder + mix1 = _mm_add_pi16(mix1, adder); + mix2 = _mm_add_pi16(mix2, adder); + + pVinput += 2; + pVMidBuf += 2; + pVdest += 2; + } + + _m_empty(); // clear MMS state +} + + +////////////////////////////////////////////////////////////////////////////// +// +// implementation of MMX optimized functions of class 'FIRFilter' +// +////////////////////////////////////////////////////////////////////////////// + +#include "FIRFilter.h" + + +FIRFilterMMX::FIRFilterMMX() : FIRFilter() +{ + filterCoeffsAlign = NULL; + filterCoeffsUnalign = NULL; +} + + +FIRFilterMMX::~FIRFilterMMX() +{ + delete[] filterCoeffsUnalign; +} + + +// (overloaded) Calculates filter coefficients for MMX routine +void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor) +{ + uint i; + FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor); + + // Ensure that filter coeffs array is aligned to 16-byte boundary + delete[] filterCoeffsUnalign; + filterCoeffsUnalign = new short[2 * newLength + 8]; + filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign); + + // rearrange the filter coefficients for mmx routines + for (i = 0;i < length; i += 4) + { + filterCoeffsAlign[2 * i + 0] = coeffs[i + 0]; + filterCoeffsAlign[2 * i + 1] = coeffs[i + 2]; + filterCoeffsAlign[2 * i + 2] = coeffs[i + 0]; + filterCoeffsAlign[2 * i + 3] = coeffs[i + 2]; + + filterCoeffsAlign[2 * i + 4] = coeffs[i + 1]; + filterCoeffsAlign[2 * i + 5] = coeffs[i + 3]; + filterCoeffsAlign[2 * i + 6] = coeffs[i + 1]; + filterCoeffsAlign[2 * i + 7] = coeffs[i + 3]; + } +} + + + +// mmx-optimized version of the filter routine for stereo sound +uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const +{ + // Create stack copies of the needed member variables for asm routines : + uint i, j; + __m64 *pVdest = (__m64*)dest; + + if (length < 2) return 0; + + for (i = 0; i < (numSamples - length) / 2; i ++) + { + __m64 accu1; + __m64 accu2; + const __m64 *pVsrc = (const __m64*)src; + const __m64 *pVfilter = (const __m64*)filterCoeffsAlign; + + accu1 = accu2 = _mm_setzero_si64(); + for (j = 0; j < lengthDiv8 * 2; j ++) + { + __m64 temp1, temp2; + + temp1 = _mm_unpacklo_pi16(pVsrc[0], pVsrc[1]); // = l2 l0 r2 r0 + temp2 = _mm_unpackhi_pi16(pVsrc[0], pVsrc[1]); // = l3 l1 r3 r1 + + accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp1, pVfilter[0])); // += l2*f2+l0*f0 r2*f2+r0*f0 + accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp2, pVfilter[1])); // += l3*f3+l1*f1 r3*f3+r1*f1 + + temp1 = _mm_unpacklo_pi16(pVsrc[1], pVsrc[2]); // = l4 l2 r4 r2 + + accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp2, pVfilter[0])); // += l3*f2+l1*f0 r3*f2+r1*f0 + accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp1, pVfilter[1])); // += l4*f3+l2*f1 r4*f3+r2*f1 + + // accu1 += l2*f2+l0*f0 r2*f2+r0*f0 + // += l3*f3+l1*f1 r3*f3+r1*f1 + + // accu2 += l3*f2+l1*f0 r3*f2+r1*f0 + // l4*f3+l2*f1 r4*f3+r2*f1 + + pVfilter += 2; + pVsrc += 2; + } + // accu >>= resultDivFactor + accu1 = _mm_srai_pi32(accu1, resultDivFactor); + accu2 = _mm_srai_pi32(accu2, resultDivFactor); + + // pack 2*2*32bits => 4*16 bits + pVdest[0] = _mm_packs_pi32(accu1, accu2); + src += 4; + pVdest ++; + } + + _m_empty(); // clear emms state + + return (numSamples & 0xfffffffe) - length; +} + +#endif // SOUNDTOUCH_ALLOW_MMX diff --git a/src/sse_optimized.cpp b/src/sse_optimized.cpp new file mode 100755 index 0000000..c4b594f --- /dev/null +++ b/src/sse_optimized.cpp @@ -0,0 +1,372 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE +/// optimized functions have been gathered into this single source +/// code file, regardless to their class or original source code file, in order +/// to ease porting the library to other compiler and processor platforms. +/// +/// The SSE-optimizations are programmed using SSE compiler intrinsics that +/// are supported both by Microsoft Visual C++ and GCC compilers, so this file +/// should compile with both toolsets. +/// +/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ +/// 6.0 processor pack" update to support SSE instruction set. The update is +/// available for download at Microsoft Developers Network, see here: +/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx +/// +/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and +/// perform a search with keywords "processor pack". +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date: 2015-08-09 00:00:15 +0300 (su, 09 elo 2015) $ +// File revision : $Revision: 4 $ +// +// $Id: sse_optimized.cpp 226 2015-08-08 21:00:15Z oparviai $ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include "cpu_detect.h" +#include "STTypes.h" + +using namespace soundtouch; + +#ifdef SOUNDTOUCH_ALLOW_SSE + +// SSE routines available only with float sample type + +////////////////////////////////////////////////////////////////////////////// +// +// implementation of SSE optimized functions of class 'TDStretchSSE' +// +////////////////////////////////////////////////////////////////////////////// + +#include "TDStretch.h" +#include +#include + +// Calculates cross correlation of two buffers +double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm) +{ + int i; + const float *pVec1; + const __m128 *pVec2; + __m128 vSum, vNorm; + + // Note. It means a major slow-down if the routine needs to tolerate + // unaligned __m128 memory accesses. It's way faster if we can skip + // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps. + // This can mean up to ~ 10-fold difference (incl. part of which is + // due to skipping every second round for stereo sound though). + // + // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided + // for choosing if this little cheating is allowed. + +#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION + // Little cheating allowed, return valid correlation only for + // aligned locations, meaning every second round for stereo sound. + + #define _MM_LOAD _mm_load_ps + + if (((ulongptr)pV1) & 15) return -1e50; // skip unaligned locations + +#else + // No cheating allowed, use unaligned load & take the resulting + // performance hit. + #define _MM_LOAD _mm_loadu_ps +#endif + + // ensure overlapLength is divisible by 8 + assert((overlapLength % 8) == 0); + + // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors + // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not. + pVec1 = (const float*)pV1; + pVec2 = (const __m128*)pV2; + vSum = vNorm = _mm_setzero_ps(); + + // Unroll the loop by factor of 4 * 4 operations. Use same routine for + // stereo & mono, for mono it just means twice the amount of unrolling. + for (i = 0; i < channels * overlapLength / 16; i ++) + { + __m128 vTemp; + // vSum += pV1[0..3] * pV2[0..3] + vTemp = _MM_LOAD(pVec1); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); + + // vSum += pV1[4..7] * pV2[4..7] + vTemp = _MM_LOAD(pVec1 + 4); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); + + // vSum += pV1[8..11] * pV2[8..11] + vTemp = _MM_LOAD(pVec1 + 8); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); + + // vSum += pV1[12..15] * pV2[12..15] + vTemp = _MM_LOAD(pVec1 + 12); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); + + pVec1 += 16; + pVec2 += 4; + } + + // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3] + float *pvNorm = (float*)&vNorm; + float norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]); + anorm = norm; + + float *pvSum = (float*)&vSum; + return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm); + + /* This is approximately corresponding routine in C-language yet without normalization: + double corr, norm; + uint i; + + // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors + corr = norm = 0.0; + for (i = 0; i < channels * overlapLength / 16; i ++) + { + corr += pV1[0] * pV2[0] + + pV1[1] * pV2[1] + + pV1[2] * pV2[2] + + pV1[3] * pV2[3] + + pV1[4] * pV2[4] + + pV1[5] * pV2[5] + + pV1[6] * pV2[6] + + pV1[7] * pV2[7] + + pV1[8] * pV2[8] + + pV1[9] * pV2[9] + + pV1[10] * pV2[10] + + pV1[11] * pV2[11] + + pV1[12] * pV2[12] + + pV1[13] * pV2[13] + + pV1[14] * pV2[14] + + pV1[15] * pV2[15]; + + for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; + + pV1 += 16; + pV2 += 16; + } + return corr / sqrt(norm); + */ +} + + + +double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) +{ + // call usual calcCrossCorr function because SSE does not show big benefit of + // accumulating "norm" value, and also the "norm" rolling algorithm would get + // complicated due to SSE-specific alignment-vs-nonexact correlation rules. + return calcCrossCorr(pV1, pV2, norm); +} + + +////////////////////////////////////////////////////////////////////////////// +// +// implementation of SSE optimized functions of class 'FIRFilter' +// +////////////////////////////////////////////////////////////////////////////// + +#include "FIRFilter.h" + +FIRFilterSSE::FIRFilterSSE() : FIRFilter() +{ + filterCoeffsAlign = NULL; + filterCoeffsUnalign = NULL; +} + + +FIRFilterSSE::~FIRFilterSSE() +{ + delete[] filterCoeffsUnalign; + filterCoeffsAlign = NULL; + filterCoeffsUnalign = NULL; +} + + +// (overloaded) Calculates filter coefficients for SSE routine +void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor) +{ + uint i; + float fDivider; + + FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor); + + // Scale the filter coefficients so that it won't be necessary to scale the filtering result + // also rearrange coefficients suitably for SSE + // Ensure that filter coeffs array is aligned to 16-byte boundary + delete[] filterCoeffsUnalign; + filterCoeffsUnalign = new float[2 * newLength + 4]; + filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign); + + fDivider = (float)resultDivider; + + // rearrange the filter coefficients for mmx routines + for (i = 0; i < newLength; i ++) + { + filterCoeffsAlign[2 * i + 0] = + filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider; + } +} + + + +// SSE-optimized version of the filter routine for stereo sound +uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const +{ + int count = (int)((numSamples - length) & (uint)-2); + int j; + + assert(count % 2 == 0); + + if (count < 2) return 0; + + assert(source != NULL); + assert(dest != NULL); + assert((length % 8) == 0); + assert(filterCoeffsAlign != NULL); + assert(((ulongptr)filterCoeffsAlign) % 16 == 0); + + // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2' + #pragma omp parallel for + for (j = 0; j < count; j += 2) + { + const float *pSrc; + float *pDest; + const __m128 *pFil; + __m128 sum1, sum2; + uint i; + + pSrc = (const float*)source + j * 2; // source audio data + pDest = dest + j * 2; // destination audio data + pFil = (const __m128*)filterCoeffsAlign; // filter coefficients. NOTE: Assumes coefficients + // are aligned to 16-byte boundary + sum1 = sum2 = _mm_setzero_ps(); + + for (i = 0; i < length / 8; i ++) + { + // Unroll loop for efficiency & calculate filter for 2*2 stereo samples + // at each pass + + // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset + // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset. + + sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc) , pFil[0])); + sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0])); + + sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1])); + sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1])); + + sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) , pFil[2])); + sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2])); + + sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3])); + sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3])); + + pSrc += 16; + pFil += 4; + } + + // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need + // to sum the two hi- and lo-floats of these registers together. + + // post-shuffle & add the filtered values and store to dest. + _mm_storeu_ps(pDest, _mm_add_ps( + _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)), // s2_1 s2_0 s1_3 s1_2 + _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0)) // s2_3 s2_2 s1_1 s1_0 + )); + } + + // Ideas for further improvement: + // 1. If it could be guaranteed that 'source' were always aligned to 16-byte + // boundary, a faster aligned '_mm_load_ps' instruction could be used. + // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte + // boundary, a faster '_mm_store_ps' instruction could be used. + + return (uint)count; + + /* original routine in C-language. please notice the C-version has differently + organized coefficients though. + double suml1, suml2; + double sumr1, sumr2; + uint i, j; + + for (j = 0; j < count; j += 2) + { + const float *ptr; + const float *pFil; + + suml1 = sumr1 = 0.0; + suml2 = sumr2 = 0.0; + ptr = src; + pFil = filterCoeffs; + for (i = 0; i < lengthLocal; i ++) + { + // unroll loop for efficiency. + + suml1 += ptr[0] * pFil[0] + + ptr[2] * pFil[2] + + ptr[4] * pFil[4] + + ptr[6] * pFil[6]; + + sumr1 += ptr[1] * pFil[1] + + ptr[3] * pFil[3] + + ptr[5] * pFil[5] + + ptr[7] * pFil[7]; + + suml2 += ptr[8] * pFil[0] + + ptr[10] * pFil[2] + + ptr[12] * pFil[4] + + ptr[14] * pFil[6]; + + sumr2 += ptr[9] * pFil[1] + + ptr[11] * pFil[3] + + ptr[13] * pFil[5] + + ptr[15] * pFil[7]; + + ptr += 16; + pFil += 8; + } + dest[0] = (float)suml1; + dest[1] = (float)sumr1; + dest[2] = (float)suml2; + dest[3] = (float)sumr2; + + src += 4; + dest += 4; + } + */ +} + +#endif // SOUNDTOUCH_ALLOW_SSE