diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5431dab..ef2fb6ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,10 @@
 cmake_minimum_required(VERSION 3.1..3.18)
 
+if (BUILD_MULTI_THREADING)
+  set(MULTI_THREADING 1)
+  add_definitions(-DMULTI_THREADING)
+endif()
+
 # Policies
 
 # Include file check macros honor CMAKE_REQUIRED_LIBRARIES, CMake >= 3.12
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4a0e6c87..02306312 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -43,6 +43,16 @@ add_library(samplerate
 # ALIAS to use if libsamplerate is included from other project with add_subdirectory()
 add_library(SampleRate::samplerate ALIAS samplerate)
 
+if(MULTI_THREADING)
+  if(MSVC)
+    target_link_libraries(samplerate PRIVATE libomp)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /openmp:llvm")
+  else()
+    target_link_libraries(samplerate PRIVATE omp)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
+  endif()
+endif()
+
 # CMake generates wrong DLL names for MinGW and Cygwin, fix it
 if(BUILD_SHARED_LIBS AND WIN32)
   if(LIBSAMPLERATE_COMPATIBLE_NAME)
diff --git a/src/src_sinc.c b/src/src_sinc.c
index 716c4a40..9fee6fb0 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -130,6 +130,18 @@ static SRC_STATE_VT sinc_mono_state_vt =
 	sinc_close
 } ;
 
+#ifdef MULTI_THREADING
+static SRC_ERROR sinc_multithread_vari_process (SRC_STATE *state, SRC_DATA *data) ;
+static SRC_STATE_VT sinc_multithread_state_vt =
+{
+	sinc_multithread_vari_process,
+	sinc_multithread_vari_process,
+	sinc_reset,
+	sinc_copy,
+	sinc_close
+} ;
+#endif
+
 static inline increment_t
 double_to_fp (double x)
 {	return (increment_t) (psf_lrint ((x) * FP_ONE)) ;
@@ -203,6 +215,642 @@ sinc_get_description (int src_enum)
 	return NULL ;
 } /* sinc_get_descrition */
 
+#ifdef MULTI_THREADING
+
+#include <omp.h>
+
+#ifdef _MSC_VER
+	#define ALWAYS_INLINE __forceinline
+	#include <xmmintrin.h>
+
+	#define mem_prefetch(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+
+#elif defined(__GNUC__) || defined(clang)
+	#define ALWAYS_INLINE __attribute__((always_inline)) static
+
+	#define mem_prefetch(ptr) __builtin_prefetch(ptr)
+	
+#else
+	#define ALWAYS_INLINE static
+
+	#define mem_prefetch(ptr) 
+#endif
+
+/* smaller frames are processed in single thread to avoid overheads */
+#define MULTI_THREADING_THRESHOLD (256)
+
+#define MT_COEFFS_CACHING 1
+
+enum MT_CACHE_MODE
+{
+	MT_CACHE_NONE = 0,
+	MT_CACHE_READ,
+	MT_CACHE_WRITE
+};
+
+typedef struct mt_cache_t
+{
+	enum MT_CACHE_MODE cache_state;
+	increment_t start_filter_index;
+	double *coeffs;
+} mt_cache_t;
+
+typedef struct mt_cache_array_t
+{
+	int len;
+	int len2;
+	mt_cache_t *caches;
+} mt_cache_array_t;
+
+ALWAYS_INLINE void
+calc_output_multi_mt_core(const enum MT_CACHE_MODE use_cache, mt_cache_t *const cache, const int cache_len,
+						  const int enable_prefetch, const int skip_fraction, const SINC_FILTER *const filter,
+						  const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
+{
+	double left[MAX_CHANNELS] = {0};
+	double right[MAX_CHANNELS] = {0};
+
+	/* Convert input parameters into fixed point. */
+	const increment_t max_filter_index = int_to_fp(filter->coeff_half_len);
+
+	const int prefetch_increment = 8;
+
+	int cache_idx = 0;
+
+	{
+		/* First apply the left half of the filter. */
+		increment_t filter_index1 = start_filter_index;
+		const int coeff_count1 = (max_filter_index - filter_index1) / increment;
+		filter_index1 = filter_index1 + coeff_count1 * increment;
+		int data_index1 = filter->b_current - channels * coeff_count1;
+
+		if (data_index1 < 0) /* Avoid underflow access to filter->buffer. */
+		{
+			int steps = int_div_ceil(-data_index1, channels);
+			/* If the assert triggers we would have to take care not to underflow/overflow */
+			assert(steps <= int_div_ceil(filter_index1, increment));
+			filter_index1 -= increment * steps;
+			data_index1 += steps * channels;
+		}
+
+		// left = 0.0;
+		while (filter_index1 >= MAKE_INCREMENT_T(0))
+		{
+			double coeff;
+
+			if (use_cache == MT_CACHE_READ)
+			{
+				coeff = cache->coeffs[cache_idx++];
+
+				assert(cache_idx <= cache_len);
+			}
+			else
+			{
+				if (enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0))
+				{
+					const int indx = fp_to_int(filter_index1 - increment * prefetch_increment);
+					mem_prefetch(&filter->coeffs[indx]);
+					mem_prefetch(&filter->coeffs[indx + 1]);
+				}
+
+				const double fraction = fp_to_double(filter_index1);
+				const int indx = fp_to_int(filter_index1);
+				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+				const coeff_t coeff_val = filter->coeffs[indx];
+				const coeff_t coeff_fraction = filter->coeffs[indx + 1] - filter->coeffs[indx];
+				coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction;
+				assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len);
+				assert(data_index1 + channels - 1 < filter->b_end);
+
+				if (use_cache == MT_CACHE_WRITE)
+				{
+					coeff = cache->coeffs[cache_idx++] = coeff;
+
+					assert(cache_idx <= cache_len);
+				}
+			}
+
+			const double icoeff = coeff;
+
+			for (int ch = 0; ch < channels; ch++)
+				left[ch] += icoeff * filter->buffer[data_index1 + ch];
+
+			filter_index1 -= increment;
+			data_index1 = data_index1 + channels;
+		};
+	}
+
+	{
+		/* Now apply the right half of the filter. */
+		increment_t filter_index2 = increment - start_filter_index;
+		const int coeff_count2 = (max_filter_index - filter_index2) / increment;
+		filter_index2 = filter_index2 + coeff_count2 * increment;
+		int data_index2 = filter->b_current + channels * (1 + coeff_count2);
+		// right = 0.0;
+
+		do
+		{
+			double coeff;
+
+			if (use_cache == MT_CACHE_READ)
+			{
+				coeff = cache->coeffs[cache_idx++];
+
+				assert(cache_idx <= cache_len);
+			}
+			else
+			{
+				if (enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0))
+				{
+					const int indx = fp_to_int(filter_index2 - increment * prefetch_increment);
+					mem_prefetch(&filter->coeffs[indx]);
+					mem_prefetch(&filter->coeffs[indx + 1]);
+				}
+
+				const double fraction = fp_to_double(filter_index2);
+				const int indx = fp_to_int(filter_index2);
+				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+				const coeff_t coeff_val = filter->coeffs[indx];
+				const coeff_t coeff_fraction = filter->coeffs[indx + 1] - filter->coeffs[indx];
+				coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction;
+				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
+				assert(data_index2 + channels - 1 < filter->b_end);
+
+				if (use_cache == MT_CACHE_WRITE)
+				{
+					coeff = cache->coeffs[cache_idx++] = coeff;
+
+					assert(cache_idx <= cache_len);
+				}
+			}
+
+			const double icoeff = coeff;
+
+			for (int ch = 0; ch < channels; ch++)
+				right[ch] += icoeff * filter->buffer[data_index2 + ch];
+
+			const double c_coeff = coeff;
+			filter_index2 -= increment;
+			data_index2 = data_index2 - channels;
+
+		} while (filter_index2 > MAKE_INCREMENT_T(0));
+	}
+
+	if (use_cache == MT_CACHE_WRITE)
+	{
+		cache->start_filter_index = start_filter_index;
+		cache->cache_state = MT_CACHE_READ;
+	}
+
+	for (int ch = 0; ch < channels; ch++)
+		output[ch] = (float)(scale * (left[ch] + right[ch]));
+} /* calc_output_multi_mt_core */
+
+ALWAYS_INLINE void
+calc_output_multi_mt_3(const enum MT_CACHE_MODE use_cache, mt_cache_t *const cache, const int cache_len, const SINC_FILTER *const filter,
+					   const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
+{
+
+	const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0;
+	const int enable_prefetch = (filter->coeff_half_len > ARRAY_LEN(slow_mid_qual_coeffs.coeffs));
+
+	if (skip_fraction)
+	{
+		if (enable_prefetch)
+		{
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 1, 1, filter, increment, start_filter_index, channels, scale, output);
+		}
+		else
+		{
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 0, 1, filter, increment, start_filter_index, channels, scale, output);
+		}
+	}
+	else
+	{
+		if (enable_prefetch)
+		{
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 1, 0, filter, increment, start_filter_index, channels, scale, output);
+		}
+		else
+		{
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 0, 0, filter, increment, start_filter_index, channels, scale, output);
+		}
+	}
+}
+
+ALWAYS_INLINE void
+calc_output_multi_mt_2(mt_cache_array_t *cache_array, const SINC_FILTER *const filter,
+					   const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
+{
+	const int cache_len = cache_array->len2;
+	const int idx = cache_array->len ? (int)(start_filter_index / (increment / (cache_array->len - 1))) : 0;
+
+	mt_cache_t *cache = (cache_array->len && idx < cache_array->len) ? &cache_array->caches[idx] : NULL;
+
+	enum MT_CACHE_MODE use_cache = MT_CACHE_NONE;
+
+	if (cache)
+	{
+		enum MT_CACHE_MODE cache_state = cache->cache_state;
+
+		if (cache_state == MT_CACHE_READ)
+		{
+			if (start_filter_index == cache->start_filter_index)
+			{
+				use_cache = MT_CACHE_READ;
+			}
+			// else {
+			//	assert(0);	// not expected to come here, but not harmful, so commenting out
+			//	exit(0);
+			// }
+		}
+		else if (cache_state == MT_CACHE_NONE)
+		{
+			cache->cache_state = MT_CACHE_WRITE;
+			use_cache = MT_CACHE_WRITE;
+		}
+	}
+
+	if (use_cache == MT_CACHE_READ)
+	{
+		// skip to core, since skip_fraction/enable_prefetch will not affect
+		calc_output_multi_mt_core(MT_CACHE_READ, cache, cache_len, 0, 0, filter, increment, start_filter_index, channels, scale, output);
+	}
+	else if (use_cache == MT_CACHE_WRITE)
+	{
+		calc_output_multi_mt_3(MT_CACHE_WRITE, cache, cache_len, filter, increment, start_filter_index, channels, scale, output);
+	}
+	else
+	{
+		calc_output_multi_mt_3(MT_CACHE_NONE, cache, cache_len, filter, increment, start_filter_index, channels, scale, output);
+	}
+}
+
+ALWAYS_INLINE void
+calc_output_multi_mt(mt_cache_array_t *cache_array,
+					 const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
+{
+#define OPTIMIZE_LINE(x)                                                                              \
+	case (x):                                                                                         \
+		calc_output_multi_mt_2(cache_array, filter, increment, start_filter_index, x, scale, output); \
+		break;
+
+	switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here.
+	{
+		OPTIMIZE_LINE(1);
+		OPTIMIZE_LINE(2);
+		OPTIMIZE_LINE(3);
+		OPTIMIZE_LINE(4);
+		OPTIMIZE_LINE(5);
+		OPTIMIZE_LINE(6);
+		OPTIMIZE_LINE(7);
+		OPTIMIZE_LINE(8);
+		OPTIMIZE_LINE(9);
+		OPTIMIZE_LINE(10);
+		OPTIMIZE_LINE(11);
+		OPTIMIZE_LINE(12);
+		OPTIMIZE_LINE(13);
+		OPTIMIZE_LINE(14);
+		OPTIMIZE_LINE(15);
+		OPTIMIZE_LINE(16);
+	default:
+		calc_output_multi_mt_2(cache_array, filter, increment, start_filter_index, channels, scale, output);
+		break;
+	}
+#undef OPTIMIZE_LINE
+}
+
+static SRC_ERROR
+_sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
+								SRC_STATE *const state, SRC_DATA *const data, SRC_STATE *const main_state)
+{
+	if (state->private_data == NULL)
+		return SRC_ERR_NO_PRIVATE;
+
+	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
+	SINC_FILTER *main_filter = (SINC_FILTER *)main_state->private_data;
+
+	/* If there is not a problem, this will be optimised out. */
+	if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0]))
+		return SRC_ERR_SIZE_INCOMPATIBILITY;
+
+	const int channels = state->channels;
+	filter->in_count = data->input_frames * channels;
+	filter->out_count = data->output_frames * channels;
+	filter->in_used = filter->out_gen = 0;
+
+	double src_ratio = state->last_ratio;
+
+	if (is_bad_src_ratio(src_ratio))
+		return SRC_ERR_BAD_INTERNAL_STATE;
+
+	/* Check the sample rate ratio wrt the buffer len. */
+	double count = (filter->coeff_half_len + 2.0) / filter->index_inc;
+	if (MIN(state->last_ratio, data->src_ratio) < 1.0)
+		count /= MIN(state->last_ratio, data->src_ratio);
+
+	/* Maximum coefficientson either side of center point. */
+	const int half_filter_chan_len = channels * (int)(psf_lrint(count) + 1);
+
+	double input_index = state->last_position;
+
+	double rem = fmod_one(input_index);
+	filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len;
+	input_index = rem;
+
+	const double terminate = 1.0 / src_ratio + 1e-20;
+
+	const long out_count = filter->out_count;
+	const int index_inc = filter->index_inc;
+
+	const int is_constant_ratio = (state->last_ratio == data->src_ratio) ? 1 : 0;
+	const double constant_input_index_inc = 1.0 / src_ratio;
+	const double constant_float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+	const increment_t constant_increment = double_to_fp(constant_float_increment);
+	const double constant_scale = constant_float_increment / index_inc;
+
+	/* Main processing loop. */
+	int interleave_counter = 0;
+	float *const data_out = data->data_out;
+
+	mt_cache_array_t _cache_array = {0};
+	mt_cache_array_t *const cache_array = &_cache_array;
+
+#if MT_COEFFS_CACHING
+	// Caching once-calculated (interpolated) coeffs in memory is reasonable if the src_ratio is an integer
+	// because only limited number of coeffs are cyclically used in those cases.
+	// Drawback is that the processing speed can fluctuate if the condition (src_ratio) changes.
+
+	if (is_constant_ratio && src_ratio == (int)src_ratio)
+	{
+
+		do
+		{
+			int len = (int)src_ratio + 1;
+			int len2 = 0;
+			cache_array->caches = (mt_cache_t *)calloc(len, sizeof(mt_cache_t));
+			if (!cache_array->caches)
+			{
+				break;
+			}
+
+			for (int i = 0; i < len; i++)
+			{
+				len2 = ((filter->coeff_half_len + 2) / filter->index_inc + filter->index_inc);
+				cache_array->caches[i].coeffs = (double *)calloc(len2, sizeof(double));
+				if (!cache_array->caches[i].coeffs)
+				{
+					for (int j = 0; j < i; j++)
+					{
+						free(cache_array->caches[j].coeffs);
+						cache_array->caches[j].coeffs = NULL;
+					}
+					free(cache_array->caches);
+					cache_array->caches = NULL;
+					break;
+				}
+			}
+
+			if (cache_array->caches)
+			{
+				cache_array->len = len;
+				cache_array->len2 = len2;
+			}
+
+		} while (0);
+	}
+#endif
+
+	int rtn = SRC_ERR_NO_ERROR;
+
+	while (filter->out_gen < out_count)
+	{
+		/* Need to reload buffer? */
+		int samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
+
+		if (samples_in_hand <= half_filter_chan_len)
+		{
+			// only one buffer is used (shared by all threads)
+			{
+				#pragma omp barrier
+				#pragma omp single
+				{
+					state->error = prepare_data(filter, channels, data, half_filter_chan_len);
+
+					*main_state = *state;
+					*main_filter = *filter;
+				}
+				#pragma omp barrier					
+				{
+					*state = *main_state;
+					*filter = *main_filter;
+				}
+			}
+
+			if (state->error != 0)
+			{
+				rtn = state->error;
+				break;
+			}
+
+			samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
+			if (samples_in_hand <= half_filter_chan_len)
+				break;
+		};
+
+		/* This is the termination condition. */
+		if (filter->b_real_end >= 0)
+		{
+			// This switching is necessary to match the outputs to the current (0.22) single-thread implementation.
+			// However, the (single-thread) implementation may have some underlying bug because the number of output frames is seemingly 
+			// inconsistent depending on the combinations of src_ratio, input frames, and number of channels.
+			if (channels == 1)
+			{
+				if (filter->b_current + input_index + terminate > filter->b_real_end)
+					break;
+			}
+			else
+			{
+				if (filter->b_current + input_index + terminate >= filter->b_real_end)
+					break;
+			}
+		};
+
+		double scale, float_increment;
+		increment_t increment;
+		if (!is_constant_ratio)
+		{
+			if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
+				src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
+
+			float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+			increment = double_to_fp(float_increment);
+			scale = float_increment / index_inc;
+		}
+		else
+		{
+			float_increment = constant_float_increment;
+			increment = constant_increment;
+			scale = constant_scale;
+		}
+
+		increment_t start_filter_index = double_to_fp(input_index * float_increment);
+
+		if (child_no == interleave_counter)
+		{
+			calc_output_multi_mt(cache_array, filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen);
+		}
+		if (++interleave_counter == num_of_threads)
+			interleave_counter = 0;
+		filter->out_gen += channels;
+
+		/* Figure out the next index. */
+		input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio;
+		rem = fmod_one(input_index);
+
+		filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem));
+		if (filter->b_current >= filter->b_len)
+			filter->b_current -= filter->b_len;
+		input_index = rem;
+	};
+
+#if MT_COEFFS_CACHING
+	{
+		if (cache_array->len)
+		{
+			for (int i = 0; i < cache_array->len; i++)
+			{
+				free(cache_array->caches[i].coeffs);
+				cache_array->caches[i].coeffs = NULL;
+			}
+			free(cache_array->caches);
+			cache_array->caches = NULL;
+			cache_array->len = 0;
+		}
+	}
+#endif
+
+	if (rtn)
+		return rtn;
+
+	state->last_position = input_index;
+
+	/* Save current ratio rather then target ratio. */
+	state->last_ratio = src_ratio;
+
+	data->input_frames_used = filter->in_used / channels;
+	data->output_frames_gen = filter->out_gen / channels;
+
+	return SRC_ERR_NO_ERROR;
+}
+
+static SRC_ERROR
+sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
+{
+	if (state->private_data == NULL)
+		return SRC_ERR_NO_PRIVATE;
+
+	const int channels = state->channels;
+
+	const long in_count = data->input_frames * channels;
+	const long out_count = data->output_frames * channels;
+
+	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
+	const int filter_buffer_len = (filter->b_len + channels);
+
+	const int N_OF_CORES = omp_get_num_procs();
+
+	const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD);
+	const int num_of_threads = should_be_single_thread ? 1 : N_OF_CORES;
+
+	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE));
+	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA));
+	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER));
+	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR));
+
+	SRC_ERROR retval = SRC_ERR_MALLOC_FAILED;
+
+	if (!per_thread_state || !per_thread_data || !per_thread_filter || !per_thread_retval)
+	{
+		goto cleanup_and_return;
+	}
+
+	// OpenMP
+	omp_set_dynamic(0);
+	omp_set_num_threads(num_of_threads);
+
+	// assert(num_of_threads == omp_get_max_threads());
+
+	if (num_of_threads == 1 || omp_get_max_threads() == 1) // w/o OpenMP
+	{
+		per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state);
+
+		retval = per_thread_retval[0];
+
+		goto cleanup_and_return;
+	}
+
+	int omp_child_no;
+
+	#pragma omp parallel for
+	for (omp_child_no = 0; omp_child_no < num_of_threads; omp_child_no++)
+	{
+		const int child_no = omp_child_no;
+
+		memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA));
+		memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER));
+
+		memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE));
+		per_thread_state[child_no].private_data = &per_thread_filter[child_no];
+
+		per_thread_filter[child_no].buffer = filter->buffer;
+
+		per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(
+			num_of_threads, child_no,
+			&per_thread_state[child_no], &per_thread_data[child_no], state);
+	}
+
+	// error checking for each worker
+	for (int child_no = 0; child_no < num_of_threads; child_no++)
+	{
+		if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR)
+		{
+			retval = per_thread_retval[child_no];
+			goto cleanup_and_return;
+		}
+	}
+
+	// update filter status
+	float *buf = filter->buffer;
+	memcpy(filter, &per_thread_filter[0], sizeof(SINC_FILTER));
+	filter->buffer = buf;
+
+	memcpy(state, &per_thread_state[0], sizeof(SRC_STATE));
+	state->private_data = filter;
+
+	memcpy(data, &per_thread_data[0], sizeof(SRC_DATA));
+
+	retval = SRC_ERR_NO_ERROR;
+
+cleanup_and_return:
+
+	if (per_thread_state)
+		free(per_thread_state);
+
+	if (per_thread_data)
+		free(per_thread_data);
+
+	if (per_thread_filter)
+		free(per_thread_filter);
+
+	if (per_thread_retval)
+		free(per_thread_retval);
+
+	return retval;
+}
+
+#endif /* MULTI_THREADING*/
+
 static SINC_FILTER *
 sinc_filter_new (int converter_type, int channels)
 {
@@ -282,6 +930,9 @@ sinc_state_new (int converter_type, int channels, SRC_ERROR *error)
 	state->channels = channels ;
 	state->mode = SRC_MODE_PROCESS ;
 
+	#ifdef MULTI_THREADING
+		state->vt = &sinc_multithread_state_vt ;
+	#else
 	if (state->channels == 1)
 		state->vt = &sinc_mono_state_vt ;
 	else if (state->channels == 2)
@@ -292,6 +943,7 @@ sinc_state_new (int converter_type, int channels, SRC_ERROR *error)
 		state->vt = &sinc_hex_state_vt ;
 	else
 		state->vt = &sinc_multichan_state_vt ;
+	#endif
 
 	state->private_data = sinc_filter_new (converter_type, state->channels) ;
 	if (!state->private_data)
diff --git a/tests/multichan_throughput_test.c b/tests/multichan_throughput_test.c
index 5cab44a1..b0aa2ea5 100644
--- a/tests/multichan_throughput_test.c
+++ b/tests/multichan_throughput_test.c
@@ -36,17 +36,21 @@ static float input [BUFFER_LEN] ;
 
 #if (defined(ENABLE_SINC_FAST_CONVERTER) || defined(ENABLE_SINC_MEDIUM_CONVERTER) || \
 	defined(ENABLE_SINC_BEST_CONVERTER))
-static float output [BUFFER_LEN] ;
+static float output [BUFFER_LEN*2] ;
 
 static void
-throughput_test (int converter, int channels, long *best_throughput)
+throughput_test (int converter, int channels, long *best_throughput, double src_ratio)
 {	SRC_DATA src_data ;
+#if !defined(_WIN32) && defined(MULTI_THREADING)
+	struct timespec start_gettime, finish_gettime;
+#else
 	clock_t start_time, clock_time ;
+#endif
 	double duration ;
 	long total_frames = 0, throughput ;
 	int error ;
 
-	printf ("    %-30s     %2d         ", src_get_name (converter), channels) ;
+	printf ("    %-30s   %2d     ", src_get_name (converter), channels) ;
 	fflush (stdout) ;
 
 	src_data.data_in = input ;
@@ -55,7 +59,7 @@ throughput_test (int converter, int channels, long *best_throughput)
 	src_data.data_out = output ;
 	src_data.output_frames = ARRAY_LEN (output) / channels ;
 
-	src_data.src_ratio = 0.99 ;
+	src_data.src_ratio = src_ratio ;
 
 #ifdef _WIN32
 	Sleep (2000) ;
@@ -63,7 +67,11 @@ throughput_test (int converter, int channels, long *best_throughput)
 	sleep (2) ;
 #endif
 
+#if !defined(_WIN32) && defined(MULTI_THREADING)
+	clock_gettime(CLOCK_MONOTONIC, &start_gettime);
+#else
 	start_time = clock () ;
+#endif
 
 	do
 	{
@@ -74,28 +82,37 @@ throughput_test (int converter, int channels, long *best_throughput)
 
 		total_frames += src_data.output_frames_gen ;
 
+#if !defined(_WIN32) && defined(MULTI_THREADING)
+		clock_gettime(CLOCK_MONOTONIC, &finish_gettime);
+
+		duration = (finish_gettime.tv_sec - start_gettime.tv_sec);
+		duration += (finish_gettime.tv_nsec - start_gettime.tv_nsec) / 1000000000.0;
+#else
 		clock_time = clock () - start_time ;
 		duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+#endif
 	}
 	while (duration < 5.0) ;
 
-	if (src_data.input_frames_used != src_data.input_frames)
-	{	printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ;
-		exit (1) ;
-		} ;
-
-	if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
-	{	printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
-		printf ("    input len  : %d\n", ARRAY_LEN (input) / channels) ;
-		printf ("    output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
-				floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
-		exit (1) ;
-		} ;
+	if ( src_ratio <= 1.0 ){
+		if (src_data.input_frames_used != src_data.input_frames)
+		{	printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ;
+			exit (1) ;
+			} ;
+	
+		if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
+		{	printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+			printf ("    input len  : %d\n", ARRAY_LEN (input) / channels) ;
+			printf ("    output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
+					floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
+			exit (1) ;
+			} ;
+	}
 
 	throughput = lrint (floor (total_frames / duration)) ;
 
 	if (!best_throughput)
-	{	printf ("%5.2f      %10ld\n", duration, throughput) ;
+	{	printf ("%5.2f      %10ld (x%7.2f)\n", duration, throughput, (throughput/src_ratio/44100)) ;
 		}
 	else
 	{	*best_throughput = MAX (throughput, *best_throughput) ;
@@ -116,31 +133,38 @@ single_run (void)
 
 	printf ("\n    CPU name : %s\n", get_cpu_name ()) ;
 
+	double src_ratio[] = {0.99, 2.0, 7.0, 0.25};
+
+for( int i=0 ; i<sizeof(src_ratio)/sizeof(double) ; i++ ){
+	printf ("\n    SRC_RATIO : %.2lf\n", src_ratio[i]) ;
+	
 	puts (
 		"\n"
-		"    Converter                        Channels    Duration      Throughput\n"
-		"    ---------------------------------------------------------------------"
+		"    Converter                        Ch    Duration  Throughput (times faster than realtime if 44.1k in)\n"
+		"    -------------------------------------------------------------------------"
 		) ;
 
 #ifdef ENABLE_SINC_FAST_CONVERTER
 	for (k = 1 ; k <= max_channels / 2 ; k++)
-		throughput_test (SRC_SINC_FASTEST, k, 0) ;
+		throughput_test (SRC_SINC_FASTEST, k, 0, src_ratio[i]) ;
 
 	puts ("") ;
 #endif
 
 #ifdef ENABLE_SINC_MEDIUM_CONVERTER
 	for (k = 1 ; k <= max_channels / 2 ; k++)
-		throughput_test (SRC_SINC_MEDIUM_QUALITY, k, 0) ;
+		throughput_test (SRC_SINC_MEDIUM_QUALITY, k, 0, src_ratio[i]) ;
 
 	puts ("") ;
 #endif
 
 #ifdef ENABLE_SINC_BEST_CONVERTER
 	for (k = 1 ; k <= max_channels ; k++)
-		throughput_test (SRC_SINC_BEST_QUALITY, k, 0) ;
+		throughput_test (SRC_SINC_BEST_QUALITY, k, 0, src_ratio[i]) ;
 	puts ("") ;
 #endif
+
+}
 	return ;
 } /* single_run */
 
@@ -152,7 +176,7 @@ multi_run (int run_count)
 
 	puts (
 		"\n"
-		"    Converter                        Channels    Duration      Throughput    Best Throughput\n"
+		"    Converter                        Ch    Duration      Throughput    Best Throughput\n"
 		"    ----------------------------------------------------------------------------------------"
 		) ;
 
@@ -172,13 +196,13 @@ multi_run (int run_count)
 		for (int k = 0 ; k < run_count ; k++)
 		{
 #ifdef ENABLE_SINC_FAST_CONVERTER
-			throughput_test (SRC_SINC_FASTEST, ch, &sinc_fastest) ;
+			throughput_test (SRC_SINC_FASTEST, ch, &sinc_fastest, 0.99) ;
 #endif
 #ifdef ENABLE_SINC_MEDIUM_CONVERTER
-			throughput_test (SRC_SINC_MEDIUM_QUALITY, ch, &sinc_medium) ;
+			throughput_test (SRC_SINC_MEDIUM_QUALITY, ch, &sinc_medium, 0.99) ;
 #endif
 #ifdef ENABLE_SINC_BEST_CONVERTER
-			throughput_test (SRC_SINC_BEST_QUALITY, ch, &sinc_best) ;
+			throughput_test (SRC_SINC_BEST_QUALITY, ch, &sinc_best, 0.99) ;
 #endif
 
 			puts ("") ;
diff --git a/tests/throughput_test.c b/tests/throughput_test.c
index e9974800..f4181897 100644
--- a/tests/throughput_test.c
+++ b/tests/throughput_test.c
@@ -38,7 +38,11 @@ static float output [BUFFER_LEN] ;
 static long
 throughput_test (int converter, long best_throughput)
 {	SRC_DATA src_data ;
+#if !defined(_WIN32) && defined(MULTI_THREADING)
+	struct timespec start_gettime, finish_gettime;
+#else
 	clock_t start_time, clock_time ;
+#endif
 	double duration ;
 	long total_frames = 0, throughput ;
 	int error ;
@@ -60,7 +64,11 @@ throughput_test (int converter, long best_throughput)
 	sleep (2) ;
 #endif
 
+#if !defined(_WIN32) && defined(MULTI_THREADING)
+	clock_gettime(CLOCK_MONOTONIC, &start_gettime);
+#else
 	start_time = clock () ;
+#endif
 
 	do
 	{
@@ -71,11 +79,18 @@ throughput_test (int converter, long best_throughput)
 
 		total_frames += src_data.output_frames_gen ;
 
+#if !defined(_WIN32) && defined(MULTI_THREADING)
+		clock_gettime(CLOCK_MONOTONIC, &finish_gettime);
+
+		duration = (finish_gettime.tv_sec - start_gettime.tv_sec);
+		duration += (finish_gettime.tv_nsec - start_gettime.tv_nsec) / 1000000000.0;
+#else
 		clock_time = clock () - start_time ;
 #ifdef __GNU__ /* Clock resolution is 10ms on GNU/Hurd */
 		duration = (10000.0 * clock_time) / CLOCKS_PER_SEC ;
 #else
 		duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+#endif
 #endif
 	}
 	while (duration < 3.0) ;