From 14fa5f923f0736cd27510578921749bb9ebfe087 Mon Sep 17 00:00:00 2001
From: selim mustafaev <selim@fastmail.fm>
Date: Mon, 6 Nov 2017 17:05:51 +0300
Subject: [PATCH] Added some code to support playing audio

---
 CMakeLists.txt                    |   5 +-
 examples/CMakeLists.txt           |  10 +
 examples/ffConv.cpp               |   2 +-
 examples/ffPlayer.cpp             |  66 ++-
 include/ffcpp/Frame.h             |   2 +
 include/ffcpp/Player.h            |  39 +-
 include/ffcpp/Resampler.h         |   7 +-
 include/ffcpp/atomicops.h         | 665 +++++++++++++++++++++++
 include/ffcpp/readerwriterqueue.h | 854 ++++++++++++++++++++++++++++++
 src/CMakeLists.txt                |   4 +-
 src/Codec.cpp                     |   4 +-
 src/Frame.cpp                     |  10 +
 src/MediaFile.cpp                 |   4 +
 src/Player.cpp                    |  98 +++-
 src/Resampler.cpp                 |  18 +-
 15 files changed, 1745 insertions(+), 43 deletions(-)
 create mode 100644 include/ffcpp/atomicops.h
 create mode 100644 include/ffcpp/readerwriterqueue.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34161c5..dae08a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,9 @@
 cmake_minimum_required(VERSION 3.5)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14  -ggdb -O2")
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -std=c++14 -ggdb -O0")
+set(CMAKE_CXX_FLAGS "-std=c++14 -g -O2 -pthread")
+set(CMAKE_CXX_FLAGS_DEBUG "-ggdb -O0 -pthread")
+SET(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -pthread")
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/bin)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/lib)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f9a1253..6038e81 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,3 +1,13 @@
+option(BUILD_WITH_TSAN "Build with thread sanitizer" OFF)
+option(BUILD_WITH_ASAN "Build with address sanitizer" OFF)
+
+if(BUILD_WITH_TSAN)
+    SET(THREAD_SANITIZER_FLAG "-fsanitize=thread")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${THREAD_SANITIZER_FLAG}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${THREAD_SANITIZER_FLAG}")
+    SET( CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${THREAD_SANITIZER_FLAG} -ltsan" )
+endif()
+
 project(ffConv)
 add_executable(ffConv ffConv.cpp)
 add_dependencies(ffConv ffcpp)
diff --git a/examples/ffConv.cpp b/examples/ffConv.cpp
index 46ca8e7..c6cd474 100644
--- a/examples/ffConv.cpp
+++ b/examples/ffConv.cpp
@@ -43,7 +43,7 @@ int main(int argc, char** argv) {
 	auto outVStream = output.addVideoStream(AV_CODEC_ID_H264, VIDEO_WIDTH, outHeight, vDecoder->timeBase(), AV_PIX_FMT_YUV420P);
 	auto vEncoder = outVStream->codec();
 
-	auto outAStream = output.addAudioStream(AV_CODEC_ID_VORBIS, 2, 44100, AV_SAMPLE_FMT_FLTP);
+	auto outAStream = output.addAudioStream(AV_CODEC_ID_AC3, 2, 44100, AV_SAMPLE_FMT_FLTP);
 	auto aEncoder = outAStream->codec();
 
 	output.writeHeader();
diff --git a/examples/ffPlayer.cpp b/examples/ffPlayer.cpp
index b9f7b4b..f87355b 100644
--- a/examples/ffPlayer.cpp
+++ b/examples/ffPlayer.cpp
@@ -11,7 +11,7 @@ namespace ff = ffcpp;
 #define WINDOW_WIDTH    640
 #define WINDOW_HEIGHT   480
 
-class SDLWindow: public ff::IVideoSink {
+class SDLWindow: public ff::IVideoSink, public ff::IAudioSink {
 private:
     template<typename T> using SDLUniquePtr = std::unique_ptr<T, void(*)(T*)>;
     using SDLWindowPtr = SDLUniquePtr<SDL_Window>;
@@ -26,9 +26,10 @@ private:
     SDL_AudioDeviceID _aDevId;
 
     std::packaged_task<void()> _renderTask;
+    ff::IAudioSource* _audioSrc;
 
 public:
-    SDLWindow(): _wnd(nullptr, SDL_DestroyWindow), _renderer(nullptr, SDL_DestroyRenderer), _texture(nullptr, SDL_DestroyTexture) {
+    SDLWindow(): _wnd(nullptr, SDL_DestroyWindow), _renderer(nullptr, SDL_DestroyRenderer), _texture(nullptr, SDL_DestroyTexture), _audioSrc(nullptr) {
         int res = SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO | SDL_INIT_TIMER);
         if(res < 0) throw std::runtime_error("Error initializing SDL");
 
@@ -44,13 +45,16 @@ public:
         SDL_AudioSpec want;
         SDL_zero(want);
         want.freq = 44100;
-        want.format = AUDIO_S16;
+        want.format = AUDIO_F32;
         want.channels = 2;
-        want.samples = 4096;
+        want.samples = 8192;
         want.callback = SDLWindow::audioCallback;
+        want.userdata = this;
 
         _aDevId = SDL_OpenAudioDevice(nullptr, 0, &want, &_audioSpec, SDL_AUDIO_ALLOW_ANY_CHANGE);
         if(_aDevId == 0) throw std::runtime_error("Error opening audio device");
+
+        SDL_PauseAudioDevice(_aDevId, 0);
     }
 
     void handleEvents() {
@@ -71,10 +75,26 @@ public:
 
 private:
     static void audioCallback(void* userdata, Uint8* stream, int len) {
+        ff::IAudioSource* src = static_cast<SDLWindow*>(userdata)->_audioSrc;
 
+        if(src) {
+            //std::cout << "fill sample buffer" << std::endl;
+            src->fillSampleBuffer(stream, len);
+        }
     }
 
-public:
+    AVSampleFormat sdlToFFMpeg(SDL_AudioFormat format) {
+        switch (format) {
+            case AUDIO_S16: return AV_SAMPLE_FMT_S16;
+            case AUDIO_S32: return AV_SAMPLE_FMT_S32;
+            case AUDIO_F32: return AV_SAMPLE_FMT_FLT;
+            default:
+                throw std::runtime_error("unknown audio sample format: " + std::to_string(format));
+        }
+    }
+
+    // IVideoSink implementation
+private:
     virtual AVPixelFormat getPixelFormat() const noexcept override {
         return AV_PIX_FMT_YUV420P;
     }
@@ -102,17 +122,41 @@ public:
 
         future.get();
     }
+
+    // IAudioSink implementation
+private:
+    void setAudioSource(ff::IAudioSource* audioSrc) override {
+        std::cout << "set audio source" << std::endl;
+        _audioSrc = audioSrc;
+    }
+
+    AVSampleFormat getSampleFormat() override {
+        return sdlToFFMpeg(_audioSpec.format);
+    }
+
+    int getChannelsCount() override {
+        return _audioSpec.channels;
+    }
+
+    int getSampleRate() override {
+        return _audioSpec.freq;
+    }
 };
 
 int main(int argc, char** argv) {
-    auto wnd = std::make_shared<SDLWindow>();
+    try {
+        auto wnd = std::make_shared<SDLWindow>();
 
-    ff::Player player(wnd);
-    player.setMedia(argv[1]);
-    player.setVideoSize(WINDOW_WIDTH, WINDOW_HEIGHT);
-    player.play();
+        ff::Player player(wnd, wnd);
+        player.setMedia(argv[1]);
+        player.setVideoSize(WINDOW_WIDTH, WINDOW_HEIGHT);
+        player.play();
 
-    wnd->handleEvents();
+        wnd->handleEvents();
+    } catch (...) {
+        std::cout << "exception" << std::endl;
+        return 0;
+    }
 
     return 0;
 }
\ No newline at end of file
diff --git a/include/ffcpp/Frame.h b/include/ffcpp/Frame.h
index c035c48..e264fc1 100644
--- a/include/ffcpp/Frame.h
+++ b/include/ffcpp/Frame.h
@@ -35,6 +35,8 @@ namespace ffcpp {
 		void setPts(int pts);
 		bool isKeyFrame() const;
         int pts() const;
+		void guessChannelLayout();
+        int size() const;
 	};
 
 }
diff --git a/include/ffcpp/Player.h b/include/ffcpp/Player.h
index e4bbd4f..3272f4c 100644
--- a/include/ffcpp/Player.h
+++ b/include/ffcpp/Player.h
@@ -4,11 +4,14 @@
 #include "ffcpp/MediaFile.h"
 #include "ffcpp/Scaler.h"
 #include "TSQueue.h"
+#include "Resampler.h"
+#include "readerwriterqueue.h"
 #include <memory>
 #include <thread>
 #include <condition_variable>
 #include <mutex>
 #include <cstdint>
+#include <cstdio>
 
 namespace ffcpp {
 
@@ -19,9 +22,15 @@ namespace ffcpp {
                                         int uPitch, int vPitch) = 0;
     };
 
+    struct IAudioSource {
+        virtual void fillSampleBuffer(uint8_t *data, int length) = 0;
+    };
+
     struct IAudioSink {
-        virtual void setPauseCallback(std::function<void(bool)> callback) = 0;
-        virtual void setAudioDataCallback(std::function<void(uint8_t*,size_t)> callback) = 0;
+        virtual void setAudioSource(IAudioSource* audioSrc) = 0;
+        virtual AVSampleFormat getSampleFormat() = 0;
+        virtual int getChannelsCount() = 0;
+        virtual int getSampleRate() = 0;
     };
 
     enum class PlayerState {
@@ -31,23 +40,36 @@ namespace ffcpp {
         Paused
     };
 
-    class Player {
+    class Player: private IAudioSource {
+    private:
+        static constexpr size_t AUDIO_BUFFER_LENGTH = 16*1024;
+
+    private:
+        typedef moodycamel::ReaderWriterQueue<FramePtr> FrameQueue;
+
     private:
         std::shared_ptr<IVideoSink> _vSink;
+        std::shared_ptr<IAudioSink> _aSink;
         std::unique_ptr<MediaFile> _curMedia;
         StreamPtr _aStream;
         StreamPtr _vStream;
         ScalerPtr _scaler;
+        ResamplerPtr _resampler;
         PlayerState _state;
 
-        TSQueue<Frame> _decodedFrames;
-        std::thread _decodeThread;
-        std::thread _vPlayThread;
+        std::unique_ptr<uint8_t[]> _aSamplesBuffer;
+        int _samplesInBuffer;
+        FILE* _asFile;
+
         std::mutex _mutex;
         std::condition_variable _stateCond;
+        FrameQueue _videoFrames;
+        FrameQueue _audioFrames;
+        std::thread _decodeThread;
+        std::thread _vPlayThread;
 
     public:
-        Player(std::shared_ptr<IVideoSink> vSink);
+        Player(std::shared_ptr<IVideoSink> vSink, std::shared_ptr<IAudioSink> aSink);
         ~Player();
 
         void setMedia(std::string path);
@@ -57,6 +79,9 @@ namespace ffcpp {
     private:
         void decode();
         void displayFrames();
+
+    private:
+        void fillSampleBuffer(uint8_t *data, int length) override;
     };
 
 }
diff --git a/include/ffcpp/Resampler.h b/include/ffcpp/Resampler.h
index 2e0e3c8..3c518d2 100644
--- a/include/ffcpp/Resampler.h
+++ b/include/ffcpp/Resampler.h
@@ -10,16 +10,19 @@ extern "C" {
 
 namespace ffcpp {
 
+	typedef std::shared_ptr<class Resampler> ResamplerPtr;
+
 	class Resampler {
 	private:
 		SwrContext* _swrContext;
+		int _dstChannelCount;
 		int _dstChannelLayout;
 		AVSampleFormat _dstSampleFormat;
 		int _dstSampleRate;
 
 	public:
-		Resampler(int inChannelLayout, int inSampleRate, AVSampleFormat inSampleFormat,
-		          int outChannelLayout, int outSampleRate, AVSampleFormat outSampleFormat);
+		Resampler(int inChannelCount, int inChannelLayout, int inSampleRate, AVSampleFormat inSampleFormat,
+				  int outChannelCount, int outChannelLayout, int outSampleRate, AVSampleFormat outSampleFormat);
 		Resampler(CodecPtr decoder, CodecPtr encoder);
 		~Resampler();
 
diff --git a/include/ffcpp/atomicops.h b/include/ffcpp/atomicops.h
new file mode 100644
index 0000000..47c76b8
--- /dev/null
+++ b/include/ffcpp/atomicops.h
@@ -0,0 +1,665 @@
+﻿// ©2013-2016 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, embedded below).
+
+#pragma once
+
+// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
+// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
+// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
+// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
+
+#include <cassert>
+#include <type_traits>
+#include <cerrno>
+#include <cstdint>
+#include <ctime>
+
+// Platform detection
+#if defined(__INTEL_COMPILER)
+#define AE_ICC
+#elif defined(_MSC_VER)
+#define AE_VCPP
+#elif defined(__GNUC__)
+#define AE_GCC
+#endif
+
+#if defined(_M_IA64) || defined(__ia64__)
+#define AE_ARCH_IA64
+#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
+#define AE_ARCH_X64
+#elif defined(_M_IX86) || defined(__i386__)
+#define AE_ARCH_X86
+#elif defined(_M_PPC) || defined(__powerpc__)
+#define AE_ARCH_PPC
+#else
+#define AE_ARCH_UNKNOWN
+#endif
+
+
+// AE_UNUSED
+#define AE_UNUSED(x) ((void)x)
+
+
+// AE_FORCEINLINE
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_FORCEINLINE __forceinline
+#elif defined(AE_GCC)
+//#define AE_FORCEINLINE __attribute__((always_inline)) 
+#define AE_FORCEINLINE inline
+#else
+#define AE_FORCEINLINE inline
+#endif
+
+
+// AE_ALIGN
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_ALIGN(x) __declspec(align(x))
+#elif defined(AE_GCC)
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#else
+// Assume GCC compliant syntax...
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+// Portable atomic fences implemented below:
+
+namespace moodycamel {
+
+enum memory_order {
+	memory_order_relaxed,
+	memory_order_acquire,
+	memory_order_release,
+	memory_order_acq_rel,
+	memory_order_seq_cst,
+
+	// memory_order_sync: Forces a full sync:
+	// #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
+	memory_order_sync = memory_order_seq_cst
+};
+
+}    // end namespace moodycamel
+
+#if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || defined(AE_ICC)
+// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
+
+#include <intrin.h>
+
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+#define AeFullSync _mm_mfence
+#define AeLiteSync _mm_mfence
+#elif defined(AE_ARCH_IA64)
+#define AeFullSync __mf
+#define AeLiteSync __mf
+#elif defined(AE_ARCH_PPC)
+#include <ppcintrinsics.h>
+#define AeFullSync __sync
+#define AeLiteSync __lwsync
+#endif
+
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4365)		// Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert`
+#ifdef __cplusplus_cli
+#pragma managed(push, off)
+#endif
+#endif
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst: _ReadWriteBarrier(); break;
+		default: assert(false);
+	}
+}
+
+// x86/x64 have a strong memory model -- all loads and stores have
+// acquire and release semantics automatically (so only need compiler
+// barriers for those).
+#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
+AE_FORCEINLINE void fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#else
+AE_FORCEINLINE void fence(memory_order order)
+{
+	// Non-specialized arch, use heavier memory barriers everywhere just in case :-(
+	switch (order) {
+		case memory_order_relaxed:
+			break;
+		case memory_order_acquire:
+			_ReadBarrier();
+			AeLiteSync();
+			_ReadBarrier();
+			break;
+		case memory_order_release:
+			_WriteBarrier();
+			AeLiteSync();
+			_WriteBarrier();
+			break;
+		case memory_order_acq_rel:
+			_ReadWriteBarrier();
+			AeLiteSync();
+			_ReadWriteBarrier();
+			break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#endif
+}    // end namespace moodycamel
+#else
+// Use standard library of atomics
+#include <atomic>
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+AE_FORCEINLINE void fence(memory_order order)
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+}    // end namespace moodycamel
+
+#endif
+
+
+#if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli))
+#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#endif
+
+#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#include <atomic>
+#endif
+#include <utility>
+
+// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
+// Provides basic support for atomic variables -- no memory ordering guarantees are provided.
+// The guarantee of atomicity is only made for types that already have atomic load and store guarantees
+// at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
+namespace moodycamel {
+template<typename T>
+class weak_atomic
+{
+public:
+	weak_atomic() { }
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4100)		// Get rid of (erroneous) 'unreferenced formal parameter' warning
+#endif
+	template<typename U> weak_atomic(U&& x) : value(std::forward<U>(x)) {  }
+#ifdef __cplusplus_cli
+	// Work around bug with universal reference/nullptr combination that only appears when /clr is on
+	weak_atomic(nullptr_t) : value(nullptr) {  }
+#endif
+	weak_atomic(weak_atomic const& other) : value(other.value) {  }
+	weak_atomic(weak_atomic&& other) : value(std::move(other.value)) {  }
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
+
+	AE_FORCEINLINE operator T() const { return load(); }
+
+	
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	template<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) { value = std::forward<U>(x); return *this; }
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) { value = other.value; return *this; }
+	
+	AE_FORCEINLINE T load() const { return value; }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment)
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment)
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+#else
+	template<typename U>
+	AE_FORCEINLINE weak_atomic const& operator=(U&& x)
+	{
+		value.store(std::forward<U>(x), std::memory_order_relaxed);
+		return *this;
+	}
+	
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other)
+	{
+		value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		return *this;
+	}
+
+	AE_FORCEINLINE T load() const { return value.load(std::memory_order_relaxed); }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment)
+	{
+		return value.fetch_add(increment, std::memory_order_acquire);
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment)
+	{
+		return value.fetch_add(increment, std::memory_order_release);
+	}
+#endif
+	
+
+private:
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	// No std::atomic support, but still need to circumvent compiler optimizations.
+	// `volatile` will make memory access slow, but is guaranteed to be reliable.
+	volatile T value;
+#else
+	std::atomic<T> value;
+#endif
+};
+
+}	// end namespace moodycamel
+
+
+
+// Portable single-producer, single-consumer semaphore below:
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#endif
+
+namespace moodycamel
+{
+	// Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's
+	// portable + lightweight semaphore implementations, originally from
+	// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+	// LICENSE:
+	// Copyright (c) 2015 Jeff Preshing
+	//
+	// This software is provided 'as-is', without any express or implied
+	// warranty. In no event will the authors be held liable for any damages
+	// arising from the use of this software.
+	//
+	// Permission is granted to anyone to use this software for any purpose,
+	// including commercial applications, and to alter it and redistribute it
+	// freely, subject to the following restrictions:
+	//
+	// 1. The origin of this software must not be misrepresented; you must not
+	//    claim that you wrote the original software. If you use this software
+	//    in a product, an acknowledgement in the product documentation would be
+	//    appreciated but is not required.
+	// 2. Altered source versions must be plainly marked as such, and must not be
+	//    misrepresented as being the original software.
+	// 3. This notice may not be removed or altered from any source distribution.
+	namespace spsc_sema
+	{
+#if defined(_WIN32)
+		class Semaphore
+		{
+		private:
+		    void* m_hSema;
+		    
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        const long maxLong = 0x7fffffff;
+		        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		    }
+
+		    ~Semaphore()
+		    {
+		        CloseHandle(m_hSema);
+		    }
+
+		    void wait()
+		    {
+		    	const unsigned long infinite = 0xffffffff;
+		        WaitForSingleObject(m_hSema, infinite);
+		    }
+
+			bool try_wait()
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT;
+			}
+
+			bool timed_wait(std::uint64_t usecs)
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT;
+			}
+
+		    void signal(int count = 1)
+		    {
+		        ReleaseSemaphore(m_hSema, count, nullptr);
+		    }
+		};
+#elif defined(__MACH__)
+		//---------------------------------------------------------
+		// Semaphore (Apple iOS and OSX)
+		// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    semaphore_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		    }
+
+		    ~Semaphore()
+		    {
+		        semaphore_destroy(mach_task_self(), m_sema);
+		    }
+
+		    void wait()
+		    {
+		        semaphore_wait(m_sema);
+		    }
+
+			bool try_wait()
+			{
+				return timed_wait(0);
+			}
+
+			bool timed_wait(std::int64_t timeout_usecs)
+			{
+				mach_timespec_t ts;
+				ts.tv_sec = timeout_usecs / 1000000;
+				ts.tv_nsec = (timeout_usecs % 1000000) * 1000;
+
+				// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+				kern_return_t rc = semaphore_timedwait(m_sema, ts);
+
+				return rc != KERN_OPERATION_TIMED_OUT;
+			}
+
+		    void signal()
+		    {
+		        semaphore_signal(m_sema);
+		    }
+
+		    void signal(int count)
+		    {
+		        while (count-- > 0)
+		        {
+		            semaphore_signal(m_sema);
+		        }
+		    }
+		};
+#elif defined(__unix__)
+		//---------------------------------------------------------
+		// Semaphore (POSIX, Linux)
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    sem_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        sem_init(&m_sema, 0, initialCount);
+		    }
+
+		    ~Semaphore()
+		    {
+		        sem_destroy(&m_sema);
+		    }
+
+		    void wait()
+		    {
+		        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		        int rc;
+		        do
+		        {
+		            rc = sem_wait(&m_sema);
+		        }
+		        while (rc == -1 && errno == EINTR);
+		    }
+
+			bool try_wait()
+			{
+				int rc;
+				do {
+					rc = sem_trywait(&m_sema);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == EAGAIN);
+			}
+
+			bool timed_wait(std::uint64_t usecs)
+			{
+				struct timespec ts;
+				const int usecs_in_1_sec = 1000000;
+				const int nsecs_in_1_sec = 1000000000;
+				clock_gettime(CLOCK_REALTIME, &ts);
+				ts.tv_sec += usecs / usecs_in_1_sec;
+				ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
+				// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+				// so we have to clean things up before passing it in
+				if (ts.tv_nsec >= nsecs_in_1_sec) {
+					ts.tv_nsec -= nsecs_in_1_sec;
+					++ts.tv_sec;
+				}
+
+				int rc;
+				do {
+					rc = sem_timedwait(&m_sema, &ts);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == ETIMEDOUT);
+			}
+
+		    void signal()
+		    {
+		        sem_post(&m_sema);
+		    }
+
+		    void signal(int count)
+		    {
+		        while (count-- > 0)
+		        {
+		            sem_post(&m_sema);
+		        }
+		    }
+		};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+		//---------------------------------------------------------
+		// LightweightSemaphore
+		//---------------------------------------------------------
+		class LightweightSemaphore
+		{
+		public:
+			typedef std::make_signed<std::size_t>::type ssize_t;
+			
+		private:
+		    weak_atomic<ssize_t> m_count;
+		    Semaphore m_sema;
+
+		    bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+		    {
+		        ssize_t oldCount;
+		        // Is there a better way to set the initial spin count?
+		        // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
+		        // as threads start hitting the kernel semaphore.
+		        int spin = 10000;
+		        while (--spin >= 0)
+		        {
+		            if (m_count.load() > 0)
+		            {
+		                m_count.fetch_add_acquire(-1);
+		                return true;
+		            }
+		            compiler_fence(memory_order_acquire);     // Prevent the compiler from collapsing the loop.
+		        }
+		        oldCount = m_count.fetch_add_acquire(-1);
+				if (oldCount > 0)
+					return true;
+		        if (timeout_usecs < 0)
+				{
+					m_sema.wait();
+					return true;
+				}
+				if (m_sema.timed_wait(timeout_usecs))
+					return true;
+				// At this point, we've timed out waiting for the semaphore, but the
+				// count is still decremented indicating we may still be waiting on
+				// it. So we have to re-adjust the count, but only if the semaphore
+				// wasn't signaled enough times for us too since then. If it was, we
+				// need to release the semaphore too.
+				while (true)
+				{
+					oldCount = m_count.fetch_add_release(1);
+					if (oldCount < 0)
+						return false;    // successfully restored things to the way they were
+					// Oh, the producer thread just signaled the semaphore after all. Try again:
+					oldCount = m_count.fetch_add_acquire(-1);
+					if (oldCount > 0 && m_sema.try_wait())
+						return true;
+				}
+		    }
+
+		public:
+		    LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
+		    {
+		        assert(initialCount >= 0);
+		    }
+
+		    bool tryWait()
+		    {
+		        if (m_count.load() > 0)
+		        {
+		        	m_count.fetch_add_acquire(-1);
+		        	return true;
+		        }
+		        return false;
+		    }
+
+		    void wait()
+		    {
+		        if (!tryWait())
+		            waitWithPartialSpinning();
+		    }
+
+			bool wait(std::int64_t timeout_usecs)
+			{
+				return tryWait() || waitWithPartialSpinning(timeout_usecs);
+			}
+
+		    void signal(ssize_t count = 1)
+		    {
+		    	assert(count >= 0);
+		        ssize_t oldCount = m_count.fetch_add_release(count);
+		        assert(oldCount >= -1);
+		        if (oldCount < 0)
+		        {
+		            m_sema.signal(1);
+		        }
+		    }
+		    
+		    ssize_t availableApprox() const
+		    {
+		    	ssize_t count = m_count.load();
+		    	return count > 0 ? count : 0;
+		    }
+		};
+	}	// end namespace spsc_sema
+}	// end namespace moodycamel
+
+#if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))
+#pragma warning(pop)
+#ifdef __cplusplus_cli
+#pragma managed(pop)
+#endif
+#endif
diff --git a/include/ffcpp/readerwriterqueue.h b/include/ffcpp/readerwriterqueue.h
new file mode 100644
index 0000000..9cf2ad4
--- /dev/null
+++ b/include/ffcpp/readerwriterqueue.h
@@ -0,0 +1,854 @@
+// ©2013-2016 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+
+#pragma once
+
+#include "atomicops.h"
+#include <type_traits>
+#include <utility>
+#include <cassert>
+#include <stdexcept>
+#include <new>
+#include <cstdint>
+#include <cstdlib>		// For malloc/free/abort & size_t
+#if __cplusplus > 199711L || _MSC_VER >= 1700 // C++11 or VS2012
+#include <chrono>
+#endif
+
+
+// A lock-free queue for a single-consumer, single-producer architecture.
+// The queue is also wait-free in the common path (except if more memory
+// needs to be allocated, in which case malloc is called).
+// Allocates memory sparingly (O(lg(n) times, amortized), and only once if
+// the original maximum size estimate is never exceeded.
+// Tested on x86/x64 processors, but semantics should be correct for all
+// architectures (given the right implementations in atomicops.h), provided
+// that aligned integer and pointer accesses are naturally atomic.
+// Note that there should only be one consumer thread and producer thread;
+// Switching roles of the threads, or using multiple consecutive threads for
+// one role, is not safe unless properly synchronized.
+// Using the queue exclusively from one thread is fine, though a bit silly.
+
+#ifndef MOODYCAMEL_CACHE_LINE_SIZE
+#define MOODYCAMEL_CACHE_LINE_SIZE 64
+#endif
+
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4324)	// structure was padded due to __declspec(align())
+#pragma warning(disable: 4820)	// padding was added
+#pragma warning(disable: 4127)	// conditional expression is constant
+#endif
+
+namespace moodycamel {
+
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class ReaderWriterQueue
+{
+	// Design: Based on a queue-of-queues. The low-level queues are just
+	// circular buffers with front and tail indices indicating where the
+	// next element to dequeue is and where the next element can be enqueued,
+	// respectively. Each low-level queue is called a "block". Each block
+	// wastes exactly one element's worth of space to keep the design simple
+	// (if front == tail then the queue is empty, and can't be full).
+	// The high-level queue is a circular linked list of blocks; again there
+	// is a front and tail, but this time they are pointers to the blocks.
+	// The front block is where the next element to be dequeued is, provided
+	// the block is not empty. The back block is where elements are to be
+	// enqueued, provided the block is not full.
+	// The producer thread owns all the tail indices/pointers. The consumer
+	// thread owns all the front indices/pointers. Both threads read each
+	// other's variables, but only the owning thread updates them. E.g. After
+	// the consumer reads the producer's tail, the tail may change before the
+	// consumer is done dequeuing an object, but the consumer knows the tail
+	// will never go backwards, only forwards.
+	// If there is no room to enqueue an object, an additional block (of
+	// equal size to the last block) is added. Blocks are never removed.
+
+public:
+	// Constructs a queue that can hold maxSize elements without further
+	// allocations. If more than MAX_BLOCK_SIZE elements are requested,
+	// then several blocks of MAX_BLOCK_SIZE each are reserved (including
+	// at least one extra buffer block).
+	explicit ReaderWriterQueue(size_t maxSize = 15)
+#ifndef NDEBUG
+		: enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		assert(maxSize > 0);
+		assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2");
+		assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2");
+		
+		Block* firstBlock = nullptr;
+		
+		largestBlockSize = ceilToPow2(maxSize + 1);		// We need a spare slot to fit maxSize elements in the block
+		if (largestBlockSize > MAX_BLOCK_SIZE * 2) {
+			// We need a spare block in case the producer is writing to a different block the consumer is reading from, and
+			// wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity
+			// between front == tail meaning "empty" and "full".
+			// So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the
+			// number of blocks - 1. Solving for maxSize and applying a ceiling to the division gives us (after simplifying):
+			size_t initialBlockCount = (maxSize + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1);
+			largestBlockSize = MAX_BLOCK_SIZE;
+			Block* lastBlock = nullptr;
+			for (size_t i = 0; i != initialBlockCount; ++i) {
+				auto block = make_block(largestBlockSize);
+				if (block == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+					throw std::bad_alloc();
+#else
+					abort();
+#endif
+				}
+				if (firstBlock == nullptr) {
+					firstBlock = block;
+				}
+				else {
+					lastBlock->next = block;
+				}
+				lastBlock = block;
+				block->next = firstBlock;
+			}
+		}
+		else {
+			firstBlock = make_block(largestBlockSize);
+			if (firstBlock == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+				throw std::bad_alloc();
+#else
+				abort();
+#endif
+			}
+			firstBlock->next = firstBlock;
+		}
+		frontBlock = firstBlock;
+		tailBlock = firstBlock;
+		
+		// Make sure the reader/writer threads will have the initialized memory setup above:
+		fence(memory_order_sync);
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being moved. It's up to the user to synchronize this.
+	ReaderWriterQueue(ReaderWriterQueue&& other)
+		: frontBlock(other.frontBlock.load()),
+		tailBlock(other.tailBlock.load()),
+		largestBlockSize(other.largestBlockSize)
+#ifndef NDEBUG
+		,enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		other.largestBlockSize = 32;
+		Block* b = other.make_block(other.largestBlockSize);
+		if (b == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+			throw std::bad_alloc();
+#else
+			abort();
+#endif
+		}
+		b->next = b;
+		other.frontBlock = b;
+		other.tailBlock = b;
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being moved. It's up to the user to synchronize this.
+	ReaderWriterQueue& operator=(ReaderWriterQueue&& other)
+	{
+		Block* b = frontBlock.load();
+		frontBlock = other.frontBlock.load();
+		other.frontBlock = b;
+		b = tailBlock.load();
+		tailBlock = other.tailBlock.load();
+		other.tailBlock = b;
+		std::swap(largestBlockSize, other.largestBlockSize);
+		return *this;
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	~ReaderWriterQueue()
+	{
+		// Make sure we get the latest version of all variables from other CPUs:
+		fence(memory_order_sync);
+
+		// Destroy any remaining objects in queue and free memory
+		Block* frontBlock_ = frontBlock;
+		Block* block = frontBlock_;
+		do {
+			Block* nextBlock = block->next;
+			size_t blockFront = block->front;
+			size_t blockTail = block->tail;
+
+			for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) {
+				auto element = reinterpret_cast<T*>(block->data + i * sizeof(T));
+				element->~T();
+				(void)element;
+			}
+			
+			auto rawBlock = block->rawThis;
+			block->~Block();
+			std::free(rawBlock);
+			block = nextBlock;
+		} while (block != frontBlock_);
+	}
+
+
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element)
+	{
+		return inner_enqueue<CannotAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element)
+	{
+		return inner_enqueue<CannotAlloc>(std::forward<T>(element));
+	}
+
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element)
+	{
+		return inner_enqueue<CanAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element)
+	{
+		return inner_enqueue<CanAlloc>(std::forward<T>(element));
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result)
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+
+		// High-level pseudocode:
+		// Remember where the tail block is
+		// If the front block has an element in it, dequeue it
+		// Else
+		//     If front block was the tail block when we entered the function, return false
+		//     Else advance to next block and dequeue the item there
+
+		// Note that we have to use the value of the tail block from before we check if the front
+		// block is full or not, in case the front block is empty and then, before we check if the
+		// tail block is at the front block or not, the producer fills up the front block *and
+		// moves on*, which would make us skip a filled block. Seems unlikely, but was consistently
+		// reproducible in practice.
+		// In order to avoid overhead in the common case, though, we do a double-checked pattern
+		// where we have the fast path if the front block is not empty, then read the tail block,
+		// then re-read the front block and check if it's not empty again, then check if the tail
+		// block has advanced.
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			// Front block not empty, dequeue from here
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			result = std::move(*element);
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				// Oh look, the front block isn't empty after all
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			// Don't need an acquire fence here since next can only ever be set on the tailBlock,
+			// and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which
+			// ensures next is up-to-date on this CPU in case we recently were at tailBlock.
+
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			// Since the tailBlock is only ever advanced after being written to,
+			// we know there's for sure an element to dequeue on it
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			// We're done with this block, let the producer use it if it needs
+			fence(memory_order_release);		// Expose possibly pending changes to frontBlock->front from last dequeue
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);	// Not strictly needed
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			
+			result = std::move(*element);
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	T* peek()
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+		non_empty_front_block:
+			return reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlock->tail.load());
+			return reinterpret_cast<T*>(nextBlock->data + nextBlockFront * sizeof(T));
+		}
+		
+		return nullptr;
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	bool pop()
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			fence(memory_order_release);
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	inline size_t size_approx() const
+	{
+		size_t result = 0;
+		Block* frontBlock_ = frontBlock.load();
+		Block* block = frontBlock_;
+		do {
+			fence(memory_order_acquire);
+			size_t blockFront = block->front.load();
+			size_t blockTail = block->tail.load();
+			result += (blockTail - blockFront) & block->sizeMask;
+			block = block->next.load();
+		} while (block != frontBlock_);
+		return result;
+	}
+
+
+private:
+	enum AllocationMode { CanAlloc, CannotAlloc };
+
+	template<AllocationMode canAlloc, typename U>
+	bool inner_enqueue(U&& element)
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->enqueuing);
+#endif
+
+		// High-level pseudocode (assuming we're allowed to alloc a new block):
+		// If room in tail block, add to tail
+		// Else check next block
+		//     If next block is not the head block, enqueue on next block
+		//     Else create a new block and enqueue there
+		//     Advance tail to the block we just enqueued to
+
+		Block* tailBlock_ = tailBlock.load();
+		size_t blockFront = tailBlock_->localFront;
+		size_t blockTail = tailBlock_->tail.load();
+
+		size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask;
+		if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) {
+			fence(memory_order_acquire);
+			// This block has room for at least one more element
+			char* location = tailBlock_->data + blockTail * sizeof(T);
+			new (location) T(std::forward<U>(element));
+
+			fence(memory_order_release);
+			tailBlock_->tail = nextBlockTail;
+		}
+		else {
+			fence(memory_order_acquire);
+			if (tailBlock_->next.load() != frontBlock) {
+				// Note that the reason we can't advance to the frontBlock and start adding new entries there
+				// is because if we did, then dequeue would stay in that block, eventually reading the new values,
+				// instead of advancing to the next full block (whose values were enqueued first and so should be
+				// consumed first).
+				
+				fence(memory_order_acquire);		// Ensure we get latest writes if we got the latest frontBlock
+
+				// tailBlock is full, but there's a free block ahead, use it
+				Block* tailBlockNext = tailBlock_->next.load();
+				size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load();
+				nextBlockTail = tailBlockNext->tail.load();
+				fence(memory_order_acquire);
+
+				// This block must be empty since it's not the head block and we
+				// go through the blocks in a circle
+				assert(nextBlockFront == nextBlockTail);
+				tailBlockNext->localFront = nextBlockFront;
+
+				char* location = tailBlockNext->data + nextBlockTail * sizeof(T);
+				new (location) T(std::forward<U>(element));
+
+				tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask;
+
+				fence(memory_order_release);
+				tailBlock = tailBlockNext;
+			}
+			else if (canAlloc == CanAlloc) {
+				// tailBlock is full and there's no free block ahead; create a new block
+				auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2;
+				auto newBlock = make_block(newBlockSize);
+				if (newBlock == nullptr) {
+					// Could not allocate a block!
+					return false;
+				}
+				largestBlockSize = newBlockSize;
+
+				new (newBlock->data) T(std::forward<U>(element));
+
+				assert(newBlock->front == 0);
+				newBlock->tail = newBlock->localTail = 1;
+
+				newBlock->next = tailBlock_->next.load();
+				tailBlock_->next = newBlock;
+
+				// Might be possible for the dequeue thread to see the new tailBlock->next
+				// *without* seeing the new tailBlock value, but this is OK since it can't
+				// advance to the next block until tailBlock is set anyway (because the only
+				// case where it could try to read the next is if it's already at the tailBlock,
+				// and it won't advance past tailBlock in any circumstance).
+				
+				fence(memory_order_release);
+				tailBlock = newBlock;
+			}
+			else if (canAlloc == CannotAlloc) {
+				// Would have had to allocate a new block to enqueue, but not allowed
+				return false;
+			}
+			else {
+				assert(false && "Should be unreachable code");
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+
+	// Disable copying
+	ReaderWriterQueue(ReaderWriterQueue const&) {  }
+
+	// Disable assignment
+	ReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
+
+
+
+	AE_FORCEINLINE static size_t ceilToPow2(size_t x)
+	{
+		// From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (size_t i = 1; i < sizeof(size_t); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename U>
+	static AE_FORCEINLINE char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+private:
+#ifndef NDEBUG
+	struct ReentrantGuard
+	{
+		ReentrantGuard(bool& _inSection)
+			: inSection(_inSection)
+		{
+			assert(!inSection && "ReaderWriterQueue does not support enqueuing or dequeuing elements from other elements' ctors and dtors");
+			inSection = true;
+		}
+
+		~ReentrantGuard() { inSection = false; }
+
+	private:
+		ReentrantGuard& operator=(ReentrantGuard const&);
+
+	private:
+		bool& inSection;
+	};
+#endif
+
+	struct Block
+	{
+		// Avoid false-sharing by putting highly contended variables on their own cache lines
+		weak_atomic<size_t> front;	// (Atomic) Elements are read from here
+		size_t localTail;			// An uncontended shadow copy of tail, owned by the consumer
+		
+		char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];
+		weak_atomic<size_t> tail;	// (Atomic) Elements are enqueued here
+		size_t localFront;
+		
+		char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];	// next isn't very contended, but we don't want it on the same cache line as tail (which is)
+		weak_atomic<Block*> next;	// (Atomic)
+		
+		char* data;		// Contents (on heap) are aligned to T's alignment
+
+		const size_t sizeMask;
+
+
+		// size must be a power of two (and greater than 0)
+		Block(size_t const& _size, char* _rawThis, char* _data)
+			: front(0), localTail(0), tail(0), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis)
+		{
+		}
+
+	private:
+		// C4512 - Assignment operator could not be generated
+		Block& operator=(Block const&);
+
+	public:
+		char* rawThis;
+	};
+	
+	
+	static Block* make_block(size_t capacity)
+	{
+		// Allocate enough memory for the block itself, as well as all the elements it will contain
+		auto size = sizeof(Block) + std::alignment_of<Block>::value - 1;
+		size += sizeof(T) * capacity + std::alignment_of<T>::value - 1;
+		auto newBlockRaw = static_cast<char*>(std::malloc(size));
+		if (newBlockRaw == nullptr) {
+			return nullptr;
+		}
+		
+		auto newBlockAligned = align_for<Block>(newBlockRaw);
+		auto newBlockData = align_for<T>(newBlockAligned + sizeof(Block));
+		return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData);
+	}
+
+private:
+	weak_atomic<Block*> frontBlock;		// (Atomic) Elements are enqueued to this block
+	
+	char cachelineFiller[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<Block*>)];
+	weak_atomic<Block*> tailBlock;		// (Atomic) Elements are dequeued from this block
+
+	size_t largestBlockSize;
+
+#ifndef NDEBUG
+	bool enqueuing;
+	bool dequeuing;
+#endif
+};
+
+// Like ReaderWriterQueue, but also providees blocking operations
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class BlockingReaderWriterQueue
+{
+private:
+	typedef ::moodycamel::ReaderWriterQueue<T, MAX_BLOCK_SIZE> ReaderWriterQueue;
+	
+public:
+	explicit BlockingReaderWriterQueue(size_t maxSize = 15)
+		: inner(maxSize)
+	{ }
+
+	
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element)
+	{
+		if (inner.try_enqueue(element)) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element)
+	{
+		if (inner.try_enqueue(std::forward<T>(element))) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element)
+	{
+		if (inner.enqueue(element)) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element)
+	{
+		if (inner.enqueue(std::forward<T>(element))) {
+			sema.signal();
+			return true;
+		}
+		return false;
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result)
+	{
+		if (sema.tryWait()) {
+			bool success = inner.try_dequeue(result);
+			assert(success);
+			AE_UNUSED(success);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available, then dequeues it.
+	template<typename U>
+	void wait_dequeue(U& result)
+	{
+		sema.wait();
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U>
+	bool wait_dequeue_timed(U& result, std::int64_t timeout_usecs)
+	{
+		if (!sema.wait(timeout_usecs)) {
+			return false;
+		}
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+		return true;
+	}
+
+
+#if __cplusplus > 199711L || _MSC_VER >= 1700
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& result, std::chrono::duration<Rep, Period> const& timeout)
+	{
+        return wait_dequeue_timed(result, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+	}
+#endif
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	AE_FORCEINLINE T* peek()
+	{
+		return inner.peek();
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	AE_FORCEINLINE bool pop()
+	{
+		if (sema.tryWait()) {
+			bool result = inner.pop();
+			assert(result);
+			AE_UNUSED(result);
+			return true;
+		}
+		return false;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	AE_FORCEINLINE size_t size_approx() const
+	{
+		return sema.availableApprox();
+	}
+
+
+private:
+	// Disable copying & assignment
+	BlockingReaderWriterQueue(ReaderWriterQueue const&) {  }
+	BlockingReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
+	
+private:
+	ReaderWriterQueue inner;
+	spsc_sema::LightweightSemaphore sema;
+};
+
+}    // end namespace moodycamel
+
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f37d3ee..728e8a1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,7 +30,9 @@ set(SOURCE_FILES    MediaFile.cpp
                     ../include/ffcpp/Resampler.h
                     Player.cpp
                     ../include/ffcpp/Player.h
-                    ../include/ffcpp/TSQueue.h)
+                    ../include/ffcpp/TSQueue.h
+                    ../include/ffcpp/atomicops.h
+                    ../include/ffcpp/readerwriterqueue.h)
                     
 add_library(ffcpp ${SOURCE_FILES})
 target_link_libraries(ffcpp ${FFMPEG_LIBRARIES})
diff --git a/src/Codec.cpp b/src/Codec.cpp
index 0a42e4b..6b5d2f1 100644
--- a/src/Codec.cpp
+++ b/src/Codec.cpp
@@ -115,6 +115,8 @@ namespace ffcpp {
 
 		if(_codecCtx->codec_type == AVMEDIA_TYPE_VIDEO) {
 			frame->guessPts();
+		} else if(_codecCtx->codec_type == AVMEDIA_TYPE_AUDIO) {
+			frame->guessChannelLayout();
 		}
 
 		return frame;
@@ -125,7 +127,7 @@ namespace ffcpp {
 		int gotPacket = 0;
 		auto encFunc = (_codecCtx->codec_type == AVMEDIA_TYPE_VIDEO ? avcodec_encode_video2 : avcodec_encode_audio2);
 
-        int res = encFunc(_codecCtx, packet, frame->nativePtr(), &gotPacket);
+        int res = encFunc(_codecCtx, packet, frame ? frame->nativePtr() : nullptr, &gotPacket);
 		if(res < 0) throw std::runtime_error("cannot encode frame");
 
 		return packet;
diff --git a/src/Frame.cpp b/src/Frame.cpp
index fb11fd5..e4489b6 100644
--- a/src/Frame.cpp
+++ b/src/Frame.cpp
@@ -86,4 +86,14 @@ namespace ffcpp {
 		return _frame->pts;
 	}
 
+	void Frame::guessChannelLayout() {
+		if(_frame->channel_layout == 0) {
+			_frame->channel_layout = (uint64_t)av_get_default_channel_layout(_frame->channels);
+		}
+	}
+
+    int Frame::size() const {
+        return _frame->pkt_size >= 0 ? _frame->pkt_size : _frame->linesize[0];
+    }
+
 }
diff --git a/src/MediaFile.cpp b/src/MediaFile.cpp
index eed4bff..7fe9737 100644
--- a/src/MediaFile.cpp
+++ b/src/MediaFile.cpp
@@ -16,6 +16,10 @@ namespace ffcpp {
 
 			_streams.reserve(_formatCtx->nb_streams);
 			for(size_t i = 0; i < _formatCtx->nb_streams; ++i) {
+				auto codecType = _formatCtx->streams[i]->codec->codec_type;
+				if(codecType != AVMEDIA_TYPE_VIDEO && codecType != AVMEDIA_TYPE_AUDIO)
+					continue;
+
 				auto stream = std::make_shared<Stream>(_formatCtx->streams[i]);
 				_streams.emplace_back(stream);
 			}
diff --git a/src/Player.cpp b/src/Player.cpp
index 7d54cb8..fe694d1 100644
--- a/src/Player.cpp
+++ b/src/Player.cpp
@@ -1,24 +1,33 @@
 #include "ffcpp/Player.h"
 #include "ffcpp/Stream.h"
 #include "ffcpp/Scaler.h"
+#include "ffcpp/Resampler.h"
 #include <iostream>
 #include <chrono>
+#include <ctime>
 
 namespace ffcpp {
 
-    Player::Player(std::shared_ptr<IVideoSink> vSink): _vSink(vSink),
+    Player::Player(std::shared_ptr<IVideoSink> vSink,
+                   std::shared_ptr<IAudioSink> aSink): _vSink(vSink),
+                                                       _aSink(aSink),
                                                        _curMedia(nullptr),
                                                        _aStream(nullptr),
                                                        _vStream(nullptr),
                                                        _state(PlayerState::Stopped),
+                                                       _aSamplesBuffer(new uint8_t[AUDIO_BUFFER_LENGTH]),
+                                                       _samplesInBuffer(0),
                                                        _decodeThread(&Player::decode, this),
                                                        _vPlayThread(&Player::displayFrames, this),
-                                                       _decodedFrames(10)
+                                                       _videoFrames(100),
+                                                       _audioFrames(100)
     {
         init();
+        _aSink->setAudioSource(this);
     }
 
     Player::~Player() {
+        std::cout << "Player destructor" << std::endl;
 //        _state = PlayerState::Shutdown;
 //        std::cout << "destructor" << std::endl;
 //        _stateCond.notify_all();
@@ -37,11 +46,24 @@ namespace ffcpp {
         _curMedia = std::make_unique<MediaFile>(path, Mode::Read);
         _vStream = _curMedia->videoStream();
         _aStream = _curMedia->audioStream();
+
+        auto codec = _aStream->codec().get();
+
+        _resampler = std::make_shared<Resampler>(_aStream->codec()->channels(),
+                                                 _aStream->codec()->channelLayout(),
+                                                 _aStream->codec()->sampleRate(),
+                                                 _aStream->codec()->sampleFormat(),
+                                                 _aSink->getChannelsCount(),
+                                                 av_get_default_channel_layout(_aSink->getChannelsCount()),
+                                                 _aSink->getSampleRate(),
+                                                 _aSink->getSampleFormat());
     }
 
     void Player::setVideoSize(size_t width, size_t height) {
         std::lock_guard<std::mutex> lock(_mutex);
-        _scaler = std::make_shared<Scaler>(_vStream->codec()->width(), _vStream->codec()->height(), _vStream->codec()->pixelFormat(),
+        _scaler = std::make_shared<Scaler>(_vStream->codec()->width(),
+                                           _vStream->codec()->height(),
+                                           _vStream->codec()->pixelFormat(),
                                            width, height, _vSink->getPixelFormat());
     }
 
@@ -74,7 +96,18 @@ namespace ffcpp {
                 auto frame = _vStream->codec()->decode(packet);
                 frame = _scaler->scale(frame);
                 lock.unlock();
-                _decodedFrames.pushOrWait(frame);
+                while(!_videoFrames.try_enqueue(frame)) {
+                    std::cout << "waiting for enqueue video frame" << std::endl;
+                    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                }
+            } else if(packetType == AVMEDIA_TYPE_AUDIO) {
+                auto frame = _aStream->codec()->decode(packet);
+                frame = _resampler->resample(frame);
+                lock.unlock();
+                while(!_audioFrames.try_enqueue(frame)) {
+                    std::cout << "waiting for enqueue audio frame" << std::endl;
+                    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                }
             }
         }
     }
@@ -91,15 +124,58 @@ namespace ffcpp {
             }
 
             lock.unlock();
-            auto frame = _decodedFrames.popOrWait();
-            lock.lock();
-            AVFrame* f = frame->nativePtr();
-            _vSink->drawPlanarYUVFrame(f->data[0], f->data[1], f->data[2],
-                                       f->linesize[0], f->linesize[1], f->linesize[2]);
-            int fps = _vStream->fps();
 
-            lock.unlock();
+            int fps = _vStream->fps();
+            FramePtr frame;
+            if(_videoFrames.try_dequeue(frame)) {
+                lock.lock();
+                AVFrame* f = frame->nativePtr();
+                _vSink->drawPlanarYUVFrame(f->data[0], f->data[1], f->data[2],
+                                           f->linesize[0], f->linesize[1], f->linesize[2]);
+
+                lock.unlock();
+            } else {
+                std::cout << "=============== skip video frame" << std::endl;
+            }
+
             std::this_thread::sleep_for(std::chrono::milliseconds(1000/fps));
         }
     }
+
+    uint64_t time = 0;
+    void Player::fillSampleBuffer(uint8_t *data, int length) {
+        int copied = 0;
+
+        if(_samplesInBuffer > 0) {
+            memcpy(data, _aSamplesBuffer.get(), _samplesInBuffer);
+            copied = _samplesInBuffer;
+            _samplesInBuffer = 0;
+        }
+
+        while (copied < length) {
+            FramePtr frame;
+            while(!_audioFrames.try_dequeue(frame)) {
+                std::cout << "waiting for audio frame" << std::endl;
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            }
+
+//            uint64_t curTime = std::chrono::system_clock::now().time_since_epoch().count();
+//            std::cout << "fill samples buffer: " << length << ", " << (curTime - time) << std::endl;
+//            time = curTime;
+
+            AVFrame* f = frame->nativePtr();
+            int frameSize = frame->size();
+
+            if(copied + frameSize > length) {
+                memcpy(data + copied, f->data[0], length - copied);
+                memcpy(_aSamplesBuffer.get(), f->data + length - copied, frameSize - length + copied);
+                _samplesInBuffer = frameSize - length + copied;
+                copied = length;
+            } else {
+                memcpy(data + copied, f->data[0], frameSize);
+                copied += frameSize;
+            }
+        }
+    }
+
 }
\ No newline at end of file
diff --git a/src/Resampler.cpp b/src/Resampler.cpp
index cebc0d0..286826a 100644
--- a/src/Resampler.cpp
+++ b/src/Resampler.cpp
@@ -9,8 +9,9 @@ extern "C" {
 
 namespace ffcpp {
 
-	Resampler::Resampler(int inChannelLayout, int inSampleRate, AVSampleFormat inSampleFormat, int outChannelLayout,
-	                     int outSampleRate, AVSampleFormat outSampleFormat) {
+	Resampler::Resampler(int inChannelCount, int inChannelLayout, int inSampleRate, AVSampleFormat inSampleFormat,
+                         int outChannelCount, int outChannelLayout, int outSampleRate, AVSampleFormat outSampleFormat) {
+		_dstChannelCount = outChannelCount;
 		_dstChannelLayout = outChannelLayout;
 		_dstSampleFormat = outSampleFormat;
 		_dstSampleRate = outSampleRate;
@@ -20,10 +21,12 @@ namespace ffcpp {
 			throw new std::runtime_error("cannot create resampler");
 		}
 
+        av_opt_set_int(_swrContext, "in_channel_count",     inChannelCount, 0);
 		av_opt_set_int(_swrContext, "in_channel_layout",    inChannelLayout, 0);
 		av_opt_set_int(_swrContext, "in_sample_rate",       inSampleRate, 0);
 		av_opt_set_sample_fmt(_swrContext, "in_sample_fmt", inSampleFormat, 0);
 
+        av_opt_set_int(_swrContext, "out_channel_count",     outChannelCount, 0);
 		av_opt_set_int(_swrContext, "out_channel_layout",    outChannelLayout, 0);
 		av_opt_set_int(_swrContext, "out_sample_rate",       outSampleRate, 0);
 		av_opt_set_sample_fmt(_swrContext, "out_sample_fmt", outSampleFormat, 0);
@@ -33,8 +36,8 @@ namespace ffcpp {
 	}
 
 	Resampler::Resampler(CodecPtr decoder, CodecPtr encoder)
-			: Resampler(decoder->channelLayout(), decoder->sampleRate(), decoder->sampleFormat(),
-			            encoder->channelLayout(), encoder->sampleRate(), encoder->sampleFormat()) {
+			: Resampler(decoder->channels(), decoder->channelLayout(), decoder->sampleRate(), decoder->sampleFormat(),
+			            encoder->channels(), encoder->channelLayout(), encoder->sampleRate(), encoder->sampleFormat()) {
 	}
 
 	Resampler::~Resampler() {
@@ -44,11 +47,12 @@ namespace ffcpp {
 	}
 
     FramePtr Resampler::resample(FramePtr inFrame) {
-		int channelsCount = av_get_channel_layout_nb_channels(_dstChannelLayout);
         int outSamples = swr_get_out_samples(_swrContext, inFrame->samplesCount());
 
-        FramePtr outFrame = std::make_shared<Frame>(outSamples, channelsCount, _dstSampleFormat, _dstSampleRate);
-        int res = swr_convert_frame(_swrContext, outFrame->nativePtr(), inFrame->nativePtr());
+        FramePtr outFrame = std::make_shared<Frame>(outSamples, _dstChannelCount, _dstSampleFormat, _dstSampleRate);
+		AVFrame *out = outFrame->nativePtr(), *in = inFrame->nativePtr();
+
+        int res = swr_convert_frame(_swrContext, out, in);
 		throwIfError(res, "cannot convert audio frame");
 
 		return outFrame;