diff --git a/Makefile.gc b/Makefile.gc
index d593d8f..f86e9dd 100644
--- a/Makefile.gc
+++ b/Makefile.gc
@@ -27,7 +27,7 @@ INCLUDES	:=	core core/m68k core/z80 core/sound core/tremor core/ntsc core/input_
 # options for code generation
 #---------------------------------------------------------------------------------
 
-CFLAGS  = -O3 -fomit-frame-pointer -Wall -Wno-strict-aliasing $(MACHDEP) $(INCLUDE) -DUSE_LIBTREMOR -DDISABLE_MANY_OGG_OPEN_FILES -DUSE_16BPP_RENDERING -DALT_RENDERER
+CFLAGS  = -O3 -fomit-frame-pointer -Wall -Wno-strict-aliasing $(MACHDEP) $(INCLUDE) -DUSE_LIBTREMOR -DDISABLE_MANY_OGG_OPEN_FILES -DUSE_16BPP_RENDERING -DALT_RENDERER -DBLIP_INVERT
 CXXFLAGS	=	$(CFLAGS)
 
 LDFLAGS	=	$(MACHDEP) -Wl,-Map,$(notdir $@).map
diff --git a/Makefile.wii b/Makefile.wii
index aff1cd7..1febf4c 100644
--- a/Makefile.wii
+++ b/Makefile.wii
@@ -27,7 +27,7 @@ INCLUDES	:=	core core/m68k core/z80 core/sound core/tremor core/ntsc core/input_
 # options for code generation
 #---------------------------------------------------------------------------------
 
-CFLAGS  = -O3 -fomit-frame-pointer -Wall -Wno-strict-aliasing $(MACHDEP) $(INCLUDE) -DUSE_LIBTREMOR -DUSE_16BPP_RENDERING -DALT_RENDERER -DHW_RVL
+CFLAGS  = -O3 -fomit-frame-pointer -Wall -Wno-strict-aliasing $(MACHDEP) $(INCLUDE) -DUSE_LIBTREMOR -DUSE_16BPP_RENDERING -DALT_RENDERER -DBLIP_INVERT -DHW_RVL
 CXXFLAGS	=	$(CFLAGS)
 
 LDFLAGS	=	$(MACHDEP) -Wl,-Map,$(notdir $@).map,-wrap,wiiuse_set_ir,-wrap,wiiuse_handshake,-wrap,classic_ctrl_handshake,-wrap,classic_ctrl_event
diff --git a/core/cd_hw/cdd.c b/core/cd_hw/cdd.c
index 673e725..d20aa87 100644
--- a/core/cd_hw/cdd.c
+++ b/core/cd_hw/cdd.c
@@ -179,8 +179,7 @@ void cdd_init(int samplerate)
 {
   /* CD-DA is running by default at 44100 Hz */
   /* Audio stream is resampled to desired rate using Blip Buffer */
-  blip_set_rates(snd.blips[2][0], 44100, samplerate);
-  blip_set_rates(snd.blips[2][1], 44100, samplerate);
+  blip_set_rates(snd.blips[2], 44100, samplerate);
 }
 
 void cdd_reset(void)
@@ -1059,16 +1058,16 @@ void cdd_read_data(uint8 *dst)
 void cdd_read_audio(unsigned int samples)
 {
   /* previous audio outputs */
-  int16 l = cdd.audio[0];
-  int16 r = cdd.audio[1];
+  int prev_l = cdd.audio[0];
+  int prev_r = cdd.audio[1];
 
   /* get number of internal clocks (samples) needed */
-  samples = blip_clocks_needed(snd.blips[2][0], samples);
+  samples = blip_clocks_needed(snd.blips[2], samples);
 
   /* audio track playing ? */
   if (!scd.regs[0x36>>1].byte.h && cdd.toc.tracks[cdd.index].fd)
   {
-    int i, mul, delta;
+    int i, mul, l, r;
 
     /* current CD-DA fader volume */
     int curVol = cdd.volume;
@@ -1106,17 +1105,13 @@ void cdd_read_audio(unsigned int samples)
         /* (MIN) 0,1,2,3,4,8,12,16,20...,1020,1024 (MAX) */
         mul = (curVol & 0x7fc) ? (curVol & 0x7fc) : (curVol & 0x03);
 
-        /* left channel */
-        delta = ((ptr[0] * mul) / 1024) - l;
-        ptr++;
-        l += delta;
-        blip_add_delta_fast(snd.blips[2][0], i, delta);
-
-        /* right channel */
-        delta = ((ptr[0] * mul) / 1024) - r;
-        ptr++;
-        r += delta;
-        blip_add_delta_fast(snd.blips[2][1], i, delta);
+        /* left & right channels */
+        l = ((ptr[0] * mul) / 1024);
+        r = ((ptr[1] * mul) / 1024);
+        blip_add_delta_fast(snd.blips[2], i, l-prev_l, r-prev_r);
+        prev_l = l;
+        prev_r = r;
+        ptr+=2;
 
         /* update CD-DA fader volume (one step/sample) */
         if (curVol < endVol)
@@ -1153,27 +1148,19 @@ void cdd_read_audio(unsigned int samples)
         /* (MIN) 0,1,2,3,4,8,12,16,20...,1020,1024 (MAX) */
         mul = (curVol & 0x7fc) ? (curVol & 0x7fc) : (curVol & 0x03);
 
-        /* left channel */
+        /* left & right channels */
 #ifdef LSB_FIRST
-        delta = ((ptr[0] * mul) / 1024) - l;
-        ptr++;
+        l = ((ptr[0] * mul) / 1024);
+        r = ((ptr[1] * mul) / 1024);
+        ptr+=2;
 #else
-        delta = (((int16)((ptr[0] + ptr[1]*256)) * mul) / 1024) - l;
-        ptr += 2;
+        l = (((int16)((ptr[0] + ptr[1]*256)) * mul) / 1024);
+        r = (((int16)((ptr[2] + ptr[3]*256)) * mul) / 1024);
+        ptr+=4;
 #endif
-        l += delta;
-        blip_add_delta_fast(snd.blips[2][0], i, delta);
-
-        /* right channel */
-#ifdef LSB_FIRST
-        delta = ((ptr[0] * mul) / 1024) - r;
-        ptr++;
-#else
-        delta = (((int16)((ptr[0] + ptr[1]*256)) * mul) / 1024) - r;
-        ptr += 2;
-#endif
-        r += delta;
-        blip_add_delta_fast(snd.blips[2][1], i, delta);
+        blip_add_delta_fast(snd.blips[2], i, l-prev_l, r-prev_r);
+        prev_l = l;
+        prev_r = r;
 
         /* update CD-DA fader volume (one step/sample) */
         if (curVol < endVol)
@@ -1198,23 +1185,24 @@ void cdd_read_audio(unsigned int samples)
     cdd.volume = curVol;
 
     /* save last audio output for next frame */
-    cdd.audio[0] = l;
-    cdd.audio[1] = r;
+    cdd.audio[0] = prev_l;
+    cdd.audio[1] = prev_r;
   }
   else
   {
     /* no audio output */
-    if (l) blip_add_delta_fast(snd.blips[2][0], 0, -l);
-    if (r) blip_add_delta_fast(snd.blips[2][1], 0, -r);
+    if (prev_l | prev_r)
+    {
+      blip_add_delta_fast(snd.blips[2], 0, -prev_l, -prev_r);
 
-    /* save audio output for next frame */
-    cdd.audio[0] = 0;
-    cdd.audio[1] = 0;
+      /* save audio output for next frame */
+      cdd.audio[0] = 0;
+      cdd.audio[1] = 0;
+    }
   }
 
   /* end of Blip Buffer timeframe */
-  blip_end_frame(snd.blips[2][0], samples);
-  blip_end_frame(snd.blips[2][1], samples);
+  blip_end_frame(snd.blips[2], samples);
 }
 
 static void cdd_read_subcode(void)
diff --git a/core/cd_hw/pcm.c b/core/cd_hw/pcm.c
index 09f99d3..e068d70 100644
--- a/core/cd_hw/pcm.c
+++ b/core/cd_hw/pcm.c
@@ -2,7 +2,7 @@
  *  Genesis Plus
  *  PCM sound chip (315-5476A) (RF5C164 compatible)
  *
- *  Copyright (C) 2012-2014  Eke-Eke (Genesis Plus GX)
+ *  Copyright (C) 2012-2016  Eke-Eke (Genesis Plus GX)
  *
  *  Redistribution and use of this code or any derivative works are permitted
  *  provided that the following conditions are met:
@@ -45,8 +45,7 @@ void pcm_init(double clock, int samplerate)
 {
   /* PCM chip is running at original rate and is synchronized with SUB-CPU  */
   /* Chip output is resampled to desired rate using Blip Buffer. */
-  blip_set_rates(snd.blips[1][0], clock / PCM_SCYCLES_RATIO, samplerate);
-  blip_set_rates(snd.blips[1][1], clock / PCM_SCYCLES_RATIO, samplerate);
+  blip_set_rates(snd.blips[1], clock / PCM_SCYCLES_RATIO, samplerate);
 }
 
 void pcm_reset(void)
@@ -71,8 +70,7 @@ void pcm_reset(void)
   pcm.cycles = 0;
 
   /* clear blip buffers */
-  blip_clear(snd.blips[1][0]);
-  blip_clear(snd.blips[1][1]);
+  blip_clear(snd.blips[1]);
 }
 
 int pcm_context_save(uint8 *state)
@@ -117,6 +115,11 @@ void pcm_run(unsigned int length)
 #ifdef LOG_PCM
   error("[%d][%d]run %d PCM samples (from %d)\n", v_counter, s68k.cycles, length, pcm.cycles);
 #endif
+
+  /* previous audio outputs */
+  int prev_l = pcm.out[0];
+  int prev_r = pcm.out[1];
+
   /* check if PCM chip is running */
   if (pcm.enabled)
   {
@@ -180,41 +183,29 @@ void pcm_run(unsigned int length)
       if (r < -32768) r = -32768;
       else if (r > 32767) r = 32767;
 
-      /* check if PCM left output changed */
-      if (pcm.out[0] != l)
-      {
-        blip_add_delta_fast(snd.blips[1][0], i, l-pcm.out[0]);
-        pcm.out[0] = l;
-      }
-
-      /* check if PCM right output changed */
-      if (pcm.out[1] != r)
-      {
-        blip_add_delta_fast(snd.blips[1][1], i, r-pcm.out[1]);
-        pcm.out[1] = r;
-      }
+      /* update Blip Buffer */
+      blip_add_delta_fast(snd.blips[1], i, l-prev_l, r-prev_r);
+      prev_l = l;
+      prev_r = r;
     }
+
+    /* save last audio outputs */
+    pcm.out[0] = prev_l;
+    pcm.out[1] = prev_r;
   }
   else
   {
-    /* check if PCM left output changed */
-    if (pcm.out[0])
+    /* check if PCM output was not muted */
+    if (prev_l | prev_r)
     {
-      blip_add_delta_fast(snd.blips[1][0], 0, -pcm.out[0]);
+      blip_add_delta_fast(snd.blips[1], 0, -prev_l, -prev_r);
       pcm.out[0] = 0;
-    }
-
-    /* check if PCM right output changed */
-    if (pcm.out[1])
-    {
-      blip_add_delta_fast(snd.blips[1][1], 0, -pcm.out[1]);
       pcm.out[1] = 0;
     }
   }
 
   /* end of blip buffer frame */
-  blip_end_frame(snd.blips[1][0], length);
-  blip_end_frame(snd.blips[1][1], length);
+  blip_end_frame(snd.blips[1], length);
 
   /* update PCM master clock counter */
   pcm.cycles += length * PCM_SCYCLES_RATIO;
@@ -223,7 +214,7 @@ void pcm_run(unsigned int length)
 void pcm_update(unsigned int samples)
 {
   /* get number of internal clocks (samples) needed */
-  unsigned int clocks = blip_clocks_needed(snd.blips[1][0], samples);
+  unsigned int clocks = blip_clocks_needed(snd.blips[1], samples);
 
   /* run PCM chip */
   if (clocks > 0)
diff --git a/core/cd_hw/pcm.h b/core/cd_hw/pcm.h
index ee1a5c5..eef847d 100644
--- a/core/cd_hw/pcm.h
+++ b/core/cd_hw/pcm.h
@@ -2,7 +2,7 @@
  *  Genesis Plus
  *  PCM sound chip (315-5476A) (RF5C164 compatible)
  *
- *  Copyright (C) 2012-2014  Eke-Eke (Genesis Plus GX)
+ *  Copyright (C) 2012-2016  Eke-Eke (Genesis Plus GX)
  *
  *  Redistribution and use of this code or any derivative works are permitted
  *  provided that the following conditions are met:
diff --git a/core/sound/blip_buf.c b/core/sound/blip_buf.c
index 2f48536..ae0db45 100644
--- a/core/sound/blip_buf.c
+++ b/core/sound/blip_buf.c
@@ -1,10 +1,11 @@
-/* blip_buf $vers. http://www.slack.net/~ant/                         */
+/* blip_buf $vers. http://www.slack.net/~ant/                       */
 
-/*  Modified for Genesis Plus GX by EkeEke (01/09/12)                 */
-/*    - disabled assertions checks (define #BLIP_ASSERT to re-enable) */
-/*    - fixed multiple time-frames support & removed m->avail         */
-/*    - modified blip_read_samples to always output to stereo streams */
-/*    - added blip_mix_samples function (see blip_buf.h)              */
+/* Modified for Genesis Plus GX by EkeEke                           */
+/*  - disabled assertions checks (define #BLIP_ASSERT to re-enable) */
+/*  - fixed multiple time-frames support & removed m->avail         */
+/*  - added blip_mix_samples function (see blip_buf.h)              */
+/*  - added stereo buffer support (define #BLIP_MONO to disable)    */
+/*  - added inverted stereo output (define #BLIP_INVERT to enable)*/
 
 #include "blip_buf.h"
 
@@ -61,24 +62,32 @@ enum { phase_count = 1 << phase_bits };
 enum { delta_bits  = 15 };
 enum { delta_unit  = 1 << delta_bits };
 enum { frac_bits = time_bits - pre_shift };
+enum { phase_shift = frac_bits - phase_bits };
 
 /* We could eliminate avail and encode whole samples in offset, but that would
 limit the total buffered samples to blip_max_frame. That could only be
 increased by decreasing time_bits, which would reduce resample ratio accuracy.
 */
 
+typedef int buf_t;
+
 struct blip_t
 {
 	fixed_t factor;
 	fixed_t offset;
 	int size;
+#ifdef BLIP_MONO
 	int integrator;
+#else
+  int integrator[2];
+  buf_t* buffer[2];
+#endif
 };
 
-typedef int buf_t;
-
+#ifdef BLIP_MONO
 /* probably not totally portable */
-#define SAMPLES( buf ) ((buf_t*) ((buf) + 1))
+#define SAMPLES( blip ) ((buf_t*) ((blip) + 1))
+#endif
 
 /* Arithmetic (sign-preserving) right shift */
 #define ARITH_SHIFT( n, shift ) \
@@ -124,9 +133,23 @@ blip_t* blip_new( int size )
 	assert( size >= 0 );
 #endif
   
+#ifdef BLIP_MONO
 	m = (blip_t*) malloc( sizeof *m + (size + buf_extra) * sizeof (buf_t) );
+#else
+	m = (blip_t*) malloc( sizeof *m );
+#endif
+
 	if ( m )
 	{
+#ifndef BLIP_MONO
+    m->buffer[0] = (buf_t*) malloc( (size + buf_extra) * sizeof (buf_t));
+    m->buffer[1] = (buf_t*) malloc( (size + buf_extra) * sizeof (buf_t));
+    if ((m->buffer[0] == NULL) || (m->buffer[1] == NULL))
+    {
+      blip_delete(m);
+      return 0;
+    }
+#endif
 		m->factor = time_unit / blip_max_ratio;
 		m->size   = size;
 		blip_clear( m );
@@ -141,7 +164,13 @@ void blip_delete( blip_t* m )
 {
 	if ( m != NULL )
 	{
-		/* Clear fields in case user tries to use after freeing */
+#ifndef BLIP_MONO
+    if (m->buffer[0] != NULL)
+      free(m->buffer[0]);
+    if (m->buffer[1] != NULL)
+      free(m->buffer[1]);
+#endif
+    /* Clear fields in case user tries to use after freeing */
 		memset( m, 0, sizeof *m );
 		free( m );
 	}
@@ -173,16 +202,23 @@ void blip_clear( blip_t* m )
 	Since we don't know rounding direction, factor/2 accommodates either,
 	with the slight loss of showing an error in half the time. Since for
 	a 64-bit factor this is years, the halving isn't a problem. */
-	
-	m->offset     = m->factor / 2;
+
+	m->offset = m->factor / 2;
+#ifdef BLIP_MONO
 	m->integrator = 0;
 	memset( SAMPLES( m ), 0, (m->size + buf_extra) * sizeof (buf_t) );
+#else
+	m->integrator[0] = 0;
+	m->integrator[1] = 0;
+	memset( m->buffer[0], 0, (m->size + buf_extra) * sizeof (buf_t) );
+	memset( m->buffer[1], 0, (m->size + buf_extra) * sizeof (buf_t) );
+#endif
 }
 
 int blip_clocks_needed( const blip_t* m, int samples )
 {
 	fixed_t needed;
-	
+
 #ifdef BLIP_ASSERT
 	/* Fails if buffer can't hold that many more samples */
 	assert( (samples >= 0) && (((m->offset >> time_bits) + samples) <= m->size) );
@@ -191,14 +227,14 @@ int blip_clocks_needed( const blip_t* m, int samples )
   needed = (fixed_t) samples * time_unit;
 	if ( needed < m->offset )
 		return 0;
-	
+
 	return (needed - m->offset + m->factor - 1) / m->factor;
 }
 
 void blip_end_frame( blip_t* m, unsigned t )
 {
 	m->offset += t * m->factor;
-	
+
 #ifdef BLIP_ASSERT
 	/* Fails if buffer size was exceeded */
   assert( (m->offset >> time_bits) <= m->size );
@@ -212,91 +248,167 @@ int blip_samples_avail( const blip_t* m )
 
 static void remove_samples( blip_t* m, int count )
 {
+#ifdef BLIP_MONO
 	buf_t* buf = SAMPLES( m );
-	int remain = (m->offset >> time_bits) + buf_extra - count;
+#else
+	buf_t* buf = m->buffer[0];
+#endif
+  int remain = (m->offset >> time_bits) + buf_extra - count;
   m->offset -= count * time_unit;
-  
-	memmove( &buf [0], &buf [count], remain * sizeof buf [0] );
-	memset( &buf [remain], 0, count * sizeof buf [0] );
+
+	memmove( &buf [0], &buf [count], remain * sizeof (buf_t) );
+	memset( &buf [remain], 0, count * sizeof (buf_t) );
+#ifndef BLIP_MONO
+	buf = m->buffer[1];
+	memmove( &buf [0], &buf [count], remain * sizeof (buf_t) );
+	memset( &buf [remain], 0, count * sizeof (buf_t) );
+#endif
 }
 
 int blip_read_samples( blip_t* m, short out [], int count)
 {
 #ifdef BLIP_ASSERT
 	assert( count >= 0 );
-	
+
 	if ( count > (m->offset >> time_bits) )
 		count = m->offset >> time_bits;
-	
+
 	if ( count )
 #endif
   {
-		buf_t const* in  = SAMPLES( m );
-		buf_t const* end = in + count;
+#ifdef BLIP_MONO
+		buf_t const* in = SAMPLES( m );
 		int sum = m->integrator;
+#else
+		buf_t const* in = m->buffer[0];
+		buf_t const* in2 = m->buffer[1];
+		int sum = m->integrator[0];
+		int sum2 = m->integrator[1];
+#endif
+		buf_t const* end = in + count;
 		do
 		{
 			/* Eliminate fraction */
 			int s = ARITH_SHIFT( sum, delta_bits );
-			
+
 			sum += *in++;
-			
+
 			CLAMP( s );
-			
-			*out = s;
-			out += 2;
-			
+
+			*out++ = s;
+
 			/* High-pass filter */
 			sum -= s << (delta_bits - bass_shift);
+
+#ifndef BLIP_MONO
+			/* Eliminate fraction */
+			s = ARITH_SHIFT( sum2, delta_bits );
+
+			sum2 += *in2++;
+
+			CLAMP( s );
+
+			*out++ = s;
+
+			/* High-pass filter */
+			sum2 -= s << (delta_bits - bass_shift);
+#endif
 		}
 		while ( in != end );
+
+#ifdef BLIP_MONO
 		m->integrator = sum;
-		
+#else
+		m->integrator[0] = sum;
+		m->integrator[1] = sum2;
+#endif
 		remove_samples( m, count );
 	}
-	
+
 	return count;
 }
 
-int blip_mix_samples( blip_t* m, short out [], int count)
+int blip_mix_samples( blip_t* m1, blip_t* m2, blip_t* m3, short out [], int count)
 {
 #ifdef BLIP_ASSERT
-	assert( count >= 0 );
-	
-	if ( count > (m->offset >> time_bits) )
-		count = m->offset >> time_bits;
-	
-	if ( count )
+  assert( count >= 0 );
+
+  if ( count > (m1->offset >> time_bits) )
+    count = m1->offset >> time_bits;
+  if ( count > (m2->offset >> time_bits) )
+    count = m2->offset >> time_bits;
+  if ( count > (m3->offset >> time_bits) )
+    count = m3->offset >> time_bits;
+
+  if ( count )
 #endif
   {
-		buf_t const* in  = SAMPLES( m );
-		buf_t const* end = in + count;
-		int sum = m->integrator;
-		do
-		{
-			/* Eliminate fraction */
-			int s = ARITH_SHIFT( sum, delta_bits );
-			
-			sum += *in++;
-			
-			/* High-pass filter */
-			sum -= s << (delta_bits - bass_shift);
+    buf_t const* end;
+    buf_t const* in[3];
+#ifdef BLIP_MONO
+    int sum = m1->integrator;
+    in[0] = SAMPLES( m1 );
+    in[1] = SAMPLES( m2 );
+    in[2] = SAMPLES( m3 );
+#else
+    int sum = m1->integrator[0];
+    int sum2 = m1->integrator[1];
+    buf_t const* in2[3];
+    in[0] = m1->buffer[0];
+    in[1] = m2->buffer[0];
+    in[2] = m3->buffer[0];
+    in2[0] = m1->buffer[1];
+    in2[1] = m2->buffer[1];
+    in2[2] = m3->buffer[1];
+#endif
 
-            /* Add current buffer value */
-            s += *out;
-			
-			CLAMP( s );
-			
-			*out = s;
-			out += 2;
-		}
-		while ( in != end );
-		m->integrator = sum;
-		
-		remove_samples( m, count );
-	}
-	
-	return count;
+    end = in[0] + count;
+    do
+    {
+      /* Eliminate fraction */
+      int s = ARITH_SHIFT( sum, delta_bits );
+
+      sum += *in[0]++;
+      sum += *in[1]++;
+      sum += *in[2]++;
+
+      CLAMP( s );
+
+      *out++ = s;
+
+      /* High-pass filter */
+      sum -= s << (delta_bits - bass_shift);
+
+#ifndef BLIP_MONO
+      /* Eliminate fraction */
+      s = ARITH_SHIFT( sum2, delta_bits );
+
+      sum2 += *in2[0]++;
+      sum2 += *in2[1]++;
+      sum2 += *in2[2]++;
+
+      CLAMP( s );
+
+      *out++ = s;
+
+      /* High-pass filter */
+      sum2 -= s << (delta_bits - bass_shift);
+#endif
+    }
+    while ( in[0] != end );
+
+#ifdef BLIP_MONO
+    m1->integrator = sum;
+#else
+    m1->integrator[0] = sum;
+    m1->integrator[1] = sum2;
+#endif
+    remove_samples( m1, count );
+    remove_samples( m2, count );
+    remove_samples( m3, count );
+  }
+
+  return count;
 }
 
 /* Things that didn't help performance on x86:
@@ -348,12 +460,180 @@ possibly-wider fixed_t. On 32-bit platforms, this is likely more efficient.
 And by having pre_shift 32, a 32-bit platform can easily do the shift by
 simply ignoring the low half. */
 
+#ifndef BLIP_MONO
+
+void blip_add_delta( blip_t* m, unsigned time, int delta_l, int delta_r )
+{
+  if (delta_l | delta_r)
+  {
+    unsigned fixed = (unsigned) ((time * m->factor + m->offset) >> pre_shift);
+    int phase = fixed >> phase_shift & (phase_count - 1);
+    short const* in  = bl_step [phase];
+    short const* rev = bl_step [phase_count - phase];
+    int interp = fixed >> (phase_shift - delta_bits) & (delta_unit - 1);
+    int pos = fixed >> frac_bits;
+
+#ifdef BLIP_INVERT
+    buf_t* out_l = m->buffer[1] + pos;
+    buf_t* out_r = m->buffer[0] + pos;
+#else
+    buf_t* out_l = m->buffer[0] + pos;
+    buf_t* out_r = m->buffer[1] + pos;
+#endif
+
+    int delta;
+
+#ifdef BLIP_ASSERT
+    /* Fails if buffer size was exceeded */
+    assert( pos <= m->size + end_frame_extra );
+#endif
+
+    if (delta_l == delta_r)
+    {
+      buf_t out;
+      delta = (delta_l * interp) >> delta_bits;
+      delta_l -= delta;
+      out = in[0]*delta_l + in[half_width+0]*delta;
+      out_l[0] += out;
+      out_r[0] += out;
+      out = in[1]*delta_l + in[half_width+1]*delta;
+      out_l[1] += out;
+      out_r[1] += out;
+      out = in[2]*delta_l + in[half_width+2]*delta;
+      out_l[2] += out;
+      out_r[2] += out;
+      out = in[3]*delta_l + in[half_width+3]*delta;
+      out_l[3] += out;
+      out_r[3] += out;
+      out = in[4]*delta_l + in[half_width+4]*delta;
+      out_l[4] += out;
+      out_r[4] += out;
+      out = in[5]*delta_l + in[half_width+5]*delta;
+      out_l[5] += out;
+      out_r[5] += out;
+      out = in[6]*delta_l + in[half_width+6]*delta;
+      out_l[6] += out;
+      out_r[6] += out;
+      out = in[7]*delta_l + in[half_width+7]*delta;
+      out_l[7] += out;
+      out_r[7] += out;
+      out = rev[7]*delta_l + rev[7-half_width]*delta;
+      out_l[8] += out;
+      out_r[8] += out;
+      out = rev[6]*delta_l + rev[6-half_width]*delta;
+      out_l[9] += out;
+      out_r[9] += out;
+      out = rev[5]*delta_l + rev[5-half_width]*delta;
+      out_l[10] += out;
+      out_r[10] += out;
+      out = rev[4]*delta_l + rev[4-half_width]*delta;
+      out_l[11] += out;
+      out_r[11] += out;
+      out = rev[3]*delta_l + rev[3-half_width]*delta;
+      out_l[12] += out;
+      out_r[12] += out;
+      out = rev[2]*delta_l + rev[2-half_width]*delta;
+      out_l[13] += out;
+      out_r[13] += out;
+      out = rev[1]*delta_l + rev[1-half_width]*delta;
+      out_l[14] += out;
+      out_r[14] += out;
+      out = rev[0]*delta_l + rev[0-half_width]*delta;
+      out_l[15] += out;
+      out_r[15] += out;
+    }
+    else
+    {
+      delta = (delta_l * interp) >> delta_bits;
+      delta_l -= delta;
+      out_l [0] += in[0]*delta_l + in[half_width+0]*delta;
+      out_l [1] += in[1]*delta_l + in[half_width+1]*delta;
+      out_l [2] += in[2]*delta_l + in[half_width+2]*delta;
+      out_l [3] += in[3]*delta_l + in[half_width+3]*delta;
+      out_l [4] += in[4]*delta_l + in[half_width+4]*delta;
+      out_l [5] += in[5]*delta_l + in[half_width+5]*delta;
+      out_l [6] += in[6]*delta_l + in[half_width+6]*delta;
+      out_l [7] += in[7]*delta_l + in[half_width+7]*delta;
+      out_l [8] += rev[7]*delta_l + rev[7-half_width]*delta;
+      out_l [9] += rev[6]*delta_l + rev[6-half_width]*delta;
+      out_l [10] += rev[5]*delta_l + rev[5-half_width]*delta;
+      out_l [11] += rev[4]*delta_l + rev[4-half_width]*delta;
+      out_l [12] += rev[3]*delta_l + rev[3-half_width]*delta;
+      out_l [13] += rev[2]*delta_l + rev[2-half_width]*delta;
+      out_l [14] += rev[1]*delta_l + rev[1-half_width]*delta;
+      out_l [15] += rev[0]*delta_l + rev[0-half_width]*delta;
+
+      delta = (delta_r * interp) >> delta_bits;
+      delta_r -= delta;
+      out_r [0] += in[0]*delta_r + in[half_width+0]*delta;
+      out_r [1] += in[1]*delta_r + in[half_width+1]*delta;
+      out_r [2] += in[2]*delta_r + in[half_width+2]*delta;
+      out_r [3] += in[3]*delta_r + in[half_width+3]*delta;
+      out_r [4] += in[4]*delta_r + in[half_width+4]*delta;
+      out_r [5] += in[5]*delta_r + in[half_width+5]*delta;
+      out_r [6] += in[6]*delta_r + in[half_width+6]*delta;
+      out_r [7] += in[7]*delta_r + in[half_width+7]*delta;
+      out_r [8] += rev[7]*delta_r + rev[7-half_width]*delta;
+      out_r [9] += rev[6]*delta_r + rev[6-half_width]*delta;
+      out_r [10] += rev[5]*delta_r + rev[5-half_width]*delta;
+      out_r [11] += rev[4]*delta_r + rev[4-half_width]*delta;
+      out_r [12] += rev[3]*delta_r + rev[3-half_width]*delta;
+      out_r [13] += rev[2]*delta_r + rev[2-half_width]*delta;
+      out_r [14] += rev[1]*delta_r + rev[1-half_width]*delta;
+      out_r [15] += rev[0]*delta_r + rev[0-half_width]*delta;
+    }
+  }
+}
+
+void blip_add_delta_fast( blip_t* m, unsigned time, int delta_l, int delta_r )
+{
+  if (delta_l | delta_r)
+  {
+    unsigned fixed = (unsigned) ((time * m->factor + m->offset) >> pre_shift);
+    int interp = fixed >> (frac_bits - delta_bits) & (delta_unit - 1);
+    int pos = fixed >> frac_bits;
+
+#ifdef STEREO_INVERT
+    buf_t* out_l = m->buffer[1] + pos;
+    buf_t* out_r = m->buffer[0] + pos;
+#else
+    buf_t* out_l = m->buffer[0] + pos;
+    buf_t* out_r = m->buffer[1] + pos;
+#endif
+
+    int delta = delta_l * interp;
+
+#ifdef BLIP_ASSERT
+    /* Fails if buffer size was exceeded */
+    assert( pos <= m->size + end_frame_extra );
+#endif
+
+    if (delta_l == delta_r)
+    {
+      delta_l = delta_l * delta_unit - delta;
+      out_l[7] += delta_l;
+      out_l[8] += delta;
+      out_r[7] += delta_l;
+      out_r[8] += delta;
+    }
+    else
+    {
+      out_l[7] += delta_l * delta_unit - delta;
+      out_l[8] += delta;
+      delta = delta_r * interp;
+      out_r[7] += delta_r * delta_unit - delta;
+      out_r[8] += delta;
+    }
+  }
+}
+
+#else
+
 void blip_add_delta( blip_t* m, unsigned time, int delta )
 {
 	unsigned fixed = (unsigned) ((time * m->factor + m->offset) >> pre_shift);
 	buf_t* out = SAMPLES( m ) + (fixed >> frac_bits);
 	
-	int const phase_shift = frac_bits - phase_bits;
 	int phase = fixed >> phase_shift & (phase_count - 1);
 	short const* in  = bl_step [phase];
 	short const* rev = bl_step [phase_count - phase];
@@ -403,3 +683,4 @@ void blip_add_delta_fast( blip_t* m, unsigned time, int delta )
 	out [7] += delta * delta_unit - delta2;
 	out [8] += delta2;
 }
+#endif
diff --git a/core/sound/blip_buf.h b/core/sound/blip_buf.h
index 21c45d0..81b986b 100644
--- a/core/sound/blip_buf.h
+++ b/core/sound/blip_buf.h
@@ -28,12 +28,24 @@ blip_max_ratio = 1 << 20 };
 /** Clears entire buffer. Afterwards, blip_samples_avail() == 0. */
 void blip_clear( blip_t* );
 
+#ifndef BLIP_MONO
+
+/** Adds positive/negative deltas into stereo buffers at specified clock time. */
+void blip_add_delta( blip_t*, unsigned time, int delta_l, int delta_r );
+
+/** Same as blip_add_delta(), but uses faster, lower-quality synthesis. */
+void blip_add_delta_fast( blip_t*, unsigned int clock_time, int delta_l, int delta_r );
+
+#else
+
 /** Adds positive/negative delta into buffer at specified clock time. */
 void blip_add_delta( blip_t*, unsigned int clock_time, int delta );
 
 /** Same as blip_add_delta(), but uses faster, lower-quality synthesis. */
 void blip_add_delta_fast( blip_t*, unsigned int clock_time, int delta );
 
+#endif
+
 /** Length of time frame, in clocks, needed to make sample_count additional
 samples available. */
 int blip_clocks_needed( const blip_t*, int sample_count );
@@ -56,9 +68,8 @@ element of 'out', allowing easy interleaving of two buffers into a stereo sample
 stream. Outputs 16-bit signed samples. Returns number of samples actually read.  */
 int blip_read_samples( blip_t*, short out [], int count);
 
-/* Same as above function except sample is added to output buffer previous value */
-/* This allows easy mixing of different blip buffers into a single output stream */
-int blip_mix_samples( blip_t* m, short out [], int count);
+/* Same as above function except sample is mixed from three blip buffers source */
+int blip_mix_samples( blip_t* m1, blip_t* m2, blip_t* m3, short out [], int count);
 
 /** Frees buffer. No effect if NULL is passed. */
 void blip_delete( blip_t* );
diff --git a/core/sound/sn76489.c b/core/sound/sn76489.c
index 27ea516..da0a1b6 100644
--- a/core/sound/sn76489.c
+++ b/core/sound/sn76489.c
@@ -167,45 +167,23 @@ int SN76489_GetContextSize(void)
 /* Updates tone amplitude in delta buffer. Call whenever amplitude might have changed. */
 INLINE void UpdateToneAmplitude(int i, int time)
 {
-  int delta;
-
-  /* left output */
-  delta = (SN76489.Channel[i][0] * SN76489.ToneFreqPos[i]) - SN76489.ChanOut[i][0];
-  if (delta != 0)
-  {
-    SN76489.ChanOut[i][0] += delta;
-    blip_add_delta(snd.blips[0][0], time, delta);
-  }
-
-  /* right output */
-  delta = (SN76489.Channel[i][1] * SN76489.ToneFreqPos[i]) - SN76489.ChanOut[i][1];
-  if (delta != 0)
-  {
-    SN76489.ChanOut[i][1] += delta;
-    blip_add_delta(snd.blips[0][1], time, delta);
-  }
+  /* left & right output */
+  int delta_l = (SN76489.Channel[i][0] * SN76489.ToneFreqPos[i]) - SN76489.ChanOut[i][0];
+  int delta_r = (SN76489.Channel[i][1] * SN76489.ToneFreqPos[i]) - SN76489.ChanOut[i][1];
+  blip_add_delta(snd.blips[0], time, delta_l, delta_r);
+  SN76489.ChanOut[i][0] += delta_l;
+  SN76489.ChanOut[i][1] += delta_r;
 }
 
 /* Updates noise amplitude in delta buffer. Call whenever amplitude might have changed. */
 INLINE void UpdateNoiseAmplitude(int time)
 {
-  int delta;
-
-  /* left output */
-  delta = (SN76489.Channel[3][0] * ( SN76489.NoiseShiftRegister & 0x1 )) - SN76489.ChanOut[3][0];
-  if (delta != 0)
-  {
-    SN76489.ChanOut[3][0] += delta;
-    blip_add_delta(snd.blips[0][0], time, delta);
-  }
-
-  /* right output */
-  delta = (SN76489.Channel[3][1] * ( SN76489.NoiseShiftRegister & 0x1 )) - SN76489.ChanOut[3][1];
-  if (delta != 0)
-  {
-    SN76489.ChanOut[3][1] += delta;
-    blip_add_delta(snd.blips[0][1], time, delta);
-  }
+  /* left & right output */
+  int delta_l = (SN76489.Channel[3][0] * ( SN76489.NoiseShiftRegister & 0x1 )) - SN76489.ChanOut[3][0];
+  int delta_r = (SN76489.Channel[3][1] * ( SN76489.NoiseShiftRegister & 0x1 )) - SN76489.ChanOut[3][1];
+  blip_add_delta(snd.blips[0], time, delta_l, delta_r);
+  SN76489.ChanOut[3][0] += delta_l;
+  SN76489.ChanOut[3][1] += delta_r;
 }
 
 /* Runs tone channel for clock_length clocks */
diff --git a/core/sound/sound.c b/core/sound/sound.c
index 06bea1a..f1862ff 100644
--- a/core/sound/sound.c
+++ b/core/sound/sound.c
@@ -2,8 +2,8 @@
  *  Genesis Plus
  *  Sound Hardware
  *
- *  Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003  Charles Mac Donald (original code)
- *  Copyright (C) 2007-2013  Eke-Eke (Genesis Plus GX)
+ *  Copyright (C) 1998-2003  Charles Mac Donald (original code)
+ *  Copyright (C) 2007-2016  Eke-Eke (Genesis Plus GX)
  *
  *  Redistribution and use of this code or any derivative works are permitted
  *  provided that the following conditions are met:
@@ -125,21 +125,21 @@ void sound_reset(void)
 
 int sound_update(unsigned int cycles)
 {
-  int delta, preamp, time, l, r, *ptr;
+  int prev_l, prev_r, preamp, time, l, r, *ptr;
 
   /* Run PSG & FM chips until end of frame */
   SN76489_Update(cycles);
   fm_update(cycles);
 
-	/* FM output pre-amplification */
+  /* FM output pre-amplification */
   preamp = config.fm_preamp;
 
   /* FM frame initial timestamp */
   time = fm_cycles_start;
 
   /* Restore last FM outputs from previous frame */
-  l = fm_last[0];
-  r = fm_last[1];
+  prev_l = fm_last[0];
+  prev_r = fm_last[1];
 
   /* FM buffer start pointer */
   ptr = fm_buffer;
@@ -150,15 +150,12 @@ int sound_update(unsigned int cycles)
     /* high-quality Band-Limited synthesis */
     do
     {
-      /* left channel */
-      delta = ((*ptr++ * preamp) / 100) - l;
-      l += delta;
-      blip_add_delta(snd.blips[0][0], time, delta);
-      
-      /* right channel */
-      delta = ((*ptr++ * preamp) / 100) - r;
-      r += delta;
-      blip_add_delta(snd.blips[0][1], time, delta);
+      /* left & right channels */
+      l = ((*ptr++ * preamp) / 100);
+      r = ((*ptr++ * preamp) / 100);
+      blip_add_delta(snd.blips[0], time, l-prev_l, r-prev_r);
+      prev_l = l;
+      prev_r = r;
 
       /* increment time counter */
       time += fm_cycles_ratio;
@@ -170,15 +167,12 @@ int sound_update(unsigned int cycles)
     /* faster Linear Interpolation */
     do
     {
-      /* left channel */
-      delta = ((*ptr++ * preamp) / 100) - l;
-      l += delta;
-      blip_add_delta_fast(snd.blips[0][0], time, delta);
-      
-      /* right channel */
-      delta = ((*ptr++ * preamp) / 100) - r;
-      r += delta;
-      blip_add_delta_fast(snd.blips[0][1], time, delta);
+      /* left & right channels */
+      l = ((*ptr++ * preamp) / 100);
+      r = ((*ptr++ * preamp) / 100);
+      blip_add_delta_fast(snd.blips[0], time, l-prev_l, r-prev_r);
+      prev_l = l;
+      prev_r = r;
 
       /* increment time counter */
       time += fm_cycles_ratio;
@@ -190,18 +184,17 @@ int sound_update(unsigned int cycles)
   fm_ptr = fm_buffer;
 
   /* save last FM output for next frame */
-  fm_last[0] = l;
-  fm_last[1] = r;
+  fm_last[0] = prev_l;
+  fm_last[1] = prev_r;
 
   /* adjust FM cycle counters for next frame */
   fm_cycles_count = fm_cycles_start = time - cycles;
-	
-  /* end of blip buffers time frame */
-  blip_end_frame(snd.blips[0][0], cycles);
-  blip_end_frame(snd.blips[0][1], cycles);
+
+  /* end of blip buffer time frame */
+  blip_end_frame(snd.blips[0], cycles);
 
   /* return number of available samples */
-  return blip_samples_avail(snd.blips[0][0]);
+  return blip_samples_avail(snd.blips[0]);
 }
 
 int sound_context_save(uint8 *state)
diff --git a/core/sound/sound.h b/core/sound/sound.h
index a4284a7..ae7cc56 100644
--- a/core/sound/sound.h
+++ b/core/sound/sound.h
@@ -2,8 +2,8 @@
  *  Genesis Plus
  *  Sound Hardware
  *
- *  Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003  Charles Mac Donald (original code)
- *  Copyright (C) 2007-2013  Eke-Eke (Genesis Plus GX)
+ *  Copyright (C) 1998-2003  Charles Mac Donald (original code)
+ *  Copyright (C) 2007-2016  Eke-Eke (Genesis Plus GX)
  *
  *  Redistribution and use of this code or any derivative works are permitted
  *  provided that the following conditions are met:
diff --git a/core/system.c b/core/system.c
index e6134b2..07d86ed 100644
--- a/core/system.c
+++ b/core/system.c
@@ -68,11 +68,9 @@ int audio_init(int samplerate, double framerate)
   memset(&snd, 0, sizeof (snd));
 
   /* Initialize Blip Buffers */
-  snd.blips[0][0] = blip_new(samplerate / 10);
-  snd.blips[0][1] = blip_new(samplerate / 10);
-  if (!snd.blips[0][0] || !snd.blips[0][1])
+  snd.blips[0] = blip_new(samplerate / 10);
+  if (!snd.blips[0])
   {
-    audio_shutdown();
     return -1;
   }
 
@@ -80,11 +78,9 @@ int audio_init(int samplerate, double framerate)
   if (system_hw == SYSTEM_MCD)
   {
     /* allocate blip buffers */
-    snd.blips[1][0] = blip_new(samplerate / 10);
-    snd.blips[1][1] = blip_new(samplerate / 10);
-    snd.blips[2][0] = blip_new(samplerate / 10);
-    snd.blips[2][1] = blip_new(samplerate / 10);
-    if (!snd.blips[1][0] || !snd.blips[1][1] || !snd.blips[2][0] || !snd.blips[2][1])
+    snd.blips[1] = blip_new(samplerate / 10);
+    snd.blips[2] = blip_new(samplerate / 10);
+    if (!snd.blips[1] || !snd.blips[2])
     {
       audio_shutdown();
       return -1;
@@ -132,8 +128,7 @@ void audio_set_rate(int samplerate, double framerate)
   /* master clock timebase so they remain perfectly synchronized together, while still */
   /* being synchronized with 68K and Z80 CPUs as well. Mixed sound chip output is then */
   /* resampled to desired rate at the end of each frame, using Blip Buffer.            */
-  blip_set_rates(snd.blips[0][0], mclk, samplerate);
-  blip_set_rates(snd.blips[0][1], mclk, samplerate);
+  blip_set_rates(snd.blips[0], mclk, samplerate);
 
   /* Mega CD sound hardware */
   if (system_hw == SYSTEM_MCD)
@@ -155,17 +150,14 @@ void audio_set_rate(int samplerate, double framerate)
 
 void audio_reset(void)
 {
-  int i,j;
+  int i;
   
   /* Clear blip buffers */
   for (i=0; i<3; i++)
   {
-    for (j=0; j<2; j++)
+    if (snd.blips[i])
     {
-      if (snd.blips[i][j])
-      {
-        blip_clear(snd.blips[i][j]);
-      }
+      blip_clear(snd.blips[i]);
     }
   }
 
@@ -187,16 +179,13 @@ void audio_set_equalizer(void)
 
 void audio_shutdown(void)
 {
-  int i,j;
+  int i;
   
   /* Delete blip buffers */
   for (i=0; i<3; i++)
   {
-    for (j=0; j<2; j++)
-    {
-      blip_delete(snd.blips[i][j]);
-      snd.blips[i][j] = 0;
-    }
+    blip_delete(snd.blips[i]);
+    snd.blips[i] = 0;
   }
 }
 
@@ -213,37 +202,24 @@ int audio_update(int16 *buffer)
 
     /* read CDDA samples */
     cdd_read_audio(size);
-  }
 
 #ifdef ALIGN_SND
-  /* return an aligned number of samples if required */
-  size &= ALIGN_SND;
+    /* return an aligned number of samples if required */
+    size &= ALIGN_SND;
 #endif
 
-  /* resample FM & PSG mixed stream to output buffer */
-#ifdef LSB_FIRST
-  blip_read_samples(snd.blips[0][0], buffer, size);
-  blip_read_samples(snd.blips[0][1], buffer + 1, size);
-#else
-  blip_read_samples(snd.blips[0][0], buffer + 1, size);
-  blip_read_samples(snd.blips[0][1], buffer, size);
-#endif
-
-  /* Mega CD specific */
-  if (system_hw == SYSTEM_MCD)
+    /* resample & mix FM/PSG, PCM & CD-DA streams to output buffer */
+    blip_mix_samples(snd.blips[0], snd.blips[1], snd.blips[2], buffer, size);
+  }
+  else
   {
-    /* resample PCM & CD-DA streams to output buffer */
-#ifdef LSB_FIRST
-    blip_mix_samples(snd.blips[1][0], buffer, size);
-    blip_mix_samples(snd.blips[1][1], buffer + 1, size);
-    blip_mix_samples(snd.blips[2][0], buffer, size);
-    blip_mix_samples(snd.blips[2][1], buffer + 1, size);
-#else
-    blip_mix_samples(snd.blips[1][0], buffer + 1, size);
-    blip_mix_samples(snd.blips[1][1], buffer, size);
-    blip_mix_samples(snd.blips[2][0], buffer + 1, size);
-    blip_mix_samples(snd.blips[2][1], buffer, size);
+#ifdef ALIGN_SND
+    /* return an aligned number of samples if required */
+    size &= ALIGN_SND;
 #endif
+
+    /* resample FM/PSG mixed stream to output buffer */
+    blip_read_samples(snd.blips[0], buffer, size);
   }
 
   /* Audio filtering */
diff --git a/core/system.h b/core/system.h
index 2a5a6d3..e6fe50d 100644
--- a/core/system.h
+++ b/core/system.h
@@ -91,7 +91,7 @@ typedef struct
   int sample_rate;      /* Output Sample rate (8000-48000) */
   double frame_rate;    /* Output Frame rate (usually 50 or 60 frames per second) */
   int enabled;          /* 1= sound emulation is enabled */
-  blip_t* blips[3][2];  /* Blip Buffer resampling */
+  blip_t* blips[3];     /* Blip Buffer resampling (stereo) */
 } t_snd;