// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.

// Dolby Pro Logic 2 decoder from ffdshow-tryout
//  * Copyright 2001 Anders Johansson ajh@atri.curtin.edu.au
//  * Copyright (c) 2004-2006 Milan Cutka
//  * based on mplayer HRTF plugin by ylai

#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <functional>
#include <string.h>
#include <vector>

#include "AudioCommon/DPL2Decoder.h"
#include "Common/CommonTypes.h"
#include "Common/MathUtil.h"

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#ifndef M_SQRT1_2
#define M_SQRT1_2 0.70710678118654752440
#endif

static int olddelay = -1;
static unsigned int oldfreq = 0;
static unsigned int dlbuflen;
static int cyc_pos;
static float l_fwr, r_fwr, lpr_fwr, lmr_fwr;
static std::vector<float> fwrbuf_l, fwrbuf_r;
static float adapt_l_gain, adapt_r_gain, adapt_lpr_gain, adapt_lmr_gain;
static std::vector<float> lf, rf, lr, rr, cf, cr;
static float LFE_buf[256];
static unsigned int lfe_pos;
static float *filter_coefs_lfe;
static unsigned int len125;

template<class T, class _ftype_t>
static _ftype_t DotProduct(int count,const T *buf,const _ftype_t *coefficients)
{
	float sum0=0,sum1=0,sum2=0,sum3=0;
	for (;count>=4;buf+=4,coefficients+=4,count-=4)
	{
		sum0+=buf[0]*coefficients[0];
		sum1+=buf[1]*coefficients[1];
		sum2+=buf[2]*coefficients[2];
		sum3+=buf[3]*coefficients[3];
	}

	while (count--)
		sum0+= *buf++ * *coefficients++;

	return sum0+sum1+sum2+sum3;
}

template<class T>
static T FIRFilter(const T *buf, int pos, int len, int count, const float *coefficients)
{
	int count1, count2;

	if (pos >= count)
	{
		pos -= count;
		count1 = count; count2 = 0;
	}
	else
	{
		count2 = pos;
		count1 = count - pos;
		pos = len - count1;
	}

	// high part of window
	const T *ptr = &buf[pos];

	float r1=DotProduct(count1,ptr,coefficients);coefficients+=count1;
	float r2=DotProduct(count2,buf,coefficients);
	return T(r1+r2);
}

/*
// Hamming
//                        2*pi*k
// w(k) = 0.54 - 0.46*cos(------), where 0 <= k < N
//                         N-1
//
// n window length
// w buffer for the window parameters
*/
static void Hamming(int n, float* w)
{
	float k = float(2*M_PI/((float)(n-1))); // 2*pi/(N-1)

	// Calculate window coefficients
	for (int i = 0; i < n; i++)
		*w++ = float(0.54 - 0.46*cos(k*(float)i));
}

/******************************************************************************
*  FIR filter design
******************************************************************************/

/* Design FIR filter using the Window method

n     filter length must be odd for HP and BS filters
w     buffer for the filter taps (must be n long)
fc    cutoff frequencies (1 for LP and HP, 2 for BP and BS)
0 < fc < 1 where 1 <=> Fs/2
flags window and filter type as defined in filter.h
variables are ored together: i.e. LP|HAMMING will give a
low pass filter designed using a hamming window
opt   beta constant used only when designing using kaiser windows

returns 0 if OK, -1 if fail
*/
static float* DesignFIR(unsigned int *n, float* fc, float opt)
{
	unsigned int  o   = *n & 1;              // Indicator for odd filter length
	unsigned int  end = ((*n + 1) >> 1) - o; // Loop end

	float k1 = 2 * float(M_PI);              // 2*pi*fc1
	float k2 = 0.5f * (float)(1 - o);        // Constant used if the filter has even length
	float g  = 0.0f;                         // Gain
	float t1;                                // Temporary variables
	float fc1;                               // Cutoff frequencies

	// Sanity check
	if (*n==0) return nullptr;
	MathUtil::Clamp(&fc[0],float(0.001),float(1));

	float *w=(float*)calloc(sizeof(float),*n);

	// Get window coefficients
	Hamming(*n,w);

	fc1=*fc;
	// Cutoff frequency must be < 0.5 where 0.5 <=> Fs/2
	fc1 = ((fc1 <= 1.0) && (fc1 > 0.0)) ? fc1/2 : 0.25f;
	k1 *= fc1;

	// Low pass filter

	// If the filter length is odd, there is one point which is exactly
	// in the middle. The value at this point is 2*fCutoff*sin(x)/x,
	// where x is zero. To make sure nothing strange happens, we set this
	// value separately.
	if (o)
	{
		w[end] = fc1 * w[end] * 2.0f;
		g=w[end];
	}

	// Create filter
	for (u32 i = 0; i < end; i++)
	{
		t1 = (float)(i+1) - k2;
		w[end-i-1] = w[*n-end+i] = float(w[end-i-1] * sin(k1 * t1)/(M_PI * t1)); // Sinc
		g += 2*w[end-i-1]; // Total gain in filter
	}


	// Normalize gain
	g=1/g;
	for (u32 i = 0; i < *n; i++)
		w[i] *= g;

	return w;
}

static void OnSeek()
{
	l_fwr = r_fwr = lpr_fwr = lmr_fwr = 0;
	std::fill(fwrbuf_l.begin(), fwrbuf_l.end(), 0.0f);
	std::fill(fwrbuf_r.begin(), fwrbuf_r.end(), 0.0f);
	adapt_l_gain = adapt_r_gain = adapt_lpr_gain = adapt_lmr_gain = 0;
	std::fill(lf.begin(), lf.end(), 0.0f);
	std::fill(rf.begin(), rf.end(), 0.0f);
	std::fill(lr.begin(), lr.end(), 0.0f);
	std::fill(rr.begin(), rr.end(), 0.0f);
	std::fill(cf.begin(), cf.end(), 0.0f);
	std::fill(cr.begin(), cr.end(), 0.0f);
	lfe_pos = 0;
	memset(LFE_buf, 0, sizeof(LFE_buf));
}

static void Done()
{
	OnSeek();

	if (filter_coefs_lfe)
	{
		free(filter_coefs_lfe);
	}

	filter_coefs_lfe = nullptr;
}

static float* CalculateCoefficients125HzLowpass(int rate)
{
	len125 = 256;
	float f = 125.0f / (rate / 2);
	float *coeffs = DesignFIR(&len125, &f, 0);
	static const float M3_01DB = 0.7071067812f;
	for (unsigned int i = 0; i < len125; i++)
	{
		coeffs[i] *= M3_01DB;
	}
	return coeffs;
}

static float PassiveLock(float x)
{
	static const float MATAGCLOCK = 0.2f;  /* AGC range (around 1) where the matrix behaves passively */
	const float x1 = x - 1;
	const float ax1s = fabs(x - 1) * (1.0f / MATAGCLOCK);
	return x1 - x1 / (1 + ax1s * ax1s) + 1;
}

static void MatrixDecode(const float *in, const int k, const int il,
	const int ir, bool decode_rear,
	const int _dlbuflen,
	float _l_fwr, float _r_fwr,
	float _lpr_fwr, float _lmr_fwr,
	float *_adapt_l_gain, float *_adapt_r_gain,
	float *_adapt_lpr_gain, float *_adapt_lmr_gain,
	float *_lf, float *_rf, float *_lr,
	float *_rr, float *_cf)
{
	static const float M9_03DB = 0.3535533906f;
	static const float MATAGCTRIG = 8.0f;   /* (Fuzzy) AGC trigger */
	static const float MATAGCDECAY = 1.0f;  /* AGC baseline decay rate (1/samp.) */
	static const float MATCOMPGAIN = 0.37f; /* Cross talk compensation gain,  0.50 - 0.55 is full cancellation. */

	const int kr = (k + olddelay) % _dlbuflen;
	float l_gain = (_l_fwr + _r_fwr) / (1 + _l_fwr + _l_fwr);
	float r_gain = (_l_fwr + _r_fwr) / (1 + _r_fwr + _r_fwr);
	// The 2nd axis has strong gain fluctuations, and therefore require
	// limits.  The factor corresponds to the 1 / amplification of (Lt
	// - Rt) when (Lt, Rt) is strongly correlated. (e.g. during
	// dialogues).  It should be bigger than -12 dB to prevent
	// distortion.
	float lmr_lim_fwr = _lmr_fwr > M9_03DB * _lpr_fwr ? _lmr_fwr : M9_03DB * _lpr_fwr;
	float lpr_gain = (_lpr_fwr + lmr_lim_fwr) / (1 + _lpr_fwr + _lpr_fwr);
	float lmr_gain = (_lpr_fwr + lmr_lim_fwr) / (1 + lmr_lim_fwr + lmr_lim_fwr);
	float lmr_unlim_gain = (_lpr_fwr + _lmr_fwr) / (1 + _lmr_fwr + _lmr_fwr);
	float lpr, lmr;
	float l_agc, r_agc, lpr_agc, lmr_agc;
	float f, d_gain, c_gain, c_agc_cfk;

	/*** AXIS NO. 1: (Lt, Rt) -> (C, Ls, Rs) ***/
	/* AGC adaption */
	d_gain = (fabs(l_gain - *_adapt_l_gain) + fabs(r_gain - *_adapt_r_gain)) * 0.5f;
	f = d_gain * (1.0f / MATAGCTRIG);
	f = MATAGCDECAY - MATAGCDECAY / (1 + f * f);
	*_adapt_l_gain = (1 - f) * *_adapt_l_gain + f * l_gain;
	*_adapt_r_gain = (1 - f) * *_adapt_r_gain + f * r_gain;
	/* Matrix */
	l_agc = in[il] * PassiveLock(*_adapt_l_gain);
	r_agc = in[ir] * PassiveLock(*_adapt_r_gain);
	_cf[k] = (l_agc + r_agc) * (float)M_SQRT1_2;
	if (decode_rear)
	{
		_lr[kr] = _rr[kr] = (l_agc - r_agc) * (float)M_SQRT1_2;
		// Stereo rear channel is steered with the same AGC steering as
		// the decoding matrix. Note this requires a fast updating AGC
		// at the order of 20 ms (which is the case here).
		_lr[kr] *= (_l_fwr + _l_fwr) / (1 + _l_fwr + _r_fwr);
		_rr[kr] *= (_r_fwr + _r_fwr) / (1 + _l_fwr + _r_fwr);
	}

	/*** AXIS NO. 2: (Lt + Rt, Lt - Rt) -> (L, R) ***/
	lpr = (in[il] + in[ir]) * (float)M_SQRT1_2;
	lmr = (in[il] - in[ir]) * (float)M_SQRT1_2;
	/* AGC adaption */
	d_gain = fabs(lmr_unlim_gain - *_adapt_lmr_gain);
	f = d_gain * (1.0f / MATAGCTRIG);
	f = MATAGCDECAY - MATAGCDECAY / (1 + f * f);
	*_adapt_lpr_gain = (1 - f) * *_adapt_lpr_gain + f * lpr_gain;
	*_adapt_lmr_gain = (1 - f) * *_adapt_lmr_gain + f * lmr_gain;
	/* Matrix */
	lpr_agc = lpr * PassiveLock(*_adapt_lpr_gain);
	lmr_agc = lmr * PassiveLock(*_adapt_lmr_gain);
	_lf[k] = (lpr_agc + lmr_agc) * (float)M_SQRT1_2;
	_rf[k] = (lpr_agc - lmr_agc) * (float)M_SQRT1_2;

	/*** CENTER FRONT CANCELLATION ***/
	// A heuristic approach exploits that Lt + Rt gain contains the
	// information about Lt, Rt correlation.  This effectively reshapes
	// the front and rear "cones" to concentrate Lt + Rt to C and
	// introduce Lt - Rt in L, R.
	/* 0.67677 is the empirical lower bound for lpr_gain. */
	c_gain = 8 * (*_adapt_lpr_gain - 0.67677f);
	c_gain = c_gain > 0 ? c_gain : 0;
	// c_gain should not be too high, not even reaching full
	// cancellation (~ 0.50 - 0.55 at current AGC implementation), or
	// the center will sound too narrow. */
	c_gain = MATCOMPGAIN / (1 + c_gain * c_gain);
	c_agc_cfk = c_gain * _cf[k];
	_lf[k] -= c_agc_cfk;
	_rf[k] -= c_agc_cfk;
	_cf[k] += c_agc_cfk + c_agc_cfk;
}

void DPL2Decode(float *samples, int numsamples, float *out)
{
	static const unsigned int FWRDURATION = 240; // FWR average duration (samples)
	static const int cfg_delay = 0;
	static const unsigned int fmt_freq = 48000;
	static const unsigned int fmt_nchannels = 2; // input channels

	int cur = 0;

	if (olddelay != cfg_delay || oldfreq != fmt_freq)
	{
		Done();
		olddelay = cfg_delay;
		oldfreq = fmt_freq;
		dlbuflen = std::max(FWRDURATION, (fmt_freq * cfg_delay / 1000)); //+(len7000-1);
		cyc_pos = dlbuflen - 1;
		fwrbuf_l.resize(dlbuflen);
		fwrbuf_r.resize(dlbuflen);
		lf.resize(dlbuflen);
		rf.resize(dlbuflen);
		lr.resize(dlbuflen);
		rr.resize(dlbuflen);
		cf.resize(dlbuflen);
		cr.resize(dlbuflen);
		filter_coefs_lfe = CalculateCoefficients125HzLowpass(fmt_freq);
		lfe_pos = 0;
		memset(LFE_buf, 0, sizeof(LFE_buf));
	}

	float *in = samples; // Input audio data
	float *end = in + numsamples * fmt_nchannels; // Loop end

	while (in < end)
	{
		const int k = cyc_pos;

		const int fwr_pos = (k + FWRDURATION) % dlbuflen;
		/* Update the full wave rectified total amplitude */
		/* Input matrix decoder */
		l_fwr += fabs(in[0]) - fabs(fwrbuf_l[fwr_pos]);
		r_fwr += fabs(in[1]) - fabs(fwrbuf_r[fwr_pos]);
		lpr_fwr += fabs(in[0] + in[1]) - fabs(fwrbuf_l[fwr_pos] + fwrbuf_r[fwr_pos]);
		lmr_fwr += fabs(in[0] - in[1]) - fabs(fwrbuf_l[fwr_pos] - fwrbuf_r[fwr_pos]);

		/* Matrix encoded 2 channel sources */
		fwrbuf_l[k] = in[0];
		fwrbuf_r[k] = in[1];
		MatrixDecode(in, k, 0, 1, true, dlbuflen,
			l_fwr, r_fwr,
			lpr_fwr, lmr_fwr,
			&adapt_l_gain, &adapt_r_gain,
			&adapt_lpr_gain, &adapt_lmr_gain,
			&lf[0], &rf[0], &lr[0], &rr[0], &cf[0]);

		out[cur + 0] = lf[k];
		out[cur + 1] = rf[k];
		out[cur + 2] = cf[k];
		LFE_buf[lfe_pos] = (out[0] + out[1]) / 2;
		out[cur + 3] = FIRFilter(LFE_buf, lfe_pos, len125, len125, filter_coefs_lfe);
		lfe_pos++;
		if (lfe_pos == len125)
		{
			lfe_pos = 0;
		}
		out[cur + 4] = lr[k];
		out[cur + 5] = rr[k];
		// Next sample...
		in += 2;
		cur += 6;
		cyc_pos--;
		if (cyc_pos < 0)
		{
			cyc_pos += dlbuflen;
		}
	}
}

void DPL2Reset()
{
	olddelay = -1;
	oldfreq = 0;
	filter_coefs_lfe = nullptr;
}