Blame - src/third_party/opus/silk/VAD.c - cobalt

blob: 541e5056ff697ac3f918c5b0f5e4d13133fb4671 [file] [log] [blame]

Andrew Top	8b6b16e	2018-07-25 17:44:41 -0700	[diff] [blame]	1	/***********************************************************************
				2	Copyright (c) 2006-2011, Skype Limited. All rights reserved.
				3	Redistribution and use in source and binary forms, with or without
				4	modification, are permitted provided that the following conditions
				5	are met:
				6	- Redistributions of source code must retain the above copyright notice,
				7	this list of conditions and the following disclaimer.
				8	- Redistributions in binary form must reproduce the above copyright
				9	notice, this list of conditions and the following disclaimer in the
				10	documentation and/or other materials provided with the distribution.
				11	- Neither the name of Internet Society, IETF or IETF Trust, nor the
				12	names of specific contributors, may be used to endorse or promote
				13	products derived from this software without specific prior written
				14	permission.
				15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				16	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				17	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				18	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				19	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				20	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				21	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				22	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				23	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				24	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				25	POSSIBILITY OF SUCH DAMAGE.
				26	***********************************************************************/
				27
				28	#ifdef HAVE_CONFIG_H
				29	#include "config.h"
				30	#endif
				31
				32	#include "main.h"
				33	#include "stack_alloc.h"
				34
				35	/* Silk VAD noise level estimation */
				36	# if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
				37	static OPUS_INLINE void silk_VAD_GetNoiseLevels(
				38	const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
				39	silk_VAD_state psSilk_VAD / I/O Pointer to Silk VAD state */
				40	);
				41	#endif
				42
				43	/**********************************/
				44	/* Initialization of the Silk VAD */
				45	/**********************************/
				46	opus_int silk_VAD_Init( /* O Return value, 0 if success */
				47	silk_VAD_state psSilk_VAD / I/O Pointer to Silk VAD state */
				48	)
				49	{
				50	opus_int b, ret = 0;
				51
				52	/* reset state memory */
				53	silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
				54
				55	/* init noise levels */
				56	/* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
				57	for( b = 0; b < VAD_N_BANDS; b++ ) {
				58	psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
				59	}
				60
				61	/* Initialize state */
				62	for( b = 0; b < VAD_N_BANDS; b++ ) {
				63	psSilk_VAD->NL[ b ] = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
				64	psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
				65	}
				66	psSilk_VAD->counter = 15;
				67
				68	/* init smoothed energy-to-noise ratio*/
				69	for( b = 0; b < VAD_N_BANDS; b++ ) {
				70	psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256; /* 100 * 256 --> 20 dB SNR */
				71	}
				72
				73	return( ret );
				74	}
				75
				76	/* Weighting factors for tilt measure */
				77	static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
				78
				79	/***************************************/
				80	/* Get the speech activity level in Q8 */
				81	/***************************************/
				82	opus_int silk_VAD_GetSA_Q8_c( /* O Return value, 0 if success */
				83	silk_encoder_state psEncC, / I/O Encoder state */
				84	const opus_int16 pIn[] /* I PCM input */
				85	)
				86	{
				87	opus_int SA_Q15, pSNR_dB_Q7, input_tilt;
				88	opus_int decimated_framelength1, decimated_framelength2;
				89	opus_int decimated_framelength;
				90	opus_int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
				91	opus_int32 sumSquared, smooth_coef_Q16;
				92	opus_int16 HPstateTmp;
				93	VARDECL( opus_int16, X );
				94	opus_int32 Xnrg[ VAD_N_BANDS ];
				95	opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
				96	opus_int32 speech_nrg, x_tmp;
				97	opus_int X_offset[ VAD_N_BANDS ];
				98	opus_int ret = 0;
				99	silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
				100	SAVE_STACK;
				101
				102	/* Safety checks */
				103	silk_assert( VAD_N_BANDS == 4 );
				104	celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
				105	celt_assert( psEncC->frame_length <= 512 );
				106	celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
				107
				108	/***********************/
				109	/* Filter and Decimate */
				110	/***********************/
				111	decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
				112	decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
				113	decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
				114	/* Decimate into 4 bands:
				115	0 L 3L L 3L 5L
				116	- -- - -- --
				117	8 8 2 4 4
				118
				119	[0-1 kHz\| temp. \|1-2 kHz\| 2-4 kHz \| 4-8 kHz \|
				120
				121	They're arranged to allow the minimal ( frame_length / 4 ) extra
				122	scratch space during the downsampling process */
				123	X_offset[ 0 ] = 0;
				124	X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
				125	X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
				126	X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
				127	ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
				128
				129	/* 0-8 kHz to 0-4 kHz and 4-8 kHz */
				130	silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ],
				131	X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
				132
				133	/* 0-4 kHz to 0-2 kHz and 2-4 kHz */
				134	silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
				135	X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
				136
				137	/* 0-2 kHz to 0-1 kHz and 1-2 kHz */
				138	silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
				139	X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
				140
				141	/*********************************************/
				142	/* HP filter on lowest band (differentiator) */
				143	/*********************************************/
				144	X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
				145	HPstateTmp = X[ decimated_framelength - 1 ];
				146	for( i = decimated_framelength - 1; i > 0; i-- ) {
				147	X[ i - 1 ] = silk_RSHIFT( X[ i - 1 ], 1 );
				148	X[ i ] -= X[ i - 1 ];
				149	}
				150	X[ 0 ] -= psSilk_VAD->HPstate;
				151	psSilk_VAD->HPstate = HPstateTmp;
				152
				153	/*************************************/
				154	/* Calculate the energy in each band */
				155	/*************************************/
				156	for( b = 0; b < VAD_N_BANDS; b++ ) {
				157	/* Find the decimated framelength in the non-uniformly divided bands */
				158	decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
				159
				160	/* Split length into subframe lengths */
				161	dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
				162	dec_subframe_offset = 0;
				163
				164	/* Compute energy per sub-frame */
				165	/* initialize with summed energy of last subframe */
				166	Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
				167	for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
				168	sumSquared = 0;
				169	for( i = 0; i < dec_subframe_length; i++ ) {
				170	/* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */
				171	/* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */
				172	x_tmp = silk_RSHIFT(
				173	X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
				174	sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
				175
				176	/* Safety check */
				177	silk_assert( sumSquared >= 0 );
				178	}
				179
				180	/* Add/saturate summed energy of current subframe */
				181	if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
				182	Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
				183	} else {
				184	/* Look-ahead subframe */
				185	Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
				186	}
				187
				188	dec_subframe_offset += dec_subframe_length;
				189	}
				190	psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
				191	}
				192
				193	/********************/
				194	/* Noise estimation */
				195	/********************/
				196	silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
				197
				198	/***********************************************/
				199	/* Signal-plus-noise to noise ratio estimation */
				200	/***********************************************/
				201	sumSquared = 0;
				202	input_tilt = 0;
				203	for( b = 0; b < VAD_N_BANDS; b++ ) {
				204	speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
				205	if( speech_nrg > 0 ) {
				206	/* Divide, with sufficient resolution */
				207	if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
				208	NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
				209	} else {
				210	NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
				211	}
				212
				213	/* Convert to log domain */
				214	SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
				215
				216	/* Sum-of-squares */
				217	sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */
				218
				219	/* Tilt measure */
				220	if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
				221	/* Scale down SNR value for small subband speech energies */
				222	SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
				223	}
				224	input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
				225	} else {
				226	NrgToNoiseRatio_Q8[ b ] = 256;
				227	}
				228	}
				229
				230	/* Mean-of-squares */
				231	sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
				232
				233	/* Root-mean-square approximation, scale to dBs, and write to output pointer */
				234	pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
				235
				236	/*********************************/
				237	/* Speech Probability Estimation */
				238	/*********************************/
				239	SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
				240
				241	/**************************/
				242	/* Frequency Tilt Measure */
				243	/**************************/
				244	psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
				245
				246	/**************************************************/
				247	/* Scale the sigmoid output based on power levels */
				248	/**************************************************/
				249	speech_nrg = 0;
				250	for( b = 0; b < VAD_N_BANDS; b++ ) {
				251	/* Accumulate signal-without-noise energies, higher frequency bands have more weight */
				252	speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
				253	}
				254
				255	if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
				256	speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
				257	}
				258	/* Power scaling */
				259	if( speech_nrg <= 0 ) {
				260	SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
				261	} else if( speech_nrg < 16384 ) {
				262	speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
				263
				264	/* square-root */
				265	speech_nrg = silk_SQRT_APPROX( speech_nrg );
				266	SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
				267	}
				268
				269	/* Copy the resulting speech activity in Q8 */
				270	psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
				271
				272	/***********************************/
				273	/* Energy Level and SNR estimation */
				274	/***********************************/
				275	/* Smoothing coefficient */
				276	smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
				277
				278	if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
				279	smooth_coef_Q16 >>= 1;
				280	}
				281
				282	for( b = 0; b < VAD_N_BANDS; b++ ) {
				283	/* compute smoothed energy-to-noise ratio per band */
				284	psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
				285	NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
				286
				287	/* signal to noise ratio in dB per band */
				288	SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
				289	/* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
				290	psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
				291	}
				292
				293	RESTORE_STACK;
				294	return( ret );
				295	}
				296
				297	/**************************/
				298	/* Noise level estimation */
				299	/**************************/
				300	# if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
				301	static OPUS_INLINE
				302	#endif
				303	void silk_VAD_GetNoiseLevels(
				304	const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
				305	silk_VAD_state psSilk_VAD / I/O Pointer to Silk VAD state */
				306	)
				307	{
				308	opus_int k;
				309	opus_int32 nl, nrg, inv_nrg;
				310	opus_int coef, min_coef;
				311
				312	/* Initially faster smoothing */
				313	if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
				314	min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
				315	} else {
				316	min_coef = 0;
				317	}
				318
				319	for( k = 0; k < VAD_N_BANDS; k++ ) {
				320	/* Get old noise level estimate for current band */
				321	nl = psSilk_VAD->NL[ k ];
				322	silk_assert( nl >= 0 );
				323
				324	/* Add bias */
				325	nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
				326	silk_assert( nrg > 0 );
				327
				328	/* Invert energies */
				329	inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
				330	silk_assert( inv_nrg >= 0 );
				331
				332	/* Less update when subband energy is high */
				333	if( nrg > silk_LSHIFT( nl, 3 ) ) {
				334	coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
				335	} else if( nrg < nl ) {
				336	coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
				337	} else {
				338	coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
				339	}
				340
				341	/* Initially faster smoothing */
				342	coef = silk_max_int( coef, min_coef );
				343
				344	/* Smooth inverse energies */
				345	psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
				346	silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
				347
				348	/* Compute noise level by inverting again */
				349	nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
				350	silk_assert( nl >= 0 );
				351
				352	/* Limit noise levels (guarantee 7 bits of head room) */
				353	nl = silk_min( nl, 0x00FFFFFF );
				354
				355	/* Store as part of state */
				356	psSilk_VAD->NL[ k ] = nl;
				357	}
				358
				359	/* Increment frame counter */
				360	psSilk_VAD->counter++;
				361	}