Overte C++ Documentation
AudioDynamics.h
1 //
2 // AudioDynamics.h
3 // libraries/audio/src
4 //
5 // Created by Ken Cooke on 5/5/17.
6 // Copyright 2017 High Fidelity, Inc.
7 //
8 
9 //
10 // Inline functions to implement audio dynamics processing
11 //
12 
13 #include <math.h>
14 #include <stdint.h>
15 #include <stddef.h>
16 
17 #ifndef MAX
18 #define MAX(a,b) ((a) > (b) ? (a) : (b))
19 #endif
20 #ifndef MIN
21 #define MIN(a,b) ((a) < (b) ? (a) : (b))
22 #endif
23 
24 #if defined(_MSC_VER)
25 #define FORCEINLINE __forceinline
26 #elif defined(__GNUC__)
27 #define FORCEINLINE inline __attribute__((always_inline))
28 #else
29 #define FORCEINLINE inline
30 #endif
31 
32 #if defined(_MSC_VER)
33 #include <intrin.h>
34 #define MUL64(a,b) __emul((a), (b))
35 #else
36 #define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
37 #endif
38 
39 #define MULHI(a,b) ((int32_t)(MUL64(a, b) >> 32))
40 #define MULQ31(a,b) ((int32_t)(MUL64(a, b) >> 31))
41 #define MULDIV64(a,b,c) (int32_t)(MUL64(a, b) / (c))
42 
43 #define ADDMOD32(a,b) (int32_t)((uint32_t)(a) + (uint32_t)(b))
44 #define SUBMOD32(a,b) (int32_t)((uint32_t)(a) - (uint32_t)(b))
45 
46 //
47 // on x86 architecture, assume that SSE2 is present
48 //
49 #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
50 
51 #include <xmmintrin.h>
52 // convert float to int using round-to-nearest
53 FORCEINLINE static int32_t floatToInt(float x) {
54  return _mm_cvt_ss2si(_mm_set_ss(x));
55 }
56 
57 #else
58 
59 // convert float to int using round-to-nearest
60 FORCEINLINE static int32_t floatToInt(float x) {
61  x += (x < 0.0f ? -0.5f : 0.5f); // round
62  return (int32_t)x;
63 }
64 
65 #endif // _M_IX86
66 
67 static const double FIXQ31 = 2147483648.0; // convert float to Q31
68 static const double DB_TO_LOG2 = 0.16609640474436813; // convert dB to log2
69 
70 // convert dB to amplitude
71 FORCEINLINE static double dBToGain(double dB) {
72  return pow(10.0, dB / 20.0);
73 }
74 
75 // convert milliseconds to first-order time constant
76 FORCEINLINE static int32_t msToTc(double ms, double sampleRate) {
77  double tc = exp(-1000.0 / (ms * sampleRate));
78  return (int32_t)(FIXQ31 * tc); // Q31
79 }
80 
81 // log2 domain values are Q26
82 static const int LOG2_INTBITS = 5;
83 static const int LOG2_FRACBITS = 31 - LOG2_INTBITS;
84 
85 // log2 domain headroom bits above 0dB
86 static const int LOG2_HEADROOM = 15;
87 
88 // log2 domain offsets so error < 0
89 static const int32_t LOG2_BIAS = 347;
90 static const int32_t EXP2_BIAS = 64;
91 
92 //
93 // P(x) = log2(1+x) for x=[0,1]
94 // scaled by 1, 0.5, 0.25
95 //
96 // |error| < 347 ulp, smooth
97 //
98 static const int LOG2_TABBITS = 4;
99 static const int32_t log2Table[1 << LOG2_TABBITS][3] = {
100  { -0x56dfe26d, 0x5c46daff, 0x00000000 },
101  { -0x4d397571, 0x5bae58e7, 0x00025a75 },
102  { -0x4518f84b, 0x5aabcac4, 0x000a62db },
103  { -0x3e3075ec, 0x596168c0, 0x0019d0e6 },
104  { -0x384486e9, 0x57e769c7, 0x00316109 },
105  { -0x332742ba, 0x564f1461, 0x00513776 },
106  { -0x2eb4bad4, 0x54a4cdfe, 0x00791de2 },
107  { -0x2ad07c6c, 0x52f18320, 0x00a8aa46 },
108  { -0x2763c4d6, 0x513ba123, 0x00df574c },
109  { -0x245c319b, 0x4f87c5c4, 0x011c9399 },
110  { -0x21aac79f, 0x4dd93bef, 0x015fcb52 },
111  { -0x1f433872, 0x4c325584, 0x01a86ddc },
112  { -0x1d1b54b4, 0x4a94ac6e, 0x01f5f13e },
113  { -0x1b2a9f81, 0x4901524f, 0x0247d3f2 },
114  { -0x1969fa57, 0x4778f3a7, 0x029d9dbf },
115  { -0x17d36370, 0x45fbf1e8, 0x02f6dfe8 },
116 };
117 
118 //
119 // P(x) = exp2(x) for x=[0,1]
120 // scaled by 2, 1, 0.5
121 // Uses exp2(-x) = exp2(1-x)/2
122 //
123 // |error| < 1387 ulp, smooth
124 //
125 static const int EXP2_TABBITS = 4;
126 static const int32_t exp2Table[1 << EXP2_TABBITS][3] = {
127  { 0x3ed838c8, 0x58b574b7, 0x40000000 },
128  { 0x41a0821c, 0x5888db8f, 0x4000b2b7 },
129  { 0x4488548d, 0x582bcbc6, 0x40039be1 },
130  { 0x4791158a, 0x579a1128, 0x400a71ae },
131  { 0x4abc3a53, 0x56cf3089, 0x4017212e },
132  { 0x4e0b48af, 0x55c66396, 0x402bd31b },
133  { 0x517fd7a7, 0x547a946d, 0x404af0ec },
134  { 0x551b9049, 0x52e658f9, 0x40772a57 },
135  { 0x58e02e75, 0x5103ee08, 0x40b37b31 },
136  { 0x5ccf81b1, 0x4ecd321f, 0x410331b5 },
137  { 0x60eb6e09, 0x4c3ba007, 0x4169f548 },
138  { 0x6535ecf9, 0x49484909, 0x41ebcdaf },
139  { 0x69b10e5b, 0x45ebcede, 0x428d2acd },
140  { 0x6e5ef96c, 0x421e5d48, 0x4352ece7 },
141  { 0x7341edcb, 0x3dd7a354, 0x44426d7b },
142  { 0x785c4499, 0x390ecc3a, 0x456188bd },
143 };
144 
145 static const int IEEE754_FABS_MASK = 0x7fffffff;
146 static const int IEEE754_MANT_BITS = 23;
147 static const int IEEE754_EXPN_BITS = 8;
148 static const int IEEE754_EXPN_BIAS = 127;
149 
150 //
151 // Peak detection and -log2(x) for float input (mono)
152 // x < 2^(31-LOG2_HEADROOM) returns 0x7fffffff
153 // x > 2^LOG2_HEADROOM returns 0
154 //
155 FORCEINLINE static int32_t peaklog2(float* input) {
156 
157  // float as integer bits
158  uint32_t u = *(uint32_t*)input;
159 
160  // absolute value
161  uint32_t peak = u & IEEE754_FABS_MASK;
162 
163  // split into e and x - 1.0
164  int32_t e = IEEE754_EXPN_BIAS - (peak >> IEEE754_MANT_BITS) + LOG2_HEADROOM;
165  int32_t x = (peak << IEEE754_EXPN_BITS) & 0x7fffffff;
166 
167  // saturate when e > 31 or e < 0
168  if ((uint32_t)e > 31) {
169  return 0x7fffffff & ~(e >> 31);
170  }
171 
172  int k = x >> (31 - LOG2_TABBITS);
173 
174  // polynomial for log2(1+x) over x=[0,1]
175  int32_t c0 = log2Table[k][0];
176  int32_t c1 = log2Table[k][1];
177  int32_t c2 = log2Table[k][2];
178 
179  c1 += MULHI(c0, x);
180  c2 += MULHI(c1, x);
181 
182  // reconstruct result in Q26
183  return (e << LOG2_FRACBITS) - (c2 >> 3);
184 }
185 
186 //
187 // Peak detection and -log2(x) for float input (stereo)
188 // x < 2^(31-LOG2_HEADROOM) returns 0x7fffffff
189 // x > 2^LOG2_HEADROOM returns 0
190 //
191 FORCEINLINE static int32_t peaklog2(float* input0, float* input1) {
192 
193  // float as integer bits
194  uint32_t u0 = *(uint32_t*)input0;
195  uint32_t u1 = *(uint32_t*)input1;
196 
197  // max absolute value
198  u0 &= IEEE754_FABS_MASK;
199  u1 &= IEEE754_FABS_MASK;
200  uint32_t peak = MAX(u0, u1);
201 
202  // split into e and x - 1.0
203  int32_t e = IEEE754_EXPN_BIAS - (peak >> IEEE754_MANT_BITS) + LOG2_HEADROOM;
204  int32_t x = (peak << IEEE754_EXPN_BITS) & 0x7fffffff;
205 
206  // saturate when e > 31 or e < 0
207  if ((uint32_t)e > 31) {
208  return 0x7fffffff & ~(e >> 31);
209  }
210 
211  int k = x >> (31 - LOG2_TABBITS);
212 
213  // polynomial for log2(1+x) over x=[0,1]
214  int32_t c0 = log2Table[k][0];
215  int32_t c1 = log2Table[k][1];
216  int32_t c2 = log2Table[k][2];
217 
218  c1 += MULHI(c0, x);
219  c2 += MULHI(c1, x);
220 
221  // reconstruct result in Q26
222  return (e << LOG2_FRACBITS) - (c2 >> 3);
223 }
224 
225 //
226 // Peak detection and -log2(x) for float input (quad)
227 // x < 2^(31-LOG2_HEADROOM) returns 0x7fffffff
228 // x > 2^LOG2_HEADROOM returns 0
229 //
230 FORCEINLINE static int32_t peaklog2(float* input0, float* input1, float* input2, float* input3) {
231 
232  // float as integer bits
233  uint32_t u0 = *(uint32_t*)input0;
234  uint32_t u1 = *(uint32_t*)input1;
235  uint32_t u2 = *(uint32_t*)input2;
236  uint32_t u3 = *(uint32_t*)input3;
237 
238  // max absolute value
239  u0 &= IEEE754_FABS_MASK;
240  u1 &= IEEE754_FABS_MASK;
241  u2 &= IEEE754_FABS_MASK;
242  u3 &= IEEE754_FABS_MASK;
243  uint32_t peak = MAX(MAX(u0, u1), MAX(u2, u3));
244 
245  // split into e and x - 1.0
246  int32_t e = IEEE754_EXPN_BIAS - (peak >> IEEE754_MANT_BITS) + LOG2_HEADROOM;
247  int32_t x = (peak << IEEE754_EXPN_BITS) & 0x7fffffff;
248 
249  // saturate when e > 31 or e < 0
250  if ((uint32_t)e > 31) {
251  return 0x7fffffff & ~(e >> 31);
252  }
253 
254  int k = x >> (31 - LOG2_TABBITS);
255 
256  // polynomial for log2(1+x) over x=[0,1]
257  int32_t c0 = log2Table[k][0];
258  int32_t c1 = log2Table[k][1];
259  int32_t c2 = log2Table[k][2];
260 
261  c1 += MULHI(c0, x);
262  c2 += MULHI(c1, x);
263 
264  // reconstruct result in Q26
265  return (e << LOG2_FRACBITS) - (c2 >> 3);
266 }
267 
268 //
269 // Count Leading Zeros
270 // Emulates the CLZ (ARM) and LZCNT (x86) instruction
271 //
272 FORCEINLINE static int CLZ(uint32_t u) {
273 
274  if (u == 0) {
275  return 32;
276  }
277 
278  int e = 0;
279  if (u < 0x00010000) {
280  u <<= 16;
281  e += 16;
282  }
283  if (u < 0x01000000) {
284  u <<= 8;
285  e += 8;
286  }
287  if (u < 0x10000000) {
288  u <<= 4;
289  e += 4;
290  }
291  if (u < 0x40000000) {
292  u <<= 2;
293  e += 2;
294  }
295  if (u < 0x80000000) {
296  e += 1;
297  }
298  return e;
299 }
300 
301 //
302 // Compute -log2(x) for x=[0,1] in Q31, result in Q26
303 // x <= 0 returns 0x7fffffff
304 //
305 FORCEINLINE static int32_t fixlog2(int32_t x) {
306 
307  if (x <= 0) {
308  return 0x7fffffff;
309  }
310 
311  // split into e and x - 1.0
312  uint32_t u = (uint32_t)x;
313  int e = CLZ(u);
314  x = (u << e) & 0x7fffffff;
315 
316  int k = x >> (31 - LOG2_TABBITS);
317 
318  // polynomial for log2(1+x) over x=[0,1]
319  int32_t c0 = log2Table[k][0];
320  int32_t c1 = log2Table[k][1];
321  int32_t c2 = log2Table[k][2];
322 
323  c1 += MULHI(c0, x);
324  c2 += MULHI(c1, x);
325 
326  // reconstruct result in Q26
327  return (e << LOG2_FRACBITS) - (c2 >> 3);
328 }
329 
330 //
331 // Compute exp2(-x) for x=[0,32] in Q26, result in Q31
332 // x <= 0 returns 0x7fffffff
333 //
334 FORCEINLINE static int32_t fixexp2(int32_t x) {
335 
336  if (x <= 0) {
337  return 0x7fffffff;
338  }
339 
340  // split into e and 1.0 - x
341  uint32_t u = (uint32_t)x;
342  int e = u >> LOG2_FRACBITS;
343  x = ~(u << LOG2_INTBITS) & 0x7fffffff;
344 
345  int k = x >> (31 - EXP2_TABBITS);
346 
347  // polynomial for exp2(x)
348  int32_t c0 = exp2Table[k][0];
349  int32_t c1 = exp2Table[k][1];
350  int32_t c2 = exp2Table[k][2];
351 
352  c1 += MULHI(c0, x);
353  c2 += MULHI(c1, x);
354 
355  // reconstruct result in Q31
356  return c2 >> e;
357 }
358 
359 // fast TPDF dither in [-1.0f, 1.0f]
360 FORCEINLINE static float dither() {
361  static uint32_t rz = 0;
362  rz = rz * 69069 + 1;
363  int32_t r0 = rz & 0xffff;
364  int32_t r1 = rz >> 16;
365  return (r0 - r1) * (1/65536.0f);
366 }
367 
368 //
369 // Min-hold lowpass filter
370 //
371 // Bandlimits the gain control signal to greatly reduce the modulation distortion,
372 // while still reaching the peak attenuation after exactly N-1 samples of delay.
373 // N completely determines the attack time.
374 //
375 template<int N, int CIC1, int CIC2>
376 class MinFilterT {
377 
378  static_assert((N & (N - 1)) == 0, "N must be a power of 2");
379  static_assert((CIC1 - 1) + (CIC2 - 1) == (N - 1), "Total CIC delay must be N-1");
380 
381  int32_t _buffer[2*N] = {}; // shared FIFO
382  size_t _index = 0;
383 
384  int32_t _acc1 = 0; // CIC1 integrator
385  int32_t _acc2 = 0; // CIC2 integrator
386 
387 public:
388  MinFilterT() {
389 
390  // fill history
391  for (size_t n = 0; n < N-1; n++) {
392  process(0x7fffffff);
393  }
394  }
395 
396  int32_t process(int32_t x) {
397 
398  const size_t MASK = 2*N - 1; // buffer wrap
399  size_t i = _index;
400 
401  // Fast min-hold using a running-min filter. Finds the peak (min) value
402  // in the sliding window of N-1 samples, using only log2(N) comparisons.
403  // Hold time of N-1 samples exactly cancels the step response of FIR filter.
404 
405  for (size_t n = 1; n < N; n <<= 1) {
406 
407  _buffer[i] = x;
408  i = (i + n) & MASK;
409  x = MIN(x, _buffer[i]);
410  }
411 
412  // Fast FIR attack/lowpass filter using a 2-stage CIC filter.
413  // The step response reaches final value after N-1 samples.
414  // NOTE: CIC integrators intentionally overflow, using modulo arithmetic.
415  // See E. B. Hogenauer, "An economical class of digital filters for decimation and interpolation"
416 
417  const int32_t CICGAIN = 0xffffffff / (CIC1 * CIC2); // Q32
418  x = MULHI(x, CICGAIN);
419 
420  _buffer[i] = _acc1;
421  _acc1 = ADDMOD32(_acc1, x); // integrator
422  i = (i + CIC1 - 1) & MASK;
423  x = SUBMOD32(_acc1, _buffer[i]); // comb
424 
425  _buffer[i] = _acc2;
426  _acc2 = ADDMOD32(_acc2, x); // integrator
427  i = (i + CIC2 - 1) & MASK;
428  x = SUBMOD32(_acc2, _buffer[i]); // comb
429 
430  _index = (i + 1) & MASK; // skip unused tap
431  return x;
432  }
433 };
434 
435 //
436 // Max-hold lowpass filter
437 //
438 // Bandlimits the gain control signal to greatly reduce the modulation distortion,
439 // while still reaching the peak attenuation after exactly N-1 samples of delay.
440 // N completely determines the attack time.
441 //
442 template<int N, int CIC1, int CIC2>
443 class MaxFilterT {
444 
445  static_assert((N & (N - 1)) == 0, "N must be a power of 2");
446  static_assert((CIC1 - 1) + (CIC2 - 1) == (N - 1), "Total CIC delay must be N-1");
447 
448  int32_t _buffer[2*N] = {}; // shared FIFO
449  size_t _index = 0;
450 
451  int32_t _acc1 = 0; // CIC1 integrator
452  int32_t _acc2 = 0; // CIC2 integrator
453 
454 public:
455  MaxFilterT() {
456 
457  // fill history
458  for (size_t n = 0; n < N-1; n++) {
459  process(0);
460  }
461  }
462 
463  int32_t process(int32_t x) {
464 
465  const size_t MASK = 2*N - 1; // buffer wrap
466  size_t i = _index;
467 
468  // Fast max-hold using a running-max filter. Finds the peak (max) value
469  // in the sliding window of N-1 samples, using only log2(N) comparisons.
470  // Hold time of N-1 samples exactly cancels the step response of FIR filter.
471 
472  for (size_t n = 1; n < N; n <<= 1) {
473 
474  _buffer[i] = x;
475  i = (i + n) & MASK;
476  x = MAX(x, _buffer[i]);
477  }
478 
479  // Fast FIR attack/lowpass filter using a 2-stage CIC filter.
480  // The step response reaches final value after N-1 samples.
481  // NOTE: CIC integrators intentionally overflow, using modulo arithmetic.
482  // See E. B. Hogenauer, "An economical class of digital filters for decimation and interpolation"
483 
484  const int32_t CICGAIN = 0xffffffff / (CIC1 * CIC2); // Q32
485  x = MULHI(x, CICGAIN);
486 
487  _buffer[i] = _acc1;
488  _acc1 = ADDMOD32(_acc1, x); // integrator
489  i = (i + CIC1 - 1) & MASK;
490  x = SUBMOD32(_acc1, _buffer[i]); // comb
491 
492  _buffer[i] = _acc2;
493  _acc2 = ADDMOD32(_acc2, x); // integrator
494  i = (i + CIC2 - 1) & MASK;
495  x = SUBMOD32(_acc2, _buffer[i]); // comb
496 
497  _index = (i + 1) & MASK; // skip unused tap
498  return x;
499  }
500 };
501 
502 //
503 // Specializations that define the optimum lowpass filter for each length.
504 //
505 template<int N> class MinFilter;
506 template<> class MinFilter< 16> : public MinFilterT< 16, 7, 10> {};
507 template<> class MinFilter< 32> : public MinFilterT< 32, 14, 19> {};
508 template<> class MinFilter< 64> : public MinFilterT< 64, 27, 38> {};
509 template<> class MinFilter<128> : public MinFilterT<128, 53, 76> {};
510 template<> class MinFilter<256> : public MinFilterT<256, 106, 151> {};
511 
512 template<int N> class MaxFilter;
513 template<> class MaxFilter< 16> : public MaxFilterT< 16, 7, 10> {};
514 template<> class MaxFilter< 32> : public MaxFilterT< 32, 14, 19> {};
515 template<> class MaxFilter< 64> : public MaxFilterT< 64, 27, 38> {};
516 template<> class MaxFilter<128> : public MaxFilterT<128, 53, 76> {};
517 template<> class MaxFilter<256> : public MaxFilterT<256, 106, 151> {};
518 
519 //
520 // N-1 sample delay (mono)
521 //
522 template<int N, typename T = float>
523 class MonoDelay {
524 
525  static_assert((N & (N - 1)) == 0, "N must be a power of 2");
526 
527  T _buffer[N] = {};
528  size_t _index = 0;
529 
530 public:
531  void process(T& x) {
532 
533  const size_t MASK = N - 1; // buffer wrap
534  size_t i = _index;
535 
536  _buffer[i] = x;
537 
538  i = (i + (N - 1)) & MASK;
539 
540  x = _buffer[i];
541 
542  _index = i;
543  }
544 };
545 
546 //
547 // N-1 sample delay (stereo)
548 //
549 template<int N, typename T = float>
550 class StereoDelay {
551 
552  static_assert((N & (N - 1)) == 0, "N must be a power of 2");
553 
554  T _buffer[2*N] = {};
555  size_t _index = 0;
556 
557 public:
558  void process(T& x0, T& x1) {
559 
560  const size_t MASK = 2*N - 1; // buffer wrap
561  size_t i = _index;
562 
563  _buffer[i+0] = x0;
564  _buffer[i+1] = x1;
565 
566  i = (i + 2*(N - 1)) & MASK;
567 
568  x0 = _buffer[i+0];
569  x1 = _buffer[i+1];
570 
571  _index = i;
572  }
573 };
574 
575 //
576 // N-1 sample delay (quad)
577 //
578 template<int N, typename T = float>
579 class QuadDelay {
580 
581  static_assert((N & (N - 1)) == 0, "N must be a power of 2");
582 
583  T _buffer[4*N] = {};
584  size_t _index = 0;
585 
586 public:
587  void process(T& x0, T& x1, T& x2, T& x3) {
588 
589  const size_t MASK = 4*N - 1; // buffer wrap
590  size_t i = _index;
591 
592  _buffer[i+0] = x0;
593  _buffer[i+1] = x1;
594  _buffer[i+2] = x2;
595  _buffer[i+3] = x3;
596 
597  i = (i + 4*(N - 1)) & MASK;
598 
599  x0 = _buffer[i+0];
600  x1 = _buffer[i+1];
601  x2 = _buffer[i+2];
602  x3 = _buffer[i+3];
603 
604  _index = i;
605  }
606 };