import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import Dict, List, Tuple, Union
from ..base import ProcessorsBase, EffectParam
from ..base_utils import check_params
from ..core.phase import unwrap_phase
[docs]class PingPongDelay(ProcessorsBase):
"""Differentiable implementation of a stereo ping-pong delay effect.
This processor implements a stereo delay effect where echoes alternate between
left and right channels, creating a "ping-pong" spatial pattern. The implementation
uses a cross-coupled feedback structure in the frequency domain for precise timing
and smooth transitions.
Implementation is based on:
.. [1] Reiss, Joshua D., and Andrew McPherson.
Audio effects: theory, implementation and application. CRC Press, 2014.
.. [2] Smith, Julius O. "Digital Audio Effects."
https://ccrma.stanford.edu/~jos/fp3/Phase_Unwrapping.html
The system is described by coupled transfer functions:
.. math::
H_{11}(z) = \\frac{1}{1 - b_1b_2z^{-2N}}
H_{12}(z) = \\frac{b_1z^{-N}}{1 - b_1b_2z^{-2N}}
H_{21}(z) = \\frac{b_2z^{-N}}{1 - b_1b_2z^{-2N}}
H_{22}(z) = \\frac{b_1b_2z^{-2N}}{1 - b_1b_2z^{-2N}}
where:
- z^(-N) represents the base delay
- b1, b2 are feedback gains for each channel
- System stability ensured by |b1*b2| < 1
Processing Chain:
1. Zero-pad stereo input for delay buffer
2. Convert to frequency domain
3. Calculate cross-coupled transfer functions
4. Apply transfers to each channel
5. Convert back to time domain
6. Mix processed signal with original
Args:
sample_rate (int): Audio sample rate in Hz
param_range (Dict[str, EffectParam], optional): Parameter ranges.
Parameters Details:
delay_ms: Base delay time
- Range: 0.1 to 3000.0 milliseconds
- Controls time between alternating echoes
- Each bounce takes this amount of time
feedback_ch1: Left channel feedback gain
- Range: 0.0 to 0.99
- Controls decay of left-to-right echoes
- Higher values create longer decay times
feedback_ch2: Right channel feedback gain
- Range: 0.0 to 0.99
- Controls decay of right-to-left echoes
- Can differ from ch1 for asymmetric patterns
mix: Wet/dry mix ratio
- Range: 0.0 to 1.0
- 0.0: Only original signal
- 1.0: Only processed signal
Note:
- Uses FFT-based delay for precise time shifting
- Phase unwrapping prevents discontinuities
- Automatic padding handles all delay times
- Particularly effective for:
- Creating rhythmic spatial patterns
- Adding stereo width and movement
- Building complex stereo textures
- System stability is maintained by gain limits
Examples:
Basic DSP Usage:
>>> # Create a ping-pong delay
>>> delay = PingPongDelay(sample_rate=44100)
>>> # Process with rhythmic spatial echoes
>>> output = delay(input_audio, dsp_params={
... 'delay_ms': 250.0, # Quarter note at 120 BPM
... 'feedback_ch1': 0.7, # Left to right decay
... 'feedback_ch2': 0.7, # Right to left decay
... 'mix': 0.5 # Equal mix of dry and wet
... })
Neural Network Control:
>>> # 1. Simple parameter prediction
>>> class PingPongController(nn.Module):
... def __init__(self, input_size):
... super().__init__()
... self.net = nn.Sequential(
... nn.Linear(input_size, 32),
... nn.ReLU(),
... nn.Linear(32, 4), # 4 parameters
... nn.Sigmoid() # Ensures output is in [0,1] range
... )
...
... def forward(self, x):
... return self.net(x)
>>>
>>> # Process with features
>>> controller = PingPongController(input_size=16)
>>> features = torch.randn(batch_size, 16)
>>> norm_params = controller(features)
>>> output = delay(input_audio, norm_params=norm_params)
"""
[docs] def _register_default_parameters(self):
"""Register delay time, feedback, and mix parameters.
Sets up four parameters:
- delay_ms: Base delay time (0.1 to 3000.0 ms)
- feedback_ch1: Left channel feedback (0.0 to 0.99)
- feedback_ch2: Right channel feedback (0.0 to 0.99)
- mix: Wet/dry mix ratio (0.0 to 1.0)
"""
self.params = {
'delay_ms': EffectParam(min_val=0.1, max_val=3000.0),
'feedback_ch1': EffectParam(min_val=0.0, max_val=0.99),
'feedback_ch2': EffectParam(min_val=0.0, max_val=0.99),
'mix': EffectParam(min_val=0.0, max_val=1.0)
}
[docs] def process(self, x: torch.Tensor, norm_params: Union[Dict[str, torch.Tensor], None] = None , dsp_params: Union[Dict[str, torch.Tensor], None] = None):
"""Process input signal through the ping-pong delay.
Args:
x (torch.Tensor): Input audio tensor. Shape: (batch, 2, samples)
norm_params (Dict[str, torch.Tensor]): Normalized parameters (0 to 1)
Must contain the following keys:
- 'delay_ms': Base delay time in milliseconds (0 to 1)
- 'feedback_ch1': Left channel feedback (0 to 1)
- 'feedback_ch2': Right channel feedback (0 to 1)
- 'mix': Wet/dry balance (0 to 1)
Each value should be a tensor of shape (batch_size,)
dsp_params (Dict[str, Union[float, torch.Tensor]], optional): Direct DSP parameters.
Can specify ping-pong parameters as:
- float/int: Single value applied to entire batch
- 0D tensor: Single value applied to entire batch
- 1D tensor: Batch of values matching input batch size
Parameters will be automatically expanded to match batch size
and moved to input device if necessary.
If provided, norm_params must be None.
Returns:
torch.Tensor: Processed stereo audio tensor of same shape as input. Shape: (batch, 2, samples)
Raises:
AssertionError: If input is not stereo (2 channels)
"""
# Set proper configuration
check_params(norm_params, dsp_params)
if norm_params is not None:
params = self.map_parameters(norm_params)
else:
params = dsp_params
delay_ms = params['delay_ms'].view(-1, 1, 1)
b1 = params['feedback_ch1'].view(-1, 1, 1)
b2 = params['feedback_ch2'].view(-1, 1, 1)
mix = params['mix'].view(-1, 1, 1)
b, ch, s = x.shape
assert ch == 2, "Input must be stereo"
max_delay_samples = max(
1,
int(torch.max(delay_ms) * self.sample_rate / 1000)
)
# Calculate FFT size (next power of 2 for efficiency)
fft_size = 2 ** int(np.ceil(np.log2(x.shape[-1] + max_delay_samples)))
# Pad input signal to FFT size
pad_right = fft_size - (x.shape[-1] + max_delay_samples)
x_padded = torch.nn.functional.pad(x, (max_delay_samples, pad_right))
X = torch.fft.rfft(x_padded, n=fft_size)
freqs = torch.fft.rfftfreq(x_padded.shape[-1], 1/self.sample_rate).to(x.device)
phase = -2 * np.pi * freqs * delay_ms / 1000
phase = unwrap_phase(phase, dim=-1)
z_n = torch.exp(1j * phase).to(X.dtype)
eps = 1e-6
den = 1 - b1 * b2 * z_n * z_n + eps
# Modified transfer functions for ping-pong behavior
H11 = 1 /den # Direct path for left
H12 = b1 * z_n / den # Left to right (single delay)
H21 = b2 * z_n / den # Right to left (single delay)
H22 = b1 * b2 * z_n * z_n / den # Right to right (double delay through feedback)
Y1 = H11 * X[:, 0:1] + H12 * X[:, 1:2]
Y2 = H21 * X[:, 0:1] + H22 * X[:, 1:2]
Y = torch.cat([Y1, Y2], dim=1)
# y = torch.fft.irfft(Y, n=x_padded.shape[-1])[:, :, max_delay_samples:]
y = torch.fft.irfft(Y, n=fft_size)
y = y[..., max_delay_samples:max_delay_samples + x.shape[-1]]
return (1 - mix) * x + mix * y