Source code for diffFx_pytorch.processors.delay.basic

import torch 
import torch.nn as nn
import torch.nn.functional as F
import numpy as np 
from typing import Dict, List, Tuple, Union
from ..base import ProcessorsBase, EffectParam
from ..base_utils import check_params
from ..core.phase import unwrap_phase

# Basic Delay 
[docs]class BasicDelay(ProcessorsBase):
    """Differentiable implementation of a single-tap delay line.
    
    This processor implements a basic digital delay line using frequency-domain processing
    for precise, artifact-free time delays. It creates a single echo of the input signal
    with controllable delay time and mix level.

    Implementation is based on: 
    
    ..  [1] Reiss, Joshua D., and Andrew McPherson. 
            Audio effects: theory, implementation and application. CRC Press, 2014.
    ..  [2] Smith, Julius O. "Digital Audio Effects." 
            https://ccrma.stanford.edu/~jos/fp3/Phase_Unwrapping.html
    
    The delay is implemented in the frequency domain using the time-shift property:

    .. math::
    
        Y(\\omega) = X(\\omega)e^{-j\\omega\\tau}

    where:
        - X(ω) is the input spectrum
        - Y(ω) is the delayed spectrum
        - τ is the delay time in seconds
        - Phase is unwrapped to ensure continuous delay response

    Processing Chain:
        1. Zero-pad input for delay buffer
        2. Convert to frequency domain
        3. Calculate phase shift (z^-N term)
        4. Apply phase shift to spectrum
        5. Convert back to time domain
        6. Mix processed signal with original

    Args:
        sample_rate (int): Audio sample rate in Hz
        param_range (Dict[str, EffectParam], optional): Parameter ranges.

    Parameters Details:
        delay_ms: Echo delay time
            - Range: 0.1 to 1000.0 milliseconds
            - Controls time offset between original and delayed signal
            - Minimum value ensures stable processing
            - Maximum value set for practical buffer sizes
            
        mix: Wet/dry mix ratio
            - Range: 0.0 to 1.0
            - 0.0: Only original signal
            - 1.0: Only delayed signal
            - Linear crossfade between original and delayed signals

    Note:
        - Uses FFT-based delay for precise time shifting
        - Phase unwrapping prevents discontinuities in delay
        - Automatic padding handles all delay times
        - Particularly effective for:
            - Creating simple echoes
            - Adding space to dry signals
            - Basic time-based effects

    Examples:
        Basic DSP Usage:
            >>> # Create a basic delay
            >>> delay = BasicDelay(sample_rate=44100)
            >>> # Process audio
            >>> output = delay(input_audio, dsp_params={
            ...     'delay_ms': 500.0,  # Half-second delay
            ...     'mix': 0.5          # Equal mix of dry and wet
            ... })

        Neural Network Control:
            >>> # 1. Simple parameter prediction
            >>> class DelayController(nn.Module):
            ...     def __init__(self, input_size):
            ...         super().__init__()
            ...         self.net = nn.Sequential(
            ...             nn.Linear(input_size, 32),
            ...             nn.ReLU(),
            ...             nn.Linear(32, 2),  # 2 parameters: delay and mix
            ...             nn.Sigmoid()  # Ensures output is in [0,1] range
            ...         )
            ...     
            ...     def forward(self, x):
            ...         return self.net(x)
            >>> 
            >>> # Process with features
            >>> controller = DelayController(input_size=16)
            >>> features = torch.randn(batch_size, 16)
            >>> norm_params = controller(features)
            >>> output = delay(input_audio, norm_params=norm_params)
    """
[docs]    def _register_default_parameters(self):
        """Register delay time and mix parameters.
        
        Sets up two parameters:
            - delay_ms: Delay time in milliseconds (0.1 to 1000.0)
            - mix: Wet/dry mix ratio (0.0 to 1.0)
        """
        self.params = {
            'delay_ms': EffectParam(min_val=10, max_val=1000.0),
            'mix': EffectParam(min_val=0.0, max_val=1.0)
        }
    
[docs]    def process(self, x: torch.Tensor, norm_params: Union[Dict[str, torch.Tensor], None] = None, dsp_params: Union[Dict[str, torch.Tensor], None] = None):
        """Process input signal through the delay line.
        
        Args:
            x (torch.Tensor): Input audio tensor. Shape: (batch, channels, samples)
            norm_params (Dict[str, torch.Tensor]): Normalized parameters (0 to 1)
                Must contain the following keys:
                - 'delay_ms': Delay time in milliseconds (0 to 1)
                - 'mix': Wet/dry balance (0 to 1)
                Each value should be a tensor of shape (batch_size,)
            dsp_params (Dict[str, Union[float, torch.Tensor]], optional): Direct DSP parameters.
                Can specify delay parameters as:
                - float/int: Single value applied to entire batch
                - 0D tensor: Single value applied to entire batch
                - 1D tensor: Batch of values matching input batch size
                Parameters will be automatically expanded to match batch size
                and moved to input device if necessary.
                If provided, norm_params must be None.

        Returns:
            torch.Tensor: Processed audio tensor of same shape as input
        """
        # Set proper configuration
        check_params(norm_params, dsp_params)
        if norm_params is not None:
            params = self.map_parameters(norm_params)
        else:
            params = dsp_params
        
        # get parameters 
        delay_ms, mix = params['delay_ms'], params['mix']
        
        # Padding 
        max_delay_samples = max(
            1,
            int(torch.max(delay_ms) * self.sample_rate / 1000)
        )
        # Calculate FFT size (next power of 2 for efficiency)
        fft_size = 2 ** int(np.ceil(np.log2(x.shape[-1] + max_delay_samples)))
        # Pad input signal to FFT size
        pad_right = fft_size - (x.shape[-1] + max_delay_samples)
        x_padded = torch.nn.functional.pad(x, (max_delay_samples, pad_right))

        # x_padded = torch.nn.functional.pad(x, (max_delay_samples, 0))
        
        # Convert to frequency domain
        X = torch.fft.rfft(x_padded, n=fft_size)
        
        # Phase calculation with unwrapping
        freqs = torch.fft.rfftfreq(x_padded.shape[-1], 1/self.sample_rate).to(x.device)
        phase = -2 * np.pi * freqs * delay_ms.view(-1, 1, 1) / 1000
        phase = unwrap_phase(phase, dim=-1)
        
        # Apply phase shift
        X_delayed = X * torch.exp(1j * phase).to(X.dtype)
        
        # IFFT and trim padding
        # x_delayed = torch.fft.irfft(X_delayed, n=x_padded.shape[-1])#[:, :, max_delay_samples:]
        # Trim to match original input length
        x_delayed = torch.fft.irfft(X_delayed, n=fft_size)
        x_delayed = x_delayed[..., max_delay_samples:max_delay_samples + x.shape[-1]]

        mix = mix.unsqueeze(-1).unsqueeze(-1)
        return (1 - mix) * x + mix * x_delayed

# Add feedback 
[docs]class BasicFeedbackDelay(ProcessorsBase):
    """Differentiable implementation of a feedback delay line.
    
    This processor implements a delay line with feedback and feedforward paths, creating
    multiple decaying echoes. The implementation uses frequency-domain processing and 
    a feedback-feedforward structure for flexible echo patterns.

    Implementation is based on: 
    
    ..  [1] Reiss, Joshua D., and Andrew McPherson. 
            Audio effects: theory, implementation and application. CRC Press, 2014.
    ..  [2] Smith, Julius O. "Digital Audio Effects." 
            https://ccrma.stanford.edu/~jos/fp3/Phase_Unwrapping.html
    
    The transfer function of the system is from [1]:

    .. math::

        H(z) = \\frac{z^{-N} + g_{ff} - g_{fb}}{z^{-N} - g_{fb}}

    where:
        - z^(-N) represents the delay of N samples
        - g_ff is the feedforward gain
        - g_fb is the feedback gain
        - System stability is ensured by limiting |g_fb| < 1

    Processing Chain:
        1. Zero-pad input for delay buffer
        2. Convert to frequency domain
        3. Calculate phase shift (z^-N term)
        4. Apply transfer function H(z)
        5. Convert back to time domain
        6. Mix processed signal with original

    Args:
        sample_rate (int): Audio sample rate in Hz
        param_range (Dict[str, EffectParam], optional): Parameter ranges.

    Parameters Details:
        delay_ms: Echo delay time
            - Range: 0.1 to 1000.0 milliseconds
            - Controls time between successive echoes
            - Determines rhythmic pattern of echoes
            
        mix: Wet/dry mix ratio
            - Range: 0.0 to 1.0
            - 0.0: Only original signal
            - 1.0: Only processed signal
            
        fb_gain: Feedback gain
            - Range: 0.0 to 0.99
            - Controls decay rate of echoes
            - Higher values create longer decay times
            - Clamped to ±0.99 for stability
            
        ff_gain: Feedforward gain
            - Range: 0.0 to 0.99
            - Controls level of direct delayed signal
            - Shapes initial echo response
            - Independent of feedback path

    Note:
        - Uses FFT-based delay for precise time shifting
        - Phase unwrapping prevents discontinuities
        - Automatic padding handles all delay times
        - Particularly effective for:
            - Creating rhythmic echo patterns
            - Adding depth and space
            - Building complex delay textures
        - System stability is maintained by gain limits

    Examples:
        Basic DSP Usage:
            >>> # Create a feedback delay
            >>> delay = BasicFeedbackDelay(sample_rate=44100)
            >>> # Process with rhythmic echoes
            >>> output = delay(input_audio, dsp_params={
            ...     'delay_ms': 250.0,  # Quarter note at 120 BPM
            ...     'mix': 0.5,         # Equal mix
            ...     'fb_gain': 0.7,     # Moderate feedback
            ...     'ff_gain': 0.8      # Strong initial echo
            ... })

        Neural Network Control:
            >>> # 1. Simple parameter prediction
            >>> class FeedbackDelayController(nn.Module):
            ...     def __init__(self, input_size):
            ...         super().__init__()
            ...         self.net = nn.Sequential(
            ...             nn.Linear(input_size, 32),
            ...             nn.ReLU(),
            ...             nn.Linear(32, 4),  # 4 parameters
            ...             nn.Sigmoid()  # Ensures output is in [0,1] range
            ...         )
            ...     
            ...     def forward(self, x):
            ...         return self.net(x)
            >>> 
            >>> # Process with features
            >>> controller = FeedbackDelayController(input_size=16)
            >>> features = torch.randn(batch_size, 16)
            >>> norm_params = controller(features)
            >>> output = delay(input_audio, norm_params=norm_params)
    """
[docs]    def _register_default_parameters(self):
        """Register delay, mix, and gain parameters.
        
        Sets up four parameters:
            - delay_ms: Delay time in milliseconds (0.1 to 1000.0)
            - mix: Wet/dry mix ratio (0.0 to 1.0)
            - fb_gain: Feedback gain (0.0 to 0.99)
            - ff_gain: Feedforward gain (0.0 to 0.99)
        """
        self.params = {
            'delay_ms': EffectParam(min_val=0.1, max_val=1000.0),
            'mix': EffectParam(min_val=0, max_val=1.0),
            'fb_gain': EffectParam(min_val=0.0, max_val=0.99),
            'ff_gain': EffectParam(min_val=0.0, max_val=0.99)
        }
    
[docs]    def process(self, x: torch.Tensor, norm_params: Union[Dict[str, torch.Tensor], None] = None , dsp_params: Union[Dict[str, torch.Tensor], None] = None):
        """Process input signal through the feedback delay line.

        Args:
            x (torch.Tensor): Input audio tensor. Shape: (batch, channels, samples)
            norm_params (Dict[str, torch.Tensor]): Normalized parameters (0 to 1)
                Must contain the following keys:
                - 'delay_ms': Base delay time in milliseconds (0 to 1)
                - 'fb_gain': Amount of signal fed back through delay line (0 to 1)
                - 'ff_gain': Feedforward gain (0 to 1)
                - 'mix': Wet/dry balance (0 to 1)
                Each value should be a tensor of shape (batch_size,)
            dsp_params (Dict[str, Union[float, torch.Tensor]], optional): Direct DSP parameters.
                Can specify feedback delay parameters as:
                - float/int: Single value applied to entire batch
                - 0D tensor: Single value applied to entire batch
                - 1D tensor: Batch of values matching input batch size
                Parameters will be automatically expanded to match batch size
                and moved to input device if necessary.
                If provided, norm_params must be None.

        Returns:
            torch.Tensor: Processed audio tensor of same shape as input
        """
        # Set proper configuration
        check_params(norm_params, dsp_params)
        if norm_params is not None:
            params = self.map_parameters(norm_params)
        else:
            params = dsp_params
        
        delay_ms = params['delay_ms'].view(-1, 1, 1)
        g_fb = params['fb_gain'].view(-1, 1, 1)
        g_fb = torch.clamp(g_fb, -0.99, 0.99)
        g_ff = params['ff_gain'].view(-1, 1, 1)
        mix = params['mix'].view(-1, 1, 1)
        
        # padding 
        max_delay_samples = max(
            1,
            int(torch.max(delay_ms) * self.sample_rate / 1000)
        )
        # Calculate FFT size (next power of 2 for efficiency)
        fft_size = 2 ** int(np.ceil(np.log2(x.shape[-1] + max_delay_samples)))
        # Pad input signal to FFT size
        pad_right = fft_size - (x.shape[-1] + max_delay_samples)
        x_padded = torch.nn.functional.pad(x, (max_delay_samples, pad_right))
       
        # freq domain 
        X = torch.fft.rfft(x_padded, n=fft_size)
        freqs = torch.fft.rfftfreq(x_padded.shape[-1], 1/self.sample_rate).to(x.device)
        phase = -2 * np.pi * freqs * delay_ms / 1000
        phase = unwrap_phase(phase, dim=-1)
        z_n = torch.exp(1j * phase).to(X.dtype)
        
        # H(z) = (z^N + g_FF - g_FB)/(z^N - g_FB)
        eps = 1e-6
        H = (z_n + g_ff - g_fb) / (z_n - g_fb + eps)
        X_delayed = X * H
        
        # x_delayed = torch.fft.irfft(X_delayed, n=x_padded.shape[-1])[:, :, max_delay_samples:]
        x_delayed = torch.fft.irfft(X_delayed, n=fft_size)
        x_delayed = x_delayed[..., max_delay_samples:max_delay_samples + x.shape[-1]]
        
        return (1 - mix) * x + mix * x_delayed

# Identical to the basic delay but the delay_ms is much shorter 
[docs]class SlapbackDelay(BasicDelay):
    """Differentiable implementation of a slapback delay effect.
    
    The implementation is based on: 
    
    ..  [1] Reiss, Joshua D., and Andrew McPherson. 
            Audio effects: theory, implementation and application. CRC Press, 2014.
    ..  [2] Smith, Julius O. "Digital Audio Effects." 
            https://ccrma.stanford.edu/~jos/fp3/Phase_Unwrapping.html
    
    This processor extends BasicDelay to create a specialized short delay effect
    that emulates the distinctive "doubling" sound popularized in 1950s recordings.
    The delay time range is specifically restricted to create the characteristic
    slapback effect.

    The processor uses the same frequency-domain implementation as BasicDelay:

    .. math::

        Y(\\omega) = X(\\omega)e^{-j\\omega\\tau}

    where τ is restricted to 40-120ms for the slapback effect.

    Delay Time Ranges:
        - 40-80ms: Tight doubling effect
        - 80-120ms: Subtle ambience
        These ranges are chosen based on psychoacoustic research
        and historical usage in classic recordings.

    Args:
        sample_rate (int): Audio sample rate in Hz
        param_range (Dict[str, EffectParam], optional): Parameter ranges.

    Parameters Details:
        delay_ms: Slapback delay time
            - Range: 40.0 to 120.0 milliseconds
            - Shorter range than BasicDelay for specific effect
            - 40-80ms: Creates tight doubling
            - 80-120ms: Adds natural space
            
        mix: Wet/dry mix ratio
            - Range: 0.0 to 1.0
            - 0.0: Only original signal
            - 1.0: Only delayed signal
            - Typical settings: 0.3-0.5 for classic sound

    Note:
        - Inherits all processing methods from BasicDelay
        - Only modifies parameter ranges for specialized use
        - Particularly effective on:
            - Vocals (creates natural doubling)
            - Electric guitar (adds depth)
            - Snare drums (enhances attack)
        - No feedback to maintain clarity of effect

    Examples:
        Basic DSP Usage:
            >>> # Create a slapback delay
            >>> delay = SlapbackDelay(sample_rate=44100)
            >>> # Process with classic settings
            >>> output = delay(input_audio, dsp_params={
            ...     'delay_ms': 60.0,  # Tight doubling effect
            ...     'mix': 0.4         # Subtle enhancement
            ... })

        Neural Network Control:
            >>> # 1. Simple parameter prediction
            >>> class SlapbackController(nn.Module):
            ...     def __init__(self, input_size):
            ...         super().__init__()
            ...         self.net = nn.Sequential(
            ...             nn.Linear(input_size, 32),
            ...             nn.ReLU(),
            ...             nn.Linear(32, 2),  # 2 parameters: delay and mix
            ...             nn.Sigmoid()  # Ensures output is in [0,1] range
            ...         )
            ...     
            ...     def forward(self, x):
            ...         return self.net(x)
            >>> 
            >>> # Process with features
            >>> controller = SlapbackController(input_size=16)
            >>> features = torch.randn(batch_size, 16)
            >>> norm_params = controller(features)
            >>> output = delay(input_audio, norm_params=norm_params)
    """
[docs]    def _register_default_parameters(self):
        """Register parameters with slapback-specific ranges.
        
        Modifies the delay time range from BasicDelay to:
            - delay_ms: 40.0 to 120.0 ms (slapback range)
            - mix: 0.0 to 1.0 (unchanged from BasicDelay)
            
        Note:
            These ranges are specifically chosen for the
            characteristic slapback doubling effect.
        """
        self.params = {
            'delay_ms': EffectParam(min_val=40.0, max_val=120.0),
            'mix': EffectParam(min_val=0.0, max_val=1.0)
        }