BeatRootSpectralFluxOnsetDetector.java (9265B)
1 /* 2 * _______ _____ _____ _____ 3 * |__ __| | __ \ / ____| __ \ 4 * | | __ _ _ __ ___ ___ ___| | | | (___ | |__) | 5 * | |/ _` | '__/ __|/ _ \/ __| | | |\___ \| ___/ 6 * | | (_| | | \__ \ (_) \__ \ |__| |____) | | 7 * |_|\__,_|_| |___/\___/|___/_____/|_____/|_| 8 * 9 * ------------------------------------------------------------- 10 * 11 * TarsosDSP is developed by Joren Six at IPEM, University Ghent 12 * 13 * ------------------------------------------------------------- 14 * 15 * Info: http://0110.be/tag/TarsosDSP 16 * Github: https://github.com/JorenSix/TarsosDSP 17 * Releases: http://0110.be/releases/TarsosDSP/ 18 * 19 * TarsosDSP includes modified source code by various authors, 20 * for credits and info, see README. 21 * 22 */ 23 24 package be.tarsos.dsp.onsets; 25 26 import java.util.Arrays; 27 import java.util.Iterator; 28 import java.util.LinkedList; 29 30 import be.tarsos.dsp.AudioDispatcher; 31 import be.tarsos.dsp.AudioEvent; 32 import be.tarsos.dsp.AudioProcessor; 33 import be.tarsos.dsp.beatroot.Peaks; 34 import be.tarsos.dsp.util.fft.FFT; 35 import be.tarsos.dsp.util.fft.ScaledHammingWindow; 36 37 /** 38 * <p> 39 * A non real-time spectral flux onset detection method, as implemented in the 40 * BeatRoot system of Centre for Digital Music, Queen Mary, University of 41 * London. 42 * </p> 43 * 44 * <p> 45 * This onset detection function does not, NOT work in real-time. It analyzes an 46 * audio-stream and detects onsets during a post processing step. 47 * </p> 48 * 49 * @author Joren Six 50 * @author Simon Dixon 51 */ 52 public class BeatRootSpectralFluxOnsetDetector implements AudioProcessor, OnsetDetector { 53 /** RMS amplitude of the current frame. */ 54 private double frameRMS; 55 56 /** The number of overlapping frames of audio data which have been read. */ 57 private int frameCount; 58 59 /** Long term average frame energy (in frequency domain representation). */ 60 private double ltAverage; 61 62 /** The real part of the data for the in-place FFT computation. 63 * Since input data is real, this initially contains the input data. */ 64 private float[] reBuffer; 65 66 /** The imaginary part of the data for the in-place FFT computation. 67 * Since input data is real, this initially contains zeros. */ 68 private float[] imBuffer; 69 70 /** Spectral flux onset detection function, indexed by frame. */ 71 private double[] spectralFlux; 72 73 /** A mapping function for mapping FFT bins to final frequency bins. 74 * The mapping is linear (1-1) until the resolution reaches 2 points per 75 * semitone, then logarithmic with a semitone resolution. e.g. for 76 * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is 77 * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and 78 * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to 79 * 83), where all energy above note 127 is mapped into the final bin. */ 80 private int[] freqMap; 81 82 /** The number of entries in <code>freqMap</code>. Note that the length of 83 * the array is greater, because its size is not known at creation time. */ 84 private int freqMapSize; 85 86 /** The magnitude spectrum of the most recent frame. 87 * Used for calculating the spectral flux. */ 88 private float[] prevFrame; 89 90 /** The magnitude spectrum of the current frame. */ 91 private double[] newFrame; 92 93 /** The magnitude spectra of all frames, used for plotting the spectrogram. */ 94 private double[][] frames; 95 96 /** The RMS energy of all frames. */ 97 private double[] energy; 98 99 /** Spacing of audio frames in samples (see <code>hopTime</code>) */ 100 protected int hopSize; 101 102 /** The size of an FFT frame in samples (see <code>fftTime</code>) */ 103 protected int fftSize; 104 105 /** Total number of audio frames if known, or -1 for live or compressed input. */ 106 private int totalFrames; 107 108 /** RMS frame energy below this value results in the frame being set to zero, 109 * so that normalization does not have undesired side-effects. */ 110 public static double silenceThreshold = 0.0004; 111 112 /** For dynamic range compression, this value is added to the log magnitude 113 * in each frequency bin and any remaining negative values are then set to zero. 114 */ 115 public static double rangeThreshold = 10; 116 117 /** Determines method of normalization. Values can be:<ul> 118 * <li>0: no normalization</li> 119 * <li>1: normalization by current frame energy</li> 120 * <li>2: normalization by exponential average of frame energy</li> 121 * </ul> 122 */ 123 public static int normaliseMode = 2; 124 125 /** Ratio between rate of sampling the signal energy (for the amplitude envelope) and the hop size */ 126 public static int energyOversampleFactor = 2; 127 128 private OnsetHandler handler; 129 130 private double hopTime; 131 132 private final FFT fft; 133 134 public BeatRootSpectralFluxOnsetDetector(AudioDispatcher d,int fftSize, int hopSize){ 135 136 this.hopSize = hopSize; 137 this.hopTime = hopSize/d.getFormat().getSampleRate(); 138 this.fftSize = fftSize; 139 140 System.err.println("Please use the ComplexOnset detector: BeatRootSpectralFluxOnsetDetector does currenlty not support streaming"); 141 //no overlap 142 //FIXME: 143 int durationInFrames = -1000; 144 totalFrames = (int)(durationInFrames / hopSize) + 4; 145 energy = new double[totalFrames*energyOversampleFactor]; 146 spectralFlux = new double[totalFrames]; 147 148 reBuffer = new float[fftSize/2]; 149 imBuffer = new float[fftSize/2]; 150 prevFrame = new float[fftSize/2]; 151 152 makeFreqMap(fftSize, d.getFormat().getSampleRate()); 153 154 newFrame = new double[freqMapSize]; 155 frames = new double[totalFrames][freqMapSize]; 156 handler = new PrintOnsetHandler(); 157 fft = new FFT(fftSize,new ScaledHammingWindow()); 158 } 159 160 @Override 161 public boolean process(AudioEvent audioEvent) { 162 frameRMS = audioEvent.getRMS()/2.0; 163 164 float[] audioBuffer = audioEvent.getFloatBuffer().clone(); 165 166 Arrays.fill(imBuffer, 0); 167 fft.powerPhaseFFTBeatRootOnset(audioBuffer, reBuffer, imBuffer); 168 Arrays.fill(newFrame, 0); 169 170 double flux = 0; 171 for (int i = 0; i < fftSize/2; i++) { 172 if (reBuffer[i] > prevFrame[i]) 173 flux += reBuffer[i] - prevFrame[i]; 174 newFrame[freqMap[i]] += reBuffer[i]; 175 } 176 spectralFlux[frameCount] = flux; 177 for (int i = 0; i < freqMapSize; i++) 178 frames[frameCount][i] = newFrame[i]; 179 180 int sz = (fftSize - hopSize) / energyOversampleFactor; 181 int index = hopSize; 182 for (int j = 0; j < energyOversampleFactor; j++) { 183 double newEnergy = 0; 184 for (int i = 0; i < sz; i++) { 185 newEnergy += audioBuffer[index] * audioBuffer[index]; 186 if (++index == fftSize) 187 index = 0; 188 } 189 energy[frameCount * energyOversampleFactor + j] = 190 newEnergy / sz <= 1e-6? 0: Math.log(newEnergy / sz) + 13.816; 191 } 192 double decay = frameCount >= 200? 0.99: 193 (frameCount < 100? 0: (frameCount - 100) / 100.0); 194 if (ltAverage == 0) 195 ltAverage = frameRMS; 196 else 197 ltAverage = ltAverage * decay + frameRMS * (1.0 - decay); 198 if (frameRMS <= silenceThreshold) 199 for (int i = 0; i < freqMapSize; i++) 200 frames[frameCount][i] = 0; 201 else { 202 if (normaliseMode == 1) 203 for (int i = 0; i < freqMapSize; i++) 204 frames[frameCount][i] /= frameRMS; 205 else if (normaliseMode == 2) 206 for (int i = 0; i < freqMapSize; i++) 207 frames[frameCount][i] /= ltAverage; 208 for (int i = 0; i < freqMapSize; i++) { 209 frames[frameCount][i] = Math.log(frames[frameCount][i]) + rangeThreshold; 210 if (frames[frameCount][i] < 0) 211 frames[frameCount][i] = 0; 212 } 213 } 214 215 float[] tmp = prevFrame; 216 prevFrame = reBuffer; 217 reBuffer = tmp; 218 frameCount++; 219 return true; 220 } 221 222 /** 223 * Creates a map of FFT frequency bins to comparison bins. 224 * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is 225 * one to one. Where the spacing is greater than 0.5 semitones, the FFT 226 * energy is mapped into semitone-wide bins. No scaling is performed; that 227 * is the energy is summed into the comparison bins. See also 228 * processFrame() 229 */ 230 protected void makeFreqMap(int fftSize, float sampleRate) { 231 freqMap = new int[fftSize/2+1]; 232 double binWidth = sampleRate / fftSize; 233 int crossoverBin = (int)(2 / (Math.pow(2, 1/12.0) - 1)); 234 int crossoverMidi = (int)Math.round(Math.log(crossoverBin*binWidth/440)/ 235 Math.log(2) * 12 + 69); 236 // freq = 440 * Math.pow(2, (midi-69)/12.0) / binWidth; 237 int i = 0; 238 while (i <= crossoverBin) 239 freqMap[i++] = i; 240 while (i <= fftSize/2) { 241 double midi = Math.log(i*binWidth/440) / Math.log(2) * 12 + 69; 242 if (midi > 127) 243 midi = 127; 244 freqMap[i++] = crossoverBin + (int)Math.round(midi) - crossoverMidi; 245 } 246 freqMapSize = freqMap[i-1] + 1; 247 } // makeFreqMap() 248 249 250 private void findOnsets(double p1, double p2){ 251 LinkedList<Integer> peaks = Peaks.findPeaks(spectralFlux, (int)Math.round(0.06 / hopTime), p1, p2, true); 252 Iterator<Integer> it = peaks.iterator(); 253 254 double minSalience = Peaks.min(spectralFlux); 255 for (int i = 0; i < peaks.size(); i++) { 256 int index = it.next(); 257 double time = index * hopTime; 258 double salience = spectralFlux[index] - minSalience; 259 handler.handleOnset(time,salience); 260 } 261 } 262 263 public void setHandler(OnsetHandler handler) { 264 this.handler = handler; 265 } 266 267 @Override 268 public void processingFinished() { 269 double p1 = 0.35; 270 double p2 = 0.84; 271 Peaks.normalise(spectralFlux); 272 findOnsets(p1, p2); 273 } 274 }