McLeodPitchMethod.java (12005B)
1 /* 2 * _______ _____ _____ _____ 3 * |__ __| | __ \ / ____| __ \ 4 * | | __ _ _ __ ___ ___ ___| | | | (___ | |__) | 5 * | |/ _` | '__/ __|/ _ \/ __| | | |\___ \| ___/ 6 * | | (_| | | \__ \ (_) \__ \ |__| |____) | | 7 * |_|\__,_|_| |___/\___/|___/_____/|_____/|_| 8 * 9 * ------------------------------------------------------------- 10 * 11 * TarsosDSP is developed by Joren Six at IPEM, University Ghent 12 * 13 * ------------------------------------------------------------- 14 * 15 * Info: http://0110.be/tag/TarsosDSP 16 * Github: https://github.com/JorenSix/TarsosDSP 17 * Releases: http://0110.be/releases/TarsosDSP/ 18 * 19 * TarsosDSP includes modified source code by various authors, 20 * for credits and info, see README. 21 * 22 */ 23 24 25 /** 26 */ 27 package be.tarsos.dsp.pitch; 28 29 import java.util.ArrayList; 30 import java.util.List; 31 32 /** 33 * <p> 34 * Implementation of The McLeod Pitch Method (MPM). It is described in the 35 * article <a href= 36 * "http://miracle.otago.ac.nz/tartini/papers/A_Smarter_Way_to_Find_Pitch.pdf" 37 * >A Smarter Way to Find Pitch</a>. According to the article: 38 * </p> 39 * <blockquote> <bufferCount> 40 * <p> 41 * A fast, accurate and robust method for finding the continuous pitch in 42 * monophonic musical sounds. [It uses] a special normalized version of the 43 * Squared Difference Function (SDF) coupled with a peak picking algorithm. 44 * </p> 45 * <p> 46 * MPM runs in real time with a standard 44.1 kHz sampling rate. It operates 47 * without using low-pass filtering so it can work on sound with high harmonic 48 * frequencies such as a violin and it can display pitch changes of one cent 49 * reliably. MPM works well without any post processing to correct the pitch. 50 * </p> 51 * </bufferCount> </blockquote> 52 * <p> 53 * For the moment this implementation uses the inefficient way of calculating 54 * the pitch. It uses <code>O(Ww)</code> with W the window size in samples and w 55 * the desired number of ACF coefficients. The implementation can be optimized 56 * to <code>O((W+w)log(W+w))</code> by using an <abbr 57 * title="Fast Fourier Transform">FFT</abbr> to calculate the <abbr 58 * title="Auto-Correlation Function">ACF</abbr>. But I am still afraid of the 59 * dark magic of the FFT and clinging to the familiar, friendly, laggard time 60 * domain. 61 * </p> 62 * 63 * @author Phillip McLeod 64 * @author Joren Six 65 */ 66 public final class McLeodPitchMethod implements PitchDetector { 67 68 /** 69 * The expected size of an audio buffer (in samples). 70 */ 71 public static final int DEFAULT_BUFFER_SIZE = 1024; 72 73 /** 74 * Overlap defines how much two audio buffers following each other should 75 * overlap (in samples). 75% overlap is advised in the MPM article. 76 */ 77 public static final int DEFAULT_OVERLAP = 768; 78 79 /** 80 * Defines the relative size the chosen peak (pitch) has. 0.93 means: choose 81 * the first peak that is higher than 93% of the highest peak detected. 93% 82 * is the default value used in the Tartini user interface. 83 */ 84 private static final double DEFAULT_CUTOFF = 0.97; 85 /** 86 * For performance reasons, peaks below this cutoff are not even considered. 87 */ 88 private static final double SMALL_CUTOFF = 0.5; 89 90 /** 91 * Pitch annotations below this threshold are considered invalid, they are 92 * ignored. 93 */ 94 private static final double LOWER_PITCH_CUTOFF = 80.0; // Hz 95 96 /** 97 * Defines the relative size the chosen peak (pitch) has. 98 */ 99 private final double cutoff; 100 101 /** 102 * The audio sample rate. Most audio has a sample rate of 44.1kHz. 103 */ 104 private final float sampleRate; 105 106 /** 107 * Contains a normalized square difference function value for each delay 108 * (tau). 109 */ 110 private final float[] nsdf; 111 112 /** 113 * The x and y coordinate of the top of the curve (nsdf). 114 */ 115 private float turningPointX, turningPointY; 116 117 /** 118 * A list with minimum and maximum values of the nsdf curve. 119 */ 120 private final List<Integer> maxPositions = new ArrayList<Integer>(); 121 122 /** 123 * A list of estimates of the period of the signal (in samples). 124 */ 125 private final List<Float> periodEstimates = new ArrayList<Float>(); 126 127 /** 128 * A list of estimates of the amplitudes corresponding with the period 129 * estimates. 130 */ 131 private final List<Float> ampEstimates = new ArrayList<Float>(); 132 133 /** 134 * The result of the pitch detection iteration. 135 */ 136 private final PitchDetectionResult result; 137 138 /** 139 * Initializes the normalized square difference value array and stores the 140 * sample rate. 141 * 142 * @param audioSampleRate 143 * The sample rate of the audio to check. 144 */ 145 public McLeodPitchMethod(final float audioSampleRate) { 146 this(audioSampleRate, DEFAULT_BUFFER_SIZE, DEFAULT_CUTOFF); 147 } 148 149 /** 150 * Create a new pitch detector. 151 * 152 * @param audioSampleRate 153 * The sample rate of the audio. 154 * @param audioBufferSize 155 * The size of one audio buffer 1024 samples is common. 156 */ 157 public McLeodPitchMethod(final float audioSampleRate, final int audioBufferSize) { 158 this(audioSampleRate, audioBufferSize, DEFAULT_CUTOFF); 159 } 160 161 /** 162 * Create a new pitch detector. 163 * 164 * @param audioSampleRate 165 * The sample rate of the audio. 166 * @param audioBufferSize 167 * The size of one audio buffer 1024 samples is common. 168 * @param cutoffMPM 169 * The cutoff (similar to the YIN threshold). In the Tartini 170 * paper 0.93 is used. 171 */ 172 public McLeodPitchMethod(final float audioSampleRate, final int audioBufferSize, final double cutoffMPM) { 173 this.sampleRate = audioSampleRate; 174 nsdf = new float[audioBufferSize]; 175 this.cutoff = cutoffMPM; 176 result = new PitchDetectionResult(); 177 } 178 179 /** 180 * Implements the normalized square difference function. See section 4 (and 181 * the explanation before) in the MPM article. This calculation can be 182 * optimized by using an FFT. The results should remain the same. 183 * 184 * @param audioBuffer 185 * The buffer with audio information. 186 */ 187 private void normalizedSquareDifference(final float[] audioBuffer) { 188 for (int tau = 0; tau < audioBuffer.length; tau++) { 189 float acf = 0; 190 float divisorM = 0; 191 for (int i = 0; i < audioBuffer.length - tau; i++) { 192 acf += audioBuffer[i] * audioBuffer[i + tau]; 193 divisorM += audioBuffer[i] * audioBuffer[i] + audioBuffer[i + tau] * audioBuffer[i + tau]; 194 } 195 nsdf[tau] = 2 * acf / divisorM; 196 } 197 } 198 199 /* 200 * (non-Javadoc) 201 * 202 * @see be.tarsos.pitch.pure.PurePitchDetector#getPitch(float[]) 203 */ 204 public PitchDetectionResult getPitch(final float[] audioBuffer) { 205 final float pitch; 206 207 // 0. Clear previous results (Is this faster than initializing a list 208 // again and again?) 209 maxPositions.clear(); 210 periodEstimates.clear(); 211 ampEstimates.clear(); 212 213 // 1. Calculate the normalized square difference for each Tau value. 214 normalizedSquareDifference(audioBuffer); 215 // 2. Peak picking time: time to pick some peaks. 216 peakPicking(); 217 218 double highestAmplitude = Double.NEGATIVE_INFINITY; 219 220 for (final Integer tau : maxPositions) { 221 // make sure every annotation has a probability attached 222 highestAmplitude = Math.max(highestAmplitude, nsdf[tau]); 223 224 if (nsdf[tau] > SMALL_CUTOFF) { 225 // calculates turningPointX and Y 226 parabolicInterpolation(tau); 227 // store the turning points 228 ampEstimates.add(turningPointY); 229 periodEstimates.add(turningPointX); 230 // remember the highest amplitude 231 highestAmplitude = Math.max(highestAmplitude, turningPointY); 232 } 233 } 234 235 if (periodEstimates.isEmpty()) { 236 pitch = -1; 237 } else { 238 // use the overall maximum to calculate a cutoff. 239 // The cutoff value is based on the highest value and a relative 240 // threshold. 241 final double actualCutoff = cutoff * highestAmplitude; 242 243 // find first period above or equal to cutoff 244 int periodIndex = 0; 245 for (int i = 0; i < ampEstimates.size(); i++) { 246 if (ampEstimates.get(i) >= actualCutoff) { 247 periodIndex = i; 248 break; 249 } 250 } 251 252 final double period = periodEstimates.get(periodIndex); 253 final float pitchEstimate = (float) (sampleRate / period); 254 if (pitchEstimate > LOWER_PITCH_CUTOFF) { 255 pitch = pitchEstimate; 256 } else { 257 pitch = -1; 258 } 259 260 } 261 result.setProbability((float) highestAmplitude); 262 result.setPitch(pitch); 263 result.setPitched(pitch != -1); 264 265 return result; 266 } 267 268 /** 269 * <p> 270 * Finds the x value corresponding with the peak of a parabola. 271 * </p> 272 * <p> 273 * a,b,c are three samples that follow each other. E.g. a is at 511, b at 274 * 512 and c at 513; f(a), f(b) and f(c) are the normalized square 275 * difference values for those samples; x is the peak of the parabola and is 276 * what we are looking for. Because the samples follow each other 277 * <code>b - a = 1</code> the formula for <a 278 * href="http://fizyka.umk.pl/nrbook/c10-2.pdf">parabolic interpolation</a> 279 * can be simplified a lot. 280 * </p> 281 * <p> 282 * The following ASCII ART shows it a bit more clear, imagine this to be a 283 * bit more curvaceous. 284 * </p> 285 * 286 * <pre> 287 * nsdf(x) 288 * ^ 289 * | 290 * f(x) |------ ^ 291 * f(b) | / |\ 292 * f(a) | / | \ 293 * | / | \ 294 * | / | \ 295 * f(c) | / | \ 296 * |_____________________> x 297 * a x b c 298 * </pre> 299 * 300 * @param tau 301 * The delay tau, b value in the drawing is the tau value. 302 */ 303 private void parabolicInterpolation(final int tau) { 304 final float nsdfa = nsdf[tau - 1]; 305 final float nsdfb = nsdf[tau]; 306 final float nsdfc = nsdf[tau + 1]; 307 final float bValue = tau; 308 final float bottom = nsdfc + nsdfa - 2 * nsdfb; 309 if (bottom == 0.0) { 310 turningPointX = bValue; 311 turningPointY = nsdfb; 312 } else { 313 final float delta = nsdfa - nsdfc; 314 turningPointX = bValue + delta / (2 * bottom); 315 turningPointY = nsdfb - delta * delta / (8 * bottom); 316 } 317 } 318 319 /** 320 * <p> 321 * Implementation based on the GPL'ED code of <a 322 * href="http://tartini.net">Tartini</a> This code can be found in the file 323 * <code>general/mytransforms.cpp</code>. 324 * </p> 325 * <p> 326 * Finds the highest value between each pair of positive zero crossings. 327 * Including the highest value between the last positive zero crossing and 328 * the end (if any). Ignoring the first maximum (which is at zero). In this 329 * diagram the desired values are marked with a + 330 * </p> 331 * 332 * <pre> 333 * f(x) 334 * ^ 335 * | 336 * 1| + 337 * | \ + /\ + /\ 338 * 0| _\____/\____/__\/\__/\____/_______> x 339 * | \ / \ / \/ \ / 340 * -1| \/ \/ \/ 341 * | 342 * </pre> 343 * 344 * @param nsdf 345 * The array to look for maximum values in. It should contain 346 * values between -1 and 1 347 * @author Phillip McLeod 348 */ 349 private void peakPicking() { 350 351 int pos = 0; 352 int curMaxPos = 0; 353 354 // find the first negative zero crossing 355 while (pos < (nsdf.length - 1) / 3 && nsdf[pos] > 0) { 356 pos++; 357 } 358 359 // loop over all the values below zero 360 while (pos < nsdf.length - 1 && nsdf[pos] <= 0.0) { 361 pos++; 362 } 363 364 // can happen if output[0] is NAN 365 if (pos == 0) { 366 pos = 1; 367 } 368 369 while (pos < nsdf.length - 1) { 370 assert nsdf[pos] >= 0; 371 if (nsdf[pos] > nsdf[pos - 1] && nsdf[pos] >= nsdf[pos + 1]) { 372 if (curMaxPos == 0) { 373 // the first max (between zero crossings) 374 curMaxPos = pos; 375 } else if (nsdf[pos] > nsdf[curMaxPos]) { 376 // a higher max (between the zero crossings) 377 curMaxPos = pos; 378 } 379 } 380 pos++; 381 // a negative zero crossing 382 if (pos < nsdf.length - 1 && nsdf[pos] <= 0) { 383 // if there was a maximum add it to the list of maxima 384 if (curMaxPos > 0) { 385 maxPositions.add(curMaxPos); 386 curMaxPos = 0; // clear the maximum position, so we start 387 // looking for a new ones 388 } 389 while (pos < nsdf.length - 1 && nsdf[pos] <= 0.0f) { 390 pos++; // loop over all the values below zero 391 } 392 } 393 } 394 if (curMaxPos > 0) { // if there was a maximum in the last part 395 maxPositions.add(curMaxPos); // add it to the vector of maxima 396 } 397 } 398 }