plectrum

Plectrum: instrument tuner for Android
Log | Files | Refs | README | LICENSE

McLeodPitchMethod.java (12005B)


      1 /*
      2 *      _______                       _____   _____ _____  
      3 *     |__   __|                     |  __ \ / ____|  __ \ 
      4 *        | | __ _ _ __ ___  ___  ___| |  | | (___ | |__) |
      5 *        | |/ _` | '__/ __|/ _ \/ __| |  | |\___ \|  ___/ 
      6 *        | | (_| | |  \__ \ (_) \__ \ |__| |____) | |     
      7 *        |_|\__,_|_|  |___/\___/|___/_____/|_____/|_|     
      8 *                                                         
      9 * -------------------------------------------------------------
     10 *
     11 * TarsosDSP is developed by Joren Six at IPEM, University Ghent
     12 *  
     13 * -------------------------------------------------------------
     14 *
     15 *  Info: http://0110.be/tag/TarsosDSP
     16 *  Github: https://github.com/JorenSix/TarsosDSP
     17 *  Releases: http://0110.be/releases/TarsosDSP/
     18 *  
     19 *  TarsosDSP includes modified source code by various authors,
     20 *  for credits and info, see README.
     21 * 
     22 */
     23 
     24 
     25 /**
     26  */
     27 package be.tarsos.dsp.pitch;
     28 
     29 import java.util.ArrayList;
     30 import java.util.List;
     31 
     32 /**
     33  * <p>
     34  * Implementation of The McLeod Pitch Method (MPM). It is described in the
     35  * article <a href=
     36  * "http://miracle.otago.ac.nz/tartini/papers/A_Smarter_Way_to_Find_Pitch.pdf"
     37  * >A Smarter Way to Find Pitch</a>. According to the article:
     38  * </p>
     39  * <blockquote> <bufferCount>
     40  * <p>
     41  * A fast, accurate and robust method for finding the continuous pitch in
     42  * monophonic musical sounds. [It uses] a special normalized version of the
     43  * Squared Difference Function (SDF) coupled with a peak picking algorithm.
     44  * </p>
     45  * <p>
     46  * MPM runs in real time with a standard 44.1 kHz sampling rate. It operates
     47  * without using low-pass filtering so it can work on sound with high harmonic
     48  * frequencies such as a violin and it can display pitch changes of one cent
     49  * reliably. MPM works well without any post processing to correct the pitch.
     50  * </p>
     51  * </bufferCount> </blockquote>
     52  * <p>
     53  * For the moment this implementation uses the inefficient way of calculating
     54  * the pitch. It uses <code>O(Ww)</code> with W the window size in samples and w
     55  * the desired number of ACF coefficients. The implementation can be optimized
     56  * to <code>O((W+w)log(W+w))</code> by using an <abbr
     57  * title="Fast Fourier Transform">FFT</abbr> to calculate the <abbr
     58  * title="Auto-Correlation Function">ACF</abbr>. But I am still afraid of the
     59  * dark magic of the FFT and clinging to the familiar, friendly, laggard time
     60  * domain.
     61  * </p>
     62  * 
     63  * @author Phillip McLeod
     64  * @author Joren Six
     65  */
     66 public final class McLeodPitchMethod implements PitchDetector {
     67 
     68 	/**
     69 	 * The expected size of an audio buffer (in samples).
     70 	 */
     71 	public static final int DEFAULT_BUFFER_SIZE = 1024;
     72 
     73 	/**
     74 	 * Overlap defines how much two audio buffers following each other should
     75 	 * overlap (in samples). 75% overlap is advised in the MPM article.
     76 	 */
     77 	public static final int DEFAULT_OVERLAP = 768;
     78 
     79 	/**
     80 	 * Defines the relative size the chosen peak (pitch) has. 0.93 means: choose
     81 	 * the first peak that is higher than 93% of the highest peak detected. 93%
     82 	 * is the default value used in the Tartini user interface.
     83 	 */
     84 	private static final double DEFAULT_CUTOFF = 0.97;
     85 	/**
     86 	 * For performance reasons, peaks below this cutoff are not even considered.
     87 	 */
     88 	private static final double SMALL_CUTOFF = 0.5;
     89 
     90 	/**
     91 	 * Pitch annotations below this threshold are considered invalid, they are
     92 	 * ignored.
     93 	 */
     94 	private static final double LOWER_PITCH_CUTOFF = 80.0; // Hz
     95 
     96 	/**
     97 	 * Defines the relative size the chosen peak (pitch) has.
     98 	 */
     99 	private final double cutoff;
    100 
    101 	/**
    102 	 * The audio sample rate. Most audio has a sample rate of 44.1kHz.
    103 	 */
    104 	private final float sampleRate;
    105 
    106 	/**
    107 	 * Contains a normalized square difference function value for each delay
    108 	 * (tau).
    109 	 */
    110 	private final float[] nsdf;
    111 
    112 	/**
    113 	 * The x and y coordinate of the top of the curve (nsdf).
    114 	 */
    115 	private float turningPointX, turningPointY;
    116 
    117 	/**
    118 	 * A list with minimum and maximum values of the nsdf curve.
    119 	 */
    120 	private final List<Integer> maxPositions = new ArrayList<Integer>();
    121 
    122 	/**
    123 	 * A list of estimates of the period of the signal (in samples).
    124 	 */
    125 	private final List<Float> periodEstimates = new ArrayList<Float>();
    126 
    127 	/**
    128 	 * A list of estimates of the amplitudes corresponding with the period
    129 	 * estimates.
    130 	 */
    131 	private final List<Float> ampEstimates = new ArrayList<Float>();
    132 
    133 	/**
    134 	 * The result of the pitch detection iteration.
    135 	 */
    136 	private final PitchDetectionResult result;
    137 
    138 	/**
    139 	 * Initializes the normalized square difference value array and stores the
    140 	 * sample rate.
    141 	 * 
    142 	 * @param audioSampleRate
    143 	 *            The sample rate of the audio to check.
    144 	 */
    145 	public McLeodPitchMethod(final float audioSampleRate) {
    146 		this(audioSampleRate, DEFAULT_BUFFER_SIZE, DEFAULT_CUTOFF);
    147 	}
    148 
    149 	/**
    150 	 * Create a new pitch detector.
    151 	 * 
    152 	 * @param audioSampleRate
    153 	 *            The sample rate of the audio.
    154 	 * @param audioBufferSize
    155 	 *            The size of one audio buffer 1024 samples is common.
    156 	 */
    157 	public McLeodPitchMethod(final float audioSampleRate, final int audioBufferSize) {
    158 		this(audioSampleRate, audioBufferSize, DEFAULT_CUTOFF);
    159 	}
    160 
    161 	/**
    162 	 * Create a new pitch detector.
    163 	 * 
    164 	 * @param audioSampleRate
    165 	 *            The sample rate of the audio.
    166 	 * @param audioBufferSize
    167 	 *            The size of one audio buffer 1024 samples is common.
    168 	 * @param cutoffMPM
    169 	 *            The cutoff (similar to the YIN threshold). In the Tartini
    170 	 *            paper 0.93 is used.
    171 	 */
    172 	public McLeodPitchMethod(final float audioSampleRate, final int audioBufferSize, final double cutoffMPM) {
    173 		this.sampleRate = audioSampleRate;
    174 		nsdf = new float[audioBufferSize];
    175 		this.cutoff = cutoffMPM;
    176 		result = new PitchDetectionResult();
    177 	}
    178 
    179 	/**
    180 	 * Implements the normalized square difference function. See section 4 (and
    181 	 * the explanation before) in the MPM article. This calculation can be
    182 	 * optimized by using an FFT. The results should remain the same.
    183 	 * 
    184 	 * @param audioBuffer
    185 	 *            The buffer with audio information.
    186 	 */
    187 	private void normalizedSquareDifference(final float[] audioBuffer) {
    188 		for (int tau = 0; tau < audioBuffer.length; tau++) {
    189 			float acf = 0;
    190 			float divisorM = 0;
    191 			for (int i = 0; i < audioBuffer.length - tau; i++) {
    192 				acf += audioBuffer[i] * audioBuffer[i + tau];
    193 				divisorM += audioBuffer[i] * audioBuffer[i] + audioBuffer[i + tau] * audioBuffer[i + tau];
    194 			}
    195 			nsdf[tau] = 2 * acf / divisorM;
    196 		}
    197 	}
    198 
    199 	/*
    200 	 * (non-Javadoc)
    201 	 * 
    202 	 * @see be.tarsos.pitch.pure.PurePitchDetector#getPitch(float[])
    203 	 */
    204 	public PitchDetectionResult getPitch(final float[] audioBuffer) {
    205 		final float pitch;
    206 
    207 		// 0. Clear previous results (Is this faster than initializing a list
    208 		// again and again?)
    209 		maxPositions.clear();
    210 		periodEstimates.clear();
    211 		ampEstimates.clear();
    212 
    213 		// 1. Calculate the normalized square difference for each Tau value.
    214 		normalizedSquareDifference(audioBuffer);
    215 		// 2. Peak picking time: time to pick some peaks.
    216 		peakPicking();
    217 
    218 		double highestAmplitude = Double.NEGATIVE_INFINITY;
    219 
    220 		for (final Integer tau : maxPositions) {
    221 			// make sure every annotation has a probability attached
    222 			highestAmplitude = Math.max(highestAmplitude, nsdf[tau]);
    223 
    224 			if (nsdf[tau] > SMALL_CUTOFF) {
    225 				// calculates turningPointX and Y
    226 				parabolicInterpolation(tau);
    227 				// store the turning points
    228 				ampEstimates.add(turningPointY);
    229 				periodEstimates.add(turningPointX);
    230 				// remember the highest amplitude
    231 				highestAmplitude = Math.max(highestAmplitude, turningPointY);
    232 			}
    233 		}
    234 
    235 		if (periodEstimates.isEmpty()) {
    236 			pitch = -1;
    237 		} else {
    238 			// use the overall maximum to calculate a cutoff.
    239 			// The cutoff value is based on the highest value and a relative
    240 			// threshold.
    241 			final double actualCutoff = cutoff * highestAmplitude;
    242 
    243 			// find first period above or equal to cutoff
    244 			int periodIndex = 0;
    245 			for (int i = 0; i < ampEstimates.size(); i++) {
    246 				if (ampEstimates.get(i) >= actualCutoff) {
    247 					periodIndex = i;
    248 					break;
    249 				}
    250 			}
    251 
    252 			final double period = periodEstimates.get(periodIndex);
    253 			final float pitchEstimate = (float) (sampleRate / period);
    254 			if (pitchEstimate > LOWER_PITCH_CUTOFF) {
    255 				pitch = pitchEstimate;
    256 			} else {
    257 				pitch = -1;
    258 			}
    259 
    260 		}
    261 		result.setProbability((float) highestAmplitude);
    262 		result.setPitch(pitch);
    263 		result.setPitched(pitch != -1);
    264 		
    265 		return result;
    266 	}
    267 
    268 	/**
    269 	 * <p>
    270 	 * Finds the x value corresponding with the peak of a parabola.
    271 	 * </p>
    272 	 * <p>
    273 	 * a,b,c are three samples that follow each other. E.g. a is at 511, b at
    274 	 * 512 and c at 513; f(a), f(b) and f(c) are the normalized square
    275 	 * difference values for those samples; x is the peak of the parabola and is
    276 	 * what we are looking for. Because the samples follow each other
    277 	 * <code>b - a = 1</code> the formula for <a
    278 	 * href="http://fizyka.umk.pl/nrbook/c10-2.pdf">parabolic interpolation</a>
    279 	 * can be simplified a lot.
    280 	 * </p>
    281 	 * <p>
    282 	 * The following ASCII ART shows it a bit more clear, imagine this to be a
    283 	 * bit more curvaceous.
    284 	 * </p>
    285 	 * 
    286 	 * <pre>
    287 	 *     nsdf(x)
    288 	 *       ^
    289 	 *       |
    290 	 * f(x)  |------ ^
    291 	 * f(b)  |     / |\
    292 	 * f(a)  |    /  | \
    293 	 *       |   /   |  \
    294 	 *       |  /    |   \
    295 	 * f(c)  | /     |    \
    296 	 *       |_____________________> x
    297 	 *            a  x b  c
    298 	 * </pre>
    299 	 * 
    300 	 * @param tau
    301 	 *            The delay tau, b value in the drawing is the tau value.
    302 	 */
    303 	private void parabolicInterpolation(final int tau) {
    304 		final float nsdfa = nsdf[tau - 1];
    305 		final float nsdfb = nsdf[tau];
    306 		final float nsdfc = nsdf[tau + 1];
    307 		final float bValue = tau;
    308 		final float bottom = nsdfc + nsdfa - 2 * nsdfb;
    309 		if (bottom == 0.0) {
    310 			turningPointX = bValue;
    311 			turningPointY = nsdfb;
    312 		} else {
    313 			final float delta = nsdfa - nsdfc;
    314 			turningPointX = bValue + delta / (2 * bottom);
    315 			turningPointY = nsdfb - delta * delta / (8 * bottom);
    316 		}
    317 	}
    318 
    319 	/**
    320 	 * <p>
    321 	 * Implementation based on the GPL'ED code of <a
    322 	 * href="http://tartini.net">Tartini</a> This code can be found in the file
    323 	 * <code>general/mytransforms.cpp</code>.
    324 	 * </p>
    325 	 * <p>
    326 	 * Finds the highest value between each pair of positive zero crossings.
    327 	 * Including the highest value between the last positive zero crossing and
    328 	 * the end (if any). Ignoring the first maximum (which is at zero). In this
    329 	 * diagram the desired values are marked with a +
    330 	 * </p>
    331 	 * 
    332 	 * <pre>
    333 	 *  f(x)
    334 	 *   ^
    335 	 *   |
    336 	 *  1|               +
    337 	 *   | \      +     /\      +     /\
    338 	 *  0| _\____/\____/__\/\__/\____/_______> x
    339 	 *   |   \  /  \  /      \/  \  /
    340 	 * -1|    \/    \/            \/
    341 	 *   |
    342 	 * </pre>
    343 	 * 
    344 	 * @param nsdf
    345 	 *            The array to look for maximum values in. It should contain
    346 	 *            values between -1 and 1
    347 	 * @author Phillip McLeod
    348 	 */
    349 	private void peakPicking() {
    350 
    351 		int pos = 0;
    352 		int curMaxPos = 0;
    353 
    354 		// find the first negative zero crossing
    355 		while (pos < (nsdf.length - 1) / 3 && nsdf[pos] > 0) {
    356 			pos++;
    357 		}
    358 
    359 		// loop over all the values below zero
    360 		while (pos < nsdf.length - 1 && nsdf[pos] <= 0.0) {
    361 			pos++;
    362 		}
    363 
    364 		// can happen if output[0] is NAN
    365 		if (pos == 0) {
    366 			pos = 1;
    367 		}
    368 
    369 		while (pos < nsdf.length - 1) {
    370 			assert nsdf[pos] >= 0;
    371 			if (nsdf[pos] > nsdf[pos - 1] && nsdf[pos] >= nsdf[pos + 1]) {
    372 				if (curMaxPos == 0) {
    373 					// the first max (between zero crossings)
    374 					curMaxPos = pos;
    375 				} else if (nsdf[pos] > nsdf[curMaxPos]) {
    376 					// a higher max (between the zero crossings)
    377 					curMaxPos = pos;
    378 				}
    379 			}
    380 			pos++;
    381 			// a negative zero crossing
    382 			if (pos < nsdf.length - 1 && nsdf[pos] <= 0) {
    383 				// if there was a maximum add it to the list of maxima
    384 				if (curMaxPos > 0) {
    385 					maxPositions.add(curMaxPos);
    386 					curMaxPos = 0; // clear the maximum position, so we start
    387 					// looking for a new ones
    388 				}
    389 				while (pos < nsdf.length - 1 && nsdf[pos] <= 0.0f) {
    390 					pos++; // loop over all the values below zero
    391 				}
    392 			}
    393 		}
    394 		if (curMaxPos > 0) { // if there was a maximum in the last part
    395 			maxPositions.add(curMaxPos); // add it to the vector of maxima
    396 		}
    397 	}
    398 }