WaveformSimilarityBasedOverlapAdd.java (13307B)
1 /* 2 * _______ _____ _____ _____ 3 * |__ __| | __ \ / ____| __ \ 4 * | | __ _ _ __ ___ ___ ___| | | | (___ | |__) | 5 * | |/ _` | '__/ __|/ _ \/ __| | | |\___ \| ___/ 6 * | | (_| | | \__ \ (_) \__ \ |__| |____) | | 7 * |_|\__,_|_| |___/\___/|___/_____/|_____/|_| 8 * 9 * ------------------------------------------------------------- 10 * 11 * TarsosDSP is developed by Joren Six at IPEM, University Ghent 12 * 13 * ------------------------------------------------------------- 14 * 15 * Info: http://0110.be/tag/TarsosDSP 16 * Github: https://github.com/JorenSix/TarsosDSP 17 * Releases: http://0110.be/releases/TarsosDSP/ 18 * 19 * TarsosDSP includes modified source code by various authors, 20 * for credits and info, see README. 21 * 22 */ 23 24 25 package be.tarsos.dsp; 26 27 28 /** 29 * 30 * <p> 31 * An overlap-add technique based on waveform similarity (WSOLA) for high 32 * quality time-scale modification of speech 33 * </p> 34 * <p> 35 * A concept of waveform similarity for tackling the problem of time-scale 36 * modification of speech is proposed. It is worked out in the context of 37 * short-time Fourier transform representations. The resulting WSOLA 38 * (waveform-similarity-based synchronized overlap-add) algorithm produces 39 * high-quality speech output, is algorithmically and computationally efficient 40 * and robust, and allows for online processing with arbitrary time-scaling 41 * factors that may be specified in a time-varying fashion and can be chosen 42 * over a wide continuous range of values. 43 * </p> 44 * <p> 45 * Inspired by the work soundtouch by Olli Parviainen, 46 * http://www.surina.net/soundtouch, especially the TDStrech.cpp file. 47 * </p> 48 * @author Joren Six 49 * @author Olli Parviainen 50 */ 51 public class WaveformSimilarityBasedOverlapAdd implements AudioProcessor { 52 private int seekWindowLength; 53 private int seekLength; 54 private int overlapLength; 55 56 private float[] pMidBuffer; 57 private float[] pRefMidBuffer; 58 private float[] outputFloatBuffer; 59 60 private int intskip; 61 private int sampleReq; 62 63 private double tempo; 64 65 private AudioDispatcher dispatcher; 66 67 private Parameters newParameters; 68 69 /** 70 * Create a new instance based on algorithm parameters for a certain audio format. 71 * @param params The parameters for the algorithm. 72 */ 73 public WaveformSimilarityBasedOverlapAdd(Parameters params){ 74 setParameters(params); 75 applyNewParameters(); 76 } 77 78 public void setParameters(Parameters params){ 79 newParameters = params; 80 } 81 82 public void setDispatcher(AudioDispatcher newDispatcher){ 83 this.dispatcher = newDispatcher; 84 } 85 86 private void applyNewParameters(){ 87 Parameters params = newParameters; 88 int oldOverlapLength = overlapLength; 89 overlapLength = (int) ((params.getSampleRate() * params.getOverlapMs())/1000); 90 seekWindowLength = (int) ((params.getSampleRate() * params.getSequenceMs())/1000); 91 seekLength = (int) ((params.getSampleRate() * params.getSeekWindowMs())/1000); 92 93 tempo = params.getTempo(); 94 95 //pMidBuffer and pRefBuffer are initialized with 8 times the needed length to prevent a reset 96 //of the arrays when overlapLength changes. 97 98 if(overlapLength > oldOverlapLength * 8 && pMidBuffer==null){ 99 pMidBuffer = new float[overlapLength * 8]; //overlapLengthx2? 100 pRefMidBuffer = new float[overlapLength * 8];//overlapLengthx2? 101 System.out.println("New overlapLength" + overlapLength); 102 } 103 104 double nominalSkip = tempo * (seekWindowLength - overlapLength); 105 intskip = (int) (nominalSkip + 0.5); 106 107 sampleReq = Math.max(intskip + overlapLength, seekWindowLength) + seekLength; 108 109 float[] prevOutputBuffer = outputFloatBuffer; 110 outputFloatBuffer = new float[getOutputBufferSize()]; 111 if(prevOutputBuffer!=null){ 112 System.out.println("Copy outputFloatBuffer contents"); 113 for(int i = 0 ; i < prevOutputBuffer.length && i < outputFloatBuffer.length ; i++){ 114 outputFloatBuffer[i] = prevOutputBuffer[i]; 115 } 116 } 117 118 newParameters = null; 119 } 120 121 public int getInputBufferSize(){ 122 return sampleReq; 123 } 124 125 private int getOutputBufferSize(){ 126 return seekWindowLength - overlapLength; 127 } 128 129 public int getOverlap(){ 130 return sampleReq-intskip; 131 } 132 133 134 /** 135 * Overlaps the sample in output with the samples in input. 136 * @param output The output buffer. 137 * @param input The input buffer. 138 */ 139 private void overlap(final float[] output, int outputOffset, float[] input,int inputOffset){ 140 for(int i = 0 ; i < overlapLength ; i++){ 141 int itemp = overlapLength - i; 142 output[i + outputOffset] = (input[i + inputOffset] * i + pMidBuffer[i] * itemp ) / overlapLength; 143 } 144 } 145 146 147 /** 148 * Seeks for the optimal overlap-mixing position. 149 * 150 * The best position is determined as the position where the two overlapped 151 * sample sequences are 'most alike', in terms of the highest 152 * cross-correlation value over the overlapping period 153 * 154 * @param inputBuffer The input buffer 155 * @param postion The position where to start the seek operation, in the input buffer. 156 * @return The best position. 157 */ 158 private int seekBestOverlapPosition(float[] inputBuffer, int postion) { 159 int bestOffset; 160 double bestCorrelation, currentCorrelation; 161 int tempOffset; 162 163 int comparePosition; 164 165 // Slopes the amplitude of the 'midBuffer' samples 166 precalcCorrReferenceMono(); 167 168 bestCorrelation = -10; 169 bestOffset = 0; 170 171 // Scans for the best correlation value by testing each possible 172 // position 173 // over the permitted range. 174 for (tempOffset = 0; tempOffset < seekLength; tempOffset++) { 175 176 comparePosition = postion + tempOffset; 177 178 // Calculates correlation value for the mixing position 179 // corresponding 180 // to 'tempOffset' 181 currentCorrelation = (double) calcCrossCorr(pRefMidBuffer, inputBuffer,comparePosition); 182 // heuristic rule to slightly favor values close to mid of the 183 // range 184 double tmp = (double) (2 * tempOffset - seekLength) / seekLength; 185 currentCorrelation = ((currentCorrelation + 0.1) * (1.0 - 0.25 * tmp * tmp)); 186 187 // Checks for the highest correlation value 188 if (currentCorrelation > bestCorrelation) { 189 bestCorrelation = currentCorrelation; 190 bestOffset = tempOffset; 191 } 192 } 193 194 return bestOffset; 195 196 } 197 198 /** 199 * Slopes the amplitude of the 'midBuffer' samples so that cross correlation 200 * is faster to calculate. Why is this faster? 201 */ 202 void precalcCorrReferenceMono() 203 { 204 for (int i = 0; i < overlapLength; i++){ 205 float temp = i * (overlapLength - i); 206 pRefMidBuffer[i] = pMidBuffer[i] * temp; 207 } 208 } 209 210 211 double calcCrossCorr(float[] mixingPos, float[] compare, int offset){ 212 double corr = 0; 213 double norm = 0; 214 for (int i = 1; i < overlapLength; i ++){ 215 corr += mixingPos[i] * compare[i + offset]; 216 norm += mixingPos[i] * mixingPos[i]; 217 } 218 // To avoid division by zero. 219 if (norm < 1e-8){ 220 norm = 1.0; 221 } 222 return corr / Math.pow(norm,0.5); 223 } 224 225 226 @Override 227 public boolean process(AudioEvent audioEvent) { 228 float[] audioFloatBuffer = audioEvent.getFloatBuffer(); 229 assert audioFloatBuffer.length == getInputBufferSize(); 230 231 //Search for the best overlapping position. 232 int offset = seekBestOverlapPosition(audioFloatBuffer,0); 233 234 // Mix the samples in the 'inputBuffer' at position of 'offset' with the 235 // samples in 'midBuffer' using sliding overlapping 236 // ... first partially overlap with the end of the previous sequence 237 // (that's in 'midBuffer') 238 overlap(outputFloatBuffer,0,audioFloatBuffer,offset); 239 240 //copy sequence samples from input to output 241 int sequenceLength = seekWindowLength - 2 * overlapLength; 242 System.arraycopy(audioFloatBuffer, offset + overlapLength, outputFloatBuffer, overlapLength, sequenceLength); 243 244 // Copies the end of the current sequence from 'inputBuffer' to 245 // 'midBuffer' for being mixed with the beginning of the next 246 // processing sequence and so on 247 System.arraycopy(audioFloatBuffer, offset + sequenceLength + overlapLength, pMidBuffer, 0, overlapLength); 248 249 assert outputFloatBuffer.length == getOutputBufferSize(); 250 251 audioEvent.setFloatBuffer(outputFloatBuffer); 252 audioEvent.setOverlap(0); 253 254 if(newParameters!=null){ 255 applyNewParameters(); 256 dispatcher.setStepSizeAndOverlap(getInputBufferSize(),getOverlap()); 257 } 258 259 return true; 260 } 261 262 @Override 263 public void processingFinished() { 264 // NOOP 265 } 266 267 268 269 /** 270 * An object to encapsulate some of the parameters for 271 * WSOLA, together with a couple of practical helper functions. 272 * 273 * @author Joren Six 274 */ 275 public static class Parameters { 276 private final int sequenceMs; 277 private final int seekWindowMs; 278 private final int overlapMs; 279 280 private final double tempo; 281 private final double sampleRate; 282 283 /** 284 * @param tempo 285 * The tempo change 1.0 means unchanged, 2.0 is + 100% , 0.5 286 * is half of the speed. 287 * @param sampleRate 288 * The sample rate of the audio 44.1kHz is common. 289 * @param newSequenceMs 290 * Length of a single processing sequence, in milliseconds. 291 * This determines to how long sequences the original sound 292 * is chopped in the time-stretch algorithm. 293 * 294 * The larger this value is, the lesser sequences are used in 295 * processing. In principle a bigger value sounds better when 296 * slowing down tempo, but worse when increasing tempo and 297 * vice versa. 298 * 299 * Increasing this value reduces computational burden & vice 300 * versa. 301 * @param newSeekWindowMs 302 * Seeking window length in milliseconds for algorithm that 303 * finds the best possible overlapping location. This 304 * determines from how wide window the algorithm may look for 305 * an optimal joining location when mixing the sound 306 * sequences back together. 307 * 308 * The bigger this window setting is, the higher the 309 * possibility to find a better mixing position will become, 310 * but at the same time large values may cause a "drifting" 311 * artifact because consequent sequences will be taken at 312 * more uneven intervals. 313 * 314 * If there's a disturbing artifact that sounds as if a 315 * constant frequency was drifting around, try reducing this 316 * setting. 317 * 318 * Increasing this value increases computational burden & 319 * vice versa. 320 * @param newOverlapMs 321 * Overlap length in milliseconds. When the chopped sound 322 * sequences are mixed back together, to form a continuous 323 * sound stream, this parameter defines over how long period 324 * the two consecutive sequences are let to overlap each 325 * other. 326 * 327 * This shouldn't be that critical parameter. If you reduce 328 * the DEFAULT_SEQUENCE_MS setting by a large amount, you 329 * might wish to try a smaller value on this. 330 * 331 * Increasing this value increases computational burden & 332 * vice versa. 333 */ 334 public Parameters(double tempo, double sampleRate, int newSequenceMs, int newSeekWindowMs, int newOverlapMs) { 335 this.tempo = tempo; 336 this.sampleRate = sampleRate; 337 this.overlapMs = newOverlapMs; 338 this.seekWindowMs = newSeekWindowMs; 339 this.sequenceMs = newSequenceMs; 340 } 341 342 public static Parameters speechDefaults(double tempo, double sampleRate){ 343 int sequenceMs = 40; 344 int seekWindowMs = 15; 345 int overlapMs = 12; 346 return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); 347 } 348 349 public static Parameters musicDefaults(double tempo, double sampleRate){ 350 int sequenceMs = 82; 351 int seekWindowMs = 28; 352 int overlapMs = 12; 353 return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); 354 } 355 356 public static Parameters slowdownDefaults(double tempo, double sampleRate){ 357 int sequenceMs = 100; 358 int seekWindowMs = 35; 359 int overlapMs = 20; 360 return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); 361 } 362 363 public static Parameters automaticDefaults(double tempo, double sampleRate){ 364 double tempoLow = 0.5; // -50% speed 365 double tempoHigh = 2.0; // +100% speed 366 367 double sequenceMsLow = 125; //ms 368 double sequenceMsHigh = 50; //ms 369 double sequenceK = ((sequenceMsHigh - sequenceMsLow) / (tempoHigh - tempoLow)); 370 double sequenceC = sequenceMsLow - sequenceK * tempoLow; 371 372 double seekLow = 25;// ms 373 double seekHigh = 15;// ms 374 double seekK =((seekHigh - seekLow) / (tempoHigh-tempoLow)); 375 double seekC = seekLow - seekK * seekLow; 376 377 int sequenceMs = (int) (sequenceC + sequenceK * tempo + 0.5); 378 int seekWindowMs = (int) (seekC + seekK * tempo + 0.5); 379 int overlapMs = 12; 380 return new Parameters(tempo,sampleRate,sequenceMs, seekWindowMs,overlapMs); 381 } 382 383 public double getOverlapMs() { 384 return overlapMs; 385 } 386 387 public double getSequenceMs() { 388 return sequenceMs; 389 } 390 391 public double getSeekWindowMs() { 392 return seekWindowMs; 393 } 394 395 public double getSampleRate() { 396 return sampleRate; 397 } 398 399 public double getTempo(){ 400 return tempo; 401 } 402 } 403 }