From 049ee0235e7f1450484d0a009602743e8fda8091 Mon Sep 17 00:00:00 2001
From: Mikael Finstad <finstaden@gmail.com>
Date: Thu, 29 Oct 2020 00:11:27 +0100
Subject: [PATCH] Audio improvements #62 #30

- Implement audio normalization #30
- Implement ducking
- Implement arbitrary audio tracks with offset #86 #62 #10
---
 README.md                       | 102 ++++++++++----
 audio.js                        | 239 +++++++++++++++++++++-----------
 examples/audio-transition.json5 |  24 +++-
 examples/audio2.json5           |  26 ++--
 examples/audio3.json5           |  15 ++
 examples/audioLoop.json5        |   1 +
 index.js                        |  34 ++---
 parseConfig.js                  |  89 +++++++++++-
 8 files changed, 380 insertions(+), 150 deletions(-)
 create mode 100644 examples/audio3.json5

diff --git a/README.md b/README.md
index 04c7b2a..c94826c 100644
--- a/README.md
+++ b/README.md
@@ -23,11 +23,12 @@ Inspired by [ffmpeg-concat](https://github.com/transitive-bullshit/ffmpeg-concat
 - Accepts custom HTML5 Canvas / Fabric.js JavaScript code for custom screens or dynamic overlays
 - Render custom GL shaders (for example from [shadertoy](https://www.shadertoy.com/))
 - Can output GIF
-- Preserve audio sources or mix multiple
 - Overlay transparent images or even videos with alpha channel
 - Show different sub-clips for parts of a clips duration (B-roll)
-- Automatic audio crossfading
 - Picture-in-picture
+- Preserve/mix multiple audio sources
+- Automatic audio crossfading
+- Automatic audio ducking and normalization
 
 ## Use cases
 
@@ -35,20 +36,19 @@ Inspired by [ffmpeg-concat](https://github.com/transitive-bullshit/ffmpeg-concat
 - Create a fast-paced trailer or promo video
 - Create a tutorial video with help text
 - Create news stories
-- Simply convert a video to a GIF
+- Create an animated GIF
 - Resize video to any size or framerate and with automatic letterboxing/cropping (e.g. if you need to upload a video somewhere but the site complains `Video must be 1337x1000 30fps`)
+- Create a podcast with multiple mixed tracks
 
 See [examples](https://github.com/mifi/editly/tree/master/examples)
 
 ## Requirements
 
-- [Node.js installed](https://nodejs.org/en/) (Use of the latest stable version is recommended)
-- Should work on Windows, MacOS and Linux. Needs at least Node.js v12.16.2 on MacOS ([see issue](https://github.com/sindresorhus/meow/issues/144)).
-  See also: https://github.com/stackgl/headless-gl#system-dependencies
+- Windows, MacOS or Linux
+- [Node.js installed](https://nodejs.org/en/) (Use of the latest LTS version is recommended, [v12.16.2 or newer on MacOS](https://github.com/sindresorhus/meow/issues/144).)
+- `ffmpeg` (and `ffprobe`) [installed](http://ffmpeg.org/) and available in `PATH`
 - (Linux) may require some extra steps. See [headless-gl](https://github.com/stackgl/headless-gl#system-dependencies).
 
-Make sure you have `ffmpeg` and `ffprobe` installed and available in `PATH`
-
 ## Installing
 
 `npm i -g editly`
@@ -72,21 +72,18 @@ editly \
   --audio-file-path /path/to/music.mp3
 ```
 
-Or create an MP4 (or GIF) from a JSON or JSON5 edit spec *(JSON5 is just a more friendly JSON format)*:
+Or create an MP4 (or GIF) from a JSON or JSON5 edit spec *(JSON5 is just a more user friendly JSON format)*:
 
 ```sh
-editly my-editly.json5 --fast --out output.gif
+editly my-spec.json5 --fast --keep-source-audio --out output.gif
 ```
 
-For examples of how to make a JSON edit spec, see below or https://github.com/mifi/editly/tree/master/examples
-
-When you run with `--fast` or `fast: true`, it will render a much quicker low-resolution preview ⏩
-
-Without `--fast` it will default to using the **width**, **height** and **frame rate** from the **first** input video. **All other clips will be converted to these dimensions.** You can of course override any or all of these parameters.
+For examples of how to make a JSON edit spec, see below or [examples](https://github.com/mifi/editly/tree/master/examples).
 
-**TIP:** Use this tool in conjunction with [LosslessCut](https://github.com/mifi/lossless-cut)
+Without `--fast`, it will default to using the **width**, **height** and **frame rate** from the **first** input video. **All other clips will be converted to these dimensions.** You can of course override any or all of these parameters.
 
-**TIP:** If you need catchy music for your video, have a look at [this YouTube](https://www.youtube.com/channel/UCht8qITGkBvXKsR1Byln-wA) or the [YouTube audio library](https://www.youtube.com/audiolibrary/music?nv=1). Then use [youtube-dl](https://github.com/ytdl-org/youtube-dl) to download the video, and then point `--audio-file-path` at the video file. *Be sure to respect their license!*
+- **TIP:** Use this tool in conjunction with [LosslessCut](https://github.com/mifi/lossless-cut)
+- **TIP:** If you need catchy music for your video, have a look at [this YouTube](https://www.youtube.com/channel/UCht8qITGkBvXKsR1Byln-wA) or the [YouTube audio library](https://www.youtube.com/audiolibrary/music?nv=1). Then use [youtube-dl](https://github.com/ytdl-org/youtube-dl) to download the video, and then point `--audio-file-path` at the video file. *Be sure to respect their license!*
 
 ## JavaScript library
 
@@ -108,6 +105,7 @@ Edit specs are JavaScript / JSON objects describing the whole edit operation wit
   width,
   height,
   fps,
+  allowRemoteRequests: false,
   defaults: {
     duration: 4,
     transition: {
@@ -127,10 +125,6 @@ Edit specs are JavaScript / JSON objects describing the whole edit operation wit
       // ...more per-layer-type defaults
     },
   },
-  audioFilePath,
-  loopAudio: false,
-  keepSourceAudio: false,
-  allowRemoteRequests: false,
   clips: [
     {
       transition,
@@ -145,6 +139,25 @@ Edit specs are JavaScript / JSON objects describing the whole edit operation wit
     }
     // ...more clips
   ],
+  audioFilePath,
+  loopAudio: false,
+  keepSourceAudio: false,
+  clipsAudioVolume: 1,
+  audio: [
+    {
+      path,
+      mixVolume: 1,
+      cutFrom: 0,
+      cutTo,
+      start: 0,
+    },
+    // ...more audio tracks
+  ],
+  audioNorm: {
+    enable: false,
+    gaussSize: 5,
+    maxGain: 30,
+  }
 
   // Testing options:
   enableFfmpegLog: false,
@@ -161,11 +174,8 @@ Edit specs are JavaScript / JSON objects describing the whole edit operation wit
 | `width` | `--width` | Width which all media will be converted to | `640` | |
 | `height` | `--height` | Height which all media will be converted to | auto based on `width` and aspect ratio of **first video** | |
 | `fps` | `--fps` | FPS which all videos will be converted to | First video FPS or `25` | |
-| `audioFilePath` | `--audio-file-path` | Set an audio track for the whole video | | |
-| `loopAudio` | `--loop-audio` | Loop the audio track if it is shorter than video? | `false` | |
-| `keepSourceAudio` | `--keep-source-audio` | Keep audio from source files | `false` | |
 | `allowRemoteRequests` | `--allow-remote-requests` | Allow remote URLs as paths | `false` | |
-| `fast` | `--fast`, `-f` | Fast mode (low resolution and FPS, useful for getting a quick preview) | `false` | |
+| `fast` | `--fast`, `-f` | Fast mode (low resolution and FPS, useful for getting a quick preview ⏩) | `false` | |
 | `defaults.layer.fontPath` | `--font-path` | Set default font to a .ttf | System font | |
 | `defaults.layer.*` | | Set any layer parameter that all layers will inherit | | |
 | `defaults.duration` | `--clip-duration` | Set default clip duration for clips that don't have an own duration | `4` | sec |
@@ -181,6 +191,14 @@ Edit specs are JavaScript / JSON objects describing the whole edit operation wit
 | `clips[].layers[].type` | | Layer type, see below | | |
 | `clips[].layers[].visibleFrom` | | What time into the clip should this layer start | | sec |
 | `clips[].layers[].visibleUntil` | | What time into the clip should this layer stop | | sec |
+| `audioTracks[]` | | List of arbitrary audio tracks. See [audio tracks](#arbitrary-audio-tracks). | `[]` | |
+| `audioFilePath` | `--audio-file-path` | Set an audio track for the whole video. See also [audio tracks](#arbitrary-audio-tracks) | | |
+| `loopAudio` | `--loop-audio` | Loop the audio track if it is shorter than video? | `false` | |
+| `keepSourceAudio` | `--keep-source-audio` | Keep source audio from `clips`? | `false` | |
+| `clipsAudioVolume` | | Volume of audio from `clips` relative to `audioTracks`. See [audio tracks](#arbitrary-audio-tracks). | `1` | |
+| `audioNorm.enable` | | Enable audio normalization? See [audio normalization](#audio-normalization). | `false` | |
+| `audioNorm.gaussSize` | | Audio normalization gauss size. See [audio normalization](#audio-normalization). | `5` | |
+| `audioNorm.maxGain` | | Audio normalization max gain. See [audio normalization](#audio-normalization). | `30` | |
 
 ### Transition types
 
@@ -220,6 +238,14 @@ Audio layers will be mixed together. If `cutFrom`/`cutTo` is set, the resulting
 | `cutTo` | Time value to cut to | `clip.duration` | sec |
 | `mixVolume` | Relative volume when mixing this audio track with others | `1` | |
 
+#### Layer type 'detached-audio'
+
+This is a special case of `audioTracks` that makes it easier to start the audio relative to `clips` start times without having to calculate global start times.
+
+`detached-audio` has the exact same properties as [audioTracks](#arbitrary-audio-tracks), except `start` time is relative to the clip's start.
+
+[Example of detached audio tracks](https://github.com/mifi/editly/blob/master/examples/audio3.json5)
+
 #### Layer type 'image'
 
 Full screen image
@@ -313,6 +339,32 @@ Loads a GLSL shader. See [gl.json5](https://github.com/mifi/editly/blob/master/e
 - `fragmentPath`
 - `vertexPath` (optional)
 
+#### Arbitrary audio tracks
+
+`audioTracks` property can optionally contain a list of objects which specify audio tracks that can be started at arbitrary times in the final video. These tracks will be mixed (`mixVolume` specifying a relative number for how loud each track is compared to the other tracks). `clipsAudioVolume` specifies the volume of **all** the audio from `clips` relative to the volume of **all** the `audioTracks`.
+
+| Parameter | Description | Default | |
+|-|-|-|-|
+| `audioTracks[].path` | File path for this track | | |
+| `audioTracks[].mixVolume` | Relative volume for this track | `1` | |
+| `audioTracks[].cutFrom` | Time value to cut source file **from** | `0` | sec |
+| `audioTracks[].cutTo` | Time value to cut source file **to** | | sec |
+| `audioTracks[].start` | How many seconds into video to start this audio track | `0` | sec |
+
+The difference between `audioTracks` and **Layer type 'audio'** is that `audioTracks` will continue to play across multiple `clips` and can start and stop whenever needed.
+
+See `audioTracks` [example](https://github.com/mifi/editly/blob/master/examples/audio2.json5)
+
+See also **Layer type 'detached-audio'**.
+
+#### Audio normalization
+
+You can enable audio normalization of the final output audio. This is useful if you want to achieve Audio Ducking (e.g. automatically lower volume of all other tracks when voice-over speaks).
+
+`audioNorm` parameters are [documented here.](https://ffmpeg.org/ffmpeg-filters.html#dynaudnorm)
+
+[Example of audio ducking](https://github.com/mifi/editly/blob/master/examples/audio2.json5)
+
 ### Resize modes
 
 `resizeMode` - How to fit image to screen. Can be one of:
diff --git a/audio.js b/audio.js
index ffd859d..5133729 100644
--- a/audio.js
+++ b/audio.js
@@ -2,44 +2,44 @@ const pMap = require('p-map');
 const { join, basename, resolve } = require('path');
 const execa = require('execa');
 const flatMap = require('lodash/flatMap');
-const fs = require('fs-extra');
 
 const { getFfmpegCommonArgs, getCutFromArgs } = require('./ffmpeg');
 const { readFileStreams } = require('./util');
 
-module.exports = ({ ffmpegPath, ffprobePath, enableFfmpegLog, verbose }) => {
-  async function editAudio({ clips, tmpDir }) {
-    if (clips.length === 0) return undefined;
+module.exports = ({ ffmpegPath, ffprobePath, enableFfmpegLog, verbose, tmpDir }) => {
+  async function createMixedAudioClips({ clips, keepSourceAudio }) {
+    return pMap(clips, async (clip, i) => {
+      const { duration, layers, transition } = clip;
 
-    console.log('Extracting audio or creating silence from all clips');
+      async function runInner() {
+        const clipAudioPath = join(tmpDir, `clip${i}-audio.flac`);
 
-    const mergedAudioPath = join(tmpDir, 'audio-merged.flac');
+        async function createSilence() {
+          if (verbose) console.log('create silence', duration);
+          const args = [
+            '-f', 'lavfi', '-i', 'anullsrc=channel_layout=stereo:sample_rate=44100',
+            '-sample_fmt', 's32',
+            '-ar', '48000',
+            '-t', duration,
+            '-c:a', 'flac',
+            '-y',
+            clipAudioPath,
+          ];
+          await execa(ffmpegPath, args);
 
-    const clipsOut = await pMap(clips, async (clip, i) => {
-      const clipAudioPath = join(tmpDir, `clip${i}-audio.flac`);
+          return { silent: true, clipAudioPath };
+        }
 
-      const { duration, layers, transition } = clip;
+        // Has user enabled keep source audio?
+        if (!keepSourceAudio) return createSilence();
 
-      const audioLayers = layers.filter(({ type, visibleFrom, visibleUntil }) => (
-        ['audio', 'video'].includes(type)
-        // TODO We don't support audio for visibleFrom/visibleUntil layers
-        && !visibleFrom && visibleUntil == null));
+        const audioLayers = layers.filter(({ type, visibleFrom, visibleUntil }) => (
+          ['audio', 'video'].includes(type)
+          // TODO: We don't support audio for visibleFrom/visibleUntil layers
+          && !visibleFrom && visibleUntil == null));
 
-      async function createSilence(outPath) {
-        if (verbose) console.log('create silence', duration);
-        const args = [
-          '-f', 'lavfi', '-i', 'anullsrc=channel_layout=stereo:sample_rate=44100',
-          '-sample_fmt', 's32',
-          '-ar', '48000',
-          '-t', duration,
-          '-c:a', 'flac',
-          '-y',
-          outPath,
-        ];
-        await execa(ffmpegPath, args);
-      }
+        if (audioLayers.length === 0) return createSilence();
 
-      if (audioLayers.length > 0) {
         const processedAudioLayersRaw = await pMap(audioLayers, async (audioLayer, j) => {
           const { path, cutFrom, cutTo, speedFactor } = audioLayer;
 
@@ -77,78 +77,159 @@ module.exports = ({ ffmpegPath, ffprobePath, enableFfmpegLog, verbose }) => {
 
             // console.log(args);
             await execa(ffmpegPath, args);
+
+            return {
+              layerAudioPath,
+              audioLayer,
+            };
           } catch (err) {
             if (verbose) console.error('Cannot extract audio from video', path, err);
             // Fall back to silence
-            await createSilence(layerAudioPath);
+            return undefined;
           }
-
-          return { layerAudioPath, audioLayer };
         }, { concurrency: 4 });
 
         const processedAudioLayers = processedAudioLayersRaw.filter((p) => p);
 
-        if (processedAudioLayers.length > 1) {
-          // Merge/mix all layer's audio
+        if (processedAudioLayers.length < 1) return createSilence();
 
-          const weights = processedAudioLayers.map(({ audioLayer }) => (audioLayer.mixVolume != null ? audioLayer.mixVolume : 1));
-          const args = [
-            ...getFfmpegCommonArgs({ enableFfmpegLog }),
-            ...flatMap(processedAudioLayers, ({ layerAudioPath }) => ['-i', layerAudioPath]),
-            '-filter_complex', `amix=inputs=${processedAudioLayers.length}:duration=longest:weights=${weights.join(' ')}`,
-            '-c:a', 'flac',
-            '-y',
-            clipAudioPath,
-          ];
+        if (processedAudioLayers.length === 1) return { clipAudioPath: processedAudioLayers[0].layerAudioPath };
 
-          await execa(ffmpegPath, args);
-        } else if (processedAudioLayers.length > 0) {
-          await fs.rename(processedAudioLayers[0].layerAudioPath, clipAudioPath);
-        } else {
-          await createSilence(clipAudioPath);
-        }
-      } else {
-        await createSilence(clipAudioPath);
+        // Merge/mix all layer's audio
+        const weights = processedAudioLayers.map(({ audioLayer }) => (audioLayer.mixVolume != null ? audioLayer.mixVolume : 1));
+        const args = [
+          ...getFfmpegCommonArgs({ enableFfmpegLog }),
+          ...flatMap(processedAudioLayers, ({ layerAudioPath }) => ['-i', layerAudioPath]),
+          '-filter_complex', `amix=inputs=${processedAudioLayers.length}:duration=longest:weights=${weights.join(' ')}`,
+          '-c:a', 'flac',
+          '-y',
+          clipAudioPath,
+        ];
+
+        await execa(ffmpegPath, args);
+        return { clipAudioPath };
       }
 
+      const { clipAudioPath, silent } = await runInner();
+
       return {
         path: resolve(clipAudioPath), // https://superuser.com/a/853262/658247
         transition,
+        silent,
       };
     }, { concurrency: 4 });
+  }
 
-    if (clipsOut.length < 2) {
-      await fs.rename(clipsOut[0].path, mergedAudioPath);
-    } else {
-      console.log('Combining audio', clipsOut.map(({ path }) => basename(path)));
-
-      let inStream = '[0:a]';
-      const filterGraph = clipsOut.slice(0, -1).map(({ transition }, i) => {
-        const outStream = `[concat${i}]`;
-
-        const epsilon = 0.0001; // If duration is 0, ffmpeg seems to default to 1 sec instead, hence epsilon.
-        let ret = `${inStream}[${i + 1}:a]acrossfade=d=${Math.max(epsilon, transition.duration)}:c1=${transition.audioOutCurve || 'tri'}:c2=${transition.audioInCurve || 'tri'}`;
-
-        inStream = outStream;
-
-        if (i < clipsOut.length - 2) ret += outStream;
-        return ret;
-      }).join(',');
-
-      const args = [
-        ...getFfmpegCommonArgs({ enableFfmpegLog }),
-        ...(flatMap(clipsOut, ({ path }) => ['-i', path])),
-        '-filter_complex',
-        filterGraph,
-        '-c', 'flac',
-        '-y',
-        mergedAudioPath,
-      ];
-      await execa(ffmpegPath, args);
+  async function mergeFadeClipAudio(clipAudio) {
+    if (clipAudio.length < 2) {
+      return clipAudio[0].path;
     }
 
-    // TODO don't return audio if only silence?
-    return mergedAudioPath;
+    const mergedClipAudioPath = join(tmpDir, 'audio-merged.flac');
+
+    if (verbose) console.log('Combining audio', clipAudio.map(({ path }) => basename(path)));
+
+    let inStream = '[0:a]';
+    const filterGraph = clipAudio.slice(0, -1).map(({ transition }, i) => {
+      const outStream = `[concat${i}]`;
+
+      const epsilon = 0.0001; // If duration is 0, ffmpeg seems to default to 1 sec instead, hence epsilon.
+      let ret = `${inStream}[${i + 1}:a]acrossfade=d=${Math.max(epsilon, transition.duration)}:c1=${transition.audioOutCurve || 'tri'}:c2=${transition.audioInCurve || 'tri'}`;
+
+      inStream = outStream;
+
+      if (i < clipAudio.length - 2) ret += outStream;
+      return ret;
+    }).join(',');
+
+    const args = [
+      ...getFfmpegCommonArgs({ enableFfmpegLog }),
+      ...(flatMap(clipAudio, ({ path }) => ['-i', path])),
+      '-filter_complex',
+      filterGraph,
+      '-c', 'flac',
+      '-y',
+      mergedClipAudioPath,
+    ];
+    await execa(ffmpegPath, args);
+
+    return mergedClipAudioPath;
+  }
+
+  async function mixArbitraryAudio({ streams, audioNorm }) {
+    let maxGain = 30;
+    let gaussSize = 5;
+    if (audioNorm) {
+      if (audioNorm.gaussSize != null) gaussSize = audioNorm.gaussSize;
+      if (audioNorm.maxGain != null) maxGain = audioNorm.maxGain;
+    }
+    const enableAudioNorm = audioNorm && audioNorm.enable;
+
+    // https://stackoverflow.com/questions/35509147/ffmpeg-amix-filter-volume-issue-with-inputs-of-different-duration
+    let filterComplex = streams.map(({ start, cutFrom, cutTo }, i) => {
+      const cutToArg = (cutTo != null ? `:end=${cutTo}` : '');
+      const apadArg = i > 0 ? ',apad' : ''; // Don't pad the first track (audio from video clips with correct duration)
+
+      return `[${i}]atrim=start=${cutFrom || 0}${cutToArg},adelay=delays=${Math.floor((start || 0) * 1000)}:all=1${apadArg}[a${i}]`;
+    }).join(';');
+
+    const audioNormArg = enableAudioNorm ? `,dynaudnorm=g=${gaussSize}:maxgain=${maxGain}` : '';
+    filterComplex += `;${streams.map((s, i) => `[a${i}]`).join('')}amix=inputs=${streams.length}:duration=first:dropout_transition=0:weights=${streams.map((s) => (s.mixVolume != null ? s.mixVolume : 1)).join(' ')}${audioNormArg}`;
+
+    const mixedAudioPath = join(tmpDir, 'audio-mixed.flac');
+
+    const args = [
+      ...getFfmpegCommonArgs({ enableFfmpegLog }),
+      ...(flatMap(streams, ({ path, loop }) => ([
+        '-stream_loop', (loop || 0),
+        '-i', path,
+      ]))),
+      '-filter_complex', filterComplex,
+      '-c:a', 'flac',
+      '-y',
+      mixedAudioPath,
+    ];
+
+    if (verbose) console.log(args.join(' '));
+
+    await execa(ffmpegPath, args);
+
+    return mixedAudioPath;
+  }
+
+
+  async function editAudio({ keepSourceAudio, clips, arbitraryAudio, clipsAudioVolume, audioNorm }) {
+    // We need clips to process audio, because we need to know duration
+    if (clips.length === 0) return undefined;
+
+    // No need to process audio if none of these are satisfied
+    if (!(keepSourceAudio || arbitraryAudio.length > 0)) return undefined;
+
+    console.log('Extracting audio/silence from all clips');
+
+    // Mix audio from each clip as separate files (or silent audio of appropriate length for clips with no audio)
+    const clipAudio = await createMixedAudioClips({ clips, keepSourceAudio });
+
+    // Return no audio if only silent clips and no arbitrary audio
+    if (clipAudio.every((ca) => ca.silent) && arbitraryAudio.length === 0) return undefined;
+
+    // Merge & fade the clip audio files
+    const mergedClipAudioPath = await mergeFadeClipAudio(clipAudio);
+
+    const streams = [
+      // The first stream is required, and it determines the length of the output audio.
+      // All other streams will be truncated to this length
+      { path: mergedClipAudioPath, mixVolume: clipsAudioVolume },
+
+      ...arbitraryAudio,
+    ];
+
+    console.log('Mixing clip audio with arbitrary audio');
+
+    if (streams.length < 2) return mergedClipAudioPath;
+
+    const mixedFile = await mixArbitraryAudio({ streams, audioNorm });
+    return mixedFile;
   }
 
   return {
diff --git a/examples/audio-transition.json5 b/examples/audio-transition.json5
index 01fd8f9..b696de7 100644
--- a/examples/audio-transition.json5
+++ b/examples/audio-transition.json5
@@ -8,27 +8,39 @@
   },
   clips: [
     { layers: [
-      { type: 'title-background', text: 'Clip 1' },
+      { type: 'title-background', text: 'Default transition' },
       { type: 'audio', path: './assets/sample1.m4a' }
     ] },
     { transition: { duration: 0.2 }, layers: [
-      { type: 'title-background', text: 'Clip 2' },
+      { type: 'title-background', text: 'Fast transition' },
       { type: 'audio', path: './assets/sample2.m4a' }
     ] },
     { transition: { duration: 0 }, layers: [
-      { type: 'title-background', text: 'Clip 3' },
+      { type: 'title-background', text: 'No transition' },
       { type: 'audio', path: './assets/sample1.m4a' }
     ] },
     { transition: { audioInCurve: 'exp', audioOutCurve: 'exp' }, layers: [
-      { type: 'title-background', text: 'Clip 4' },
+      { type: 'title-background', text: 'Exp curve' },
       { type: 'audio', path: './assets/sample2.m4a' }
     ] },
     { transition: { name: 'dummy' }, layers: [
-      { type: 'title-background', text: 'Clip 5' },
+      { type: 'title-background', text: 'Dummy' },
       { type: 'audio', path: './assets/sample1.m4a' }
     ] },
+    { transition: { duration: 2 }, layers: [
+      { type: 'title-background', text: 'Too short' },
+      { type: 'audio', path: './assets/sample2.m4a' }
+    ] },
+    { duration: 1, transition: { duration: 2 }, layers: [
+      { type: 'title-background', text: 'Too short' },
+      { type: 'audio', path: './assets/sample2.m4a' }
+    ] },
+    { duration: 1, transition: { duration: 2 }, layers: [
+      { type: 'title-background', text: 'Too short' },
+      { type: 'audio', path: './assets/sample2.m4a' }
+    ] },
     { layers: [
-      { type: 'title-background', text: 'Clip 6' },
+      { type: 'title-background', text: 'THE END' },
       { type: 'audio', path: './assets/sample2.m4a' }
     ] },
   ],
diff --git a/examples/audio2.json5 b/examples/audio2.json5
index bebd44c..ed886aa 100644
--- a/examples/audio2.json5
+++ b/examples/audio2.json5
@@ -1,22 +1,16 @@
 {
   // enableFfmpegLog: true,
   outPath: './audio2.mp4',
-  keepSourceAudio: true,
+  width: 200, height: 200,
   clips: [
-    { duration: 0.5, layers: [{ type: 'video', path: './assets/lofoten.mp4', cutFrom: 0.4, cutTo: 2 }] },
-
-    { layers: [
-      { type: 'title-background', text: 'Audio track' },
-      { type: 'audio', path: './assets/High [NCS Release] - JPB  (No Copyright Music)-R8ZRCXy5vhA.m4a' }] },
-
-    { layers: [
-      { type: 'video', path: './assets/lofoten.mp4', cutFrom: 0, cutTo: 2, mixVolume: 0.7 },
-      { type: 'audio', path: './assets/High [NCS Release] - JPB  (No Copyright Music)-R8ZRCXy5vhA.m4a', mixVolume: 0.3 }] },
-
-    { layers: [
-      { type: 'video', path: './assets/lofoten.mp4', cutFrom: 0.4, cutTo: 2 },
-      { type: 'audio', path: './assets/High [NCS Release] - JPB  (No Copyright Music)-R8ZRCXy5vhA.m4a' }] },
-
     { layers: [{ type: 'video', path: './assets/lofoten.mp4', cutFrom: 1, cutTo: 2 }] },
+    { duration: 15, layers: { type: 'title-background', text: 'Audio track' } },
   ],
-}
+  audioNorm: { enable: true, gaussSize: 3, maxGain: 100 },
+  clipsAudioVolume: 50,
+  audioTracks: [
+    { path: './assets/High [NCS Release] - JPB  (No Copyright Music)-R8ZRCXy5vhA.m4a', cutFrom: 18 },
+    { path: './assets/winxp.mp3', mixVolume: 10, cutFrom: 1, cutTo: 2, start: 2 },
+    { path: './assets/Julen_ribas.m4a', mixVolume: 50, cutTo: 7, start: 5 },
+  ],
+}
\ No newline at end of file
diff --git a/examples/audio3.json5 b/examples/audio3.json5
new file mode 100644
index 0000000..7a2294e
--- /dev/null
+++ b/examples/audio3.json5
@@ -0,0 +1,15 @@
+{
+  outPath: './audio3.mp4',
+  width: 200, height: 200,
+  clips: [
+    { layers: [{ type: 'video', path: './assets/lofoten.mp4', cutTo: 2 }, { type: 'title', text: 'Arbitrary audio' }] },
+    { duration: 3, layers: [{ type: 'title-background', text: 'Voice starts in 1 sec' }, { type: 'detached-audio', path: './assets/Julen_ribas.m4a', mixVolume: 50, cutFrom: 2, start: 1 }] },
+    { duration: 1, layers: [{ type: 'title-background', text: 'Voice continues over clip 2' }] }, 
+    { duration: 3, layers: [{ type: 'title-background', text: 'Voice continues over clip 3' }] },
+    { duration: 2, layers: [{ type: 'title-background', text: 'XP sound starts' }, { type: 'detached-audio', path: './assets/winxp.mp3', mixVolume: 10, cutFrom: 0.5 }] },
+  ],
+  audioNorm: { enable: true, gaussSize: 3, maxGain: 100 },
+  audioTracks: [
+    { path: './assets/High [NCS Release] - JPB  (No Copyright Music)-R8ZRCXy5vhA.m4a', cutFrom: 18 },
+  ],
+}
\ No newline at end of file
diff --git a/examples/audioLoop.json5 b/examples/audioLoop.json5
index 56c0505..c258101 100644
--- a/examples/audioLoop.json5
+++ b/examples/audioLoop.json5
@@ -1,5 +1,6 @@
 {
   outPath: './audioLoop.mp4',
+  width: 200, height: 200,
   audioFilePath: './assets/winxp.mp3',
   loopAudio: true,
   // Should properly cut off and not crash with EPIPE if loopAudio=false and audio duration is shorter than total duration
diff --git a/index.js b/index.js
index 42b306a..1542b99 100644
--- a/index.js
+++ b/index.js
@@ -8,7 +8,7 @@ const { nanoid } = require('nanoid');
 const { parseFps, multipleOf2 } = require('./util');
 const { createFabricCanvas, rgbaToFabricImage, getNodeCanvasFromFabricCanvas } = require('./sources/fabric');
 const { createFrameSource } = require('./sources/frameSource');
-const parseConfig = require('./parseConfig');
+const { parseConfig } = require('./parseConfig');
 const GlTransitions = require('./glTransitions');
 const Audio = require('./audio');
 const { assertFileValid, checkTransition } = require('./util');
@@ -22,18 +22,22 @@ const Editly = async (config = {}) => {
     enableFfmpegLog = false,
     verbose = false,
     logTimes = false,
+    keepTmp = false,
     fast,
 
     outPath,
     clips: clipsIn,
+    clipsAudioVolume = 1,
+    audioTracks: arbitraryAudioIn = [],
     width: requestedWidth,
     height: requestedHeight,
     fps: requestedFps,
     defaults = {},
-    audioFilePath: audioFilePathIn,
+    audioFilePath: backgroundAudioPath,
     loopAudio,
     keepSourceAudio,
     allowRemoteRequests,
+    audioNorm,
 
     ffmpegPath = 'ffmpeg',
     ffprobePath = 'ffprobe',
@@ -41,10 +45,7 @@ const Editly = async (config = {}) => {
 
   const isGif = outPath.toLowerCase().endsWith('.gif');
 
-  let audioFilePath;
-  if (!isGif) audioFilePath = audioFilePathIn;
-
-  if (audioFilePath) await assertFileValid(audioFilePath, allowRemoteRequests);
+  if (backgroundAudioPath) await assertFileValid(backgroundAudioPath, allowRemoteRequests);
 
   checkTransition(defaults.transition);
 
@@ -53,21 +54,17 @@ const Editly = async (config = {}) => {
   assert(outPath, 'Please provide an output path');
   assert(clipsIn.length > 0, 'Please provide at least 1 clip');
 
-  const clips = await parseConfig({ defaults, clips: clipsIn, allowRemoteRequests, ffprobePath });
-
-  const { editAudio } = Audio({ ffmpegPath, ffprobePath, enableFfmpegLog, verbose });
+  const { clips, arbitraryAudio } = await parseConfig({ defaults, clips: clipsIn, arbitraryAudio: arbitraryAudioIn, backgroundAudioPath, loopAudio, allowRemoteRequests, ffprobePath });
+  if (verbose) console.log('Calculated', JSON5.stringify({ clips, arbitraryAudio }, null, 2));
 
   const outDir = dirname(outPath);
   const tmpDir = join(outDir, `editly-tmp-${nanoid()}`);
   if (verbose) console.log({ tmpDir });
-  await fs.remove(tmpDir);
   await fs.mkdirp(tmpDir);
 
-  if (!audioFilePath && keepSourceAudio) {
-    audioFilePath = await editAudio({ clips, tmpDir });
-  }
+  const { editAudio } = Audio({ ffmpegPath, ffprobePath, enableFfmpegLog, verbose, tmpDir });
 
-  if (verbose) console.log(JSON5.stringify(clips, null, 2));
+  const audioFilePath = !isGif ? await editAudio({ keepSourceAudio, arbitraryAudio, clipsAudioVolume, clips, audioNorm }) : undefined;
 
   // Try to detect parameters from first video
   let firstVideoWidth;
@@ -193,8 +190,6 @@ const Editly = async (config = {}) => {
       '-y', outPath,
     ];
 
-    const loopAudioArgs = loopAudio ? ['-stream_loop', '-1'] : [];
-
     const args = [
       ...(enableFfmpegLog ? [] : ['-hide_banner', '-loglevel', 'error']),
 
@@ -205,7 +200,7 @@ const Editly = async (config = {}) => {
       '-r', framerateStr,
       '-i', '-',
 
-      ...(audioFilePath ? [...loopAudioArgs, '-i', audioFilePath, '-shortest'] : []),
+      ...(audioFilePath ? ['-i', audioFilePath] : []),
 
       ...(!isGif ? ['-map', '0:v:0'] : []),
       ...(audioFilePath ? ['-map', '1:a:0'] : []),
@@ -374,7 +369,7 @@ const Editly = async (config = {}) => {
     if (verbose) console.log('Cleanup');
     if (frameSource1) await frameSource1.close();
     if (frameSource2) await frameSource2.close();
-    await fs.remove(tmpDir);
+    if (!keepTmp) await fs.remove(tmpDir);
   }
 
   try {
@@ -389,7 +384,8 @@ const Editly = async (config = {}) => {
   console.log(outPath);
 };
 
-// Pure function to get a frame at a certain time (excluding transitions)
+// Pure function to get a frame at a certain time
+// TODO I think this does not respect transition durations
 async function renderSingleFrame({
   time = 0,
   defaults,
diff --git a/parseConfig.js b/parseConfig.js
index 6fde470..e121e08 100644
--- a/parseConfig.js
+++ b/parseConfig.js
@@ -13,7 +13,23 @@ const { assertFileValid, checkTransition } = require('./util');
 const loadedFonts = [];
 
 
-async function parseConfig({ defaults: defaultsIn = {}, clips, allowRemoteRequests, ffprobePath }) {
+async function validateArbitraryAudio(audio) {
+  assert(audio === undefined || Array.isArray(audio));
+
+  if (audio) {
+    // eslint-disable-next-line no-restricted-syntax
+    for (const { path, cutFrom, cutTo, start } of audio) {
+      await assertFileValid(path, false);
+
+      if (cutFrom != null && cutTo != null) assert(cutTo > cutFrom);
+      if (cutFrom != null) assert(cutFrom >= 0);
+      if (cutTo != null) assert(cutTo >= 0);
+      assert(start == null || start >= 0, `Invalid "start" ${start}`);
+    }
+  }
+}
+
+async function parseConfig({ defaults: defaultsIn = {}, clips, arbitraryAudio: arbitraryAudioIn, backgroundAudioPath, loopAudio, allowRemoteRequests, ffprobePath }) {
   const defaults = {
     duration: 4,
     ...defaultsIn,
@@ -88,7 +104,9 @@ async function parseConfig({ defaults: defaultsIn = {}, clips, allowRemoteReques
     throw new Error(`Invalid layer type ${type}`);
   }
 
-  return pMap(clips, async (clip, clipIndex) => {
+  const detachedAudioByClip = {};
+
+  let clipsOut = await pMap(clips, async (clip, clipIndex) => {
     assert(typeof clip === 'object', '"clips" must contain objects with one or more layers');
     const { transition: userTransition, duration: userClipDuration, layers: layersIn } = clip;
 
@@ -136,7 +154,7 @@ async function parseConfig({ defaults: defaultsIn = {}, clips, allowRemoteReques
       }
 
       // Audio is handled later
-      if (type === 'audio') return layer;
+      if (['audio', 'detached-audio'].includes(type)) return layer;
 
       return handleLayer(layer);
     }, { concurrency: 1 }));
@@ -181,7 +199,7 @@ async function parseConfig({ defaults: defaultsIn = {}, clips, allowRemoteReques
         return { ...layer, cutFrom, cutTo, speedFactor };
       }
 
-      if (layer.type === 'video') {
+      if (type === 'video') {
         const { inputDuration } = layer;
 
         let speedFactor;
@@ -197,15 +215,76 @@ async function parseConfig({ defaults: defaultsIn = {}, clips, allowRemoteReques
         return { ...layer, speedFactor };
       }
 
+      // These audio tracks are detached from the clips (can run over multiple clips)
+      // This is useful so we can have audio start relative to clip start time
+      if (type === 'detached-audio') {
+        const { cutFrom, cutTo, mixVolume, start } = layer;
+        if (!detachedAudioByClip[clipIndex]) detachedAudioByClip[clipIndex] = [];
+        detachedAudioByClip[clipIndex].push({ path, cutFrom, cutTo, mixVolume, start });
+        return undefined;
+      }
+
       return layer;
     });
 
+    layersOut = layersOut.filter((l) => l);
+
     return {
       transition,
       duration: clipDuration,
       layers: layersOut,
     };
   }, { concurrency: 1 });
+
+
+  let totalClipDuration = 0;
+  const clipDetachedAudio = [];
+
+  // Need to map again because now we know all clip durations
+  clipsOut = await pMap(clipsOut, async (clip, i) => {
+    const nextClip = clipsOut[i + 1];
+
+    // We clamp all transitions to half the length of every clip
+    // NOTE: similar logic is duplicated in index.js
+    let safeTransitionDuration = 0;
+    if (nextClip) {
+      // Each clip can have two transitions, make sure we leave enough room:
+      safeTransitionDuration = Math.min(clip.duration / 2, nextClip.duration / 2, clip.transition.duration);
+    }
+
+    // We now know all clip durations so we can calculate the offset for detached audio tracks
+    // eslint-disable-next-line no-restricted-syntax
+    for (const { start, ...rest } of (detachedAudioByClip[i] || [])) {
+      clipDetachedAudio.push({ ...rest, start: totalClipDuration + (start || 0) });
+    }
+
+    totalClipDuration += clip.duration - safeTransitionDuration;
+
+    return {
+      ...clip,
+      transition: {
+        ...clip.transition,
+        duration: safeTransitionDuration,
+      },
+    };
+  });
+
+  // Audio can either come from `audioFilePath`, `audio` or from "detached" audio layers in clips
+  const arbitraryAudio = [
+    // Background audio is treated just like arbitrary audio
+    ...(backgroundAudioPath ? [{ path: backgroundAudioPath, mixVolume: 1, loop: loopAudio ? -1 : 0 }] : []),
+    ...arbitraryAudioIn,
+    ...clipDetachedAudio,
+  ];
+
+  await validateArbitraryAudio(arbitraryAudio);
+
+  return {
+    clips: clipsOut,
+    arbitraryAudio,
+  };
 }
 
-module.exports = parseConfig;
+module.exports = {
+  parseConfig,
+};