Augment audio to a video (#133)

Adib234 · facebook-github-bot · commit 8d623dfd6440 · 2021-10-18T12:30:33.000-07:00
Summary: ## Related Issue Fixes #130 - [X] I have read CONTRIBUTING.md to understand how to contribute to this repository :) <Please summarize what you are trying to achieve, what changes you made, and how they acheive the desired result.> I'm trying to extract the audio to a temporary file and then apply some audio augmentation. After that I swap the current audio in the video with the augmented audio. I also gather metadata for the augmented audio and video. It either returns the output path (if it was specified) or the video path. Only file that was changed was `/AugLy/augly/video/functional.py`. A question I have is what should be the description for the `audio_aug_function` param? ## Unit Tests If your changes touch the `audio` module, please run all of the `audio` tests and paste the output here. Likewise for `image`, `text`, & `video`. If your changes could affect behavior in multiple modules, please run the tests for all potentially affected modules. If you are unsure of which modules might be affected by your changes, please just run all the unit tests. ### Video ```bash python -m unittest discover -s augly/tests/video_tests/ -p "*" ``` Output of test suite for video ``` ...../Users/admin/AugLy/augly/image/utils/utils.py:51: ResourceWarning: unclosed file <_io.BufferedReader name='/Users/admin/AugLy/augly/assets/screenshot_templates/bboxes.json'> bbox = json.load(open(local_bbox_path, "rb"))[template_key] ResourceWarning: Enable tracemalloc to get the object allocation traceback /Users/admin/AugLy/augly/video/helpers/metadata.py:357: ResourceWarning: unclosed file <_io.BufferedReader name='/Users/admin/AugLy/augly/assets/screenshot_templates/web.png'> metadata[-1]["intensity"] = getattr( ResourceWarning: Enable tracemalloc to get the object allocation traceback ./Users/admin/AugLy/augly/image/utils/utils.py:184: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations A = np.matrix(matrix, dtype=np.float) /Users/admin/AugLy/augly/image/utils/utils.py:184: PendingDeprecationWarning: the matrix subclass is not the recommended way to represent matrices or deal with linear algebra (see https://docs.scipy.org/doc/numpy/user/numpy-for-matlab-users.html). Please adjust your code to use regular ndarray. A = np.matrix(matrix, dtype=np.float) /Users/admin/opt/anaconda3/envs/augly/lib/python3.9/site-packages/numpy/matrixlib/defmatrix.py:69: PendingDeprecationWarning: the matrix subclass is not the recommended way to represent matrices or deal with linear algebra (see https://docs.scipy.org/doc/numpy/user/numpy-for-matlab-users.html). Please adjust your code to use regular ndarray. return matrix(data, dtype=dtype, copy=False) .. ====================================================================== FAIL: test_ReplaceWithBackground (transforms.composite_test.TransformsVideoUnitTest) ---------------------------------------------------------------------- Traceback (most recent call last): File "/Users/admin/AugLy/augly/tests/video_tests/transforms/composite_test.py", line 75, in test_ReplaceWithBackground self.evaluate_class( File "/Users/admin/AugLy/augly/tests/video_tests/base_unit_test.py", line 122, in evaluate_class self.assertTrue( AssertionError: False is not true ---------------------------------------------------------------------- Ran 63 tests in 813.840s FAILED (failures=1) ``` ## Other testing N/A only looking for some feedback on the work I did so far If applicable, test your changes and paste the output here. For example, if your changes affect the requirements/installation, then test installing augly in a fresh conda env, then make sure you are able to import augly & run the unit test Pull Request resolved: #133 Reviewed By: zpapakipos Differential Revision: D31368459 Pulled By: jbitton fbshipit-source-id: 9c76057d1a2057ed25b317f2eb3fd1808c92bd8d
diff --git a/augly/tests/video_tests/transforms/composite_test.py b/augly/tests/video_tests/transforms/composite_test.py
@@ -5,6 +5,7 @@
 import random
 import unittest
 
+import augly.audio as audaugs
 import augly.video as vidaugs
 from augly.tests.base_configs import VideoAugConfig
 from augly.tests.video_tests.base_unit_test import BaseVideoUnitTest
@@ -23,6 +24,14 @@ def setUpClass(cls):
     def test_ApplyLambda(self):
         self.evaluate_class(vidaugs.ApplyLambda(), fname="apply_lambda")
 
+    def test_AugmentAudio(self):
+        self.evaluate_class(
+            vidaugs.AugmentAudio(
+                audio_aug_function=audaugs.PitchShift(),
+            ),
+            fname="augment_audio",
+        )
+
     def test_InsertInBackground(self):
         self.evaluate_class(
             vidaugs.InsertInBackground(offset_factor=0.25),
@@ -79,6 +88,7 @@ def test_ReplaceWithBackground(self):
                 source_percentage=0.7,
             ),
             fname="replace_with_background",
+            metadata_exclude_keys=["dst_duration", "dst_fps"],
         )
 
     def test_ReplaceWithColorFrames(self):
diff --git a/augly/utils/expected_output/video_tests/expected_metadata.json b/augly/utils/expected_output/video_tests/expected_metadata.json
@@ -80,6 +80,60 @@
             "src_width": 1920
         }
     ],
+    "augment_audio": [
+        {
+            "audio_aug_function": "PitchShift",
+            "audio_aug_kwargs": {},
+            "audio_metadata": [
+                {
+                    "dst_duration": 10.005333333333333,
+                    "dst_num_channels": 1,
+                    "dst_sample_rate": 48000,
+                    "dst_segments": [
+                        {
+                            "end": 10.005333333333333,
+                            "start": 0
+                        }
+                    ],
+                    "intensity": 1.1904761904761905,
+                    "n_steps": 1,
+                    "name": "pitch_shift",
+                    "output_path": null,
+                    "src_duration": 10.005333333333333,
+                    "src_num_channels": 1,
+                    "src_sample_rate": 48000,
+                    "src_segments": [
+                        {
+                            "end": 10.005333333333333,
+                            "start": 0
+                        }
+                    ]
+                }
+            ],
+            "dst_duration": 10.027855,
+            "dst_fps": 29.916666666666668,
+            "dst_height": 1080,
+            "dst_segments": [
+                {
+                    "end": 10.027855,
+                    "start": 0
+                }
+            ],
+            "dst_width": 1920,
+            "intensity": 1.1904761904761905,
+            "name": "augment_audio",
+            "src_duration": 10.027855,
+            "src_fps": 29.916666666666668,
+            "src_height": 1080,
+            "src_segments": [
+                {
+                    "end": 10.027855,
+                    "start": 0
+                }
+            ],
+            "src_width": 1920
+        }
+    ],
     "blend_videos": [
         {
             "dst_duration": 10.027855,
diff --git a/augly/video/__init__.py b/augly/video/__init__.py
@@ -6,6 +6,7 @@
     add_noise,
     apply_lambda,
     audio_swap,
+    augment_audio,
     blend_videos,
     blur,
     brightness,
@@ -21,20 +22,20 @@
     hflip,
     hstack,
     insert_in_background,
-    replace_with_background,
     loop,
     meme_format,
     overlay,
     overlay_dots,
     overlay_emoji,
-    overlay_onto_screenshot,
     overlay_onto_background_video,
+    overlay_onto_screenshot,
     overlay_shapes,
     overlay_text,
     pad,
     perspective_transform_and_shake,
     pixelization,
     remove_audio,
+    replace_with_background,
     replace_with_color_frames,
     resize,
     rotate,
@@ -50,6 +51,7 @@
     AddNoise,
     ApplyLambda,
     AudioSwap,
+    AugmentAudio,
     BlendVideos,
     Blur,
     Brightness,
@@ -106,6 +108,7 @@
     "AddNoise",
     "ApplyLambda",
     "AudioSwap",
+    "AugmentAudio",
     "BlendVideos",
     "Blur",
     "Brightness",
@@ -161,6 +164,7 @@
     "add_noise",
     "apply_lambda",
     "audio_swap",
+    "augment_audio",
     "blend_videos",
     "blur",
     "brightness",
diff --git a/augly/video/functional.py b/augly/video/functional.py
@@ -8,6 +8,8 @@
 import tempfile
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import augly.audio as audaugs
+import augly.audio.utils as audutils
 import augly.image as imaugs
 import augly.utils as utils
 import augly.video.augmenters.cv2 as ac
@@ -138,6 +140,69 @@ def audio_swap(
     return output_path or video_path
 
 
+def augment_audio(
+    video_path: str,
+    output_path: Optional[str] = None,
+    audio_aug_function: Callable[..., Tuple[np.ndarray, int]] = audaugs.apply_lambda,
+    metadata: Optional[List[Dict[str, Any]]] = None,
+    **audio_aug_kwargs,
+) -> str:
+    """
+    Augments the audio track of the input video using a given AugLy audio augmentation
+
+    @param video_path: the path to the video to be augmented
+
+    @param output_path: the path in which the resulting video will be stored.
+        If not passed in, the original video file will be overwritten
+
+    @param audio_aug_function: the augmentation function to be applied onto the video's
+        audio track. Should have the standard API of an AugLy audio augmentation, i.e.
+        expect input audio as a numpy array or path & output path as input, and output
+        the augmented audio to the output path
+
+    @param metadata: if set to be a list, metadata about the function execution
+        including its name, the source & dest duration, fps, etc. will be appended
+        to the inputted list. If set to None, no metadata will be appended or returned
+
+    @param audio_aug_kwargs: the input attributes to be passed into `audio_aug`
+
+    @returns: the path to the augmented video
+    """
+    assert callable(audio_aug_function), (
+        repr(type(audio_aug_function).__name__) + " object is not callable"
+    )
+
+    func_kwargs = helpers.get_func_kwargs(
+        metadata, locals(), video_path, audio_aug_function=audio_aug_function
+    )
+
+    if audio_aug_function is not None:
+        try:
+            func_kwargs["audio_aug_function"] = audio_aug_function.__name__
+        except AttributeError:
+            func_kwargs["audio_aug_function"] = type(audio_aug_function).__name__
+
+    audio_metadata = []
+    with tempfile.NamedTemporaryFile(suffix=".wav") as tmpfile:
+        helpers.extract_audio_to_file(video_path, tmpfile.name)
+        audio, sr = audutils.validate_and_load_audio(tmpfile.name)
+        aug_audio, aug_sr = audio_aug_function(
+            audio, sample_rate=sr, metadata=audio_metadata, **audio_aug_kwargs
+        )
+        audutils.ret_and_save_audio(aug_audio, tmpfile.name, aug_sr)
+        audio_swap(video_path, tmpfile.name, output_path=output_path or video_path)
+
+    if metadata is not None:
+        helpers.get_metadata(
+            metadata=metadata,
+            audio_metadata=audio_metadata,
+            function_name="augment_audio",
+            **func_kwargs,
+        )
+
+    return output_path or video_path
+
+
 def blend_videos(
     video_path: str,
     overlay_path: str,
diff --git a/augly/video/helpers/__init__.py b/augly/video/helpers/__init__.py
@@ -15,6 +15,7 @@
     add_noise_intensity,
     apply_lambda_intensity,
     audio_swap_intensity,
+    augment_audio_intensity,
     blend_videos_intensity,
     blur_intensity,
     brightness_intensity,
@@ -85,6 +86,7 @@
     "add_noise_intensity",
     "apply_lambda_intensity",
     "audio_swap_intensity",
+    "augment_audio_intensity",
     "blend_videos_intensity",
     "blur_intensity",
     "brightness_intensity",
diff --git a/augly/video/helpers/intensity.py b/augly/video/helpers/intensity.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates.
 
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import augly.image.intensity as imint
 import augly.image.utils as imutils
@@ -42,6 +42,10 @@ def audio_swap_intensity(offset: float, **kwargs) -> float:
     return (1.0 - offset) * 100.0
 
 
+def augment_audio_intensity(audio_metadata: List[Dict[str, Any]], **kwargs) -> float:
+    return audio_metadata[0]["intensity"]
+
+
 def blend_videos_intensity(opacity: float, overlay_size: float, **kwargs) -> float:
     return imint.overlay_media_intensity_helper(opacity, overlay_size)
 
@@ -209,9 +213,7 @@ def overlay_emoji_intensity(
 
 
 def overlay_onto_background_video_intensity(
-    overlay_size: Optional[float],
-    metadata: Dict[str, Any],
-    **kwargs,
+    overlay_size: Optional[float], metadata: Dict[str, Any], **kwargs
 ) -> float:
     if overlay_size is not None:
         return (1 - overlay_size ** 2) * 100.0
diff --git a/augly/video/transforms.py b/augly/video/transforms.py
@@ -5,8 +5,10 @@
 import random
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
+import augly.audio as audaugs
 import augly.utils as utils
 import augly.video.functional as F
+import numpy as np
 from augly.video.helpers import identity_function
 
 
@@ -270,6 +272,60 @@ def apply_transform(
         )
 
 
+class AugmentAudio(BaseTransform):
+    def __init__(
+        self,
+        audio_aug_function: Callable[
+            ..., Tuple[np.ndarray, int]
+        ] = audaugs.apply_lambda,
+        p: float = 1.0,
+        **audio_aug_kwargs,
+    ):
+        """
+        @param audio_aug_function: the augmentation function to be applied onto the
+            video's audio track. Should have the standard API of an AugLy audio
+            augmentation, i.e. expect input audio as a numpy array or path & output
+            path as input, and output the augmented audio to the output path
+
+        @param p: the probability of the transform being applied; default value is 1.0
+
+        @param audio_aug_kwargs: the input attributes to be passed into `audio_aug`
+        """
+        super().__init__(p)
+        self.audio_aug_function = audio_aug_function
+        self.audio_aug_kwargs = audio_aug_kwargs
+
+    def apply_transform(
+        self,
+        video_path: str,
+        output_path: str,
+        metadata: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """
+        Augments the audio track of the input video using a given AugLy audio
+        augmentation
+
+        @param video_path: the path to the video to be augmented
+
+        @param output_path: the path in which the resulting video will be stored.
+            If not passed in, the original video file will be overwritten
+
+        @param metadata: if set to be a list, metadata about the function execution
+            including its name, the source & dest duration, fps, etc. will be appended
+            to the inputted list. If set to None, no metadata will be appended or
+            returned
+
+        @returns: the path to the augmented video
+        """
+        return F.augment_audio(
+            video_path=video_path,
+            audio_aug_function=self.audio_aug_function,
+            output_path=output_path,
+            metadata=metadata,
+            **self.audio_aug_kwargs,
+        )
+
+
 class BlendVideos(BaseTransform):
     def __init__(
         self,
@@ -405,7 +461,12 @@ def apply_transform(
 
         @returns: the path to the augmented video
         """
-        return F.brightness(video_path, output_path, self.level, metadata=metadata)
+        return F.brightness(
+            video_path,
+            output_path,
+            level=self.level,
+            metadata=metadata,
+        )
 
 
 class ChangeAspectRatio(BaseTransform):
@@ -713,7 +774,10 @@ def apply_transform(
         @returns: the path to the augmented video
         """
         return F.encoding_quality(
-            video_path, output_path, self.quality, metadata=metadata
+            video_path,
+            output_path,
+            quality=int(self.quality),
+            metadata=metadata,
         )
 
 
@@ -1622,7 +1686,12 @@ def apply_transform(
 
         @returns: the path to the augmented video
         """
-        return F.pixelization(video_path, output_path, self.ratio, metadata=metadata)
+        return F.pixelization(
+            video_path,
+            output_path,
+            ratio=self.ratio,
+            metadata=metadata,
+        )
 
 
 class RemoveAudio(BaseTransform):