Skip to content

Commit 4cbd3cd

Browse files
YLouWashUfacebook-github-bot
authored andcommitted
{BugFix} Fixing audio issue in VrsToMp4.
Summary: This diff fixes the "missing" audio issue in `VrsToMp4`. There are 2 issues: 1. The max_amplitude of audio signals is different from encoded and non-encoded audio signals (16bits vs 32bits). Hence this parameter cannot be hard-coded like before. 2. Pick 2 mic that is close to the lower frame since they have the best audio quality. 3. Incorporated fixes from Opus decoder in PAT. Reviewed By: kongchen1992 Differential Revision: D83388440 fbshipit-source-id: 23c9dbabeb4c8a679d023be488bbe1fb7fcd055d
1 parent b59ec9b commit 4cbd3cd

File tree

2 files changed

+39
-8
lines changed

2 files changed

+39
-8
lines changed

projectaria_tools/tools/vrs_to_mp4/vrs_to_mp4.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,14 @@ def parse_args():
5252
default=1,
5353
help="Downsampling factor on VRS images (Must be >=1)",
5454
)
55+
parser.add_argument(
56+
"--audio_channels",
57+
type=int,
58+
nargs="+",
59+
required=False,
60+
default=[0, 2],
61+
help="Audio channel indices to output to the MP4 file, e.g., '--audio_channels 0 1 2'. Default: the 2 mic on the lower frame of the glasses [0, 2]",
62+
)
5563
return parser.parse_args()
5664

5765

@@ -63,6 +71,7 @@ def main():
6371
stream_id=args.stream_id,
6472
log_folder=args.log_folder,
6573
down_sample_factor=args.downsample,
74+
audio_channels=args.audio_channels,
6675
)
6776

6877

projectaria_tools/tools/vrs_to_mp4/vrs_to_mp4_utils.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,12 @@ def convert_vrs_to_mp4(
126126
log_folder: str = None,
127127
stream_id: str = "214-1",
128128
down_sample_factor: int = 1,
129+
audio_channels: List[int] = None,
129130
):
130131
"""Convert a VRS file to MP4 video."""
132+
if audio_channels is None:
133+
audio_channels = [0, 2]
134+
131135
use_temp_folder = False
132136
if log_folder is None:
133137
# If no log folder is provided, we will create a temporary one
@@ -137,7 +141,9 @@ def convert_vrs_to_mp4(
137141
os.mkdir(log_folder)
138142

139143
# Create a vrs to mp4 converter
140-
converter = Vrs2Mp4Converter(vrs_file, stream_id, down_sample_factor)
144+
converter = Vrs2Mp4Converter(
145+
vrs_file, stream_id, down_sample_factor, audio_channels
146+
)
141147
duration_ns = converter.duration_ns()
142148
duration_in_second = duration_ns * 1e-9
143149
video_writer_clip = VideoClip(converter.make_frame, duration=duration_in_second)
@@ -151,7 +157,7 @@ def convert_vrs_to_mp4(
151157
duration=duration_in_second,
152158
fps=converter.audio_config.sample_rate,
153159
)
154-
audio_writer_clip.nchannels = converter.audio_config.num_channels
160+
audio_writer_clip.nchannels = len(converter.audio_channels_)
155161
audio_writer_clip.write_audiofile(
156162
temp_audio_file,
157163
fps=converter.audio_config.sample_rate,
@@ -204,10 +210,21 @@ class Vrs2Mp4Converter:
204210
make_audio_data(t)->np.ndarray is called to insert audio stream into MP4
205211
"""
206212

213+
# Select audio channels to output to MP4
214+
# TODO: support user-selected audio channels.
215+
OUTPUT_AUDIO_CHANNELS = [0, 2]
216+
207217
def __init__(
208-
self, vrs_path: str, stream_id: str = "214-1", down_sampling_factor: int = 1
218+
self,
219+
vrs_path: str,
220+
stream_id: str = "214-1",
221+
down_sampling_factor: int = 1,
222+
audio_channels: List[int] = None,
209223
):
210224
self.down_sampling_factor_ = down_sampling_factor
225+
if audio_channels is None:
226+
audio_channels = [0, 2]
227+
self.audio_channels_ = audio_channels
211228

212229
self.provider_ = data_provider.create_vrs_data_provider(vrs_path)
213230
if not self.provider_:
@@ -242,7 +259,7 @@ def __init__(
242259
self.audio_config = self.provider_.get_audio_configuration(
243260
self.audio_streamid_
244261
)
245-
self.audio_max_value_ = max_signed_value_for_bytes(4)
262+
self.audio_num_channels = self.audio_config.num_channels
246263

247264
# RECORD_TIME for audio is the START of a Record
248265
# DEVICE_TIME for audio is the END of a Record
@@ -415,8 +432,13 @@ def make_audio_data(self, t) -> np.ndarray:
415432
TimeQueryOptions.CLOSEST,
416433
)
417434
audio_data = np.array(audio_data_and_config[0].data)
418-
audio_data = audio_data.astype(np.float64) / self.audio_max_value_
419435

420-
# return all channels
421-
# a subset of channels create crackle sounds
422-
return audio_data
436+
num_samples = int(audio_data.shape[0] / self.audio_num_channels)
437+
max_amplitude = audio_data_and_config[0].max_amplitude
438+
audio_data = audio_data.astype(np.float64) / max_amplitude
439+
440+
# Select only the specified channels
441+
audio_data = audio_data.reshape(num_samples, self.audio_num_channels)
442+
selected_audio_data = audio_data[:, self.audio_channels_]
443+
444+
return selected_audio_data

0 commit comments

Comments
 (0)