@@ -204,9 +204,8 @@ def __call__(
204204 return_token_timestamps : Optional [bool ] = None ,
205205 ** kwargs ,
206206 ) -> BatchFeature :
207- """
208- Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
209- the STFT computation if available, otherwise a slower NumPy based one.
207+ """Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch
208+ for the STFT computation if available, otherwise a slower NumPy based one.
210209
211210 Args:
212211 raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
@@ -220,6 +219,11 @@ def __call__(
220219
221220 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
222221 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
222+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
223+ If set, will return tensors instead of list of python integers. Acceptable values are:
224+
225+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
226+ - `'np'`: Return Numpy `np.ndarray` objects.
223227 return_attention_mask (`bool`, *optional*):
224228 Whether to return the attention mask. If left to the default, will return the attention mask according
225229 to the specific feature_extractor's default.
@@ -232,18 +236,24 @@ def __call__(
232236 bugs.
233237
234238 </Tip>
235-
236- return_tensors (`str` or [`~utils.TensorType`], *optional*):
237- If set, will return tensors instead of list of python integers. Acceptable values are:
238-
239- - `'pt'`: Return PyTorch `torch.Tensor` objects.
240- - `'np'`: Return Numpy `np.ndarray` objects.
239+ padding (`str` or [`~utils.PaddingStrategy`], *optional*, defaults to `'max_length'`):
240+ Activates and controls padding. Accepts the following values:
241+
242+ - `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence is
243+ provided).
244+ - `'max_length'` (default): Pad to a maximum length specified with the argument `max_length` or to the
245+ maximum acceptable input length for the model if that argument is not provided.
246+ - `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
247+ max_length (`int`, *optional*):
248+ Controls the maximum length to use by one of the truncation/padding parameters.
249+
250+ If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
251+ is required by one of the truncation/padding parameters. If the model has no specific maximum input
252+ length (like XLNet) truncation/padding to a maximum length will be deactivated.
241253 sampling_rate (`int`, *optional*):
242254 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
243255 `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
244256 pipeline.
245- padding_value (`float`, *optional*, defaults to 0.0):
246- The value that is used to fill the padding values / vectors.
247257 do_normalize (`bool`, *optional*, defaults to `False`):
248258 Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
249259 improve the performance of the model.
@@ -255,6 +265,7 @@ def __call__(
255265
256266 Whether or not to return the number of frames of the input raw_speech.
257267 These num_frames can be used by the model to compute word level timestamps.
268+ **kwargs: Not supported by WhisperFeatureExtractor.__call__() and ignored.
258269 """
259270 if sampling_rate is not None :
260271 if sampling_rate != self .sampling_rate :
0 commit comments