improve restoration strategy in mode 0

haoheliu · haoheliu · commit bbbe3f055f71 · 2021-09-30T15:40:02.000+08:00
diff --git a/README.md b/README.md
@@ -26,10 +26,25 @@ from voicefixer import Vocoder
 # Initialize model
 voicefixer = VoiceFixer()
 # Speech restoration
+
+# Mode 0
+voicefixer.restore(input="", # input wav file path
+                   output="", # output wav file path
+                   cuda=False, # whether to use gpu acceleration
+                   mode = 0) # You can try out mode 0, 1, 2 to find out the best result
+# Mode 1
+voicefixer.restore(input="", # input wav file path
+                   output="", # output wav file path
+                   cuda=False, # whether to use gpu acceleration
+                   mode = 1) # You can try out mode 0, 1, 2 to find out the best result
+# Mode 2
 voicefixer.restore(input="", # input wav file path
                    output="", # output wav file path
                    cuda=False, # whether to use gpu acceleration
-                   mode = 0) # You can try out mode 0, 1 to find out the best result
+                   mode = 2) # You can try out mode 0, 1, 2 to find out the best result
+
+
+
 
 # Universal speaker independent vocoder
 vocoder = Vocoder(sample_rate=44100) # Only 44100 sampling rate is supported.
diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@
 EMAIL = 'haoheliu@gmail.com'
 AUTHOR = 'Haohe Liu'
 REQUIRES_PYTHON = '>=3.7.0'
-VERSION = '0.0.7'
+VERSION = '0.0.8'
 
 # What packages are required for this module to be executed?
 REQUIRED = [
diff --git a/test/test.py b/test/test.py
@@ -14,6 +14,6 @@
 
 voicefixer = VoiceFixer()
 
-voicefixer.restore(input="/Users/liuhaohe/Desktop/test_song.wav",
-                   output="/Users/liuhaohe/Desktop/test_song_out_2.wav",
-                   cuda=False,mode=1)
+voicefixer.restore(input="/Users/liuhaohe/Downloads/lieshi_short.wav",
+                   output="/Users/liuhaohe/Downloads/lieshi_short.wav",
+                   cuda=False,mode=2)
diff --git a/voicefixer/base.py b/voicefixer/base.py
@@ -61,19 +61,41 @@ def _pre(self, model, input, cuda):
         # return models.to_log(sp), models.to_log(mel_orig)
         return sp, mel_orig
 
+    def remove_higher_frequency(self, wav, ratio=0.95):
+        stft = librosa.stft(wav)
+        real, img = np.real(stft), np.imag(stft)
+        mag = (real ** 2 + img ** 2) ** 0.5
+        cos, sin = real / mag, img / mag
+        spec = np.abs(stft)  # [1025,T]
+        feature = spec.copy()
+        feature = np.log10(feature)
+        feature[feature < 0] = 0
+        energy_level = np.sum(feature, axis=1)
+        threshold = np.sum(energy_level) * ratio
+        curent_level, i = energy_level[0], 0
+        while (i < energy_level.shape[0] and curent_level < threshold):
+            curent_level += energy_level[i + 1, ...]
+            i += 1
+        spec[i:, ...] = np.zeros_like(spec[i:, ...])
+        stft = spec * cos + 1j * spec * sin
+        return librosa.istft(stft)
+
     def restore(self, input, output, cuda=False, mode=0):
         if(cuda and torch.cuda.is_available()):
             self._model = self._model.cuda()
         # metrics = {}
-        if(mode == 1):
-            self._model.train() # More effective on seriously demaged speech
-        elif(mode == 2):
-            self._model.generator.denoiser.train() # Another option worth trying
-        else:
+        if(mode == 0):
             self._model.eval()
+        elif(mode == 1):
+            self._model.eval()
+        elif(mode == 2):
+            self._model.train() # More effective on seriously demaged speech
 
         with torch.no_grad():
             wav_10k = self._load_wav(input, sample_rate=44100)
+            if(mode == 0):
+                # print("In mode 0, we will remove part of the higher frequency part before processing")
+                wav_10k = self.remove_higher_frequency(wav_10k)
             res = []
             seg_length = 44100*60
             break_point = seg_length