update readme

haoheliu · haoheliu · commit 7ac62992aff2 · 2021-10-01T23:14:10.000+08:00
diff --git a/README.md b/README.md
@@ -54,17 +54,17 @@ from voicefixer import Vocoder
 voicefixer = VoiceFixer()
 # Speech restoration
 
-# Mode 0
+# Mode 0: Original Model (suggested by default)
 voicefixer.restore(input="", # input wav file path
                    output="", # output wav file path
                    cuda=False, # whether to use gpu acceleration
                    mode = 0) # You can try out mode 0, 1, 2 to find out the best result
-# Mode 1
+# Mode 1: Add preprocessing module (remove higher frequency)
 voicefixer.restore(input="", # input wav file path
                    output="", # output wav file path
                    cuda=False, # whether to use gpu acceleration
                    mode = 1) # You can try out mode 0, 1, 2 to find out the best result
-# Mode 2
+# Mode 2: Train mode (might work sometimes on seriously degraded real speech)
 voicefixer.restore(input="", # input wav file path
                    output="", # output wav file path
                    cuda=False, # whether to use gpu acceleration
diff --git a/test/streamlit.py b/test/streamlit.py
@@ -31,7 +31,7 @@ def init_voicefixer():
     st.write('Inference : ')
     
     # choose options
-    mode = st.radio('Voice fixer mode (0: rm high frequency, 1: none, 2: train fixer)', [0, 1, 2])
+    mode = st.radio('Voice fixer modes (0: original mode, 1: Add preprocessing module 2: Train mode (may work sometimes on seriously degraded speech))', [0, 1, 2])
     if torch.cuda.is_available():
         is_cuda = st.radio('Turn on GPU', [True, False])
         if is_cuda != list(voice_fixer._model.parameters())[0].is_cuda:
@@ -62,5 +62,5 @@ def init_voicefixer():
     # make buffer
     with BytesIO() as buffer:
         soundfile.write(buffer, pred_wav.T, samplerate=sample_rate, format='WAV')
-        st.write("Time: {:.3f}".format(pred_time))
+        st.write("Time: {:.3f}s".format(pred_time))
         st.audio(buffer.getvalue(), format='audio/wav')
diff --git a/voicefixer/base.py b/voicefixer/base.py
@@ -65,17 +65,18 @@ def remove_higher_frequency(self, wav, ratio=0.95):
         stft = librosa.stft(wav)
         real, img = np.real(stft), np.imag(stft)
         mag = (real ** 2 + img ** 2) ** 0.5
-        cos, sin = real / mag, img / mag
+        cos, sin = real / (mag+EPS), img / (mag+EPS)
         spec = np.abs(stft)  # [1025,T]
         feature = spec.copy()
-        feature = np.log10(feature)
+        feature = np.log10(feature+EPS)
         feature[feature < 0] = 0
         energy_level = np.sum(feature, axis=1)
         threshold = np.sum(energy_level) * ratio
         curent_level, i = energy_level[0], 0
         while (i < energy_level.shape[0] and curent_level < threshold):
             curent_level += energy_level[i + 1, ...]
             i += 1
+        print(i)
         spec[i:, ...] = np.zeros_like(spec[i:, ...])
         stft = spec * cos + 1j * spec * sin
         return librosa.istft(stft)
@@ -92,13 +93,13 @@ def restore_inmem(self, wav_10k, cuda=False, mode=0, your_vocoder_func=None):
         elif(mode == 2):
             self._model.train() # More effective on seriously demaged speech
 
-        if(mode == 0):
-            wav_10k = self.remove_higher_frequency(wav_10k)
         res = []
-        seg_length = 44100*60
+        seg_length = 44100*30
         break_point = seg_length
         while break_point < wav_10k.shape[0]+seg_length:
             segment = wav_10k[break_point-seg_length:break_point]
+            if (mode == 1):
+                segment = self.remove_higher_frequency(segment)
             sp,mel_noisy = self._pre(self._model, segment, cuda)
             out_model = self._model(sp, mel_noisy)
             denoised_mel = from_log(out_model['mel'])