Skip to content

Commit 45d1fdc

Browse files
committed
v0.1709.1 - increased fallbacks for safe splits
1 parent f2f683b commit 45d1fdc

File tree

3 files changed

+37
-12
lines changed

3 files changed

+37
-12
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,14 +196,17 @@ After launching your bot successfully, you can interact with it via Telegram (se
196196
- `/language` - set the model's transcription language (`auto` = autodetect); if you know the language spoken in the audio, setting the transcription language manually with this command may improve both transcription speed and accuracy.
197197

198198
## Changes
199+
- v0.1709.1 - increased split message maximum character safe zone buffers to prevent chunk exceeding
200+
- added a further safeguard to fall back on character-level splitting if no whitespace is found
201+
- please refer to issues (and reopen if necessary) if the problem persists
199202
- v0.1709 - Added `config.ini` option to ping users (i.e. the owner) on startup (when the service is online)
200203
- startup notifications true/false, user ID's and the environment variable and fallbacks can be defined in `config.ini`
201204
- v0.1708.4 - Better error catching
202205
- Fixed the description and catching of i.e. YouTube's 403 errors
203206
- v0.1708.3 - Enforced chunk size double-check when sending transcripts as messages
204207
- This is to ensure we're staying under the message length cap in Telegram
205208
- v0.1708.2 - Added buffer for chunking
206-
- Changed the chunk sizes from `4096` to `4000` to avoid edge case
209+
- Changed the chunk sizes from `4096` to `4000` to avoid edge cases
207210
- v0.1708.1 - Small bug fixes in the output
208211
- Note that running the program within `firejail` using Nvidia driver v.560.xx or newer requires i.e.:
209212
```

src/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# openai-whisper transcriber-bot for Telegram
44

55
# version of this program
6-
version_number = "0.1709"
6+
version_number = "0.1709.1"
77

88
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
99
# https://github.com/FlyingFathead/whisper-transcriber-telegram-bot/

src/transcription_handler.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# transcription_handler.py
22
# ~~~
33
# openai-whisper transcriber-bot for Telegram
4-
# v0.12
54
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
65
# https://github.com/FlyingFathead/whisper-transcriber-telegram-bot/
76
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -172,7 +171,7 @@ def get_transcription_settings():
172171
}
173172

174173
# split long messages
175-
def split_message(message, max_length=4096):
174+
def split_message(message, max_length=3500):
176175
return [message[i:i+max_length] for i in range(0, len(message), max_length)]
177176

178177
# // audio download (new method)
@@ -620,22 +619,45 @@ async def process_url_message(message_text, bot, update, model, language):
620619
try:
621620
logger.info(f"Preparing to send plain text message from raw content")
622621
content = transcription_note + raw_content # Add transcription note to the raw content
622+
623623
# Just to be safe, reduce the chunk even more if needed
624-
safe_max = 3500 # even safer limit
624+
safe_max = 3000 # even safer limit
625+
625626
for i in range(0, len(content), safe_max):
626627
chunk = content[i:i+safe_max]
627628

628629
# Optional: Make sure chunk length is safely under 4096 (should already be)
629-
if len(chunk) > 4096:
630-
chunk = chunk[:4096]
631-
632-
# OPTIONAL: Check if we end in the middle of an HTML tag and adjust if needed.
633-
# For example, if chunk ends with '<', we might remove that character or find the previous space:
630+
if len(chunk) > 4000:
631+
chunk = chunk[:4000]
632+
633+
# # OPTIONAL: Check if we end in the middle of an HTML tag and adjust if needed.
634+
# # For example, if chunk ends with '<', we might remove that character or find the previous space:
635+
# if '<' in chunk[-5:]: # crude check for partial tag at end
636+
# # Try to backtrack to a space before the '<'
637+
# last_space = chunk.rfind(' ')
638+
# if last_space != -1:
639+
# chunk = chunk[:last_space]
640+
641+
# Check if we end on a partial HTML tag
634642
if '<' in chunk[-5:]: # crude check for partial tag at end
635-
# Try to backtrack to a space before the '<'
636643
last_space = chunk.rfind(' ')
637644
if last_space != -1:
638645
chunk = chunk[:last_space]
646+
647+
# Attempt to find last whitespace so we don’t split in the middle of a word
648+
# BUT if there's NO whitespace at all, we forcibly break anyway:
649+
last_space = chunk.rfind(' ')
650+
if last_space == -1 and len(chunk) == safe_max:
651+
# This means no space was found in the entire chunk,
652+
# so we forcibly keep the chunk at safe_max (which likely breaks a word).
653+
logger.warning("No whitespace found. Forcibly splitting mid-word at position %d.", i + safe_max)
654+
# chunk remains chunk[:safe_max], i.e. as-is
655+
elif last_space > -1 and last_space > 0:
656+
# We found a space within the chunk
657+
chunk = chunk[:last_space]
658+
# Note: If you do this, you might want to adjust `i` accordingly
659+
# or treat the leftover text on the next iteration.
660+
# But for a simple approach, this is enough to keep the code short.
639661

640662
# Now send the message
641663
await bot.send_message(chat_id=update.effective_chat.id, text=chunk, parse_mode='HTML')
@@ -680,7 +702,7 @@ async def process_url_message(message_text, bot, update, model, language):
680702
await bot.send_message(chat_id=update.effective_chat.id, text="An error occurred during processing.")
681703

682704
# create video info
683-
def create_video_info_message(details, max_length=4000):
705+
def create_video_info_message(details):
684706
header_separator = "=" * 10
685707
video_info_message = f"""{header_separator}
686708
Title: {details.get('title', 'No title available')}

0 commit comments

Comments
 (0)