Skip to content

Commit 2a1a1bf

Browse files
committed
v0.1710 - chunking logging rewrite w/ fallbacks+logging
1 parent 3cb7bfc commit 2a1a1bf

File tree

3 files changed

+114
-29
lines changed

3 files changed

+114
-29
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ After launching your bot successfully, you can interact with it via Telegram (se
196196
- `/language` - set the model's transcription language (`auto` = autodetect); if you know the language spoken in the audio, setting the transcription language manually with this command may improve both transcription speed and accuracy.
197197

198198
## Changes
199+
- v0.1710 - rewrite for chunking logic when sending as messages
200+
- better step-by-step logging, better error catching, better fitting into TG message limits with fallbacks
201+
- again; please refer to i.e. [Issue #7](https://github.com/FlyingFathead/whisper-transcriber-telegram-bot/issues/7) (and open up a new issue if necessary) if the problem persists
199202
- v0.1709.2 - up & running greeting is now more prominent w/ both UTC+local times
200203
- v0.1709.1 - increased split message maximum character safe zone buffers to prevent chunk exceeding
201204
- added a further safeguard to fall back on character-level splitting if no whitespace is found

src/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# openai-whisper transcriber-bot for Telegram
44

55
# version of this program
6-
version_number = "0.1709.2"
6+
version_number = "0.1710"
77

88
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
99
# https://github.com/FlyingFathead/whisper-transcriber-telegram-bot/

src/transcription_handler.py

Lines changed: 110 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -614,58 +614,140 @@ async def process_url_message(message_text, bot, update, model, language):
614614
# note_length = len(transcription_note)
615615
# max_message_length = 4000 - note_length # Adjust max length to account for transcription note
616616

617-
# message sending and chunking logic; revised
617+
# message sending and chunking logic; revised for v.0.1710
618618
if transcription_settings['send_as_messages'] and 'txt' in transcription_paths:
619619
try:
620-
logger.info(f"Preparing to send plain text message from raw content")
620+
logger.info("Preparing to send plain text message from raw content")
621621
content = transcription_note + raw_content # Add transcription note to the raw content
622622

623623
# Just to be safe, reduce the chunk even more if needed
624-
safe_max = 3000 # even safer limit
624+
safe_max = 3500 # even safer limit
625+
i = 0
625626

626-
for i in range(0, len(content), safe_max):
627-
chunk = content[i:i+safe_max]
627+
# Replacing the old `for i in range(0, len(content), safe_max):` approach
628+
# with a `while` loop that ensures leftover text isn’t lost if we trim.
629+
while i < len(content):
630+
# Slice up to safe_max or the end of the string
631+
chunk = content[i:i + safe_max]
628632

629-
# Optional: Make sure chunk length is safely under 4096 (should already be)
633+
# Optional: Make sure chunk length is safely under 4096 (Telegram limit is 4096)
630634
if len(chunk) > 4000:
631635
chunk = chunk[:4000]
632636

633-
# # OPTIONAL: Check if we end in the middle of an HTML tag and adjust if needed.
634-
# # For example, if chunk ends with '<', we might remove that character or find the previous space:
635-
# if '<' in chunk[-5:]: # crude check for partial tag at end
636-
# # Try to backtrack to a space before the '<'
637-
# last_space = chunk.rfind(' ')
638-
# if last_space != -1:
639-
# chunk = chunk[:last_space]
637+
# If this chunk is smaller than safe_max, we’re near the end
638+
# We'll still do the partial-tag and whitespace checks, but after sending, we break
639+
if len(chunk) < safe_max:
640+
# Check partial HTML near the end (optional).
641+
if '<' in chunk[-5:]: # crude check for partial tag at end
642+
last_space = chunk.rfind(' ')
643+
if last_space != -1:
644+
chunk = chunk[:last_space]
640645

641-
# Check if we end on a partial HTML tag
646+
# Check if we’re splitting a word
647+
last_space = chunk.rfind(' ')
648+
if last_space == -1 and len(chunk) == safe_max:
649+
logger.warning("No whitespace found. Forcibly splitting mid-word near end.")
650+
elif last_space > 0:
651+
chunk = chunk[:last_space]
652+
653+
# Send the last chunk
654+
await bot.send_message(
655+
chat_id=update.effective_chat.id,
656+
text=chunk,
657+
parse_mode='HTML'
658+
)
659+
logger.info(f"Sent message chunk: {(i // safe_max) + 1}")
660+
break
661+
662+
# If chunk is exactly safe_max in length, do partial tag / partial word checks
663+
664+
# 1) Attempt to avoid splitting an HTML tag
642665
if '<' in chunk[-5:]: # crude check for partial tag at end
643666
last_space = chunk.rfind(' ')
644667
if last_space != -1:
645668
chunk = chunk[:last_space]
646-
647-
# Attempt to find last whitespace so we don’t split in the middle of a word
648-
# BUT if there's NO whitespace at all, we forcibly break anyway:
669+
670+
# 2) Attempt to find last whitespace so we don’t split in the middle of a word
649671
last_space = chunk.rfind(' ')
650672
if last_space == -1 and len(chunk) == safe_max:
651-
# This means no space was found in the entire chunk,
652-
# so we forcibly keep the chunk at safe_max (which likely breaks a word).
653-
logger.warning("No whitespace found. Forcibly splitting mid-word at position %d.", i + safe_max)
654-
# chunk remains chunk[:safe_max], i.e. as-is
655-
elif last_space > -1 and last_space > 0:
656-
# We found a space within the chunk
673+
# This means no space found => forcibly keep chunk as-is
674+
logger.warning(
675+
"No whitespace found. Forcibly splitting mid-word at position %d.",
676+
i + safe_max
677+
)
678+
elif last_space > 0:
657679
chunk = chunk[:last_space]
658-
# Note: If you do this, you might want to adjust `i` accordingly
659-
# or treat the leftover text on the next iteration.
660-
# But for a simple approach, this is enough to keep the code short.
661680

662-
# Now send the message
663-
await bot.send_message(chat_id=update.effective_chat.id, text=chunk, parse_mode='HTML')
681+
# Now send the chunk
682+
await bot.send_message(
683+
chat_id=update.effective_chat.id,
684+
text=chunk,
685+
parse_mode='HTML'
686+
)
664687
logger.info(f"Sent message chunk: {(i // safe_max) + 1}")
688+
689+
# Advance i by the length of the chunk we actually sent
690+
# That leftover beyond `chunk` is still unsent, so the next loop iteration handles it
691+
i += len(chunk)
692+
665693
except Exception as e:
666694
logger.error(f"Error in sending plain text message: {e}")
667695
else:
668696
logger.info("Condition for sending plain text message not met.")
697+
698+
# # // old method; used up until v0.1709.2
699+
# # message sending and chunking logic; revised
700+
# if transcription_settings['send_as_messages'] and 'txt' in transcription_paths:
701+
# try:
702+
# logger.info(f"Preparing to send plain text message from raw content")
703+
# content = transcription_note + raw_content # Add transcription note to the raw content
704+
705+
# # Just to be safe, reduce the chunk even more if needed
706+
# safe_max = 3500 # even safer limit
707+
708+
# for i in range(0, len(content), safe_max):
709+
# chunk = content[i:i+safe_max]
710+
711+
# # Optional: Make sure chunk length is safely under 4096 (should already be)
712+
# if len(chunk) > 4000:
713+
# chunk = chunk[:4000]
714+
715+
# # # OPTIONAL: Check if we end in the middle of an HTML tag and adjust if needed.
716+
# # # For example, if chunk ends with '<', we might remove that character or find the previous space:
717+
# # if '<' in chunk[-5:]: # crude check for partial tag at end
718+
# # # Try to backtrack to a space before the '<'
719+
# # last_space = chunk.rfind(' ')
720+
# # if last_space != -1:
721+
# # chunk = chunk[:last_space]
722+
723+
# # Check if we end on a partial HTML tag
724+
# if '<' in chunk[-5:]: # crude check for partial tag at end
725+
# last_space = chunk.rfind(' ')
726+
# if last_space != -1:
727+
# chunk = chunk[:last_space]
728+
729+
# # Attempt to find last whitespace so we don’t split in the middle of a word
730+
# # BUT if there's NO whitespace at all, we forcibly break anyway:
731+
# last_space = chunk.rfind(' ')
732+
# if last_space == -1 and len(chunk) == safe_max:
733+
# # This means no space was found in the entire chunk,
734+
# # so we forcibly keep the chunk at safe_max (which likely breaks a word).
735+
# logger.warning("No whitespace found. Forcibly splitting mid-word at position %d.", i + safe_max)
736+
# # chunk remains chunk[:safe_max], i.e. as-is
737+
# elif last_space > -1 and last_space > 0:
738+
# # We found a space within the chunk
739+
# chunk = chunk[:last_space]
740+
# # Note: If you do this, you might want to adjust `i` accordingly
741+
# # or treat the leftover text on the next iteration.
742+
# # But for a simple approach, this is enough to keep the code short.
743+
744+
# # Now send the message
745+
# await bot.send_message(chat_id=update.effective_chat.id, text=chunk, parse_mode='HTML')
746+
# logger.info(f"Sent message chunk: {(i // safe_max) + 1}")
747+
# except Exception as e:
748+
# logger.error(f"Error in sending plain text message: {e}")
749+
# else:
750+
# logger.info("Condition for sending plain text message not met.")
669751

670752
# // old method
671753
# if transcription_settings['send_as_messages'] and 'txt' in transcription_paths:

0 commit comments

Comments
 (0)