|
1 | 1 | # transcription_handler.py |
2 | 2 | # ~~~ |
3 | 3 | # openai-whisper transcriber-bot for Telegram |
4 | | -# v0.12 |
5 | 4 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
6 | 5 | # https://github.com/FlyingFathead/whisper-transcriber-telegram-bot/ |
7 | 6 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
@@ -172,7 +171,7 @@ def get_transcription_settings(): |
172 | 171 | } |
173 | 172 |
|
174 | 173 | # split long messages |
175 | | -def split_message(message, max_length=4096): |
| 174 | +def split_message(message, max_length=3500): |
176 | 175 | return [message[i:i+max_length] for i in range(0, len(message), max_length)] |
177 | 176 |
|
178 | 177 | # // audio download (new method) |
@@ -620,22 +619,45 @@ async def process_url_message(message_text, bot, update, model, language): |
620 | 619 | try: |
621 | 620 | logger.info(f"Preparing to send plain text message from raw content") |
622 | 621 | content = transcription_note + raw_content # Add transcription note to the raw content |
| 622 | + |
623 | 623 | # Just to be safe, reduce the chunk even more if needed |
624 | | - safe_max = 3500 # even safer limit |
| 624 | + safe_max = 3000 # even safer limit |
| 625 | + |
625 | 626 | for i in range(0, len(content), safe_max): |
626 | 627 | chunk = content[i:i+safe_max] |
627 | 628 |
|
628 | 629 | # Optional: Make sure chunk length is safely under 4096 (should already be) |
629 | | - if len(chunk) > 4096: |
630 | | - chunk = chunk[:4096] |
631 | | - |
632 | | - # OPTIONAL: Check if we end in the middle of an HTML tag and adjust if needed. |
633 | | - # For example, if chunk ends with '<', we might remove that character or find the previous space: |
| 630 | + if len(chunk) > 4000: |
| 631 | + chunk = chunk[:4000] |
| 632 | + |
| 633 | + # # OPTIONAL: Check if we end in the middle of an HTML tag and adjust if needed. |
| 634 | + # # For example, if chunk ends with '<', we might remove that character or find the previous space: |
| 635 | + # if '<' in chunk[-5:]: # crude check for partial tag at end |
| 636 | + # # Try to backtrack to a space before the '<' |
| 637 | + # last_space = chunk.rfind(' ') |
| 638 | + # if last_space != -1: |
| 639 | + # chunk = chunk[:last_space] |
| 640 | + |
| 641 | + # Check if we end on a partial HTML tag |
634 | 642 | if '<' in chunk[-5:]: # crude check for partial tag at end |
635 | | - # Try to backtrack to a space before the '<' |
636 | 643 | last_space = chunk.rfind(' ') |
637 | 644 | if last_space != -1: |
638 | 645 | chunk = chunk[:last_space] |
| 646 | + |
| 647 | + # Attempt to find last whitespace so we don’t split in the middle of a word |
| 648 | + # BUT if there's NO whitespace at all, we forcibly break anyway: |
| 649 | + last_space = chunk.rfind(' ') |
| 650 | + if last_space == -1 and len(chunk) == safe_max: |
| 651 | + # This means no space was found in the entire chunk, |
| 652 | + # so we forcibly keep the chunk at safe_max (which likely breaks a word). |
| 653 | + logger.warning("No whitespace found. Forcibly splitting mid-word at position %d.", i + safe_max) |
| 654 | + # chunk remains chunk[:safe_max], i.e. as-is |
| 655 | + elif last_space > -1 and last_space > 0: |
| 656 | + # We found a space within the chunk |
| 657 | + chunk = chunk[:last_space] |
| 658 | + # Note: If you do this, you might want to adjust `i` accordingly |
| 659 | + # or treat the leftover text on the next iteration. |
| 660 | + # But for a simple approach, this is enough to keep the code short. |
639 | 661 |
|
640 | 662 | # Now send the message |
641 | 663 | await bot.send_message(chat_id=update.effective_chat.id, text=chunk, parse_mode='HTML') |
@@ -680,7 +702,7 @@ async def process_url_message(message_text, bot, update, model, language): |
680 | 702 | await bot.send_message(chat_id=update.effective_chat.id, text="An error occurred during processing.") |
681 | 703 |
|
682 | 704 | # create video info |
683 | | -def create_video_info_message(details, max_length=4000): |
| 705 | +def create_video_info_message(details): |
684 | 706 | header_separator = "=" * 10 |
685 | 707 | video_info_message = f"""{header_separator} |
686 | 708 | Title: {details.get('title', 'No title available')} |
|
0 commit comments