forked from qodo-ai/pr-agent
-
Notifications
You must be signed in to change notification settings - Fork 1
Add Unit Tests and Improve Documentation for utils.py clip_tokens Function #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
TaskerJang
wants to merge
2
commits into
main
Choose a base branch
from
feature/clip-tokens-tests-and-docs
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,302 @@ | ||
|
|
||
| # Generated by CodiumAI | ||
|
|
||
| import pytest | ||
|
|
||
| from unittest.mock import patch, MagicMock | ||
| from pr_agent.algo.utils import clip_tokens | ||
| from pr_agent.algo.token_handler import TokenEncoder | ||
|
|
||
|
|
||
| class TestClipTokens: | ||
| def test_clip(self): | ||
| """Comprehensive test suite for the clip_tokens function.""" | ||
|
|
||
| def test_empty_input_text(self): | ||
| """Test that empty input returns empty string.""" | ||
| assert clip_tokens("", 10) == "" | ||
| assert clip_tokens(None, 10) == None | ||
|
|
||
| def test_text_under_token_limit(self): | ||
| """Test that text under the token limit is returned unchanged.""" | ||
| text = "Short text" | ||
| max_tokens = 100 | ||
| result = clip_tokens(text, max_tokens) | ||
| assert result == text | ||
|
|
||
| def test_text_exactly_at_token_limit(self): | ||
| """Test text that is exactly at the token limit.""" | ||
| text = "This is exactly at the limit" | ||
| # Mock the token encoder to return exact limit | ||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 10 # Exactly 10 tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, 10) | ||
| assert result == text | ||
|
|
||
| def test_text_over_token_limit_with_three_dots(self): | ||
| """Test text over token limit with three dots addition.""" | ||
| text = "This is a longer text that should be clipped when it exceeds the token limit" | ||
| max_tokens = 5 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
| assert result.endswith("\n...(truncated)") | ||
| assert len(result) < len(text) | ||
|
|
||
| def test_text_over_token_limit_without_three_dots(self): | ||
| """Test text over token limit without three dots addition.""" | ||
| text = "This is a longer text that should be clipped" | ||
| max_tokens = 5 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens, add_three_dots=False) | ||
| assert not result.endswith("\n...(truncated)") | ||
| assert len(result) < len(text) | ||
|
|
||
| def test_negative_max_tokens(self): | ||
| """Test that negative max_tokens returns empty string.""" | ||
| text = "Some text" | ||
| result = clip_tokens(text, -1) | ||
| assert result == "" | ||
|
|
||
| result = clip_tokens(text, -100) | ||
| assert result == "" | ||
|
|
||
| def test_zero_max_tokens(self): | ||
| """Test that zero max_tokens returns empty string.""" | ||
| text = "Some text" | ||
| result = clip_tokens(text, 0) | ||
| assert result == "" | ||
|
|
||
| def test_delete_last_line_functionality(self): | ||
| """Test the delete_last_line parameter functionality.""" | ||
| text = "Line 1\nLine 2\nLine 3\nLine 4" | ||
| max_tokens = 5 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| # Without delete_last_line | ||
| result_normal = clip_tokens(text, max_tokens, delete_last_line=False) | ||
|
|
||
| # With delete_last_line | ||
| result_deleted = clip_tokens(text, max_tokens, delete_last_line=True) | ||
|
|
||
| # The result with delete_last_line should be shorter or equal | ||
| assert len(result_deleted) <= len(result_normal) | ||
|
|
||
| def test_pre_computed_num_input_tokens(self): | ||
| """Test using pre-computed num_input_tokens parameter.""" | ||
| text = "This is a test text" | ||
| max_tokens = 10 | ||
| num_input_tokens = 15 | ||
|
|
||
| # Should not call the encoder when num_input_tokens is provided | ||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_encoder.return_value = None # Should not be called | ||
|
|
||
| result = clip_tokens(text, max_tokens, num_input_tokens=num_input_tokens) | ||
| assert result.endswith("\n...(truncated)") | ||
| mock_encoder.assert_not_called() | ||
|
|
||
| def test_pre_computed_tokens_under_limit(self): | ||
| """Test pre-computed tokens under the limit.""" | ||
| text = "Short text" | ||
| max_tokens = 20 | ||
| num_input_tokens = 5 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_encoder.return_value = None # Should not be called | ||
|
|
||
| result = clip_tokens(text, max_tokens, num_input_tokens=num_input_tokens) | ||
| assert result == text | ||
| mock_encoder.assert_not_called() | ||
|
|
||
| def test_special_characters_and_unicode(self): | ||
| """Test text with special characters and Unicode content.""" | ||
| text = "Special chars: @#$%^&*()_+ ÑéΓΓ³ΓΊ δΈλ¬Έ π emoji" | ||
| max_tokens = 5 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
| assert isinstance(result, str) | ||
| assert len(result) < len(text) | ||
|
|
||
| def test_multiline_text_handling(self): | ||
| """Test handling of multiline text.""" | ||
| text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5" | ||
| max_tokens = 5 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
| assert isinstance(result, str) | ||
|
|
||
| def test_very_long_text(self): | ||
| """Test with very long text.""" | ||
| text = "A" * 10000 # Very long text | ||
| max_tokens = 10 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 5000 # Many tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
| assert len(result) < len(text) | ||
| assert result.endswith("\n...(truncated)") | ||
|
|
||
| def test_encoder_exception_handling(self): | ||
| """Test handling of encoder exceptions.""" | ||
| text = "Test text" | ||
| max_tokens = 10 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_encoder.side_effect = Exception("Encoder error") | ||
|
|
||
| # Should return original text when encoder fails | ||
| result = clip_tokens(text, max_tokens) | ||
| assert result == text | ||
|
|
||
| def test_zero_division_scenario(self): | ||
| """Test scenario that could lead to division by zero.""" | ||
| text = "Test" | ||
| max_tokens = 10 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [] # Empty tokens (could cause division by zero) | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
| # Should handle gracefully and return original text | ||
| assert result == text | ||
|
|
||
| def test_various_edge_cases(self): | ||
| """Test various edge cases.""" | ||
| # Single character | ||
| assert clip_tokens("A", 1000) == "A" | ||
|
|
||
| # Only whitespace | ||
| text = " \n \t " | ||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 10 | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, 5) | ||
| assert isinstance(result, str) | ||
|
|
||
| # Text with only newlines | ||
| text = "\n\n\n\n" | ||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 10 | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, 2, delete_last_line=True) | ||
| assert isinstance(result, str) | ||
|
|
||
| def test_parameter_combinations(self): | ||
| """Test different parameter combinations.""" | ||
| text = "Multi\nline\ntext\nfor\ntesting" | ||
| max_tokens = 5 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 20 | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| # Test all combinations | ||
| combinations = [ | ||
| (True, True), # add_three_dots=True, delete_last_line=True | ||
| (True, False), # add_three_dots=True, delete_last_line=False | ||
| (False, True), # add_three_dots=False, delete_last_line=True | ||
| (False, False), # add_three_dots=False, delete_last_line=False | ||
| ] | ||
|
|
||
| for add_dots, delete_line in combinations: | ||
| result = clip_tokens(text, max_tokens, | ||
| add_three_dots=add_dots, | ||
| delete_last_line=delete_line) | ||
| assert isinstance(result, str) | ||
| if add_dots and len(result) > 0: | ||
| assert result.endswith("\n...(truncated)") or result == text | ||
|
|
||
| def test_num_output_chars_zero_scenario(self): | ||
| """Test scenario where num_output_chars becomes zero or negative.""" | ||
| text = "Short" | ||
| max_tokens = 1 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 1000 # Many tokens for short text | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
| # When num_output_chars is 0 or negative, should return empty string | ||
| assert result == "" | ||
|
|
||
| def test_logging_on_exception(self): | ||
| """Test that exceptions are properly logged.""" | ||
| text = "Test text" | ||
| max_tokens = 10 | ||
|
|
||
| # Patch the logger at the module level where it's imported | ||
| with patch('pr_agent.algo.utils.get_logger') as mock_logger: | ||
| mock_log_instance = MagicMock() | ||
| mock_logger.return_value = mock_log_instance | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_encoder.side_effect = Exception("Test exception") | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
|
|
||
| # Should log the warning | ||
| mock_log_instance.warning.assert_called_once() | ||
| # Should return original text | ||
| assert result == text | ||
|
|
||
| def test_factor_safety_calculation(self): | ||
| """Test that the 0.9 factor (10% reduction) works correctly.""" | ||
| text = "Test text that should be reduced by 10 percent for safety" | ||
| max_tokens = 10 | ||
|
|
||
| with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: | ||
| mock_tokenizer = MagicMock() | ||
| mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens | ||
| mock_encoder.return_value = mock_tokenizer | ||
|
|
||
| result = clip_tokens(text, max_tokens) | ||
|
|
||
| # The result should be shorter due to the 0.9 factor | ||
| # Characters per token = len(text) / 20 | ||
| # Expected chars = int(0.9 * (len(text) / 20) * 10) | ||
| expected_chars = int(0.9 * (len(text) / 20) * 10) | ||
|
|
||
| # Result should be around expected_chars length (plus truncation text) | ||
| if result.endswith("\n...(truncated)"): | ||
| actual_content = result[:-len("\n...(truncated)")] | ||
| assert len(actual_content) <= expected_chars + 5 # Some tolerance | ||
|
|
||
| # Test the original basic functionality to ensure backward compatibility | ||
| def test_clip_original_functionality(self): | ||
| """Test original functionality from the existing test.""" | ||
| text = "line1\nline2\nline3\nline4\nline5\nline6" | ||
| max_tokens = 25 | ||
| result = clip_tokens(text, max_tokens) | ||
|
|
@@ -16,4 +305,4 @@ def test_clip(self): | |
| max_tokens = 10 | ||
| result = clip_tokens(text, max_tokens) | ||
| expected_results = 'line1\nline2\nline3\n\n...(truncated)' | ||
| assert result == expected_results | ||
| assert result == expected_results | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. μμνμ§λ§ EOF λ μ±κ²¨μ£Όμμ¬.. μ½λ©νΈ λ¨κ²¨λ΄ λλ€ :) |
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Takser..God λ, μ½λ 보면μ κ°μΈμ μΌλ‘ κΆκΈν μ μ΄ μκ²Όμ΄μ :)
test_clip_tokens.pyμ²λΌ κΌΌκΌΌν ν μ€νΈ μΌμ΄μ€λ€μ 보면
ꡬμνμ€ λ μ΄λ€ κ²λΆν° μμν΄μ μ΄μ λΆμ¬λλμ§ κ·Έ κ³Όμ μ΄ κΆκΈν©λλ€.
ν¨μ ν΅μ¬ κΈ°λ₯μ λ¨Όμ μ‘κ³ ,
κ·Έλ€μμ μ λ ₯ κ° λ μ¬κ³ ,
μ΅μ λ³ λμμ΄λ μμΈ μ²λ¦¬ μμΌλ‘ 체κ³μ μΌλ‘ λ§λ€μ΄κ°μλ 건μ§
μλλ©΄ κ°μΈμ΄ μ£Όλ‘ μ¬μ©νλ λ°©μμ΄ μλμ§ λ±λ±...
λΉλμ κ³Όμ μ λν μκ°μ νλ¦ νμ μ‘°κΈλ§ 곡μ ν΄ μ£Όμ€ μ μμκΉμ?
(μ무λλ μ κ° μ λμ μΈ κ²½νμ΄ μ λ€λ³΄λ... μκ°μ νλ¦λκ° κΆκΈν©λλ€!)
νμ λ§μ΄ λ°°μλλ€ππ₯π₯π₯
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
1. μΌλ¨ ν¨μ λ³΄κ³ "λνλ ν¨μμ§?"
ν ν° μλ₯΄λ ν¨μꡬλ. κ·ΈλΌ μΌλ¨ κΈ°λ³Έμ μΈ κ±°λΆν° ν μ€νΈν΄λ³΄μ.
2. κΈ°μ‘΄ ν μ€νΈ νμΌ λ΄€λλ 2κ°λ°μ μμ
μ΄κ±° λ무 λΆμ‘±νλ€ μΆμ΄μ Issue λ΄μ© λ€μ λ΄€μ΄μ.
3. Issueμμ μꡬν κ²λ€ 체ν¬
4. μ€μ λ‘λ νλμ© λ§λ€λ©΄μ "μ μ΄κ²λ ν μ€νΈν΄μΌκ² λ€" νλ©΄μ μΆκ°
μ΄λ κ² νλμ© λ§λ€λ€κ° "μ μ λμ½λλ?" "0μΌλ‘ λλκΈ°λ?" μ΄λ° μμΌλ‘ κ³μ λ μ¬λΌμ μΆκ°ν κ±°μμ.
5. μμ§ν 21κ°λ λ μ€ λͺ°λμ γ γ
μ²μμ 10κ° μ λ μκ°νλλ° νλ€λ³΄λ "μ΄κ²λ ν μ€νΈν΄μΌκ² λ€" νλ©΄μ κ³μ λμ΄λ¬μ΄μ.
νΉν mock ν μ€νΈ λΆλΆμ... μ¬μ€ μ’ ν€λ§Έμ΄μ. TokenEncoder μ΄λ»κ² mock ν΄μΌ νλ κ³ λ―Όνλ©΄μ μ¬λ¬ λ² μλν΄λ΄€κ³ μ.
μ§μ§ μλ²½νκ² κ³ν μΈμ°κ³ ν κ² μλλΌ λ§λ€λ©΄μ "μ μ΄κ²λ, μ κ²λ" νλ©΄μ λ§λΆμΈ κ±°λΌ π
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
κ³ μ€νΈλ°λμμ²λΌ
1->2->3->4->5 μ΄λ κ² λΉμνλ©΄μ λ°λΌκ°λκΉ μ΄ν΄κ° κ°μ΅λλ€..
Issueμμ μꡬν κ²λ€ 체ν¬μ΄ λΆλΆλ μ€μνκ³κ²°κ΅ νλ νλ λΆμ¬λκ°λκ² λ°©λ²μ΄μκ΅°μ!
(ν¬λ§ ν¬μΈνΈγ γ ) - >
μμ§ν 21κ°λ λ μ€ λͺ°λμν μ€νΈ μμλ¦¬κ³ μ λλ‘ λμ€λμ§, ν μ€νΈ μλ¦¬κ³ + λΆλμ§, 0μΌλ‘ λλ μ§λ μ€λ₯ ν μ€νΈ λ±...
λ°λ‘ λμ¨κ² μλλΌ νλ νλ, μ¬λ¬ λ² κ³ λ―Όμ΄λ μλκ° λ€μ΄κ°μ λμ¨κ±°κ΅°μ
λλΆμ λ§μ΄ λ°°μκ°λλ€! βοΈ
(νλ μ΄νμλ λΈλ‘κ·Έ κΈ λͺ°λ νμΈνκ² μ΅λλ€..)