@@ -8,6 +8,8 @@ class DocStripper {
88 removePageNumbers : options . removePageNumbers !== false ,
99 removeHeadersFooters : options . removeHeadersFooters !== false ,
1010 removeDuplicates : options . removeDuplicates !== false ,
11+ removePunctuationLines : options . removePunctuationLines !== false ,
12+ preserveParagraphSpacing : options . preserveParagraphSpacing !== false ,
1113 } ;
1214
1315 // Enhanced header/footer patterns
@@ -30,14 +32,36 @@ class DocStripper {
3032 / ^ C L A S S I F I E D $ / i,
3133 / ^ C l a s s i f i e d $ / i,
3234 / ^ F O R I N T E R N A L U S E O N L Y $ / i,
35+ / ^ F O R \s + I N T E R N A L \s + U S E \s + O N L Y $ / i,
36+ / ^ I N T E R N A L U S E O N L Y $ / i,
3337 / ^ D O N O T D I S T R I B U T E $ / i,
38+ / ^ P R O P R I E T A R Y $ / i,
39+ / ^ P r o p r i e t a r y $ / i,
3440 ] ;
3541 }
3642
3743 isPageNumber ( line ) {
3844 const stripped = line . trim ( ) ;
3945 if ( ! stripped ) return false ;
40- return / ^ \s * \d + \s * $ / . test ( stripped ) ;
46+
47+ // Regular numbers: 1, 2, 3, etc.
48+ if ( / ^ \d + $ / . test ( stripped ) ) return true ;
49+
50+ // Roman numerals: I, II, III, IV, etc.
51+ if ( / ^ [ I V X L C D M ] + $ / i. test ( stripped ) && stripped . length <= 10 ) return true ;
52+
53+ // Single letters: A, B, C, etc. (common in appendices)
54+ if ( / ^ [ A - Z ] $ / i. test ( stripped ) ) return true ;
55+
56+ return false ;
57+ }
58+
59+ isPunctuationOnly ( line ) {
60+ const stripped = line . trim ( ) ;
61+ if ( ! stripped ) return false ;
62+
63+ // Lines with only punctuation characters: ---, ***, ===, etc.
64+ return / ^ [ ^ \w \s ] + $ / . test ( stripped ) && stripped . length <= 50 ;
4165 }
4266
4367 isHeaderFooter ( line ) {
@@ -51,11 +75,13 @@ class DocStripper {
5175 const lines = text . split ( '\n' ) ;
5276 const cleanedLines = [ ] ;
5377 let prevLine = null ;
78+ let prevNonEmptyLine = null ;
5479 const stats = {
5580 linesRemoved : 0 ,
5681 duplicatesCollapsed : 0 ,
5782 emptyLinesRemoved : 0 ,
5883 headerFooterRemoved : 0 ,
84+ punctuationLinesRemoved : 0 ,
5985 } ;
6086
6187 for ( let i = 0 ; i < lines . length ; i ++ ) {
@@ -65,6 +91,22 @@ class DocStripper {
6591 // Skip empty or whitespace-only lines (if enabled)
6692 if ( ! stripped ) {
6793 if ( this . options . removeEmptyLines ) {
94+ // If preserving paragraph spacing, keep one empty line after non-empty lines
95+ if ( this . options . preserveParagraphSpacing && prevNonEmptyLine !== null ) {
96+ // Check if next line is non-empty
97+ let nextNonEmptyIdx = i + 1 ;
98+ while ( nextNonEmptyIdx < lines . length && ! lines [ nextNonEmptyIdx ] . trim ( ) ) {
99+ nextNonEmptyIdx ++ ;
100+ }
101+
102+ // If there's a non-empty line after this empty line, keep one empty line
103+ if ( nextNonEmptyIdx < lines . length && lines [ nextNonEmptyIdx ] . trim ( ) ) {
104+ cleanedLines . push ( line ) ;
105+ prevNonEmptyLine = null ; // Reset to prevent multiple empty lines
106+ continue ;
107+ }
108+ }
109+
68110 stats . emptyLinesRemoved ++ ;
69111 continue ;
70112 } else {
@@ -73,6 +115,12 @@ class DocStripper {
73115 }
74116 }
75117
118+ // Skip punctuation-only lines (if enabled)
119+ if ( this . options . removePunctuationLines && this . isPunctuationOnly ( stripped ) ) {
120+ stats . punctuationLinesRemoved ++ ;
121+ continue ;
122+ }
123+
76124 // Skip page numbers (if enabled)
77125 if ( this . options . removePageNumbers && this . isPageNumber ( stripped ) ) {
78126 stats . headerFooterRemoved ++ ;
@@ -93,6 +141,7 @@ class DocStripper {
93141
94142 cleanedLines . push ( line ) ;
95143 prevLine = line ;
144+ prevNonEmptyLine = line ;
96145 }
97146
98147 stats . linesRemoved = lines . length - cleanedLines . length ;
@@ -180,6 +229,7 @@ class DocStripper {
180229 duplicatesCollapsed : 0 ,
181230 emptyLinesRemoved : 0 ,
182231 headerFooterRemoved : 0 ,
232+ punctuationLinesRemoved : 0 ,
183233 } ;
184234 }
185235}
@@ -435,6 +485,8 @@ class App {
435485 removePageNumbers : this . removePageNumbers ? this . removePageNumbers . checked : true ,
436486 removeHeadersFooters : this . removeHeadersFooters ? this . removeHeadersFooters . checked : true ,
437487 removeDuplicates : this . removeDuplicates ? this . removeDuplicates . checked : true ,
488+ removePunctuationLines : this . removePunctuationLines ? this . removePunctuationLines . checked : true ,
489+ preserveParagraphSpacing : this . preserveParagraphSpacing ? this . preserveParagraphSpacing . checked : true ,
438490 } ;
439491
440492 // Create new stripper instance with current settings
@@ -455,6 +507,7 @@ class App {
455507 duplicatesCollapsed : 0 ,
456508 emptyLinesRemoved : 0 ,
457509 headerFooterRemoved : 0 ,
510+ punctuationLinesRemoved : 0 ,
458511 } ;
459512
460513 for ( const file of this . files ) {
@@ -467,6 +520,7 @@ class App {
467520 totalStats . duplicatesCollapsed += result . stats . duplicatesCollapsed ;
468521 totalStats . emptyLinesRemoved += result . stats . emptyLinesRemoved ;
469522 totalStats . headerFooterRemoved += result . stats . headerFooterRemoved ;
523+ totalStats . punctuationLinesRemoved += result . stats . punctuationLinesRemoved || 0 ;
470524 }
471525 }
472526
@@ -507,6 +561,12 @@ class App {
507561 <span class="stat-value">${ totalStats . headerFooterRemoved } </span>
508562 <span class="stat-label">Headers/Footers Removed</span>
509563 </div>
564+ ${ totalStats . punctuationLinesRemoved > 0 ? `
565+ <div class="stat-item">
566+ <span class="stat-value">${ totalStats . punctuationLinesRemoved } </span>
567+ <span class="stat-label">Punctuation Lines Removed</span>
568+ </div>
569+ ` : '' }
510570 </div>
511571 </div>
512572 ` ;
0 commit comments