Skip to content

Commit a5b5038

Browse files
committed
✨ Improve cleaning algorithm with advanced features
Features: - Add punctuation-only line detection (---, ***, ===) - Add Roman numeral and letter page number support (I, II, III, A, B, C) - Add preserve paragraph spacing option (keeps one empty line between paragraphs) - Improve header/footer patterns (INTERNAL USE ONLY detection) - Add new settings UI options for punctuation and spacing - Enhanced statistics tracking Improvements: - Better page number detection (Roman numerals, single letters) - Smarter empty line handling with paragraph spacing option - More comprehensive header/footer patterns Update cache-busting to v=11
1 parent d7c5a66 commit a5b5038

File tree

2 files changed

+77
-3
lines changed

2 files changed

+77
-3
lines changed

docs/assets/app.js

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ class DocStripper {
88
removePageNumbers: options.removePageNumbers !== false,
99
removeHeadersFooters: options.removeHeadersFooters !== false,
1010
removeDuplicates: options.removeDuplicates !== false,
11+
removePunctuationLines: options.removePunctuationLines !== false,
12+
preserveParagraphSpacing: options.preserveParagraphSpacing !== false,
1113
};
1214

1315
// Enhanced header/footer patterns
@@ -30,14 +32,36 @@ class DocStripper {
3032
/^CLASSIFIED$/i,
3133
/^Classified$/i,
3234
/^FOR INTERNAL USE ONLY$/i,
35+
/^FOR\s+INTERNAL\s+USE\s+ONLY$/i,
36+
/^INTERNAL USE ONLY$/i,
3337
/^DO NOT DISTRIBUTE$/i,
38+
/^PROPRIETARY$/i,
39+
/^Proprietary$/i,
3440
];
3541
}
3642

3743
isPageNumber(line) {
3844
const stripped = line.trim();
3945
if (!stripped) return false;
40-
return /^\s*\d+\s*$/.test(stripped);
46+
47+
// Regular numbers: 1, 2, 3, etc.
48+
if (/^\d+$/.test(stripped)) return true;
49+
50+
// Roman numerals: I, II, III, IV, etc.
51+
if (/^[IVXLCDM]+$/i.test(stripped) && stripped.length <= 10) return true;
52+
53+
// Single letters: A, B, C, etc. (common in appendices)
54+
if (/^[A-Z]$/i.test(stripped)) return true;
55+
56+
return false;
57+
}
58+
59+
isPunctuationOnly(line) {
60+
const stripped = line.trim();
61+
if (!stripped) return false;
62+
63+
// Lines with only punctuation characters: ---, ***, ===, etc.
64+
return /^[^\w\s]+$/.test(stripped) && stripped.length <= 50;
4165
}
4266

4367
isHeaderFooter(line) {
@@ -51,11 +75,13 @@ class DocStripper {
5175
const lines = text.split('\n');
5276
const cleanedLines = [];
5377
let prevLine = null;
78+
let prevNonEmptyLine = null;
5479
const stats = {
5580
linesRemoved: 0,
5681
duplicatesCollapsed: 0,
5782
emptyLinesRemoved: 0,
5883
headerFooterRemoved: 0,
84+
punctuationLinesRemoved: 0,
5985
};
6086

6187
for (let i = 0; i < lines.length; i++) {
@@ -65,6 +91,22 @@ class DocStripper {
6591
// Skip empty or whitespace-only lines (if enabled)
6692
if (!stripped) {
6793
if (this.options.removeEmptyLines) {
94+
// If preserving paragraph spacing, keep one empty line after non-empty lines
95+
if (this.options.preserveParagraphSpacing && prevNonEmptyLine !== null) {
96+
// Check if next line is non-empty
97+
let nextNonEmptyIdx = i + 1;
98+
while (nextNonEmptyIdx < lines.length && !lines[nextNonEmptyIdx].trim()) {
99+
nextNonEmptyIdx++;
100+
}
101+
102+
// If there's a non-empty line after this empty line, keep one empty line
103+
if (nextNonEmptyIdx < lines.length && lines[nextNonEmptyIdx].trim()) {
104+
cleanedLines.push(line);
105+
prevNonEmptyLine = null; // Reset to prevent multiple empty lines
106+
continue;
107+
}
108+
}
109+
68110
stats.emptyLinesRemoved++;
69111
continue;
70112
} else {
@@ -73,6 +115,12 @@ class DocStripper {
73115
}
74116
}
75117

118+
// Skip punctuation-only lines (if enabled)
119+
if (this.options.removePunctuationLines && this.isPunctuationOnly(stripped)) {
120+
stats.punctuationLinesRemoved++;
121+
continue;
122+
}
123+
76124
// Skip page numbers (if enabled)
77125
if (this.options.removePageNumbers && this.isPageNumber(stripped)) {
78126
stats.headerFooterRemoved++;
@@ -93,6 +141,7 @@ class DocStripper {
93141

94142
cleanedLines.push(line);
95143
prevLine = line;
144+
prevNonEmptyLine = line;
96145
}
97146

98147
stats.linesRemoved = lines.length - cleanedLines.length;
@@ -180,6 +229,7 @@ class DocStripper {
180229
duplicatesCollapsed: 0,
181230
emptyLinesRemoved: 0,
182231
headerFooterRemoved: 0,
232+
punctuationLinesRemoved: 0,
183233
};
184234
}
185235
}
@@ -435,6 +485,8 @@ class App {
435485
removePageNumbers: this.removePageNumbers ? this.removePageNumbers.checked : true,
436486
removeHeadersFooters: this.removeHeadersFooters ? this.removeHeadersFooters.checked : true,
437487
removeDuplicates: this.removeDuplicates ? this.removeDuplicates.checked : true,
488+
removePunctuationLines: this.removePunctuationLines ? this.removePunctuationLines.checked : true,
489+
preserveParagraphSpacing: this.preserveParagraphSpacing ? this.preserveParagraphSpacing.checked : true,
438490
};
439491

440492
// Create new stripper instance with current settings
@@ -455,6 +507,7 @@ class App {
455507
duplicatesCollapsed: 0,
456508
emptyLinesRemoved: 0,
457509
headerFooterRemoved: 0,
510+
punctuationLinesRemoved: 0,
458511
};
459512

460513
for (const file of this.files) {
@@ -467,6 +520,7 @@ class App {
467520
totalStats.duplicatesCollapsed += result.stats.duplicatesCollapsed;
468521
totalStats.emptyLinesRemoved += result.stats.emptyLinesRemoved;
469522
totalStats.headerFooterRemoved += result.stats.headerFooterRemoved;
523+
totalStats.punctuationLinesRemoved += result.stats.punctuationLinesRemoved || 0;
470524
}
471525
}
472526

@@ -507,6 +561,12 @@ class App {
507561
<span class="stat-value">${totalStats.headerFooterRemoved}</span>
508562
<span class="stat-label">Headers/Footers Removed</span>
509563
</div>
564+
${totalStats.punctuationLinesRemoved > 0 ? `
565+
<div class="stat-item">
566+
<span class="stat-value">${totalStats.punctuationLinesRemoved}</span>
567+
<span class="stat-label">Punctuation Lines Removed</span>
568+
</div>
569+
` : ''}
510570
</div>
511571
</div>
512572
`;

docs/index.html

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<meta name="viewport" content="width=device-width, initial-scale=1.0">
66
<title>DocStripper - Batch Document Cleaner</title>
77
<meta name="description" content="DocStripper - Remove noise from text documents automatically. Clean page numbers, headers, footers, duplicates, and empty lines.">
8-
<link rel="stylesheet" href="assets/style.css?v=10">
8+
<link rel="stylesheet" href="assets/style.css?v=11">
99
<link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 100 100'><text y='.9em' font-size='90'>🧹</text></svg>">
1010
</head>
1111
<body>
@@ -78,6 +78,20 @@ <h3>Cleaning Options</h3>
7878
<span class="setting-desc">Collapse consecutive identical lines</span>
7979
</span>
8080
</label>
81+
<label class="setting-item">
82+
<input type="checkbox" id="removePunctuationLines" checked>
83+
<span class="setting-label">
84+
<span class="setting-title">Remove Punctuation Lines</span>
85+
<span class="setting-desc">Remove lines with only symbols (---, ***, ===)</span>
86+
</span>
87+
</label>
88+
<label class="setting-item">
89+
<input type="checkbox" id="preserveParagraphSpacing" checked>
90+
<span class="setting-label">
91+
<span class="setting-title">Preserve Paragraph Spacing</span>
92+
<span class="setting-desc">Keep one empty line between paragraphs</span>
93+
</span>
94+
</label>
8195
</div>
8296
</div>
8397

@@ -259,6 +273,6 @@ <h3>Clean Output</h3>
259273
</div>
260274
</footer>
261275

262-
<script src="assets/app.js?v=10"></script>
276+
<script src="assets/app.js?v=11"></script>
263277
</body>
264278
</html>

0 commit comments

Comments
 (0)