diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractor.java index 320fc03c..f3db703e 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractor.java @@ -31,6 +31,8 @@ import java.io.PrintStream; import java.io.Reader; import java.io.Writer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -156,7 +158,11 @@ public class TikaExtractor extends PasswordBasedExtractor { private static final String FILE_PASSWORD = "fess.file.password"; /** - * Output encoding. + * Output encoding used when materializing extracted content into a byte + * stream (e.g. read-as-text fallbacks). This setter intentionally does + * not influence the JVM-stream capture/replay path used by + * {@link #muteSystemStreams()} / {@link #replayCapturedBytes(BoundedByteArrayOutputStream, boolean)}; + * see those methods for the rationale. */ protected String outputEncoding = Constants.UTF_8; @@ -210,6 +216,50 @@ public class TikaExtractor extends PasswordBasedExtractor { */ protected TikaConfig tikaConfig; + /** + * If true, System.out/System.err are muted during Tika parsing to suppress + * stray output produced by some bundled parsers. Defaults to {@code true} + * to preserve existing behavior. Disable when debugging or when callers + * want to keep the original streams intact. + */ + protected boolean muteSystemStreams = true; + + /** + * Class-wide lock guarding {@link #muteRefCount}, {@link #savedOut}, + * {@link #savedErr} and the capture buffers. We never hold this lock during + * extraction itself, so concurrent extractions are not serialized; we only + * synchronize the (cheap) swap and restore of the JVM streams plus the + * post-run replay of captured bytes. + */ + private static final Object SYSTEM_STREAM_LOCK = new Object(); + + /** + * Hard cap on the number of bytes captured from each muted stream + * (System.out and System.err) while extractions are running. Once the cap + * is reached, additional writes are discarded and a single truncation + * marker is appended to the buffer so the operator can tell the output was + * cut short. + */ + private static final int CAPTURE_BUFFER_SIZE = 64 * 1024; + + /** Truncation marker appended once when a capture buffer fills up. */ + private static final byte[] CAPTURE_TRUNCATION_MARKER = "\n[truncated; further output discarded]\n".getBytes(StandardCharsets.UTF_8); + + /** Number of extractions currently running with muted streams. */ + private static int muteRefCount = 0; + + /** Original {@link System#out}, captured by the first muting thread. */ + private static PrintStream savedOut; + + /** Original {@link System#err}, captured by the first muting thread. */ + private static PrintStream savedErr; + + /** Buffer capturing bytes written to the muted {@link System#out}. */ + private static BoundedByteArrayOutputStream capturedOut; + + /** Buffer capturing bytes written to the muted {@link System#err}. */ + private static BoundedByteArrayOutputStream capturedErr; + private final Map tesseractOCRConfigMap = new ConcurrentHashMap<>(); private final Map pdfParserConfigMap = new ConcurrentHashMap<>(); @@ -272,13 +322,11 @@ protected ExtractData getText(final InputStream inputStream, final Map { - InputStream in = null; - try { - if (!isByteStream) { - try (OutputStream out = new FileOutputStream(tempFile)) { - CopyUtil.copy(inputStream, out); - } - in = new FileInputStream(tempFile); - } else { - in = inputStream; - } + try (InputStream in = openMaterializedInput(inputStream, tempFile, isByteStream)) { parser.parse(in, new BodyContentHandler(writer), metadata, parseContext); - } finally { - CloseableUtil.closeQuietly(in); } }, contentEncoding, normalizeText); if (StringUtil.isBlank(content)) { if (resourceName != null) { if (logger.isDebugEnabled()) { - logger.debug("retry without a resource name: {}", resourceName); + logger.debug("retry without a resource name: resourceName={}", resourceName); } final Metadata metadata2 = createMetadata(null, contentType, contentEncoding, password); content = getContent(writer -> { - InputStream in = null; - try { - if (isByteStream) { - inputStream.reset(); - in = inputStream; - } else { - in = new FileInputStream(tempFile); - } + try (InputStream in = openMaterializedInput(inputStream, tempFile, isByteStream)) { parser.parse(in, new BodyContentHandler(writer), metadata2, parseContext); - } finally { - CloseableUtil.closeQuietly(in); } }, contentEncoding, normalizeText); } if (StringUtil.isBlank(content) && contentType != null) { if (logger.isDebugEnabled()) { - logger.debug("retry without a content type: {}", contentType); + logger.debug("retry without a content type: contentType={}", contentType); } final Metadata metadata3 = createMetadata(null, null, contentEncoding, password); content = getContent(writer -> { - InputStream in = null; - try { - if (isByteStream) { - inputStream.reset(); - in = inputStream; - } else { - in = new FileInputStream(tempFile); - } + try (InputStream in = openMaterializedInput(inputStream, tempFile, isByteStream)) { parser.parse(in, new BodyContentHandler(writer), metadata3, parseContext); - } finally { - CloseableUtil.closeQuietly(in); } }, contentEncoding, normalizeText); } @@ -388,7 +417,8 @@ protected ExtractData getText(final InputStream inputStream, final Map= 0) { - throw new ExtractException("Zip bomb detected.", e); + if (e.getMessage() != null && e.getMessage().indexOf("bomb") >= 0) { + throw new ExtractException("Failed to extract via Tika: reason=zipBombDetected", e); } final Throwable cause = e.getCause(); if (cause instanceof SAXException) { final Extractor xmlExtractor = crawlerContainer.getComponent("xmlExtractor"); if (xmlExtractor != null) { - InputStream in = null; - try { - if (isByteStream) { - inputStream.reset(); - in = inputStream; - } else { - in = new FileInputStream(tempFile); - } + try (InputStream in = openMaterializedInput(inputStream, tempFile, isByteStream)) { return xmlExtractor.getText(in, params); - } finally { - CloseableUtil.closeQuietly(in); } } } throw e; + } + } catch (final ExtractException e) { + throw e; + } catch (final Exception e) { + throw new ExtractException("Failed to extract via Tika: reason=unexpectedError", e); + } finally { + try { + FileUtil.deleteInBackground(tempFile); } finally { - if (originalOutStream != null) { - try { - System.setOut(originalOutStream); - } catch (Exception e) { - logger.warn("Failed to set originalOutStream.", e); - } + if (muted) { + unmuteSystemStreams(); } - if (originalErrStream != null) { + } + } + } + + /** + * Opens an {@link InputStream} backed by the already-materialized source (the byte + * stream marked at the start, or the on-disk {@code tempFile}). When a temp file is + * present we wrap it as a {@link TikaInputStream} so that + * {@code TikaInputStream.get(stream, ...)} inside {@link TikaDetectParser#parse} + * can reuse the existing file path instead of spooling the bytes a second time. + * + * @param inputStream the original byte stream (used when {@code isByteStream}) + * @param tempFile the materialized on-disk copy (used otherwise); may be {@code null} + * @param isByteStream {@code true} iff the caller passed a {@link ByteArrayInputStream} + * @return a fresh, ready-to-read input stream + * @throws IOException if the on-disk file cannot be opened + */ + protected InputStream openMaterializedInput(final InputStream inputStream, final File tempFile, final boolean isByteStream) + throws IOException { + if (isByteStream) { + inputStream.reset(); + return inputStream; + } + return TikaInputStream.get(tempFile.toPath()); + } + + /** + * Mutes {@link System#out} and {@link System#err} for the duration of the current + * extraction. Concurrent extractions share a single muted state via a reference + * count — the first caller saves the original streams and swaps in bounded + * capture buffers; subsequent callers just bump the count. Captured bytes are + * not silently discarded: when the last outstanding mute is released the + * captured contents are emitted via the configured logger so operators still + * see Tika parser warnings (PDFBox font warnings, JBIG2 warnings, legacy POI + * debug, etc.). The lock is released as soon as the swap is recorded so that + * extractions never serialize on it. + * + *

The replacement {@link PrintStream}s are constructed with the auto-flush + * two-arg form, which the JVM wraps using {@link Charset#defaultCharset()}. + * The configured {@link #outputEncoding} is intentionally not applied + * here: third-party libraries (Tika, PDFBox, POI, JBIG2-ImageIO) write + * diagnostics through {@link System#out}/{@link System#err} using the JVM + * default charset, and forcing a different encoder would risk character + * substitution for code points not representable in that encoding. See + * {@link #replayCapturedBytes(BoundedByteArrayOutputStream, boolean)} for the + * matching decode side. + */ + protected void muteSystemStreams() { + synchronized (SYSTEM_STREAM_LOCK) { + if (muteRefCount == 0) { + savedOut = System.out; + savedErr = System.err; + capturedOut = new BoundedByteArrayOutputStream(CAPTURE_BUFFER_SIZE); + capturedErr = new BoundedByteArrayOutputStream(CAPTURE_BUFFER_SIZE); + System.setOut(new PrintStream(capturedOut, true)); + System.setErr(new PrintStream(capturedErr, true)); + } + muteRefCount++; + } + } + + /** + * Releases one reference acquired by {@link #muteSystemStreams()}. When the last + * outstanding mute is released, the original streams are restored and any + * bytes captured while the streams were muted are replayed via the logger + * (info for stdout, warn for stderr) so they are not silently lost. Calling + * this without a matching {@link #muteSystemStreams()} is a programming + * error and is tolerated only defensively (the count is clamped at zero). + */ + protected void unmuteSystemStreams() { + BoundedByteArrayOutputStream outToReplay = null; + BoundedByteArrayOutputStream errToReplay = null; + synchronized (SYSTEM_STREAM_LOCK) { + if (muteRefCount <= 0) { + logger.warn("unmuteSystemStreams called without a matching mute; muteRefCount={}", muteRefCount); + return; + } + muteRefCount--; + if (muteRefCount == 0) { + if (savedOut != null) { try { - System.setErr(originalErrStream); - } catch (Exception e) { - logger.warn("Failed to set originalErrStream.", e); + System.setOut(savedOut); + } catch (final Exception e) { + logger.warn("Failed to restore System.out.", e); } } - try { - if (logger.isInfoEnabled()) { - final byte[] bs = outStream.toByteArray(); - if (bs.length != 0) { - logger.info(new String(bs, outputEncoding)); - } - } - if (logger.isWarnEnabled()) { - final byte[] bs = errStream.toByteArray(); - if (bs.length != 0) { - logger.warn(new String(bs, outputEncoding)); - } + if (savedErr != null) { + try { + System.setErr(savedErr); + } catch (final Exception e) { + logger.warn("Failed to restore System.err.", e); } - } catch (final Exception e) { - // NOP } + savedOut = null; + savedErr = null; + outToReplay = capturedOut; + errToReplay = capturedErr; + capturedOut = null; + capturedErr = null; } - } catch (final Exception e) { - throw new ExtractException("Could not extract a content.", e); - } finally { - FileUtil.deleteInBackground(tempFile); + } + // Emit captured output outside the lock so logging back-pressure + // never blocks future muteSystemStreams() callers. + if (outToReplay != null) { + replayCapturedBytes(outToReplay, false); + } + if (errToReplay != null) { + replayCapturedBytes(errToReplay, true); + } + } + + /** + * Decodes the bytes captured from a muted JVM stream and re-emits them via + * the logger. Stdout-origin bytes are logged at INFO; stderr-origin bytes at + * WARN, matching the severity of the original Tika diagnostic channels. + * Visible for testing so that subclasses can intercept the replayed text. + * + *

The decode charset is deliberately {@link Charset#defaultCharset()} and + * not the configurable {@link #outputEncoding}. {@link #muteSystemStreams()} + * installs {@code PrintStream}s that wrap the JVM default charset, so this + * choice guarantees a lossless round-trip of whatever Tika/PDFBox/POI wrote + * to {@link System#out}/{@link System#err}. + * + * @param buffer captured byte buffer; ignored when empty + * @param fromStderr whether the buffer came from {@link System#err} + */ + protected void replayCapturedBytes(final BoundedByteArrayOutputStream buffer, final boolean fromStderr) { + if (buffer == null || buffer.size() == 0) { + return; + } + // Must match the encoder used by muteSystemStreams() — PrintStream(out, true) + // wraps Charset.defaultCharset(). outputEncoding is intentionally not used + // here; see the Javadoc above. + final Charset charset = Charset.defaultCharset(); + final String text = new String(buffer.toByteArray(), charset); + if (text.isEmpty()) { + return; + } + onReplayCaptured(text, fromStderr); + if (fromStderr) { + logger.warn("Captured System.err output during Tika extraction:\n{}", text); + } else { + logger.info("Captured System.out output during Tika extraction:\n{}", text); + } + } + + /** + * Hook invoked just before the captured bytes from a muted JVM stream are + * emitted via the logger. Default implementation is a no-op; tests and + * subclasses can override to observe (or otherwise act on) the replayed + * text. + * + * @param text captured text decoded with the JVM default charset + * @param fromStderr {@code true} if the bytes were captured from + * {@link System#err}, {@code false} for {@link System#out} + */ + protected void onReplayCaptured(final String text, final boolean fromStderr) { + // hook for subclasses; intentionally empty + } + + /** + * A {@link ByteArrayOutputStream} with a hard upper bound. Once the bound is + * reached a single truncation marker is appended and any further writes are + * silently dropped. This prevents a runaway parser from consuming arbitrary + * heap while still preserving an operator-visible record that output was + * truncated. + */ + static final class BoundedByteArrayOutputStream extends ByteArrayOutputStream { + private final int capacity; + private boolean truncated; + + BoundedByteArrayOutputStream(final int capacity) { + super(Math.min(1024, capacity)); + this.capacity = capacity; + } + + @Override + public synchronized void write(final int b) { + if (truncated) { + return; + } + if (size() >= capacity) { + appendTruncationMarker(); + return; + } + super.write(b); + } + + @Override + public synchronized void write(final byte[] b, final int off, final int len) { + if (truncated) { + return; + } + final int remaining = capacity - size(); + if (remaining <= 0) { + appendTruncationMarker(); + return; + } + if (len <= remaining) { + super.write(b, off, len); + return; + } + super.write(b, off, remaining); + appendTruncationMarker(); + } + + private void appendTruncationMarker() { + if (truncated) { + return; + } + truncated = true; + // Direct super.write to bypass our cap (the marker itself is small). + super.write(CAPTURE_TRUNCATION_MARKER, 0, CAPTURE_TRUNCATION_MARKER.length); } } @@ -724,7 +925,16 @@ protected interface ContentWriter { } /** - * Sets the output encoding. + * Sets the output encoding used for materializing extracted content. + * + *

Note: this setting does not affect how bytes captured from muted + * {@link System#out}/{@link System#err} are decoded for replay through the + * logger. Those bytes are written by Tika/PDFBox/POI through + * {@link PrintStream}s the JVM wraps with {@link Charset#defaultCharset()}, + * so the capture/replay path always uses the JVM default charset to avoid + * lossy substitution of non-ASCII diagnostic text. See + * {@link #replayCapturedBytes(BoundedByteArrayOutputStream, boolean)}. + * * @param outputEncoding The output encoding. */ public void setOutputEncoding(final String outputEncoding) { @@ -811,6 +1021,18 @@ public void setTikaConfig(final TikaConfig tikaConfig) { this.tikaConfig = tikaConfig; } + /** + * Sets whether to mute {@link System#out} and {@link System#err} during extraction. + * Some Tika-bundled parsers print to the JVM streams; muting suppresses that noise. + * Defaults to {@code true}. Disable when debugging or when callers depend on + * application output remaining on the original streams. + * + * @param muteSystemStreams {@code true} to mute, {@code false} to leave streams intact + */ + public void setMuteSystemStreams(final boolean muteSystemStreams) { + this.muteSystemStreams = muteSystemStreams; + } + /** * Strips HTML tags from the given content using regex. * diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java index faed5aac..2ab11de7 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java @@ -16,10 +16,22 @@ package org.codelibs.fess.crawler.extractor.impl; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.io.InputStream; +import java.io.PrintStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -33,6 +45,7 @@ import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; import org.dbflute.utflute.core.PlainTestCase; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; @@ -702,4 +715,360 @@ public void test_getPdfPassword() { resourceName = "fuga.pdf"; assertEquals("PASSWORD", tikaExtractor.getPassword(createParams(url, resourceName))); } + + /** + * Verifies that running many concurrent extractions does not leave + * {@link System#out}/{@link System#err} permanently redirected. The previous + * implementation swapped the JVM streams without synchronization, so two + * threads racing through the swap could lose the original references. + */ + @Test + public void test_concurrentExtraction_doesNotCorruptSystemStreams() throws Exception { + final PrintStream originalOut = System.out; + final PrintStream originalErr = System.err; + + final int threadCount = 16; + final int iterations = 8; + final ExecutorService pool = Executors.newFixedThreadPool(threadCount); + final CountDownLatch start = new CountDownLatch(1); + final List failures = java.util.Collections.synchronizedList(new ArrayList<>()); + try { + for (int i = 0; i < threadCount; i++) { + pool.submit(() -> { + try { + start.await(); + for (int j = 0; j < iterations; j++) { + final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt"); + try { + tikaExtractor.getText(in, null); + } finally { + CloseableUtil.closeQuietly(in); + } + } + } catch (final Throwable t) { + failures.add(t); + } + }); + } + start.countDown(); + pool.shutdown(); + assertTrue(pool.awaitTermination(60, TimeUnit.SECONDS)); + } finally { + pool.shutdownNow(); + } + Assertions.assertTrue(failures.isEmpty(), "concurrent extractions threw: " + failures); + Assertions.assertSame(originalOut, System.out, "System.out must be restored to its original reference"); + Assertions.assertSame(originalErr, System.err, "System.err must be restored to its original reference"); + } + + /** + * Verifies that an exception thrown during extraction still restores + * {@link System#out}/{@link System#err}. + */ + @Test + public void test_systemStreamsRestoredOnException() { + final PrintStream originalOut = System.out; + final PrintStream originalErr = System.err; + try { + tikaExtractor.getText(null, null); + Assertions.fail("expected CrawlerSystemException"); + } catch (final CrawlerSystemException expected) { + // null input always throws synchronously, before muting; this exercises + // the simplest restore path. + } + Assertions.assertSame(originalOut, System.out); + Assertions.assertSame(originalErr, System.err); + + // Also exercise the muted path: feed a non-byte stream so the muting branch + // runs, but force the underlying stream to throw mid-read. + final InputStream broken = new InputStream() { + @Override + public int read() throws IOException { + throw new IOException("boom"); + } + }; + try { + tikaExtractor.getText(broken, null); + Assertions.fail("expected ExtractException"); + } catch (final ExtractException expected) { + // ok + } + Assertions.assertSame(originalOut, System.out, "System.out must be restored after a thrown extraction"); + Assertions.assertSame(originalErr, System.err, "System.err must be restored after a thrown extraction"); + } + + /** + * When the input is large enough for the on-disk staging file to be created, the + * old implementation also let Tika spool the same bytes into a second temp file + * (apache-tika-*). With the fix only the outer staging file should appear. + */ + @Test + public void test_dfosSpilledToDisk_noDoubleTempFile() throws Exception { + final Path tempDir = Files.createTempDirectory("tikaExtractor-test-"); + final String originalTmp = System.getProperty("java.io.tmpdir"); + System.setProperty("java.io.tmpdir", tempDir.toAbsolutePath().toString()); + try { + // Build an input large enough to ensure the staging file is created. The + // payload is plain text repeated to about 4 MB. + final byte[] chunk = "Concurrent extraction stress payload テスト 1234567890\n".getBytes(StandardCharsets.UTF_8); + final int targetBytes = 4 * 1024 * 1024; + final int repeats = (targetBytes / chunk.length) + 1; + final byte[] data = new byte[chunk.length * repeats]; + for (int i = 0; i < repeats; i++) { + System.arraycopy(chunk, 0, data, i * chunk.length, chunk.length); + } + + // BufferedInputStream is not a ByteArrayInputStream, so the on-disk path runs. + final InputStream in = new java.io.BufferedInputStream(new java.io.ByteArrayInputStream(data)); + final ExtractData result = tikaExtractor.getText(in, null); + assertNotNull(result); + CloseableUtil.closeQuietly(in); + + final List tikaTempFilesSeen = new ArrayList<>(); + try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) { + for (final Path p : stream) { + final String name = p.getFileName().toString(); + if (name.startsWith("apache-tika-")) { + tikaTempFilesSeen.add(name); + } + } + } + Assertions.assertTrue(tikaTempFilesSeen.isEmpty(), + "Tika should not have spooled a second temp file, but saw: " + tikaTempFilesSeen); + } finally { + if (originalTmp != null) { + System.setProperty("java.io.tmpdir", originalTmp); + } + // Clean up any leftover files (deleteInBackground may still be racing). + Thread.sleep(200); + try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) { + for (final Path p : stream) { + try { + Files.deleteIfExists(p); + } catch (final IOException ignore) { + // best effort + } + } + } + Files.deleteIfExists(tempDir); + } + } + + /** + * For a small {@link ByteArrayInputStream}, neither the outer staging file nor + * Tika's internal spool should be created. + */ + @Test + public void test_dfosInMemory_noTempFileCreated() throws Exception { + final Path tempDir = Files.createTempDirectory("tikaExtractor-test-"); + final String originalTmp = System.getProperty("java.io.tmpdir"); + System.setProperty("java.io.tmpdir", tempDir.toAbsolutePath().toString()); + try { + final byte[] data = "Small inline payload テスト".getBytes(StandardCharsets.UTF_8); + final ByteArrayInputStream in = new ByteArrayInputStream(data); + final ExtractData result = tikaExtractor.getText(in, null); + assertNotNull(result); + CloseableUtil.closeQuietly(in); + + final List leftover = new ArrayList<>(); + try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) { + for (final Path p : stream) { + final String name = p.getFileName().toString(); + if (name.startsWith("tikaExtractor-") || name.startsWith("apache-tika-")) { + leftover.add(name); + } + } + } + Assertions.assertTrue(leftover.isEmpty(), "In-memory path must not create temp files, but saw: " + leftover); + } finally { + if (originalTmp != null) { + System.setProperty("java.io.tmpdir", originalTmp); + } + Thread.sleep(100); + try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) { + for (final Path p : stream) { + try { + Files.deleteIfExists(p); + } catch (final IOException ignore) { + // best effort + } + } + } + Files.deleteIfExists(tempDir); + } + } + + /** + * Verifies that bytes written to {@link System#out}/{@link System#err} while the + * streams are muted are not silently discarded. The previous fix swapped the + * streams to a {@code NullOutputStream}; we now buffer them and re-emit through + * the logger when extraction finishes so operators still see Tika parser + * diagnostics (PDFBox font warnings, JBIG2 warnings, legacy POI debug, ...). + */ + @Test + public void test_systemStreamsCapturedAndReplayed() throws Exception { + final List replayedOut = java.util.Collections.synchronizedList(new ArrayList<>()); + final List replayedErr = java.util.Collections.synchronizedList(new ArrayList<>()); + + // Subclass that injects writes to System.out/System.err while the muting is + // active and observes the replayed bytes through the test hook. + final TikaExtractor spy = new TikaExtractor() { + @Override + protected InputStream openMaterializedInput(final InputStream inputStream, final java.io.File tempFile, + final boolean isByteStream) throws IOException { + System.out.println("HELLO_FROM_PARSER_OUT"); + System.err.println("HELLO_FROM_PARSER_ERR"); + return super.openMaterializedInput(inputStream, tempFile, isByteStream); + } + + @Override + protected void onReplayCaptured(final String text, final boolean fromStderr) { + if (fromStderr) { + replayedErr.add(text); + } else { + replayedOut.add(text); + } + } + }; + // wire up the same Tika config the regular fixture uses so detection works + spy.setTikaConfig(org.apache.tika.config.TikaConfig.getDefaultConfig()); + spy.init(); + + final PrintStream originalOut = System.out; + final PrintStream originalErr = System.err; + + final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt"); + try { + final ExtractData result = spy.getText(in, null); + assertNotNull(result); + } finally { + CloseableUtil.closeQuietly(in); + } + + // streams must be restored + Assertions.assertSame(originalOut, System.out); + Assertions.assertSame(originalErr, System.err); + + // captured stdout/stderr must have been replayed via the hook + Assertions.assertFalse(replayedOut.isEmpty(), "captured stdout was not replayed"); + Assertions.assertFalse(replayedErr.isEmpty(), "captured stderr was not replayed"); + Assertions.assertTrue(String.join("\n", replayedOut).contains("HELLO_FROM_PARSER_OUT"), + "expected captured stdout to contain the marker, got: " + replayedOut); + Assertions.assertTrue(String.join("\n", replayedErr).contains("HELLO_FROM_PARSER_ERR"), + "expected captured stderr to contain the marker, got: " + replayedErr); + } + + /** + * Locks down the encoding contract for the muted-stream capture/replay path: + * regardless of {@link TikaExtractor#setOutputEncoding(String)}, bytes + * written to {@link System#out}/{@link System#err} during extraction must be + * decoded with {@link java.nio.charset.Charset#defaultCharset()} so that the + * replayed text is a lossless round-trip of what Tika/PDFBox/POI emitted + * (which they emit through {@link PrintStream}s the JVM wraps with the JVM + * default charset). A different output encoding here would risk character + * substitution for diagnostics containing non-ASCII text. + */ + @Test + public void test_capturedSystemStreams_useDefaultCharsetRegardlessOfOutputEncoding() throws Exception { + final String marker = "Türkçe-日本語-✓"; + // Round-trip is only meaningful when the JVM default charset can actually + // represent the marker. On platforms whose default is e.g. ISO-8859-1 + // (rare on Java 21+), skip rather than report a false failure. + org.junit.jupiter.api.Assumptions.assumeTrue(java.nio.charset.Charset.defaultCharset().newEncoder().canEncode(marker), + "skipping: JVM default charset " + java.nio.charset.Charset.defaultCharset() + " cannot represent the test marker"); + final List replayedOut = java.util.Collections.synchronizedList(new ArrayList<>()); + + final TikaExtractor spy = new TikaExtractor() { + @Override + protected InputStream openMaterializedInput(final InputStream inputStream, final java.io.File tempFile, + final boolean isByteStream) throws IOException { + System.out.println(marker); + return super.openMaterializedInput(inputStream, tempFile, isByteStream); + } + + @Override + protected void onReplayCaptured(final String text, final boolean fromStderr) { + if (!fromStderr) { + replayedOut.add(text); + } + } + }; + spy.setTikaConfig(org.apache.tika.config.TikaConfig.getDefaultConfig()); + // Pick an outputEncoding that cannot represent every code point in the + // marker; if the capture/replay path were (incorrectly) using + // outputEncoding, characters such as '日' would be substituted with '?'. + spy.setOutputEncoding("ISO-8859-1"); + spy.init(); + + final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt"); + try { + spy.getText(in, null); + } finally { + CloseableUtil.closeQuietly(in); + } + + Assertions.assertFalse(replayedOut.isEmpty(), "captured stdout was not replayed"); + final String joined = String.join("\n", replayedOut); + Assertions.assertTrue(joined.contains(marker), + "replayed stdout must round-trip non-ASCII text via the JVM default charset, got: " + joined); + } + + /** + * Verifies that the bounded capture buffer caps at its configured size and + * appends a truncation marker once instead of growing without bound. + */ + @Test + public void test_boundedCaptureBuffer_truncatesAtCapacity() throws Exception { + final TikaExtractor.BoundedByteArrayOutputStream buf = new TikaExtractor.BoundedByteArrayOutputStream(16); + // Write more than the cap in a single call. + final byte[] payload = new byte[64]; + java.util.Arrays.fill(payload, (byte) 'x'); + buf.write(payload, 0, payload.length); + // Subsequent writes are dropped. + buf.write("MORE".getBytes(StandardCharsets.UTF_8)); + buf.write('!'); + + final String text = new String(buf.toByteArray(), StandardCharsets.UTF_8); + // 16 'x' bytes + the truncation marker; "MORE" / '!' must be discarded. + Assertions.assertTrue(text.startsWith("xxxxxxxxxxxxxxxx"), "expected 16 bytes of payload, got: " + text); + Assertions.assertTrue(text.contains("truncated"), "expected truncation marker, got: " + text); + Assertions.assertFalse(text.contains("MORE"), "post-cap writes must be dropped"); + Assertions.assertFalse(text.contains("!"), "post-cap byte writes must be dropped"); + } + + /** + * When muting is disabled, {@link System#out}/{@link System#err} must remain the + * caller-visible streams throughout extraction. + */ + @Test + public void test_setMuteSystemStreams_false_doesNotMute() throws Exception { + final PrintStream originalOut = System.out; + final PrintStream originalErr = System.err; + tikaExtractor.setMuteSystemStreams(false); + try { + final java.io.ByteArrayOutputStream capture = new java.io.ByteArrayOutputStream(); + final PrintStream tap = new PrintStream(capture, true); + System.setOut(tap); + System.setErr(tap); + try { + final PrintStream beforeOut = System.out; + final PrintStream beforeErr = System.err; + final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt"); + try { + tikaExtractor.getText(in, null); + } finally { + CloseableUtil.closeQuietly(in); + } + Assertions.assertSame(beforeOut, System.out, "System.out must not be swapped when muting is disabled"); + Assertions.assertSame(beforeErr, System.err, "System.err must not be swapped when muting is disabled"); + } finally { + System.setOut(originalOut); + System.setErr(originalErr); + } + } finally { + tikaExtractor.setMuteSystemStreams(true); + } + Assertions.assertSame(originalOut, System.out); + Assertions.assertSame(originalErr, System.err); + } }