= 0) {
- throw new ExtractException("Zip bomb detected.", e);
+ if (e.getMessage() != null && e.getMessage().indexOf("bomb") >= 0) {
+ throw new ExtractException("Failed to extract via Tika: reason=zipBombDetected", e);
}
final Throwable cause = e.getCause();
if (cause instanceof SAXException) {
final Extractor xmlExtractor = crawlerContainer.getComponent("xmlExtractor");
if (xmlExtractor != null) {
- InputStream in = null;
- try {
- if (isByteStream) {
- inputStream.reset();
- in = inputStream;
- } else {
- in = new FileInputStream(tempFile);
- }
+ try (InputStream in = openMaterializedInput(inputStream, tempFile, isByteStream)) {
return xmlExtractor.getText(in, params);
- } finally {
- CloseableUtil.closeQuietly(in);
}
}
}
throw e;
+ }
+ } catch (final ExtractException e) {
+ throw e;
+ } catch (final Exception e) {
+ throw new ExtractException("Failed to extract via Tika: reason=unexpectedError", e);
+ } finally {
+ try {
+ FileUtil.deleteInBackground(tempFile);
} finally {
- if (originalOutStream != null) {
- try {
- System.setOut(originalOutStream);
- } catch (Exception e) {
- logger.warn("Failed to set originalOutStream.", e);
- }
+ if (muted) {
+ unmuteSystemStreams();
}
- if (originalErrStream != null) {
+ }
+ }
+ }
+
+ /**
+ * Opens an {@link InputStream} backed by the already-materialized source (the byte
+ * stream marked at the start, or the on-disk {@code tempFile}). When a temp file is
+ * present we wrap it as a {@link TikaInputStream} so that
+ * {@code TikaInputStream.get(stream, ...)} inside {@link TikaDetectParser#parse}
+ * can reuse the existing file path instead of spooling the bytes a second time.
+ *
+ * @param inputStream the original byte stream (used when {@code isByteStream})
+ * @param tempFile the materialized on-disk copy (used otherwise); may be {@code null}
+ * @param isByteStream {@code true} iff the caller passed a {@link ByteArrayInputStream}
+ * @return a fresh, ready-to-read input stream
+ * @throws IOException if the on-disk file cannot be opened
+ */
+ protected InputStream openMaterializedInput(final InputStream inputStream, final File tempFile, final boolean isByteStream)
+ throws IOException {
+ if (isByteStream) {
+ inputStream.reset();
+ return inputStream;
+ }
+ return TikaInputStream.get(tempFile.toPath());
+ }
+
+ /**
+ * Mutes {@link System#out} and {@link System#err} for the duration of the current
+ * extraction. Concurrent extractions share a single muted state via a reference
+ * count — the first caller saves the original streams and swaps in bounded
+ * capture buffers; subsequent callers just bump the count. Captured bytes are
+ * not silently discarded: when the last outstanding mute is released the
+ * captured contents are emitted via the configured logger so operators still
+ * see Tika parser warnings (PDFBox font warnings, JBIG2 warnings, legacy POI
+ * debug, etc.). The lock is released as soon as the swap is recorded so that
+ * extractions never serialize on it.
+ *
+ * The replacement {@link PrintStream}s are constructed with the auto-flush
+ * two-arg form, which the JVM wraps using {@link Charset#defaultCharset()}.
+ * The configured {@link #outputEncoding} is intentionally not applied
+ * here: third-party libraries (Tika, PDFBox, POI, JBIG2-ImageIO) write
+ * diagnostics through {@link System#out}/{@link System#err} using the JVM
+ * default charset, and forcing a different encoder would risk character
+ * substitution for code points not representable in that encoding. See
+ * {@link #replayCapturedBytes(BoundedByteArrayOutputStream, boolean)} for the
+ * matching decode side.
+ */
+ protected void muteSystemStreams() {
+ synchronized (SYSTEM_STREAM_LOCK) {
+ if (muteRefCount == 0) {
+ savedOut = System.out;
+ savedErr = System.err;
+ capturedOut = new BoundedByteArrayOutputStream(CAPTURE_BUFFER_SIZE);
+ capturedErr = new BoundedByteArrayOutputStream(CAPTURE_BUFFER_SIZE);
+ System.setOut(new PrintStream(capturedOut, true));
+ System.setErr(new PrintStream(capturedErr, true));
+ }
+ muteRefCount++;
+ }
+ }
+
+ /**
+ * Releases one reference acquired by {@link #muteSystemStreams()}. When the last
+ * outstanding mute is released, the original streams are restored and any
+ * bytes captured while the streams were muted are replayed via the logger
+ * (info for stdout, warn for stderr) so they are not silently lost. Calling
+ * this without a matching {@link #muteSystemStreams()} is a programming
+ * error and is tolerated only defensively (the count is clamped at zero).
+ */
+ protected void unmuteSystemStreams() {
+ BoundedByteArrayOutputStream outToReplay = null;
+ BoundedByteArrayOutputStream errToReplay = null;
+ synchronized (SYSTEM_STREAM_LOCK) {
+ if (muteRefCount <= 0) {
+ logger.warn("unmuteSystemStreams called without a matching mute; muteRefCount={}", muteRefCount);
+ return;
+ }
+ muteRefCount--;
+ if (muteRefCount == 0) {
+ if (savedOut != null) {
try {
- System.setErr(originalErrStream);
- } catch (Exception e) {
- logger.warn("Failed to set originalErrStream.", e);
+ System.setOut(savedOut);
+ } catch (final Exception e) {
+ logger.warn("Failed to restore System.out.", e);
}
}
- try {
- if (logger.isInfoEnabled()) {
- final byte[] bs = outStream.toByteArray();
- if (bs.length != 0) {
- logger.info(new String(bs, outputEncoding));
- }
- }
- if (logger.isWarnEnabled()) {
- final byte[] bs = errStream.toByteArray();
- if (bs.length != 0) {
- logger.warn(new String(bs, outputEncoding));
- }
+ if (savedErr != null) {
+ try {
+ System.setErr(savedErr);
+ } catch (final Exception e) {
+ logger.warn("Failed to restore System.err.", e);
}
- } catch (final Exception e) {
- // NOP
}
+ savedOut = null;
+ savedErr = null;
+ outToReplay = capturedOut;
+ errToReplay = capturedErr;
+ capturedOut = null;
+ capturedErr = null;
}
- } catch (final Exception e) {
- throw new ExtractException("Could not extract a content.", e);
- } finally {
- FileUtil.deleteInBackground(tempFile);
+ }
+ // Emit captured output outside the lock so logging back-pressure
+ // never blocks future muteSystemStreams() callers.
+ if (outToReplay != null) {
+ replayCapturedBytes(outToReplay, false);
+ }
+ if (errToReplay != null) {
+ replayCapturedBytes(errToReplay, true);
+ }
+ }
+
+ /**
+ * Decodes the bytes captured from a muted JVM stream and re-emits them via
+ * the logger. Stdout-origin bytes are logged at INFO; stderr-origin bytes at
+ * WARN, matching the severity of the original Tika diagnostic channels.
+ * Visible for testing so that subclasses can intercept the replayed text.
+ *
+ *
The decode charset is deliberately {@link Charset#defaultCharset()} and
+ * not the configurable {@link #outputEncoding}. {@link #muteSystemStreams()}
+ * installs {@code PrintStream}s that wrap the JVM default charset, so this
+ * choice guarantees a lossless round-trip of whatever Tika/PDFBox/POI wrote
+ * to {@link System#out}/{@link System#err}.
+ *
+ * @param buffer captured byte buffer; ignored when empty
+ * @param fromStderr whether the buffer came from {@link System#err}
+ */
+ protected void replayCapturedBytes(final BoundedByteArrayOutputStream buffer, final boolean fromStderr) {
+ if (buffer == null || buffer.size() == 0) {
+ return;
+ }
+ // Must match the encoder used by muteSystemStreams() — PrintStream(out, true)
+ // wraps Charset.defaultCharset(). outputEncoding is intentionally not used
+ // here; see the Javadoc above.
+ final Charset charset = Charset.defaultCharset();
+ final String text = new String(buffer.toByteArray(), charset);
+ if (text.isEmpty()) {
+ return;
+ }
+ onReplayCaptured(text, fromStderr);
+ if (fromStderr) {
+ logger.warn("Captured System.err output during Tika extraction:\n{}", text);
+ } else {
+ logger.info("Captured System.out output during Tika extraction:\n{}", text);
+ }
+ }
+
+ /**
+ * Hook invoked just before the captured bytes from a muted JVM stream are
+ * emitted via the logger. Default implementation is a no-op; tests and
+ * subclasses can override to observe (or otherwise act on) the replayed
+ * text.
+ *
+ * @param text captured text decoded with the JVM default charset
+ * @param fromStderr {@code true} if the bytes were captured from
+ * {@link System#err}, {@code false} for {@link System#out}
+ */
+ protected void onReplayCaptured(final String text, final boolean fromStderr) {
+ // hook for subclasses; intentionally empty
+ }
+
+ /**
+ * A {@link ByteArrayOutputStream} with a hard upper bound. Once the bound is
+ * reached a single truncation marker is appended and any further writes are
+ * silently dropped. This prevents a runaway parser from consuming arbitrary
+ * heap while still preserving an operator-visible record that output was
+ * truncated.
+ */
+ static final class BoundedByteArrayOutputStream extends ByteArrayOutputStream {
+ private final int capacity;
+ private boolean truncated;
+
+ BoundedByteArrayOutputStream(final int capacity) {
+ super(Math.min(1024, capacity));
+ this.capacity = capacity;
+ }
+
+ @Override
+ public synchronized void write(final int b) {
+ if (truncated) {
+ return;
+ }
+ if (size() >= capacity) {
+ appendTruncationMarker();
+ return;
+ }
+ super.write(b);
+ }
+
+ @Override
+ public synchronized void write(final byte[] b, final int off, final int len) {
+ if (truncated) {
+ return;
+ }
+ final int remaining = capacity - size();
+ if (remaining <= 0) {
+ appendTruncationMarker();
+ return;
+ }
+ if (len <= remaining) {
+ super.write(b, off, len);
+ return;
+ }
+ super.write(b, off, remaining);
+ appendTruncationMarker();
+ }
+
+ private void appendTruncationMarker() {
+ if (truncated) {
+ return;
+ }
+ truncated = true;
+ // Direct super.write to bypass our cap (the marker itself is small).
+ super.write(CAPTURE_TRUNCATION_MARKER, 0, CAPTURE_TRUNCATION_MARKER.length);
}
}
@@ -724,7 +925,16 @@ protected interface ContentWriter {
}
/**
- * Sets the output encoding.
+ * Sets the output encoding used for materializing extracted content.
+ *
+ *
Note: this setting does not affect how bytes captured from muted
+ * {@link System#out}/{@link System#err} are decoded for replay through the
+ * logger. Those bytes are written by Tika/PDFBox/POI through
+ * {@link PrintStream}s the JVM wraps with {@link Charset#defaultCharset()},
+ * so the capture/replay path always uses the JVM default charset to avoid
+ * lossy substitution of non-ASCII diagnostic text. See
+ * {@link #replayCapturedBytes(BoundedByteArrayOutputStream, boolean)}.
+ *
* @param outputEncoding The output encoding.
*/
public void setOutputEncoding(final String outputEncoding) {
@@ -811,6 +1021,18 @@ public void setTikaConfig(final TikaConfig tikaConfig) {
this.tikaConfig = tikaConfig;
}
+ /**
+ * Sets whether to mute {@link System#out} and {@link System#err} during extraction.
+ * Some Tika-bundled parsers print to the JVM streams; muting suppresses that noise.
+ * Defaults to {@code true}. Disable when debugging or when callers depend on
+ * application output remaining on the original streams.
+ *
+ * @param muteSystemStreams {@code true} to mute, {@code false} to leave streams intact
+ */
+ public void setMuteSystemStreams(final boolean muteSystemStreams) {
+ this.muteSystemStreams = muteSystemStreams;
+ }
+
/**
* Strips HTML tags from the given content using regex.
*
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java
index faed5aac..2ab11de7 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java
@@ -16,10 +16,22 @@
package org.codelibs.fess.crawler.extractor.impl;
import java.io.ByteArrayInputStream;
+import java.io.IOException;
import java.io.InputStream;
+import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -33,6 +45,7 @@
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl;
import org.dbflute.utflute.core.PlainTestCase;
+import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInfo;
@@ -702,4 +715,360 @@ public void test_getPdfPassword() {
resourceName = "fuga.pdf";
assertEquals("PASSWORD", tikaExtractor.getPassword(createParams(url, resourceName)));
}
+
+ /**
+ * Verifies that running many concurrent extractions does not leave
+ * {@link System#out}/{@link System#err} permanently redirected. The previous
+ * implementation swapped the JVM streams without synchronization, so two
+ * threads racing through the swap could lose the original references.
+ */
+ @Test
+ public void test_concurrentExtraction_doesNotCorruptSystemStreams() throws Exception {
+ final PrintStream originalOut = System.out;
+ final PrintStream originalErr = System.err;
+
+ final int threadCount = 16;
+ final int iterations = 8;
+ final ExecutorService pool = Executors.newFixedThreadPool(threadCount);
+ final CountDownLatch start = new CountDownLatch(1);
+ final List failures = java.util.Collections.synchronizedList(new ArrayList<>());
+ try {
+ for (int i = 0; i < threadCount; i++) {
+ pool.submit(() -> {
+ try {
+ start.await();
+ for (int j = 0; j < iterations; j++) {
+ final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt");
+ try {
+ tikaExtractor.getText(in, null);
+ } finally {
+ CloseableUtil.closeQuietly(in);
+ }
+ }
+ } catch (final Throwable t) {
+ failures.add(t);
+ }
+ });
+ }
+ start.countDown();
+ pool.shutdown();
+ assertTrue(pool.awaitTermination(60, TimeUnit.SECONDS));
+ } finally {
+ pool.shutdownNow();
+ }
+ Assertions.assertTrue(failures.isEmpty(), "concurrent extractions threw: " + failures);
+ Assertions.assertSame(originalOut, System.out, "System.out must be restored to its original reference");
+ Assertions.assertSame(originalErr, System.err, "System.err must be restored to its original reference");
+ }
+
+ /**
+ * Verifies that an exception thrown during extraction still restores
+ * {@link System#out}/{@link System#err}.
+ */
+ @Test
+ public void test_systemStreamsRestoredOnException() {
+ final PrintStream originalOut = System.out;
+ final PrintStream originalErr = System.err;
+ try {
+ tikaExtractor.getText(null, null);
+ Assertions.fail("expected CrawlerSystemException");
+ } catch (final CrawlerSystemException expected) {
+ // null input always throws synchronously, before muting; this exercises
+ // the simplest restore path.
+ }
+ Assertions.assertSame(originalOut, System.out);
+ Assertions.assertSame(originalErr, System.err);
+
+ // Also exercise the muted path: feed a non-byte stream so the muting branch
+ // runs, but force the underlying stream to throw mid-read.
+ final InputStream broken = new InputStream() {
+ @Override
+ public int read() throws IOException {
+ throw new IOException("boom");
+ }
+ };
+ try {
+ tikaExtractor.getText(broken, null);
+ Assertions.fail("expected ExtractException");
+ } catch (final ExtractException expected) {
+ // ok
+ }
+ Assertions.assertSame(originalOut, System.out, "System.out must be restored after a thrown extraction");
+ Assertions.assertSame(originalErr, System.err, "System.err must be restored after a thrown extraction");
+ }
+
+ /**
+ * When the input is large enough for the on-disk staging file to be created, the
+ * old implementation also let Tika spool the same bytes into a second temp file
+ * (apache-tika-*). With the fix only the outer staging file should appear.
+ */
+ @Test
+ public void test_dfosSpilledToDisk_noDoubleTempFile() throws Exception {
+ final Path tempDir = Files.createTempDirectory("tikaExtractor-test-");
+ final String originalTmp = System.getProperty("java.io.tmpdir");
+ System.setProperty("java.io.tmpdir", tempDir.toAbsolutePath().toString());
+ try {
+ // Build an input large enough to ensure the staging file is created. The
+ // payload is plain text repeated to about 4 MB.
+ final byte[] chunk = "Concurrent extraction stress payload テスト 1234567890\n".getBytes(StandardCharsets.UTF_8);
+ final int targetBytes = 4 * 1024 * 1024;
+ final int repeats = (targetBytes / chunk.length) + 1;
+ final byte[] data = new byte[chunk.length * repeats];
+ for (int i = 0; i < repeats; i++) {
+ System.arraycopy(chunk, 0, data, i * chunk.length, chunk.length);
+ }
+
+ // BufferedInputStream is not a ByteArrayInputStream, so the on-disk path runs.
+ final InputStream in = new java.io.BufferedInputStream(new java.io.ByteArrayInputStream(data));
+ final ExtractData result = tikaExtractor.getText(in, null);
+ assertNotNull(result);
+ CloseableUtil.closeQuietly(in);
+
+ final List tikaTempFilesSeen = new ArrayList<>();
+ try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) {
+ for (final Path p : stream) {
+ final String name = p.getFileName().toString();
+ if (name.startsWith("apache-tika-")) {
+ tikaTempFilesSeen.add(name);
+ }
+ }
+ }
+ Assertions.assertTrue(tikaTempFilesSeen.isEmpty(),
+ "Tika should not have spooled a second temp file, but saw: " + tikaTempFilesSeen);
+ } finally {
+ if (originalTmp != null) {
+ System.setProperty("java.io.tmpdir", originalTmp);
+ }
+ // Clean up any leftover files (deleteInBackground may still be racing).
+ Thread.sleep(200);
+ try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) {
+ for (final Path p : stream) {
+ try {
+ Files.deleteIfExists(p);
+ } catch (final IOException ignore) {
+ // best effort
+ }
+ }
+ }
+ Files.deleteIfExists(tempDir);
+ }
+ }
+
+ /**
+ * For a small {@link ByteArrayInputStream}, neither the outer staging file nor
+ * Tika's internal spool should be created.
+ */
+ @Test
+ public void test_dfosInMemory_noTempFileCreated() throws Exception {
+ final Path tempDir = Files.createTempDirectory("tikaExtractor-test-");
+ final String originalTmp = System.getProperty("java.io.tmpdir");
+ System.setProperty("java.io.tmpdir", tempDir.toAbsolutePath().toString());
+ try {
+ final byte[] data = "Small inline payload テスト".getBytes(StandardCharsets.UTF_8);
+ final ByteArrayInputStream in = new ByteArrayInputStream(data);
+ final ExtractData result = tikaExtractor.getText(in, null);
+ assertNotNull(result);
+ CloseableUtil.closeQuietly(in);
+
+ final List leftover = new ArrayList<>();
+ try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) {
+ for (final Path p : stream) {
+ final String name = p.getFileName().toString();
+ if (name.startsWith("tikaExtractor-") || name.startsWith("apache-tika-")) {
+ leftover.add(name);
+ }
+ }
+ }
+ Assertions.assertTrue(leftover.isEmpty(), "In-memory path must not create temp files, but saw: " + leftover);
+ } finally {
+ if (originalTmp != null) {
+ System.setProperty("java.io.tmpdir", originalTmp);
+ }
+ Thread.sleep(100);
+ try (DirectoryStream stream = Files.newDirectoryStream(tempDir)) {
+ for (final Path p : stream) {
+ try {
+ Files.deleteIfExists(p);
+ } catch (final IOException ignore) {
+ // best effort
+ }
+ }
+ }
+ Files.deleteIfExists(tempDir);
+ }
+ }
+
+ /**
+ * Verifies that bytes written to {@link System#out}/{@link System#err} while the
+ * streams are muted are not silently discarded. The previous fix swapped the
+ * streams to a {@code NullOutputStream}; we now buffer them and re-emit through
+ * the logger when extraction finishes so operators still see Tika parser
+ * diagnostics (PDFBox font warnings, JBIG2 warnings, legacy POI debug, ...).
+ */
+ @Test
+ public void test_systemStreamsCapturedAndReplayed() throws Exception {
+ final List replayedOut = java.util.Collections.synchronizedList(new ArrayList<>());
+ final List replayedErr = java.util.Collections.synchronizedList(new ArrayList<>());
+
+ // Subclass that injects writes to System.out/System.err while the muting is
+ // active and observes the replayed bytes through the test hook.
+ final TikaExtractor spy = new TikaExtractor() {
+ @Override
+ protected InputStream openMaterializedInput(final InputStream inputStream, final java.io.File tempFile,
+ final boolean isByteStream) throws IOException {
+ System.out.println("HELLO_FROM_PARSER_OUT");
+ System.err.println("HELLO_FROM_PARSER_ERR");
+ return super.openMaterializedInput(inputStream, tempFile, isByteStream);
+ }
+
+ @Override
+ protected void onReplayCaptured(final String text, final boolean fromStderr) {
+ if (fromStderr) {
+ replayedErr.add(text);
+ } else {
+ replayedOut.add(text);
+ }
+ }
+ };
+ // wire up the same Tika config the regular fixture uses so detection works
+ spy.setTikaConfig(org.apache.tika.config.TikaConfig.getDefaultConfig());
+ spy.init();
+
+ final PrintStream originalOut = System.out;
+ final PrintStream originalErr = System.err;
+
+ final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt");
+ try {
+ final ExtractData result = spy.getText(in, null);
+ assertNotNull(result);
+ } finally {
+ CloseableUtil.closeQuietly(in);
+ }
+
+ // streams must be restored
+ Assertions.assertSame(originalOut, System.out);
+ Assertions.assertSame(originalErr, System.err);
+
+ // captured stdout/stderr must have been replayed via the hook
+ Assertions.assertFalse(replayedOut.isEmpty(), "captured stdout was not replayed");
+ Assertions.assertFalse(replayedErr.isEmpty(), "captured stderr was not replayed");
+ Assertions.assertTrue(String.join("\n", replayedOut).contains("HELLO_FROM_PARSER_OUT"),
+ "expected captured stdout to contain the marker, got: " + replayedOut);
+ Assertions.assertTrue(String.join("\n", replayedErr).contains("HELLO_FROM_PARSER_ERR"),
+ "expected captured stderr to contain the marker, got: " + replayedErr);
+ }
+
+ /**
+ * Locks down the encoding contract for the muted-stream capture/replay path:
+ * regardless of {@link TikaExtractor#setOutputEncoding(String)}, bytes
+ * written to {@link System#out}/{@link System#err} during extraction must be
+ * decoded with {@link java.nio.charset.Charset#defaultCharset()} so that the
+ * replayed text is a lossless round-trip of what Tika/PDFBox/POI emitted
+ * (which they emit through {@link PrintStream}s the JVM wraps with the JVM
+ * default charset). A different output encoding here would risk character
+ * substitution for diagnostics containing non-ASCII text.
+ */
+ @Test
+ public void test_capturedSystemStreams_useDefaultCharsetRegardlessOfOutputEncoding() throws Exception {
+ final String marker = "Türkçe-日本語-✓";
+ // Round-trip is only meaningful when the JVM default charset can actually
+ // represent the marker. On platforms whose default is e.g. ISO-8859-1
+ // (rare on Java 21+), skip rather than report a false failure.
+ org.junit.jupiter.api.Assumptions.assumeTrue(java.nio.charset.Charset.defaultCharset().newEncoder().canEncode(marker),
+ "skipping: JVM default charset " + java.nio.charset.Charset.defaultCharset() + " cannot represent the test marker");
+ final List replayedOut = java.util.Collections.synchronizedList(new ArrayList<>());
+
+ final TikaExtractor spy = new TikaExtractor() {
+ @Override
+ protected InputStream openMaterializedInput(final InputStream inputStream, final java.io.File tempFile,
+ final boolean isByteStream) throws IOException {
+ System.out.println(marker);
+ return super.openMaterializedInput(inputStream, tempFile, isByteStream);
+ }
+
+ @Override
+ protected void onReplayCaptured(final String text, final boolean fromStderr) {
+ if (!fromStderr) {
+ replayedOut.add(text);
+ }
+ }
+ };
+ spy.setTikaConfig(org.apache.tika.config.TikaConfig.getDefaultConfig());
+ // Pick an outputEncoding that cannot represent every code point in the
+ // marker; if the capture/replay path were (incorrectly) using
+ // outputEncoding, characters such as '日' would be substituted with '?'.
+ spy.setOutputEncoding("ISO-8859-1");
+ spy.init();
+
+ final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt");
+ try {
+ spy.getText(in, null);
+ } finally {
+ CloseableUtil.closeQuietly(in);
+ }
+
+ Assertions.assertFalse(replayedOut.isEmpty(), "captured stdout was not replayed");
+ final String joined = String.join("\n", replayedOut);
+ Assertions.assertTrue(joined.contains(marker),
+ "replayed stdout must round-trip non-ASCII text via the JVM default charset, got: " + joined);
+ }
+
+ /**
+ * Verifies that the bounded capture buffer caps at its configured size and
+ * appends a truncation marker once instead of growing without bound.
+ */
+ @Test
+ public void test_boundedCaptureBuffer_truncatesAtCapacity() throws Exception {
+ final TikaExtractor.BoundedByteArrayOutputStream buf = new TikaExtractor.BoundedByteArrayOutputStream(16);
+ // Write more than the cap in a single call.
+ final byte[] payload = new byte[64];
+ java.util.Arrays.fill(payload, (byte) 'x');
+ buf.write(payload, 0, payload.length);
+ // Subsequent writes are dropped.
+ buf.write("MORE".getBytes(StandardCharsets.UTF_8));
+ buf.write('!');
+
+ final String text = new String(buf.toByteArray(), StandardCharsets.UTF_8);
+ // 16 'x' bytes + the truncation marker; "MORE" / '!' must be discarded.
+ Assertions.assertTrue(text.startsWith("xxxxxxxxxxxxxxxx"), "expected 16 bytes of payload, got: " + text);
+ Assertions.assertTrue(text.contains("truncated"), "expected truncation marker, got: " + text);
+ Assertions.assertFalse(text.contains("MORE"), "post-cap writes must be dropped");
+ Assertions.assertFalse(text.contains("!"), "post-cap byte writes must be dropped");
+ }
+
+ /**
+ * When muting is disabled, {@link System#out}/{@link System#err} must remain the
+ * caller-visible streams throughout extraction.
+ */
+ @Test
+ public void test_setMuteSystemStreams_false_doesNotMute() throws Exception {
+ final PrintStream originalOut = System.out;
+ final PrintStream originalErr = System.err;
+ tikaExtractor.setMuteSystemStreams(false);
+ try {
+ final java.io.ByteArrayOutputStream capture = new java.io.ByteArrayOutputStream();
+ final PrintStream tap = new PrintStream(capture, true);
+ System.setOut(tap);
+ System.setErr(tap);
+ try {
+ final PrintStream beforeOut = System.out;
+ final PrintStream beforeErr = System.err;
+ final InputStream in = ResourceUtil.getResourceAsStream("extractor/test.txt");
+ try {
+ tikaExtractor.getText(in, null);
+ } finally {
+ CloseableUtil.closeQuietly(in);
+ }
+ Assertions.assertSame(beforeOut, System.out, "System.out must not be swapped when muting is disabled");
+ Assertions.assertSame(beforeErr, System.err, "System.err must not be swapped when muting is disabled");
+ } finally {
+ System.setOut(originalOut);
+ System.setErr(originalErr);
+ }
+ } finally {
+ tikaExtractor.setMuteSystemStreams(true);
+ }
+ Assertions.assertSame(originalOut, System.out);
+ Assertions.assertSame(originalErr, System.err);
+ }
}