diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java
index 20f3dca0..b757d5da 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java
@@ -21,7 +21,6 @@
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.codelibs.fess.crawler.entity.ExtractData;
-import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;
/**
@@ -41,21 +40,23 @@ public MsPublisherExtractor() {
/**
* Extracts text from the Publisher input stream.
+ *
+ * The {@link PublisherTextExtractor} is wrapped in a try-with-resources
+ * block so that the underlying {@code POIFSFileSystem} (and therefore the
+ * provided input stream) is always closed even when extraction fails
+ * partway through.
+ *
* @param in The input stream.
* @param params The parameters.
* @return The extracted data.
*/
@Override
public ExtractData getText(final InputStream in, final Map params) {
- if (in == null) {
- throw new CrawlerSystemException("Microsoft Publisher input stream is null. Cannot extract text from null input.");
- }
- try {
- @SuppressWarnings("resource")
- final PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(in);
+ validateInputStream(in);
+ try (PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(in)) {
return new ExtractData(publisherTextExtractor.getText());
} catch (final IOException e) {
- throw new ExtractException(e);
+ throw new ExtractException("Failed to extract text from Publisher document.", e);
}
}
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java
index b6c2c6b6..d29a72a5 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java
@@ -21,7 +21,6 @@
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.codelibs.fess.crawler.entity.ExtractData;
-import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;
/**
@@ -41,21 +40,23 @@ public MsVisioExtractor() {
/**
* Extracts text from the Visio input stream.
+ *
+ * The {@link VisioTextExtractor} is wrapped in a try-with-resources block
+ * so that the underlying {@code POIFSFileSystem} (and therefore the
+ * provided input stream) is always closed even when extraction fails
+ * partway through.
+ *
* @param in The input stream.
* @param params The parameters.
* @return The extracted data.
*/
@Override
public ExtractData getText(final InputStream in, final Map params) {
- if (in == null) {
- throw new CrawlerSystemException("Microsoft Visio input stream is null. Cannot extract text from null input.");
- }
- try {
- @SuppressWarnings("resource")
- final VisioTextExtractor visioTextExtractor = new VisioTextExtractor(in);
+ validateInputStream(in);
+ try (VisioTextExtractor visioTextExtractor = new VisioTextExtractor(in)) {
return new ExtractData(visioTextExtractor.getText());
} catch (final IOException e) {
- throw new ExtractException(e);
+ throw new ExtractException("Failed to extract text from Visio document.", e);
}
}
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java
index 738ae3ba..558f35dc 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java
@@ -16,7 +16,10 @@
package org.codelibs.fess.crawler.extractor.impl;
import java.io.ByteArrayInputStream;
+import java.io.FilterInputStream;
+import java.io.IOException;
import java.io.InputStream;
+import java.util.Random;
import org.codelibs.fess.crawler.container.StandardCrawlerContainer;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
@@ -40,7 +43,7 @@ protected void setUp(final TestInfo testInfo) throws Exception {
}
@Test
- public void test_getText_null() {
+ public void test_nullInput_throwsCrawlerSystemException() {
try {
msPublisherExtractor.getText(null, null);
fail();
@@ -50,8 +53,24 @@ public void test_getText_null() {
}
@Test
- public void test_getText_invalidData() {
- // Test with invalid (non-Publisher) data
+ public void test_corruptedInput_throwsExtractException() {
+ // Random bytes are not a valid Publisher document; the extractor
+ // must surface this as an ExtractException rather than leaking the
+ // underlying POI resources.
+ final byte[] randomBytes = new byte[4096];
+ new Random(42L).nextBytes(randomBytes);
+ final InputStream in = new ByteArrayInputStream(randomBytes);
+ try {
+ msPublisherExtractor.getText(in, null);
+ fail();
+ } catch (final ExtractException e) {
+ // Expected - random bytes are not a valid Publisher file
+ }
+ }
+
+ @Test
+ public void test_textInput_throwsExtractException() {
+ // Plain text bytes are not a valid Publisher document.
final InputStream in = new ByteArrayInputStream("This is not a valid Publisher file".getBytes());
try {
msPublisherExtractor.getText(in, null);
@@ -62,7 +81,7 @@ public void test_getText_invalidData() {
}
@Test
- public void test_getText_emptyStream() {
+ public void test_emptyInput_throwsExtractException() {
final InputStream in = new ByteArrayInputStream(new byte[0]);
try {
msPublisherExtractor.getText(in, null);
@@ -72,10 +91,47 @@ public void test_getText_emptyStream() {
}
}
+ @Test
+ public void test_corruptedInput_closesUnderlyingStream() throws IOException {
+ // Verify that even when extraction fails, the underlying input stream
+ // is closed by the try-with-resources block (no resource leak).
+ final byte[] randomBytes = new byte[4096];
+ new Random(7L).nextBytes(randomBytes);
+ final CloseTrackingInputStream tracking = new CloseTrackingInputStream(new ByteArrayInputStream(randomBytes));
+ try {
+ msPublisherExtractor.getText(tracking, null);
+ fail();
+ } catch (final ExtractException e) {
+ // Expected
+ }
+ assertTrue(tracking.isClosed());
+ }
+
@Test
public void test_constructor() {
// Verify the extractor can be instantiated
final MsPublisherExtractor extractor = new MsPublisherExtractor();
assertNotNull(extractor);
}
+
+ /**
+ * Test helper that records whether {@link #close()} has been invoked.
+ */
+ private static final class CloseTrackingInputStream extends FilterInputStream {
+ private boolean closed;
+
+ CloseTrackingInputStream(final InputStream in) {
+ super(in);
+ }
+
+ @Override
+ public void close() throws IOException {
+ closed = true;
+ super.close();
+ }
+
+ boolean isClosed() {
+ return closed;
+ }
+ }
}
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java
index dbc6cf70..38993c99 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java
@@ -16,7 +16,10 @@
package org.codelibs.fess.crawler.extractor.impl;
import java.io.ByteArrayInputStream;
+import java.io.FilterInputStream;
+import java.io.IOException;
import java.io.InputStream;
+import java.util.Random;
import org.codelibs.fess.crawler.container.StandardCrawlerContainer;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
@@ -40,7 +43,7 @@ protected void setUp(final TestInfo testInfo) throws Exception {
}
@Test
- public void test_getText_null() {
+ public void test_nullInput_throwsCrawlerSystemException() {
try {
msVisioExtractor.getText(null, null);
fail();
@@ -50,8 +53,24 @@ public void test_getText_null() {
}
@Test
- public void test_getText_invalidData() {
- // Test with invalid (non-Visio) data
+ public void test_corruptedInput_throwsExtractException() {
+ // Random bytes are not a valid Visio document; the extractor must
+ // surface this as an ExtractException rather than leaking the
+ // underlying POI resources.
+ final byte[] randomBytes = new byte[4096];
+ new Random(42L).nextBytes(randomBytes);
+ final InputStream in = new ByteArrayInputStream(randomBytes);
+ try {
+ msVisioExtractor.getText(in, null);
+ fail();
+ } catch (final ExtractException e) {
+ // Expected - random bytes are not a valid Visio file
+ }
+ }
+
+ @Test
+ public void test_textInput_throwsExtractException() {
+ // Plain text bytes are not a valid Visio document.
final InputStream in = new ByteArrayInputStream("This is not a valid Visio file".getBytes());
try {
msVisioExtractor.getText(in, null);
@@ -62,7 +81,7 @@ public void test_getText_invalidData() {
}
@Test
- public void test_getText_emptyStream() {
+ public void test_emptyInput_throwsExtractException() {
final InputStream in = new ByteArrayInputStream(new byte[0]);
try {
msVisioExtractor.getText(in, null);
@@ -72,10 +91,47 @@ public void test_getText_emptyStream() {
}
}
+ @Test
+ public void test_corruptedInput_closesUnderlyingStream() throws IOException {
+ // Verify that even when extraction fails, the underlying input stream
+ // is closed by the try-with-resources block (no resource leak).
+ final byte[] randomBytes = new byte[4096];
+ new Random(7L).nextBytes(randomBytes);
+ final CloseTrackingInputStream tracking = new CloseTrackingInputStream(new ByteArrayInputStream(randomBytes));
+ try {
+ msVisioExtractor.getText(tracking, null);
+ fail();
+ } catch (final ExtractException e) {
+ // Expected
+ }
+ assertTrue(tracking.isClosed());
+ }
+
@Test
public void test_constructor() {
// Verify the extractor can be instantiated
final MsVisioExtractor extractor = new MsVisioExtractor();
assertNotNull(extractor);
}
+
+ /**
+ * Test helper that records whether {@link #close()} has been invoked.
+ */
+ private static final class CloseTrackingInputStream extends FilterInputStream {
+ private boolean closed;
+
+ CloseTrackingInputStream(final InputStream in) {
+ super(in);
+ }
+
+ @Override
+ public void close() throws IOException {
+ closed = true;
+ super.close();
+ }
+
+ boolean isClosed() {
+ return closed;
+ }
+ }
}