diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java index 20f3dca0..b757d5da 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java @@ -21,7 +21,6 @@ import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.codelibs.fess.crawler.entity.ExtractData; -import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.ExtractException; /** @@ -41,21 +40,23 @@ public MsPublisherExtractor() { /** * Extracts text from the Publisher input stream. + *

+ * The {@link PublisherTextExtractor} is wrapped in a try-with-resources + * block so that the underlying {@code POIFSFileSystem} (and therefore the + * provided input stream) is always closed even when extraction fails + * partway through. + *

* @param in The input stream. * @param params The parameters. * @return The extracted data. */ @Override public ExtractData getText(final InputStream in, final Map params) { - if (in == null) { - throw new CrawlerSystemException("Microsoft Publisher input stream is null. Cannot extract text from null input."); - } - try { - @SuppressWarnings("resource") - final PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(in); + validateInputStream(in); + try (PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(in)) { return new ExtractData(publisherTextExtractor.getText()); } catch (final IOException e) { - throw new ExtractException(e); + throw new ExtractException("Failed to extract text from Publisher document.", e); } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java index b6c2c6b6..d29a72a5 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java @@ -21,7 +21,6 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.codelibs.fess.crawler.entity.ExtractData; -import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.ExtractException; /** @@ -41,21 +40,23 @@ public MsVisioExtractor() { /** * Extracts text from the Visio input stream. + *

+ * The {@link VisioTextExtractor} is wrapped in a try-with-resources block + * so that the underlying {@code POIFSFileSystem} (and therefore the + * provided input stream) is always closed even when extraction fails + * partway through. + *

* @param in The input stream. * @param params The parameters. * @return The extracted data. */ @Override public ExtractData getText(final InputStream in, final Map params) { - if (in == null) { - throw new CrawlerSystemException("Microsoft Visio input stream is null. Cannot extract text from null input."); - } - try { - @SuppressWarnings("resource") - final VisioTextExtractor visioTextExtractor = new VisioTextExtractor(in); + validateInputStream(in); + try (VisioTextExtractor visioTextExtractor = new VisioTextExtractor(in)) { return new ExtractData(visioTextExtractor.getText()); } catch (final IOException e) { - throw new ExtractException(e); + throw new ExtractException("Failed to extract text from Visio document.", e); } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java index 738ae3ba..558f35dc 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractorTest.java @@ -16,7 +16,10 @@ package org.codelibs.fess.crawler.extractor.impl; import java.io.ByteArrayInputStream; +import java.io.FilterInputStream; +import java.io.IOException; import java.io.InputStream; +import java.util.Random; import org.codelibs.fess.crawler.container.StandardCrawlerContainer; import org.codelibs.fess.crawler.exception.CrawlerSystemException; @@ -40,7 +43,7 @@ protected void setUp(final TestInfo testInfo) throws Exception { } @Test - public void test_getText_null() { + public void test_nullInput_throwsCrawlerSystemException() { try { msPublisherExtractor.getText(null, null); fail(); @@ -50,8 +53,24 @@ public void test_getText_null() { } @Test - public void test_getText_invalidData() { - // Test with invalid (non-Publisher) data + public void test_corruptedInput_throwsExtractException() { + // Random bytes are not a valid Publisher document; the extractor + // must surface this as an ExtractException rather than leaking the + // underlying POI resources. + final byte[] randomBytes = new byte[4096]; + new Random(42L).nextBytes(randomBytes); + final InputStream in = new ByteArrayInputStream(randomBytes); + try { + msPublisherExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + // Expected - random bytes are not a valid Publisher file + } + } + + @Test + public void test_textInput_throwsExtractException() { + // Plain text bytes are not a valid Publisher document. final InputStream in = new ByteArrayInputStream("This is not a valid Publisher file".getBytes()); try { msPublisherExtractor.getText(in, null); @@ -62,7 +81,7 @@ public void test_getText_invalidData() { } @Test - public void test_getText_emptyStream() { + public void test_emptyInput_throwsExtractException() { final InputStream in = new ByteArrayInputStream(new byte[0]); try { msPublisherExtractor.getText(in, null); @@ -72,10 +91,47 @@ public void test_getText_emptyStream() { } } + @Test + public void test_corruptedInput_closesUnderlyingStream() throws IOException { + // Verify that even when extraction fails, the underlying input stream + // is closed by the try-with-resources block (no resource leak). + final byte[] randomBytes = new byte[4096]; + new Random(7L).nextBytes(randomBytes); + final CloseTrackingInputStream tracking = new CloseTrackingInputStream(new ByteArrayInputStream(randomBytes)); + try { + msPublisherExtractor.getText(tracking, null); + fail(); + } catch (final ExtractException e) { + // Expected + } + assertTrue(tracking.isClosed()); + } + @Test public void test_constructor() { // Verify the extractor can be instantiated final MsPublisherExtractor extractor = new MsPublisherExtractor(); assertNotNull(extractor); } + + /** + * Test helper that records whether {@link #close()} has been invoked. + */ + private static final class CloseTrackingInputStream extends FilterInputStream { + private boolean closed; + + CloseTrackingInputStream(final InputStream in) { + super(in); + } + + @Override + public void close() throws IOException { + closed = true; + super.close(); + } + + boolean isClosed() { + return closed; + } + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java index dbc6cf70..38993c99 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractorTest.java @@ -16,7 +16,10 @@ package org.codelibs.fess.crawler.extractor.impl; import java.io.ByteArrayInputStream; +import java.io.FilterInputStream; +import java.io.IOException; import java.io.InputStream; +import java.util.Random; import org.codelibs.fess.crawler.container.StandardCrawlerContainer; import org.codelibs.fess.crawler.exception.CrawlerSystemException; @@ -40,7 +43,7 @@ protected void setUp(final TestInfo testInfo) throws Exception { } @Test - public void test_getText_null() { + public void test_nullInput_throwsCrawlerSystemException() { try { msVisioExtractor.getText(null, null); fail(); @@ -50,8 +53,24 @@ public void test_getText_null() { } @Test - public void test_getText_invalidData() { - // Test with invalid (non-Visio) data + public void test_corruptedInput_throwsExtractException() { + // Random bytes are not a valid Visio document; the extractor must + // surface this as an ExtractException rather than leaking the + // underlying POI resources. + final byte[] randomBytes = new byte[4096]; + new Random(42L).nextBytes(randomBytes); + final InputStream in = new ByteArrayInputStream(randomBytes); + try { + msVisioExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + // Expected - random bytes are not a valid Visio file + } + } + + @Test + public void test_textInput_throwsExtractException() { + // Plain text bytes are not a valid Visio document. final InputStream in = new ByteArrayInputStream("This is not a valid Visio file".getBytes()); try { msVisioExtractor.getText(in, null); @@ -62,7 +81,7 @@ public void test_getText_invalidData() { } @Test - public void test_getText_emptyStream() { + public void test_emptyInput_throwsExtractException() { final InputStream in = new ByteArrayInputStream(new byte[0]); try { msVisioExtractor.getText(in, null); @@ -72,10 +91,47 @@ public void test_getText_emptyStream() { } } + @Test + public void test_corruptedInput_closesUnderlyingStream() throws IOException { + // Verify that even when extraction fails, the underlying input stream + // is closed by the try-with-resources block (no resource leak). + final byte[] randomBytes = new byte[4096]; + new Random(7L).nextBytes(randomBytes); + final CloseTrackingInputStream tracking = new CloseTrackingInputStream(new ByteArrayInputStream(randomBytes)); + try { + msVisioExtractor.getText(tracking, null); + fail(); + } catch (final ExtractException e) { + // Expected + } + assertTrue(tracking.isClosed()); + } + @Test public void test_constructor() { // Verify the extractor can be instantiated final MsVisioExtractor extractor = new MsVisioExtractor(); assertNotNull(extractor); } + + /** + * Test helper that records whether {@link #close()} has been invoked. + */ + private static final class CloseTrackingInputStream extends FilterInputStream { + private boolean closed; + + CloseTrackingInputStream(final InputStream in) { + super(in); + } + + @Override + public void close() throws IOException { + closed = true; + super.close(); + } + + boolean isClosed() { + return closed; + } + } }