Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;

/**
Expand All @@ -41,21 +40,23 @@ public MsPublisherExtractor() {

/**
* Extracts text from the Publisher input stream.
* <p>
* The {@link PublisherTextExtractor} is wrapped in a try-with-resources
* block so that the underlying {@code POIFSFileSystem} (and therefore the
* provided input stream) is always closed even when extraction fails
* partway through.
* </p>
* @param in The input stream.
* @param params The parameters.
* @return The extracted data.
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("Microsoft Publisher input stream is null. Cannot extract text from null input.");
}
try {
@SuppressWarnings("resource")
final PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(in);
validateInputStream(in);
try (PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(in)) {
return new ExtractData(publisherTextExtractor.getText());
} catch (final IOException e) {
throw new ExtractException(e);
throw new ExtractException("Failed to extract text from Publisher document.", e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;

/**
Expand All @@ -41,21 +40,23 @@ public MsVisioExtractor() {

/**
* Extracts text from the Visio input stream.
* <p>
* The {@link VisioTextExtractor} is wrapped in a try-with-resources block
* so that the underlying {@code POIFSFileSystem} (and therefore the
* provided input stream) is always closed even when extraction fails
* partway through.
* </p>
* @param in The input stream.
* @param params The parameters.
* @return The extracted data.
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("Microsoft Visio input stream is null. Cannot extract text from null input.");
}
try {
@SuppressWarnings("resource")
final VisioTextExtractor visioTextExtractor = new VisioTextExtractor(in);
validateInputStream(in);
try (VisioTextExtractor visioTextExtractor = new VisioTextExtractor(in)) {
return new ExtractData(visioTextExtractor.getText());
} catch (final IOException e) {
throw new ExtractException(e);
throw new ExtractException("Failed to extract text from Visio document.", e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
package org.codelibs.fess.crawler.extractor.impl;

import java.io.ByteArrayInputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Random;

import org.codelibs.fess.crawler.container.StandardCrawlerContainer;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
Expand All @@ -40,7 +43,7 @@ protected void setUp(final TestInfo testInfo) throws Exception {
}

@Test
public void test_getText_null() {
public void test_nullInput_throwsCrawlerSystemException() {
try {
msPublisherExtractor.getText(null, null);
fail();
Expand All @@ -50,8 +53,24 @@ public void test_getText_null() {
}

@Test
public void test_getText_invalidData() {
// Test with invalid (non-Publisher) data
public void test_corruptedInput_throwsExtractException() {
// Random bytes are not a valid Publisher document; the extractor
// must surface this as an ExtractException rather than leaking the
// underlying POI resources.
final byte[] randomBytes = new byte[4096];
new Random(42L).nextBytes(randomBytes);
final InputStream in = new ByteArrayInputStream(randomBytes);
try {
msPublisherExtractor.getText(in, null);
fail();
} catch (final ExtractException e) {
// Expected - random bytes are not a valid Publisher file
}
}

@Test
public void test_textInput_throwsExtractException() {
// Plain text bytes are not a valid Publisher document.
final InputStream in = new ByteArrayInputStream("This is not a valid Publisher file".getBytes());
try {
msPublisherExtractor.getText(in, null);
Expand All @@ -62,7 +81,7 @@ public void test_getText_invalidData() {
}

@Test
public void test_getText_emptyStream() {
public void test_emptyInput_throwsExtractException() {
final InputStream in = new ByteArrayInputStream(new byte[0]);
try {
msPublisherExtractor.getText(in, null);
Expand All @@ -72,10 +91,47 @@ public void test_getText_emptyStream() {
}
}

@Test
public void test_corruptedInput_closesUnderlyingStream() throws IOException {
// Verify that even when extraction fails, the underlying input stream
// is closed by the try-with-resources block (no resource leak).
final byte[] randomBytes = new byte[4096];
new Random(7L).nextBytes(randomBytes);
final CloseTrackingInputStream tracking = new CloseTrackingInputStream(new ByteArrayInputStream(randomBytes));
try {
msPublisherExtractor.getText(tracking, null);
fail();
} catch (final ExtractException e) {
// Expected
}
assertTrue(tracking.isClosed());
}

@Test
public void test_constructor() {
// Verify the extractor can be instantiated
final MsPublisherExtractor extractor = new MsPublisherExtractor();
assertNotNull(extractor);
}

/**
* Test helper that records whether {@link #close()} has been invoked.
*/
private static final class CloseTrackingInputStream extends FilterInputStream {
private boolean closed;

CloseTrackingInputStream(final InputStream in) {
super(in);
}

@Override
public void close() throws IOException {
closed = true;
super.close();
}

boolean isClosed() {
return closed;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
package org.codelibs.fess.crawler.extractor.impl;

import java.io.ByteArrayInputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Random;

import org.codelibs.fess.crawler.container.StandardCrawlerContainer;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
Expand All @@ -40,7 +43,7 @@ protected void setUp(final TestInfo testInfo) throws Exception {
}

@Test
public void test_getText_null() {
public void test_nullInput_throwsCrawlerSystemException() {
try {
msVisioExtractor.getText(null, null);
fail();
Expand All @@ -50,8 +53,24 @@ public void test_getText_null() {
}

@Test
public void test_getText_invalidData() {
// Test with invalid (non-Visio) data
public void test_corruptedInput_throwsExtractException() {
// Random bytes are not a valid Visio document; the extractor must
// surface this as an ExtractException rather than leaking the
// underlying POI resources.
final byte[] randomBytes = new byte[4096];
new Random(42L).nextBytes(randomBytes);
final InputStream in = new ByteArrayInputStream(randomBytes);
try {
msVisioExtractor.getText(in, null);
fail();
} catch (final ExtractException e) {
// Expected - random bytes are not a valid Visio file
}
}

@Test
public void test_textInput_throwsExtractException() {
// Plain text bytes are not a valid Visio document.
final InputStream in = new ByteArrayInputStream("This is not a valid Visio file".getBytes());
try {
msVisioExtractor.getText(in, null);
Expand All @@ -62,7 +81,7 @@ public void test_getText_invalidData() {
}

@Test
public void test_getText_emptyStream() {
public void test_emptyInput_throwsExtractException() {
final InputStream in = new ByteArrayInputStream(new byte[0]);
try {
msVisioExtractor.getText(in, null);
Expand All @@ -72,10 +91,47 @@ public void test_getText_emptyStream() {
}
}

@Test
public void test_corruptedInput_closesUnderlyingStream() throws IOException {
// Verify that even when extraction fails, the underlying input stream
// is closed by the try-with-resources block (no resource leak).
final byte[] randomBytes = new byte[4096];
new Random(7L).nextBytes(randomBytes);
final CloseTrackingInputStream tracking = new CloseTrackingInputStream(new ByteArrayInputStream(randomBytes));
try {
msVisioExtractor.getText(tracking, null);
fail();
} catch (final ExtractException e) {
// Expected
}
assertTrue(tracking.isClosed());
}

@Test
public void test_constructor() {
// Verify the extractor can be instantiated
final MsVisioExtractor extractor = new MsVisioExtractor();
assertNotNull(extractor);
}

/**
* Test helper that records whether {@link #close()} has been invoked.
*/
private static final class CloseTrackingInputStream extends FilterInputStream {
private boolean closed;

CloseTrackingInputStream(final InputStream in) {
super(in);
}

@Override
public void close() throws IOException {
closed = true;
super.close();
}

boolean isClosed() {
return closed;
}
}
}
Loading