diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java index 14759e30..7a2eb122 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java @@ -18,10 +18,18 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import org.apache.commons.io.IOUtils; import org.codelibs.fess.crawler.container.CrawlerContainer; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; @@ -48,6 +56,14 @@ */ public abstract class AbstractExtractor implements Extractor { + /** + * Parameter key used to track the recursion depth across nested archive + * extraction. Callers/recursive extractor invocations may set this to + * limit how deeply nested archives are unpacked. The value is parsed as + * an integer; missing or unparseable values are treated as depth 0. + */ + public static final String EXTRACTOR_DEPTH_KEY = "extractorDepth"; + /** The crawler container. */ @Resource protected CrawlerContainer crawlerContainer; @@ -55,6 +71,14 @@ public abstract class AbstractExtractor implements Extractor { /** The weight of this extractor. */ protected int weight = 1; + /** + * Maximum allowed depth for recursive archive extraction. When the depth + * value parsed from {@link #EXTRACTOR_DEPTH_KEY} reaches this threshold, + * {@link #checkDepth(Map, int)} aborts further recursion to defend + * against recursion-bomb archives. + */ + protected int maxArchiveDepth = 10; + /** * Constructs a new AbstractExtractor. */ @@ -62,6 +86,74 @@ public AbstractExtractor() { // NOP } + /** + * Sets the maximum allowed recursion depth for nested archive extraction. + * @param maxArchiveDepth the new maximum depth (non-negative) + */ + public void setMaxArchiveDepth(final int maxArchiveDepth) { + this.maxArchiveDepth = maxArchiveDepth; + } + + /** + * Returns the current recursion depth recorded in the extractor params. + * Missing, blank, or unparseable values are treated as {@code 0}. + * + * @param params the extractor parameters (may be {@code null}) + * @return the parsed depth, or {@code 0} if not set + */ + protected int getCurrentDepth(final Map params) { + if (params == null) { + return 0; + } + final String value = params.get(EXTRACTOR_DEPTH_KEY); + if (value == null || value.isBlank()) { + return 0; + } + try { + final int depth = Integer.parseInt(value.trim()); + return depth < 0 ? 0 : depth; + } catch (final NumberFormatException e) { + return 0; + } + } + + /** + * Returns a NEW parameter map (the original is not mutated) with the + * recursion depth incremented by one. Useful when an archive extractor + * recursively delegates to another extractor for a nested archive entry. + * + * @param params the current extractor parameters (may be {@code null}) + * @return a new map containing all original entries plus an incremented + * depth + */ + protected Map incrementDepth(final Map params) { + final Map next = new HashMap<>(); + if (params != null) { + next.putAll(params); + } + next.put(EXTRACTOR_DEPTH_KEY, Integer.toString(getCurrentDepth(params) + 1)); + return next; + } + + /** + * Validates that the recursion depth recorded in {@code params} does not + * meet or exceed {@code maxDepth}. Throws {@link MaxLengthExceededException} + * (a {@link org.codelibs.fess.crawler.exception.CrawlingAccessException + * CrawlingAccessException}) when the threshold is reached so that the + * surrounding crawler treats it as a data-driven access failure rather + * than a system error. + * + * @param params the extractor parameters (may be {@code null}) + * @param maxDepth the (exclusive) maximum allowed depth + * @throws MaxLengthExceededException when {@code currentDepth >= maxDepth} + */ + protected void checkDepth(final Map params, final int maxDepth) { + final int current = getCurrentDepth(params); + if (current >= maxDepth) { + throw new MaxLengthExceededException("Archive recursion depth exceeded: depth=" + current + " max=" + maxDepth); + } + } + @Override public int getWeight() { return weight; @@ -142,4 +234,77 @@ protected void validateInputStream(final InputStream in) { throw new CrawlerSystemException("The inputstream is null."); } } + + /** + * Returns true when the supplied entry name escapes the conceptual + * extraction root via path-traversal segments. The check is performed on + * a normalised form of the path and is shared between the archive + * extractors (Zip / Tar / Lha) so the rejection rules stay in lock step. + * + *

+ * An entry is rejected when it is null/empty, when it is rooted at + * {@code /} or {@code \}, when it begins with a Windows drive letter + * (e.g. {@code C:}), when its normalised form contains a {@code ..} + * segment, or when {@link Paths#get} treats it as malformed. + *

+ * + * @param name the entry name as reported by the archive + * @return {@code true} if the name should be rejected + */ + protected static boolean isPathTraversal(final String name) { + if (name == null || name.isEmpty()) { + return true; + } + // Absolute paths (Unix or Windows-style) are unsafe in the + // context of an archive extracted into a sandbox root. + if (name.startsWith("/") || name.startsWith("\\")) { + return true; + } + if (name.length() >= 2 && name.charAt(1) == ':') { + return true; + } + try { + final Path normalised = Paths.get(name).normalize(); + final String normStr = normalised.toString().replace('\\', '/'); + if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { + return true; + } + for (final Path part : normalised) { + if ("..".equals(part.toString())) { + return true; + } + } + } catch (final InvalidPathException ipe) { + return true; + } + return false; + } + + /** + * Copies up to {@code limit} bytes from {@code in} to {@code out}, returning + * the actual number of bytes copied. Used by archive extractors to bound + * the amount of memory consumed when buffering an entry's uncompressed + * payload. + * + * @param in the source stream + * @param out the sink stream + * @param limit the maximum number of bytes to copy (inclusive). Values + * {@code <= 0} cause the method to return without reading. + * @return the number of bytes actually copied + * @throws IOException if reading from {@code in} or writing to {@code out} + * fails + */ + protected static long copyBounded(final InputStream in, final OutputStream out, final long limit) throws IOException { + if (limit <= 0) { + return 0; + } + final byte[] buffer = new byte[8192]; + long total = 0; + int read; + while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) { + out.write(buffer, 0, read); + total += read; + } + return total; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index 7a5ff29a..7e975e13 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -15,18 +15,18 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.Enumeration; -import java.util.HashMap; import java.util.Map; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.core.io.CloseableUtil; -import org.codelibs.core.io.CopyUtil; import org.codelibs.core.io.FileUtil; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; @@ -35,7 +35,6 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jp.gr.java_conf.dangan.util.lha.LhaFile; import jp.gr.java_conf.dangan.util.lha.LhaHeader; @@ -45,15 +44,59 @@ * This extractor can extract text content from files within LHA archives * by using appropriate extractors for each contained file type. * + *

+ * Defends against decompression / many-entry / recursion bombs and Zip Slip + * style path traversal in entry names. + *

+ * * @author shinsuke */ public class LhaExtractor extends AbstractExtractor { /** Logger for this class. */ private static final Logger logger = LogManager.getLogger(LhaExtractor.class); - /** Maximum content size for extraction. -1 means no limit. */ + /** + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. + */ protected long maxContentSize = -1; + /** + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering. + */ + protected long maxBytes = 1L << 31; + + /** + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * Enforced against the actual bytes read from the entry stream (NOT the + * header-reported size, which is attacker-controlled). Defaults to + * 256 MiB. Set to {@code -1} to disable. Enforced independently of + * {@link #maxBytes}. Only applies to entries that have a registered + * {@link Extractor}; an unsupported entry is never buffered. + */ + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + + /** + * Maximum bytes copied from the input stream to the local temporary file + * before {@link LhaFile} is opened. The LHA library requires a seekable + * file, so the entire archive must be staged on disk; this cap prevents a + * hostile producer from filling local storage. Defaults to 1 GiB. Set to + * {@code -1} to disable. + */ + protected long maxInputBytes = 1L << 30; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + /** * Creates a new LhaExtractor instance. */ @@ -76,6 +119,7 @@ public ExtractData getText(final InputStream in, final Map param if (in == null) { throw new CrawlerSystemException("LHA archive input stream is null. Cannot extract text from null input."); } + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); @@ -86,38 +130,104 @@ public ExtractData getText(final InputStream in, final Map param try { tempFile = createTempFile("crawler-", ".lzh", null); try (FileOutputStream fos = new FileOutputStream(tempFile)) { - CopyUtil.copy(in, fos); + // Stage the (untrusted) archive bytes to disk under a hard + // cap so a hostile producer cannot exhaust local storage by + // streaming an arbitrarily large body. + final long inputReadLimit = maxInputBytes > 0 ? maxInputBytes + 1L : Long.MAX_VALUE; + final long staged = copyBounded(in, fos, inputReadLimit); + if (maxInputBytes > 0 && staged > maxInputBytes) { + throw new MaxLengthExceededException("lha input size exceeded: bytes=" + staged + " max=" + maxInputBytes); + } } lhaFile = new LhaFile(tempFile); @SuppressWarnings("unchecked") final Enumeration entries = lhaFile.entries(); - long contentSize = 0; + long totalBytes = 0; + int entryCount = 0; while (entries.hasMoreElements()) { final LhaHeader head = entries.nextElement(); - contentSize += head.getOriginalSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = head.getPath(); + if (isPathTraversal(filename)) { + logger.warn("lha entry rejected: name={} reason=path-traversal", filename); + continue; + } + + // Decide MIME / extractor up front so an unsupported entry + // is skipped without opening its decompressor at all. This + // mirrors the legacy behaviour and keeps a large irrelevant + // entry from consuming the per-entry / total caps reserved + // for entries the crawler actually wants to extract. final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - InputStream is = null; - try { - is = lhaFile.getInputStream(head); - final Map map = new HashMap<>(); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent()); - buf.append('\n'); - } catch (final Exception e) { - if (logger.isDebugEnabled()) { - logger.debug("Exception in an internal extractor.", e); - } - } finally { - CloseableUtil.closeQuietly(is); - } + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + continue; + } + + // Read the entry payload through copyBounded so the cap is + // enforced against bytes actually decompressed, not the + // header-reported size (which is attacker-controlled). + final long actualBytes; + final byte[] entryBytes; + InputStream is = null; + try { + is = lhaFile.getInputStream(head); + final long totalReadLimit; + if (maxBytes > 0) { + totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + } else { + totalReadLimit = Long.MAX_VALUE; + } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + } else { + contentReadLimit = Long.MAX_VALUE; + } + final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(is, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to read lha entry: name={}", filename, ioe); + } + continue; + } finally { + CloseableUtil.closeQuietly(is); + } + + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "lha per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize >= 0 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Exception in an internal extractor.", e); } } } @@ -147,4 +257,41 @@ public ExtractData getText(final InputStream in, final Map param public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. The + * cap is enforced against bytes actually decompressed (not the + * header-reported size). Set to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + + /** + * Sets the cap on the number of input bytes staged to a temporary file + * before {@link LhaFile} is opened. Set to {@code -1} to disable. + * + * @param maxInputBytes the input-stage maximum + */ + public void setMaxInputBytes(final long maxInputBytes) { + this.maxInputBytes = maxInputBytes; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java index b5bb0238..602878e5 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java @@ -15,8 +15,10 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.util.HashMap; import java.util.Map; import org.apache.commons.compress.archivers.ArchiveInputStream; @@ -30,12 +32,17 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jakarta.annotation.Resource; /** * Extracts text content from TAR archives. + * + *

+ * Defends against decompression / many-entry / recursion bombs and Zip Slip + * style path traversal. Symbolic and hard link entries are skipped because + * they can reference files outside the archive sandbox. + *

*/ public class TarExtractor extends AbstractExtractor { private static final Logger logger = LogManager.getLogger(TarExtractor.class); @@ -47,10 +54,38 @@ public class TarExtractor extends AbstractExtractor { protected ArchiveStreamFactory archiveStreamFactory; /** - * Maximum content size. + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. */ protected long maxContentSize = -1; + /** + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering. + */ + protected long maxBytes = 1L << 31; + + /** + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * Guards against an oversized entry exhausting the JVM heap when + * buffered into memory. Defaults to 256 MiB. Set to {@code -1} to + * disable. Enforced independently of {@link #maxBytes}. Only applies to + * entries that have a registered {@link Extractor}; an unsupported + * entry is never buffered, so this cap is irrelevant for it. + */ + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + /** * Creates a new TarExtractor instance. */ @@ -61,10 +96,11 @@ public TarExtractor() { @Override public ExtractData getText(final InputStream in, final Map params) { validateInputStream(in); + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); - return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory)); + return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory, params)); } /** @@ -73,38 +109,106 @@ public ExtractData getText(final InputStream in, final Map param * @param in The input stream. * @param mimeTypeHelper The mime type helper. * @param extractorFactory The extractor factory. + * @param params Extractor parameters used to track recursion depth. * @return A text. */ - protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) { + protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory, + final Map params) { final StringBuilder buf = new StringBuilder(1000); int processedEntries = 0; int failedEntries = 0; try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream("tar", in)) { - TarArchiveEntry entry = null; - long contentSize = 0; + TarArchiveEntry entry; + long totalBytes = 0; + int entryCount = 0; while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) { - contentSize += entry.getSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("tar entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = entry.getName(); + if (entry.isDirectory()) { + continue; + } + if (entry.isSymbolicLink() || entry.isLink()) { + if (logger.isDebugEnabled()) { + logger.debug("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName()); + } + continue; + } + if (isPathTraversal(filename)) { + logger.warn("tar entry rejected: name={} reason=path-traversal", filename); + continue; + } + + // Decide MIME / extractor up front. An unsupported entry + // (e.g. a video alongside a small .txt) is skipped without + // buffering, so a large irrelevant entry does not consume + // the per-entry / total caps that should be reserved for + // entries the crawler actually wants to extract. final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - try { - final Map map = new HashMap<>(); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent()); - buf.append('\n'); - processedEntries++; - } catch (final Exception e) { - failedEntries++; - if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: {}", filename, e); - } - } + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + continue; + } + + final long actualBytes; + final byte[] entryBytes; + try { + final long totalReadLimit; + if (maxBytes > 0) { + totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + } else { + totalReadLimit = Long.MAX_VALUE; + } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + } else { + contentReadLimit = Long.MAX_VALUE; + } + final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(ais, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to read tar entry: name={}", filename, ioe); + } + continue; + } + + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "tar per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize >= 0 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to extract content from archive entry: name={}", filename, e); } } } @@ -115,7 +219,7 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime throw new ExtractException("Failed to extract content from TAR archive. No entries could be processed.", e); } if (logger.isWarnEnabled()) { - logger.warn("Partial extraction from TAR archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e); + logger.warn("Partial extraction from TAR archive. processed={} failed={}", processedEntries, failedEntries, e); } } @@ -129,4 +233,30 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. Set + * to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java index a543b3a9..bfa83264 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java @@ -16,13 +16,15 @@ package org.codelibs.fess.crawler.extractor.impl; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.util.HashMap; import java.util.Map; -import org.apache.commons.compress.archivers.ArchiveInputStream; -import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.io.input.CountingInputStream; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.fess.crawler.entity.ExtractData; @@ -31,26 +33,81 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; - -import jakarta.annotation.Resource; /** * Extracts text content from ZIP archives. + * + *

+ * The extractor defends against several content-driven attack vectors. The + * input stream itself is treated as untrusted, while the {@code params} map is + * assumed to be admin-configured / trusted. Protections include: + *

+ *
    + *
  • Total uncompressed-size cap ({@link #setMaxBytes(long)})
  • + *
  • Maximum number of entries ({@link #setMaxEntries(int)})
  • + *
  • Per-entry compression-ratio threshold + * ({@link #setMaxCompressionRatio(long)}) to detect zip bombs
  • + *
  • Recursion-depth check (via {@link AbstractExtractor#checkDepth})
  • + *
  • Zip Slip path-traversal detection (entry names normalised and + * rejected when they escape the conceptual extraction root)
  • + *
  • Configurable filename encoding (e.g. {@code "CP932"} / + * {@code "MS932"} for Japanese filenames)
  • + *
*/ public class ZipExtractor extends AbstractExtractor { private static final Logger logger = LogManager.getLogger(ZipExtractor.class); + /** Threshold below which compression-ratio checks are skipped (bytes). */ + private static final long COMPRESSION_RATIO_MIN_BYTES = 1L << 20; // 1 MiB + + /** + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. + */ + protected long maxContentSize = -1; + /** - * The archive stream factory. + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering or + * draining, mirroring the pre-defence behaviour. */ - @Resource - protected ArchiveStreamFactory archiveStreamFactory; + protected long maxBytes = 1L << 31; /** - * The maximum content size. + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * This guards against a legitimate-looking but oversized entry (e.g. a + * 1.9 GiB file inside an otherwise small archive) exhausting the JVM + * heap when buffered into memory. Defaults to 256 MiB. Set to + * {@code -1} to disable. Enforced independently of {@link #maxBytes}. + * Only applies to entries that have a registered {@link Extractor}; an + * unsupported entry is never buffered, so this cap is irrelevant for it. */ - protected long maxContentSize = -1; + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + + /** + * Maximum allowed compression ratio (uncompressed / compressed). Entries + * exceeding this ratio AND larger than 1 MiB are rejected as suspected + * zip bombs. Set to {@code -1} to disable. + */ + protected long maxCompressionRatio = 100L; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + + /** + * Filename encoding used to decode entry names that lack the UTF-8 flag. + * Defaults to {@code "UTF-8"}; set to {@code "CP932"} or {@code "MS932"} + * for archives created on Japanese Windows systems. + */ + protected String filenameEncoding = "UTF-8"; /** * Creates a new ZipExtractor instance. @@ -62,6 +119,7 @@ public ZipExtractor() { @Override public ExtractData getText(final InputStream in, final Map params) { validateInputStream(in); + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); @@ -69,32 +127,150 @@ public ExtractData getText(final InputStream in, final Map param int processedEntries = 0; int failedEntries = 0; - try (final ArchiveInputStream ais = - archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) { - ZipArchiveEntry entry = null; - long contentSize = 0; - while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) { - contentSize += entry.getSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + final InputStream wrapped = in.markSupported() ? in : new BufferedInputStream(in); + // Early-validate the ZIP magic so a clearly non-zip blob is reported + // as ExtractException rather than silently returning empty text. + wrapped.mark(4); + try { + final byte[] sig = new byte[4]; + int read = 0; + while (read < 4) { + final int n = wrapped.read(sig, read, 4 - read); + if (n < 0) { + break; + } + read += n; + } + wrapped.reset(); + if (read != 4 || sig[0] != 'P' || sig[1] != 'K' || (sig[2] != 0x03 && sig[2] != 0x05 && sig[2] != 0x07)) { + throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature."); + } + } catch (final IOException ioe) { + throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", ioe); + } + // CountingInputStream lets us measure the compressed bytes consumed + // from the underlying stream per entry, which is the only reliable + // signal in streaming mode (ZipArchiveEntry#getCompressedSize() is + // often -1 when entries use a data descriptor). + final CountingInputStream counter = new CountingInputStream(wrapped); + try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, true)) { + ZipArchiveEntry entry; + long totalBytes = 0; + long lastCompressedBytes = counter.getByteCount(); + int entryCount = 0; + while ((entry = ais.getNextEntry()) != null) { + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("zip entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = entry.getName(); + if (entry.isDirectory()) { + lastCompressedBytes = counter.getByteCount(); + continue; + } + if (isPathTraversal(filename)) { + logger.warn("zip entry rejected: name={} reason=path-traversal", filename); + // Keep the compressed-bytes anchor in step with the + // stream so the next supported entry's ratio is + // computed against ITS own compressed bytes, not also + // those of the rejected entry. + lastCompressedBytes = counter.getByteCount(); + continue; + } + + // Decide MIME / extractor up front. An unsupported entry + // (e.g. a video alongside a small .txt) is skipped without + // buffering, so a large irrelevant entry does not consume + // the per-entry / total caps that should be reserved for + // entries the crawler actually wants to extract. final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - try { - final Map map = new HashMap<>(); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent()); - buf.append('\n'); - processedEntries++; - } catch (final Exception e) { - failedEntries++; - if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: {}", filename, e); - } - } + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + lastCompressedBytes = counter.getByteCount(); + continue; + } + + // Read entry into bounded buffer while counting actual bytes. + final long actualBytes; + final byte[] entryBytes; + try { + final long totalReadLimit; + if (maxBytes > 0) { + totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + } else { + totalReadLimit = Long.MAX_VALUE; + } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + } else { + contentReadLimit = Long.MAX_VALUE; + } + // Enforce a per-entry cap independently of the total + // cap so that a single oversized entry cannot exhaust + // the JVM heap. We read one byte beyond the cap so the + // explicit overflow check below can distinguish + // "exactly at the cap" from "exceeds the cap". + final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(ais, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to read zip entry: name={}", filename, ioe); + } + lastCompressedBytes = counter.getByteCount(); + continue; + } + + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "zip per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize >= 0 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + + // Compression-ratio check (only meaningful for non-tiny entries). + // Prefer the entry header's compressed size when present; + // otherwise fall back to the bytes actually consumed from the + // underlying stream during this entry's read. + long compressed = entry.getCompressedSize(); + if (compressed <= 0) { + final long now = counter.getByteCount(); + compressed = Math.max(0L, now - lastCompressedBytes); + lastCompressedBytes = now; + } else { + lastCompressedBytes = counter.getByteCount(); + } + if (maxCompressionRatio > 0 && compressed > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES + && actualBytes / compressed > maxCompressionRatio) { + throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio=" + + (actualBytes / compressed) + " max=" + maxCompressionRatio); + } + + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to extract content from archive entry: name={}", filename, e); } } } @@ -105,7 +281,7 @@ public ExtractData getText(final InputStream in, final Map param throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", e); } if (logger.isWarnEnabled()) { - logger.warn("Partial extraction from ZIP archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e); + logger.warn("Partial extraction from ZIP archive. processed={} failed={}", processedEntries, failedEntries, e); } } @@ -119,4 +295,49 @@ public ExtractData getText(final InputStream in, final Map param public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } -} \ No newline at end of file + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. Set + * to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + + /** + * Sets the maximum permitted uncompressed/compressed ratio per entry. + * @param maxCompressionRatio the threshold (use {@code -1} to disable) + */ + public void setMaxCompressionRatio(final long maxCompressionRatio) { + this.maxCompressionRatio = maxCompressionRatio; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } + + /** + * Sets the filename encoding used to decode entry names that lack the + * UTF-8 flag (e.g. {@code "CP932"} / {@code "MS932"} for Japanese + * archives). + * + * @param filenameEncoding the charset name + */ + public void setFilenameEncoding(final String filenameEncoding) { + this.filenameEncoding = filenameEncoding; + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java index 30e7db86..1a51483f 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java @@ -17,10 +17,12 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; +import java.util.HashMap; import java.util.Map; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.dbflute.utflute.core.PlainTestCase; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -64,6 +66,19 @@ public void resetTestState() { public void testValidateInputStream(final InputStream in) { validateInputStream(in); } + + // Expose depth helpers for testing. + public int testGetCurrentDepth(final Map params) { + return getCurrentDepth(params); + } + + public Map testIncrementDepth(final Map params) { + return incrementDepth(params); + } + + public void testCheckDepth(final Map params, final int maxDepth) { + checkDepth(params, maxDepth); + } } private TestExtractor extractor; @@ -243,4 +258,85 @@ public void test_validateInputStream_throwsCorrectExceptionType() { fail(); } } + + /** Recursion-depth helper: missing/null params return 0. */ + @Test + public void test_getCurrentDepth_returnsZeroForMissing() { + assertEquals(0, extractor.testGetCurrentDepth(null)); + assertEquals(0, extractor.testGetCurrentDepth(new HashMap<>())); + final Map blank = new HashMap<>(); + blank.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, ""); + assertEquals(0, extractor.testGetCurrentDepth(blank)); + final Map garbage = new HashMap<>(); + garbage.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "not-a-number"); + assertEquals(0, extractor.testGetCurrentDepth(garbage)); + } + + /** Recursion-depth helper: depth value is parsed and clamped to >= 0. */ + @Test + public void test_getCurrentDepth_parsesValidValue() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + assertEquals(3, extractor.testGetCurrentDepth(params)); + + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "-5"); + assertEquals(0, extractor.testGetCurrentDepth(params)); + } + + /** incrementDepth must return a NEW map and not mutate the input. */ + @Test + public void test_incrementDepth_returnsNewMap() { + final Map original = new HashMap<>(); + original.put("foo", "bar"); + final Map next = extractor.testIncrementDepth(original); + + assertFalse(original == next); + // original is unchanged + assertFalse(original.containsKey(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + assertEquals("bar", next.get("foo")); + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + + final Map after = extractor.testIncrementDepth(next); + assertEquals("2", after.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + // first map still says "1" + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + /** incrementDepth on null produces depth=1. */ + @Test + public void test_incrementDepth_nullInput() { + final Map next = extractor.testIncrementDepth(null); + assertNotNull(next); + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + /** checkDepth allows depths below the limit. */ + @Test + public void test_checkDepth_belowLimit_passes() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + extractor.testCheckDepth(params, 10); // no throw + extractor.testCheckDepth(null, 10); + } + + /** checkDepth rejects depths at or above the limit. */ + @Test + public void test_checkDepth_atOrAboveLimit_throws() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + try { + extractor.testCheckDepth(params, 10); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("recursion depth")); + } + + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "99"); + try { + extractor.testCheckDepth(params, 10); + fail(); + } catch (final MaxLengthExceededException e) { + // pass + } + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java new file mode 100644 index 00000000..39aeefc4 --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java @@ -0,0 +1,599 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.extractor.impl; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.extractor.ExtractorFactory; +import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; +import org.dbflute.utflute.core.PlainTestCase; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; + +/** + * Security-oriented tests that validate the archive-bomb / recursion / Zip + * Slip / link-skipping defences added to the Zip / Tar / Lha extractors. + * + *

+ * Synthetic archives are constructed in-memory with Apache Commons Compress + * so the tests are fully self-contained. + *

+ */ +public class ArchiveExtractorSecurityTest extends PlainTestCase { + + private ZipExtractor zipExtractor; + private TarExtractor tarExtractor; + private LhaExtractor lhaExtractor; + + @Override + protected void setUp(final TestInfo testInfo) throws Exception { + super.setUp(testInfo); + final StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("archiveStreamFactory", ArchiveStreamFactory.class) + .singleton("compressorStreamFactory", CompressorStreamFactory.class) + .singleton("mimeTypeHelper", MimeTypeHelperImpl.class) + .singleton("textExtractor", TextExtractor.class) + .singleton("zipExtractor", ZipExtractor.class) + .singleton("tarExtractor", TarExtractor.class) + .singleton("lhaExtractor", LhaExtractor.class) + . singleton("extractorFactory", ExtractorFactory.class, factory -> { + final TextExtractor textExtractor = container.getComponent("textExtractor"); + final ZipExtractor zip = container.getComponent("zipExtractor"); + final TarExtractor tar = container.getComponent("tarExtractor"); + final LhaExtractor lha = container.getComponent("lhaExtractor"); + factory.addExtractor("text/plain", textExtractor); + factory.addExtractor("application/zip", zip); + factory.addExtractor("application/x-tar", tar); + factory.addExtractor("application/x-lha", lha); + }); + + zipExtractor = container.getComponent("zipExtractor"); + tarExtractor = container.getComponent("tarExtractor"); + lhaExtractor = container.getComponent("lhaExtractor"); + } + + // --------------------------------------------------------------------- + // Helpers + // --------------------------------------------------------------------- + + private byte[] buildZip(final EntrySpec... specs) throws IOException { + return buildZipWithCharset(StandardCharsets.UTF_8, specs); + } + + private byte[] buildZipWithCharset(final Charset charset, final EntrySpec... specs) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + zos.setEncoding(charset.name()); + // Disable the UTF-8 flag so the encoding parameter is honoured by + // ZipArchiveInputStream during read. + zos.setUseLanguageEncodingFlag(false); + zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER); + for (final EntrySpec spec : specs) { + final ZipArchiveEntry entry = new ZipArchiveEntry(spec.name); + zos.putArchiveEntry(entry); + if (spec.content != null) { + zos.write(spec.content); + } + zos.closeArchiveEntry(); + } + zos.finish(); + } + return baos.toByteArray(); + } + + private byte[] buildTar(final TarEntrySpec... specs) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (TarArchiveOutputStream tos = new TarArchiveOutputStream(baos)) { + tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX); + for (final TarEntrySpec spec : specs) { + final TarArchiveEntry entry; + if (spec.linkType != 0) { + entry = new TarArchiveEntry(spec.name, spec.linkType); + if (spec.linkName != null) { + entry.setLinkName(spec.linkName); + } + } else { + entry = new TarArchiveEntry(spec.name); + entry.setSize(spec.content == null ? 0 : spec.content.length); + } + tos.putArchiveEntry(entry); + if (spec.linkType == 0 && spec.content != null) { + tos.write(spec.content); + } + tos.closeArchiveEntry(); + } + tos.finish(); + } + return baos.toByteArray(); + } + + private static final class EntrySpec { + final String name; + final byte[] content; + + EntrySpec(final String name, final byte[] content) { + this.name = name; + this.content = content; + } + } + + private static final class TarEntrySpec { + final String name; + final byte[] content; + final byte linkType; + final String linkName; + + TarEntrySpec(final String name, final byte[] content) { + this(name, content, (byte) 0, null); + } + + TarEntrySpec(final String name, final byte[] content, final byte linkType, final String linkName) { + this.name = name; + this.content = content; + this.linkType = linkType; + this.linkName = linkName; + } + } + + // --------------------------------------------------------------------- + // Zip — byte-limit bomb + // --------------------------------------------------------------------- + + @Test + public void test_zipBomb_byteLimit() throws Exception { + final byte[] payload = new byte[64 * 1024]; + final byte[] data = buildZip(new EntrySpec("a.txt", payload), new EntrySpec("b.txt", payload), new EntrySpec("c.txt", payload)); + + zipExtractor.setMaxBytes(64 * 1024); // exactly one entry's worth -> 2nd should fail + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip uncompressed size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Zip — many-entry bomb + // --------------------------------------------------------------------- + + @Test + public void test_zipBomb_entryLimit() throws Exception { + final EntrySpec[] specs = new EntrySpec[20]; + for (int i = 0; i < specs.length; i++) { + specs[i] = new EntrySpec("e" + i + ".txt", new byte[0]); + } + final byte[] data = buildZip(specs); + + zipExtractor.setMaxEntries(5); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip entry count exceeded")); + } + } + + // --------------------------------------------------------------------- + // Zip — Zip Slip path traversal + // --------------------------------------------------------------------- + + @Test + public void test_zipSlip_pathTraversal() throws Exception { + final byte[] data = buildZip(new EntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + // Bad entry must be skipped; good entry must still be processed. + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + @Test + public void test_zipSlip_absolutePath() throws Exception { + final byte[] data = buildZip(new EntrySpec("/etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // Recursion-depth bomb + // --------------------------------------------------------------------- + + @Test + public void test_recursionDepth_exceeded() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); // == default max + + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + @Test + public void test_recursionDepth_belowLimit_succeeds() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, params).getContent(); + assertTrue(content.contains("hello")); + } + // Original params must be unchanged. + assertEquals("3", params.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + // --------------------------------------------------------------------- + // CP932 / non-UTF-8 filename encoding + // --------------------------------------------------------------------- + + @Test + public void test_cp932Filename() throws Exception { + final Charset cp932; + try { + cp932 = Charset.forName("MS932"); + } catch (final Exception e) { + // CP932/MS932 not available on this JVM; skip. + return; + } + + final byte[] data = buildZipWithCharset(cp932, new EntrySpec("テスト.txt", "japan".getBytes(StandardCharsets.UTF_8))); + + // Default UTF-8 encoding may mojibake the filename, but once we set + // CP932 the filename should round-trip cleanly. We assert by + // inspecting the entry list directly via the public API: setting the + // proper encoding allows the .txt suffix to be detected and the + // entry's content extracted. + zipExtractor.setFilenameEncoding("MS932"); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("japan")); + } + } + + // --------------------------------------------------------------------- + // Tar — symlink / hardlink entries are skipped + // --------------------------------------------------------------------- + + @Test + public void test_tar_symlinkSkipped() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_SYMLINK, "/etc/passwd")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + // Symlink target text must NOT leak into the output. + assertFalse(content.contains("/etc/passwd")); + } + } + + @Test + public void test_tar_hardlinkSkipped() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_LINK, "ok.txt")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + // The hardlink should not have introduced a duplicate of the + // referenced entry's content. + assertEquals(content.indexOf("regular"), content.lastIndexOf("regular")); + } + } + + @Test + public void test_tar_pathTraversal() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // Compression-ratio bomb — produce a highly-compressible big entry + // --------------------------------------------------------------------- + + @Test + public void test_compressionRatioExceeded() throws Exception { + // 2 MiB of zeroes compresses extremely well, well above the 100:1 + // default threshold. Build the entry with explicit method/size/crc so + // the local file header carries the compressed size (otherwise a + // streaming DEFLATED entry uses a data descriptor, leaving + // ZipArchiveEntry#getCompressedSize() as -1 and bypassing the ratio + // check). + final byte[] payload = new byte[2 * 1024 * 1024]; + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + // Disable the byte cap so the compression-ratio check is the one that + // fires. + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("compression ratio") || e.getMessage().contains("uncompressed size")); + } + } + + // --------------------------------------------------------------------- + // Tar byte/entry limits + // --------------------------------------------------------------------- + + @Test + public void test_tarBomb_byteLimit() throws Exception { + final byte[] payload = new byte[64 * 1024]; + final byte[] data = buildTar(new TarEntrySpec("a.txt", payload), new TarEntrySpec("b.txt", payload)); + + tarExtractor.setMaxBytes(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar uncompressed size exceeded")); + } + } + + @Test + public void test_tarBomb_entryLimit() throws Exception { + final TarEntrySpec[] specs = new TarEntrySpec[20]; + for (int i = 0; i < specs.length; i++) { + specs[i] = new TarEntrySpec("e" + i + ".txt", new byte[0]); + } + final byte[] data = buildTar(specs); + + tarExtractor.setMaxEntries(5); + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar entry count exceeded")); + } + } + + @Test + public void test_tar_recursionDepth_exceeded() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "hi".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + // --------------------------------------------------------------------- + // Lha recursion-depth check (uses isPathTraversal helper too) + // --------------------------------------------------------------------- + + // --------------------------------------------------------------------- + // Per-entry size cap — guards against a single oversized entry + // --------------------------------------------------------------------- + + @Test + public void test_perEntryCapEnforced() throws Exception { + // Build a zip whose single SUPPORTED entry exceeds the configured + // per-entry cap. The extractor must trip the cap before buffering + // the whole payload. We use a small cap (1 MiB) and a slightly + // larger payload (2 MiB) so the test stays cheap on parallel / + // low-memory CI. The extension is .txt so the entry routes through + // the registered text/plain extractor — only supported entries are + // buffered (and therefore can hit the per-entry memory cap). + final int perEntryCap = 1024 * 1024; + final int entrySize = 2 * perEntryCap; + final byte[] payload = new byte[entrySize]; + + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("big.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + // Disable the total-size and ratio checks so only the per-entry cap + // can trigger. + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(perEntryCap); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("per-entry size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Unsupported entries must NOT consume the per-entry / total caps — + // they are skipped without buffering so that supported entries + // alongside them still extract successfully (regression for PR #161 + // review feedback). + // --------------------------------------------------------------------- + + @Test + public void test_zip_unsupportedEntryDoesNotConsumeCaps() throws Exception { + // A "big.bin" payload that, were it to be buffered, would exceed + // both the per-entry cap and the total cap. The supported "ok.txt" + // alongside it must still extract because no extractor is + // registered for application/octet-stream. + final byte[] big = new byte[4 * 1024 * 1024]; + final byte[] data = buildZip(new EntrySpec("big.bin", big), new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + zipExtractor.setMaxBytes(64 * 1024); // smaller than big.bin + zipExtractor.setMaxBytesPerEntry(64 * 1024); // also smaller + zipExtractor.setMaxContentSize(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("good")); + } + } + + @Test + public void test_tar_unsupportedEntryDoesNotConsumeCaps() throws Exception { + final byte[] big = new byte[4 * 1024 * 1024]; + final byte[] data = buildTar(new TarEntrySpec("big.bin", big), new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + tarExtractor.setMaxBytes(64 * 1024); + tarExtractor.setMaxBytesPerEntry(64 * 1024); + tarExtractor.setMaxContentSize(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // maxContentSize is folded into the read budget — a small legacy cap + // must trip BEFORE the buffer grows to the much larger per-entry cap + // (regression for PR #161 review feedback). + // --------------------------------------------------------------------- + + @Test + public void test_zip_maxContentSize_capsBufferBeforePerEntryCap() throws Exception { + // 4 MiB supported entry; per-entry cap default is large; legacy + // maxContentSize is small. Without the fix the buffer would grow + // up to maxBytesPerEntry+1 before throwing. With the fix the read + // budget is bounded by maxContentSize+1 so buffering stops early. + final int legacyCap = 64 * 1024; + final byte[] payload = new byte[4 * 1024 * 1024]; + final byte[] data = buildZip(new EntrySpec("big.txt", payload)); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(8L * 1024L * 1024L); // intentionally larger than payload + zipExtractor.setMaxContentSize(legacyCap); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("Extracted size is")); + } + } + + @Test + public void test_lha_recursionDepth_exceeded() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + // We pass a tiny non-archive stream; the depth check fires before + // the LHA library is invoked. + try (InputStream in = new ByteArrayInputStream("dummy".getBytes(StandardCharsets.UTF_8))) { + lhaExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } catch (final IOException e) { + fail(); + } + } + + @Test + public void test_lha_maxInputBytes_capsStaging() { + // Stage cap is enforced during the temp-file copy, before LhaFile + // is opened. Any blob larger than the cap must be rejected — we use + // arbitrary bytes since the failure precedes archive parsing. + lhaExtractor.setMaxInputBytes(1024L); + final byte[] payload = new byte[4 * 1024]; + try (InputStream in = new ByteArrayInputStream(payload)) { + lhaExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("input size exceeded")); + } catch (final IOException e) { + fail(); + } + } +}