codelibs · marevol · May 4, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
@@ -18,10 +18,18 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.InvalidPathException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
+import org.apache.commons.io.IOUtils;
 import org.codelibs.fess.crawler.container.CrawlerContainer;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
+import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.ExtractorFactory;
 import org.codelibs.fess.crawler.helper.MimeTypeHelper;
@@ -48,20 +56,104 @@
  */
 public abstract class AbstractExtractor implements Extractor {
 
+    /**
+     * Parameter key used to track the recursion depth across nested archive
+     * extraction. Callers/recursive extractor invocations may set this to
+     * limit how deeply nested archives are unpacked. The value is parsed as
+     * an integer; missing or unparseable values are treated as depth 0.
+     */
+    public static final String EXTRACTOR_DEPTH_KEY = "extractorDepth";
+
     /** The crawler container. */
     @Resource
     protected CrawlerContainer crawlerContainer;
 
     /** The weight of this extractor. */
     protected int weight = 1;
 
+    /**
+     * Maximum allowed depth for recursive archive extraction. When the depth
+     * value parsed from {@link #EXTRACTOR_DEPTH_KEY} reaches this threshold,
+     * {@link #checkDepth(Map, int)} aborts further recursion to defend
+     * against recursion-bomb archives.
+     */
+    protected int maxArchiveDepth = 10;
+
     /**
      * Constructs a new AbstractExtractor.
      */
     public AbstractExtractor() {
         // NOP
     }
 
+    /**
+     * Sets the maximum allowed recursion depth for nested archive extraction.
+     * @param maxArchiveDepth the new maximum depth (non-negative)
+     */
+    public void setMaxArchiveDepth(final int maxArchiveDepth) {
+        this.maxArchiveDepth = maxArchiveDepth;
+    }
+
+    /**
+     * Returns the current recursion depth recorded in the extractor params.
+     * Missing, blank, or unparseable values are treated as {@code 0}.
+     *
+     * @param params the extractor parameters (may be {@code null})
+     * @return the parsed depth, or {@code 0} if not set
+     */
+    protected int getCurrentDepth(final Map<String, String> params) {
+        if (params == null) {
+            return 0;
+        }
+        final String value = params.get(EXTRACTOR_DEPTH_KEY);
+        if (value == null || value.isBlank()) {
+            return 0;
+        }
+        try {
+            final int depth = Integer.parseInt(value.trim());
+            return depth < 0 ? 0 : depth;
+        } catch (final NumberFormatException e) {
+            return 0;
+        }
+    }
+
+    /**
+     * Returns a NEW parameter map (the original is not mutated) with the
+     * recursion depth incremented by one. Useful when an archive extractor
+     * recursively delegates to another extractor for a nested archive entry.
+     *
+     * @param params the current extractor parameters (may be {@code null})
+     * @return a new map containing all original entries plus an incremented
+     *         depth
+     */
+    protected Map<String, String> incrementDepth(final Map<String, String> params) {
+        final Map<String, String> next = new HashMap<>();
+        if (params != null) {
+            next.putAll(params);
+        }
+        next.put(EXTRACTOR_DEPTH_KEY, Integer.toString(getCurrentDepth(params) + 1));
+        return next;
+    }
+
+    /**
+     * Validates that the recursion depth recorded in {@code params} does not
+     * meet or exceed {@code maxDepth}. Throws {@link MaxLengthExceededException}
+     * (a {@link org.codelibs.fess.crawler.exception.CrawlingAccessException
+     * CrawlingAccessException}) when the threshold is reached so that the
+     * surrounding crawler treats it as a data-driven access failure rather
+     * than a system error.
+     *
+     * @param params the extractor parameters (may be {@code null})
+     * @param maxDepth the (exclusive) maximum allowed depth
+     * @throws MaxLengthExceededException when {@code currentDepth >= maxDepth}
+     */
+    protected void checkDepth(final Map<String, String> params, final int maxDepth) {
+        final int current = getCurrentDepth(params);
+        if (current >= maxDepth) {
+            throw new MaxLengthExceededException("Archive recursion depth exceeded: depth=" + current + " max=" + maxDepth);
+        }
+    }
+
     @Override
     public int getWeight() {
         return weight;
@@ -142,4 +234,77 @@ protected void validateInputStream(final InputStream in) {
             throw new CrawlerSystemException("The inputstream is null.");
         }
     }
+
+    /**
+     * Returns true when the supplied entry name escapes the conceptual
+     * extraction root via path-traversal segments. The check is performed on
+     * a normalised form of the path and is shared between the archive
+     * extractors (Zip / Tar / Lha) so the rejection rules stay in lock step.
+     *
+     * <p>
+     * An entry is rejected when it is null/empty, when it is rooted at
+     * {@code /} or {@code \}, when it begins with a Windows drive letter
+     * (e.g. {@code C:}), when its normalised form contains a {@code ..}
+     * segment, or when {@link Paths#get} treats it as malformed.
+     * </p>
+     *
+     * @param name the entry name as reported by the archive
+     * @return {@code true} if the name should be rejected
+     */
+    protected static boolean isPathTraversal(final String name) {
+        if (name == null || name.isEmpty()) {
+            return true;
+        }
+        // Absolute paths (Unix or Windows-style) are unsafe in the
+        // context of an archive extracted into a sandbox root.
+        if (name.startsWith("/") || name.startsWith("\\")) {
+            return true;
+        }
+        if (name.length() >= 2 && name.charAt(1) == ':') {
+            return true;
+        }
+        try {
+            final Path normalised = Paths.get(name).normalize();
+            final String normStr = normalised.toString().replace('\\', '/');
+            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
+                return true;
+            }
+            for (final Path part : normalised) {
+                if ("..".equals(part.toString())) {
+                    return true;
+                }
+            }
+        } catch (final InvalidPathException ipe) {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Copies up to {@code limit} bytes from {@code in} to {@code out}, returning
+     * the actual number of bytes copied. Used by archive extractors to bound
+     * the amount of memory consumed when buffering an entry's uncompressed
+     * payload.
+     *
+     * @param in the source stream
+     * @param out the sink stream
+     * @param limit the maximum number of bytes to copy (inclusive). Values
+     *              {@code <= 0} cause the method to return without reading.
+     * @return the number of bytes actually copied
+     * @throws IOException if reading from {@code in} or writing to {@code out}
+     *                     fails
+     */
+    protected static long copyBounded(final InputStream in, final OutputStream out, final long limit) throws IOException {
+        if (limit <= 0) {
+            return 0;
+        }
+        final byte[] buffer = new byte[8192];
+        long total = 0;
+        int read;
+        while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) {
+            out.write(buffer, 0, read);
+            total += read;
+        }
+        return total;
+    }
 }