Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SimpleBufferStream = "777ac1f9-54b0-4bf8-805c-2214025038e7"
Tar_jll = "9b64493d-8859-5bf3-93d7-7c32dd38186f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"

[targets]
test = ["Random", "SimpleBufferStream", "Tar_jll", "Test"]
test = ["Random", "SimpleBufferStream", "Tar_jll", "Test", "CodecZlib"]
12 changes: 10 additions & 2 deletions src/Tar.jl
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ end
tarball :: Union{AbstractString, AbstractCmd, IO}
algorithm :: AbstractString
skip_empty :: Bool
copy_symlinks :: Bool

Compute a tree hash value for the file tree that the tarball contains. By
default, this uses git's tree hashing algorithm with the SHA1 secure hash
Expand Down Expand Up @@ -389,21 +390,26 @@ hash, the hash value that you get will match the hash value computed by
are hashing trees that may contain empty directories (i.e. do not come from a
git repo), however, it is recommended that you hash them using a tool (such as
this one) that does not ignore empty directories.

If `copy_symlinks` is true, symlinks in the tarfile will be followed and the
target hashes will be copied. This is useful for checking what the hash would
be when using `Tar.extract` with `copy_symlinks = true`.
"""
function tree_hash(
predicate::Function,
tarball::ArgRead;
algorithm::AbstractString = "git-sha1",
skip_empty::Bool = false,
copy_symlinks::Bool = false,
)
check_tree_hash_tarball(tarball)
if algorithm == "git-sha1"
return arg_read(tarball) do tar
git_tree_hash(predicate, tar, SHA.SHA1_CTX, skip_empty)
git_tree_hash(predicate, tar, SHA.SHA1_CTX, skip_empty, copy_symlinks)
end
elseif algorithm == "git-sha256"
return arg_read(tarball) do tar
git_tree_hash(predicate, tar, SHA.SHA256_CTX, skip_empty)
git_tree_hash(predicate, tar, SHA.SHA256_CTX, skip_empty, copy_symlinks)
end
else
error("invalid tree hashing algorithm: $algorithm")
Expand All @@ -414,12 +420,14 @@ function tree_hash(
tarball::ArgRead;
algorithm::AbstractString = "git-sha1",
skip_empty::Bool = false,
copy_symlinks::Bool = false,
)
tree_hash(
true_predicate,
tarball,
algorithm = algorithm,
skip_empty = skip_empty,
copy_symlinks = copy_symlinks
)
end

Expand Down
67 changes: 62 additions & 5 deletions src/extract.jl
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,13 @@ function git_tree_hash(
predicate::Function,
tar::IO,
::Type{HashType},
skip_empty::Bool;
skip_empty::Bool,
copy_symlinks::Bool = false;
buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE),
) where HashType <: SHA.SHA_CTX
# build tree with leaves for files and symlinks
tree = Dict{String,Any}()
read_tarball(predicate, tar; buf=buf) do hdr, parts
paths = read_tarball(predicate, tar; buf=buf) do hdr, parts
isempty(parts) && return
name = pop!(parts)
node = tree
Expand All @@ -229,9 +230,14 @@ function git_tree_hash(
end
return
elseif hdr.type == :symlink
mode = "120000"
hash = git_object_hash("blob", HashType) do io
write(io, hdr.link)
if copy_symlinks
mode = "120000"
hash = hdr.link
else
mode = "120000"
hash = git_object_hash("blob", HashType) do io
write(io, hdr.link)
end
end
elseif hdr.type == :hardlink
mode = iszero(hdr.mode & 0o100) ? "100644" : "100755"
Expand All @@ -249,6 +255,57 @@ function git_tree_hash(
node[name] = (mode, hash)
end

if copy_symlinks
# resolve the internal targets of symlinks
for (path, what) in paths
what isa String || continue
target = link_target(paths, path, what)
paths[path] = something(target, :symlink)
end

for (path, what) in paths
what isa AbstractString || continue
paths[path] = follow_symlink_chain([path], what, paths)
end

# use paths to index into the tree
function get_tree_index(tree::Dict, path::AbstractString)
node = tree
parts = splitpath(path)
for part in parts
node = node[part]
end
return node
end
function set_tree_index!(tree::Dict, value, path::AbstractString)
node = tree
parts = splitpath(path)
for part in parts[1:end-1]
node = node[part]
end
node[parts[end]] = value
end
function prune_tree_index!(tree::Dict, path::AbstractString)
node = tree
parts = splitpath(path)
for part in parts[1:end-1]
node = node[part]
end
delete!(node, parts[end])
end

# copy hashes
for (path, what) in paths
if what isa AbstractString
what_hash = get_tree_index(tree, what)
set_tree_index!(tree, what_hash, path)
elseif what == :symlink
# external symlink
prune_tree_index!(tree, path)
end
end
end

# prune directories that don't contain any files
if skip_empty
prune_empty!(node::Tuple) = true
Expand Down
Binary file added test/data/iso_codes.v4.11.0.any.tar.gz
Binary file not shown.
14 changes: 14 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,25 @@ end
@testset "Tar.tree_hash" begin
arg_readers(tarball) do tar
@arg_test tar @test Tar.tree_hash(tar) == hash
@arg_test tar @test Tar.tree_hash(tar; copy_symlinks=true) != hash
@arg_test tar @test empty_tree_sha1 == Tar.tree_hash(hdr->false, tar)
@arg_test tar @test empty_tree_sha1 == Tar.tree_hash(hdr->false, tar; copy_symlinks=true)
@arg_test tar @test empty_tree_sha1 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha1")
@arg_test tar @test empty_tree_sha1 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha1", copy_symlinks=true)
@arg_test tar @test empty_tree_sha256 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha256")
@arg_test tar @test empty_tree_sha256 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha256", copy_symlinks=true)
end
NON_STDLIB_TESTS && begin
open(GzipDecompressorStream, "data/iso_codes.v4.11.0.any.tar.gz") do io
@test Tar.tree_hash(io) == "71f68a3d55d73f2e15a3969c241fae2349b1feb5"
end
open(GzipDecompressorStream, "data/iso_codes.v4.11.0.any.tar.gz") do io
@test Tar.tree_hash(io; copy_symlinks=true) == "409d6ac4c02dae43ff4fe576b5c5820d0386fb3f"
end
end
end
@testset "Tar.list & check properties" begin
Expand Down
1 change: 1 addition & 0 deletions test/setup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ const NON_STDLIB_TESTS = Main == @__MODULE__

if NON_STDLIB_TESTS
using SimpleBufferStream
using CodecZlib

using Tar_jll
if isdefined(Tar_jll, :tar)
Expand Down