Skip to content

Commit 2e04af6

Browse files
copy_symlinks: allow extract to simulate symlinks with copies (#63)
The logic to figure out which symlinks can be created as copies and in what order is fairly bonkers, but this seems to do the trick.
1 parent 50812f1 commit 2e04af6

File tree

6 files changed

+260
-23
lines changed

6 files changed

+260
-23
lines changed

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ version = "1.5.0"
55

66
[deps]
77
ArgTools = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
8+
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
89
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
910

1011
[compat]
@@ -13,8 +14,9 @@ julia = "1.3"
1314

1415
[extras]
1516
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
17+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1618
Tar_jll = "9b64493d-8859-5bf3-93d7-7c32dd38186f"
1719
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
1820

1921
[targets]
20-
test = ["Test", "Pkg", "Tar_jll"]
22+
test = ["Pkg", "Random", "Tar_jll", "Test"]

README.md

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,14 @@ recreated. The `skeleton` and `predicate` arguments cannot be used together.
147147

148148
### Tar.extract
149149

150-
extract([ predicate, ] tarball, [ dir ]; [ skeleton ]) -> dir
150+
extract([ predicate, ] tarball, [ dir ];
151+
[ skeleton, ] [ copy_symlinks ]) -> dir
151152

152-
* `predicate :: Header --> Bool`
153-
* `tarball :: Union{AbstractString, IO}`
154-
* `dir :: AbstractString`
155-
* `skeleton :: Union{AbstractString, IO}`
153+
* `predicate :: Header --> Bool`
154+
* `tarball :: Union{AbstractString, AbstractCmd, IO}`
155+
* `dir :: AbstractString`
156+
* `skeleton :: Union{AbstractString, AbstractCmd, IO}`
157+
* `copy_symlinks :: Bool`
156158

157159
Extract a tar archive ("tarball") located at the path `tarball` into the
158160
directory `dir`. If `tarball` is an IO object instead of a path, then the
@@ -172,6 +174,14 @@ is written to the file or IO handle given. This skeleton file can be used to
172174
recreate an identical tarball by passing the `skeleton` keyword to the `create`
173175
function. The `skeleton` and `predicate` arguments cannot be used together.
174176

177+
If `copy_symlinks` is `true` then instead of extracting symbolic links as such,
178+
they will be extracted as copies of what they link to if they are internal to
179+
the tarball and if it is possible to do so. Non-internal symlinks, such as a
180+
link to `/etc/passwd` will not be copied. Symlinks which are in any way cyclic
181+
will also not be copied and will instead be skipped. By default, `extract` will
182+
detect whether symlinks can be created in `dir` or not and will automatically
183+
copy symlinks if they cannot be created.
184+
175185
### Tar.list
176186

177187
list(tarball; [ strict = true ]) -> Vector{Header}

src/Tar.jl

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
module Tar
22

33
using ArgTools
4+
using Logging
5+
46
const true_predicate = _ -> true
57

68
# 2 MiB to take advantage of THP if enabled
@@ -17,6 +19,23 @@ function Base.skip(io::Union{Base.Process, Base.ProcessChain}, n::Integer)
1719
end
1820
const skip_buffer = UInt8[]
1921

22+
function can_symlink(dir::AbstractString)
23+
# guaranteed to be an empty directory
24+
link_path = joinpath(dir, "link")
25+
log_level = Logging.min_enabled_level(Logging.current_logger())
26+
return try
27+
Logging.disable_logging(Logging.Warn)
28+
symlink("target", link_path)
29+
true
30+
catch err
31+
err isa Base.IOError || rethrow()
32+
false
33+
finally
34+
Logging.disable_logging(log_level-1)
35+
rm(link_path, force=true)
36+
end
37+
end
38+
2039
include("header.jl")
2140
include("create.jl")
2241
include("extract.jl")
@@ -134,12 +153,14 @@ function list(
134153
end
135154

136155
"""
137-
extract([ predicate, ] tarball, [ dir ]; [ skeleton ]) -> dir
156+
extract([ predicate, ] tarball, [ dir ];
157+
[ skeleton, ] [ copy_symlinks ]) -> dir
138158
139-
predicate :: Header --> Bool
140-
tarball :: Union{AbstractString, AbstractCmd, IO}
141-
dir :: AbstractString
142-
skeleton :: Union{AbstractString, AbstractCmd, IO}
159+
predicate :: Header --> Bool
160+
tarball :: Union{AbstractString, AbstractCmd, IO}
161+
dir :: AbstractString
162+
skeleton :: Union{AbstractString, AbstractCmd, IO}
163+
copy_symlinks :: Bool
143164
144165
Extract a tar archive ("tarball") located at the path `tarball` into the
145166
directory `dir`. If `tarball` is an IO object instead of a path, then the
@@ -158,12 +179,21 @@ If the `skeleton` keyword is passed then a "skeleton" of the extracted tarball
158179
is written to the file or IO handle given. This skeleton file can be used to
159180
recreate an identical tarball by passing the `skeleton` keyword to the `create`
160181
function. The `skeleton` and `predicate` arguments cannot be used together.
182+
183+
If `copy_symlinks` is `true` then instead of extracting symbolic links as such,
184+
they will be extracted as copies of what they link to if they are internal to
185+
the tarball and if it is possible to do so. Non-internal symlinks, such as a
186+
link to `/etc/passwd` will not be copied. Symlinks which are in any way cyclic
187+
will also not be copied and will instead be skipped. By default, `extract` will
188+
detect whether symlinks can be created in `dir` or not and will automatically
189+
copy symlinks if they cannot be created.
161190
"""
162191
function extract(
163192
predicate::Function,
164193
tarball::ArgRead,
165194
dir::Union{AbstractString, Nothing} = nothing;
166195
skeleton::Union{ArgWrite, Nothing} = nothing,
196+
copy_symlinks::Union{Bool, Nothing} = nothing,
167197
)
168198
predicate === true_predicate || skeleton === nothing ||
169199
error("extract: predicate and skeleton cannot be used together")
@@ -172,8 +202,15 @@ function extract(
172202
check_extract_dir(dir)
173203
arg_read(tarball) do tar
174204
arg_mkdir(dir) do dir
205+
if copy_symlinks === nothing
206+
copy_symlinks = !can_symlink(dir)
207+
end
175208
arg_write(skeleton) do skeleton
176-
extract_tarball(predicate, tar, dir, skeleton=skeleton)
209+
extract_tarball(
210+
predicate, tar, dir,
211+
skeleton = skeleton,
212+
copy_symlinks = copy_symlinks,
213+
)
177214
end
178215
end
179216
end
@@ -183,8 +220,13 @@ function extract(
183220
tarball::ArgRead,
184221
dir::Union{AbstractString, Nothing} = nothing;
185222
skeleton::Union{ArgWrite, Nothing} = nothing,
223+
copy_symlinks::Union{Bool, Nothing} = nothing,
186224
)
187-
extract(true_predicate, tarball, dir, skeleton=skeleton)
225+
extract(
226+
true_predicate, tarball, dir,
227+
skeleton = skeleton,
228+
copy_symlinks = copy_symlinks,
229+
)
188230
end
189231

190232
"""

src/extract.jl

Lines changed: 93 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ function extract_tarball(
4343
root::String;
4444
buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE),
4545
skeleton::IO = devnull,
46+
copy_symlinks::Bool = false,
4647
)
47-
read_tarball(predicate, tar; buf=buf, skeleton=skeleton) do hdr, parts
48+
paths = read_tarball(predicate, tar; buf=buf, skeleton=skeleton) do hdr, parts
4849
# get the file system version of the path
4950
sys_path = reduce(joinpath, init=root, parts)
5051
# delete anything that's there already
@@ -60,7 +61,7 @@ function extract_tarball(
6061
if hdr.type == :directory
6162
mkdir(sys_path)
6263
elseif hdr.type == :symlink
63-
symlink(hdr.link, sys_path)
64+
copy_symlinks || symlink(hdr.link, sys_path)
6465
elseif hdr.type == :file
6566
read_data(tar, sys_path, size=hdr.size, buf=buf)
6667
# set executable bit if necessary
@@ -74,6 +75,89 @@ function extract_tarball(
7475
error("unsupported tarball entry type: $(hdr.type)")
7576
end
7677
end
78+
copy_symlinks || return
79+
80+
# resolve the internal targets of symlinks
81+
for (path, what) in paths
82+
what isa AbstractString || continue
83+
target = link_target(paths, path, what)
84+
paths[path] = something(target, :symlink)
85+
end
86+
87+
# follow chains of symlinks
88+
follow(seen::Vector, what::Symbol) =
89+
what == :symlink ? what : seen[end]
90+
follow(seen::Vector, what::String) =
91+
what in seen ? :symlink : follow(push!(seen, what), paths[what])
92+
for (path, what) in paths
93+
what isa AbstractString || continue
94+
paths[path] = follow([path], what)
95+
end
96+
97+
# copies that need to be made
98+
copies = Pair{String,String}[]
99+
for (path, what) in paths
100+
what isa AbstractString || continue
101+
push!(copies, path => what)
102+
end
103+
sort!(copies, by=last)
104+
105+
while !isempty(copies)
106+
i = 1
107+
while i length(copies)
108+
path, what = copies[i]
109+
# check if source is complete yet
110+
if any(startswith(p, "$what/") for (p, w) in copies)
111+
# `what` is an incomplete directory
112+
# need to wait for source to be complete
113+
i += 1
114+
else
115+
# source complete, can copy now
116+
deleteat!(copies, i)
117+
src = reduce(joinpath, init=root, split(what, '/'))
118+
dst = reduce(joinpath, init=root, split(path, '/'))
119+
cp(src, dst)
120+
end
121+
end
122+
end
123+
end
124+
125+
# resolve symlink target or nothing if not valid
126+
function link_target(
127+
paths::Dict{String,Union{String,Symbol}},
128+
path::AbstractString,
129+
link::AbstractString,
130+
)
131+
first(link) == '/' && return
132+
path_parts = split(path, r"/+")
133+
link_parts = split(link, r"/+")
134+
pop!(path_parts)
135+
part = nothing # remember the last part
136+
while !isempty(link_parts)
137+
part = popfirst!(link_parts)
138+
part in ("", ".") && continue
139+
if part == ".."
140+
isempty(path_parts) && return
141+
pop!(path_parts)
142+
else
143+
push!(path_parts, part)
144+
prefix = join(path_parts, '/')
145+
prefix in keys(paths) || return
146+
isempty(link_parts) && break
147+
what = paths[prefix]
148+
if what isa AbstractString
149+
prefix = link_target(paths, prefix, what)
150+
path_parts = split(prefix, '/')
151+
end
152+
end
153+
end
154+
isempty(path_parts) && return
155+
target = join(path_parts, '/')
156+
# if link ends in `/` or `.` target must be a directory
157+
part in ("", ".") && paths[target] != :directory && return
158+
# can't copy a circular link to a prefix of itself
159+
(path == target || startswith(path, "$target/")) && return
160+
return target
77161
end
78162

79163
function git_tree_hash(
@@ -210,8 +294,9 @@ function read_tarball(
210294
skeleton::IO = devnull,
211295
)
212296
write_skeleton_header(skeleton, buf=buf)
297+
# symbols for path types except symlinks store the link
298+
paths = Dict{String,Union{Symbol,String}}()
213299
globals = Dict{String,String}()
214-
links = Set{String}()
215300
while !eof(tar)
216301
hdr = read_header(tar, globals, buf=buf, tee=skeleton)
217302
hdr === nothing && break
@@ -226,18 +311,15 @@ function read_tarball(
226311
for part in split(hdr.path, '/')
227312
(isempty(part) || part == ".") && continue
228313
# check_header doesn't allow ".." in path
229-
path in links && error("""
314+
get(paths, path, nothing) isa String && error("""
230315
Refusing to extract path with symlink prefix, possible attack
316+
* path to extract: $(repr(hdr.path))
231317
* symlink prefix: $(repr(path))
232-
* extracted path: $(repr(hdr.path))
233318
""")
319+
isempty(path) || (paths[path] = :directory)
234320
path = isempty(path) ? part : "$path/$part"
235321
end
236-
if hdr.type == :symlink
237-
push!(links, path)
238-
else
239-
delete!(links, path)
240-
end
322+
paths[path] = hdr.type == :symlink ? hdr.link : hdr.type
241323
before = applicable(position, tar) ? position(tar) : 0
242324
callback(hdr, split(path, '/', keepempty=false))
243325
applicable(position, tar) || continue
@@ -246,6 +328,7 @@ function read_tarball(
246328
advanced == expected ||
247329
error("callback read $advanced bytes instead of $expected")
248330
end
331+
return paths
249332
end
250333

251334
function read_header(

0 commit comments

Comments
 (0)