Skip to content

Commit 0928da7

Browse files
committed
Delay SubString->String conversion
This has a surprisingly large benefit for performance
1 parent 0d219c0 commit 0928da7

File tree

2 files changed

+19
-15
lines changed

2 files changed

+19
-15
lines changed

src/mmcif.jl

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -75,16 +75,20 @@ Call `MMCIFDict` with a filepath or stream to read the dictionary from that
7575
source.
7676
The keyword argument `gzip` (default `false`) determines if the input is gzipped.
7777
"""
78-
struct MMCIFDict <: AbstractDict{String, Vector{String}}
79-
dict::Dict{String, Vector{String}}
78+
struct MMCIFDict{K<:AbstractString} <: AbstractDict{K, Vector{K}}
79+
dict::Dict{K, Vector{K}}
8080
end
8181

82-
MMCIFDict() = MMCIFDict(Dict())
82+
MMCIFDict{K}() where K<:AbstractString = MMCIFDict{K}(Dict{K,Vector{K}}())
83+
MMCIFDict() = MMCIFDict{String}()
84+
85+
MMCIFDict(d::AbstractDict{K, Vector{K}}) where K<:AbstractString = MMCIFDict{K}(d)
86+
MMCIFDict(d::AbstractDict) = MMCIFDict{String}(Dict(d))
8387

8488
Base.getindex(mmcif_dict::MMCIFDict, field::AbstractString) = mmcif_dict.dict[field]
8589

8690
function Base.setindex!(mmcif_dict::MMCIFDict,
87-
val::AbstractVector{<:String},
91+
val::AbstractVector{<:AbstractString},
8892
field::AbstractString)
8993
mmcif_dict.dict[field] = val
9094
return mmcif_dict
@@ -147,7 +151,7 @@ splitline(s::AbstractString) = splitline!(String[], s) # mostly for testing
147151

148152
# Get tokens from a mmCIF file
149153
function tokenizecif(f::IO)
150-
tokens = String[]
154+
tokens = SubString{String}[]
151155
for line in eachline(f)
152156
if startswith(line, "#")
153157
continue
@@ -172,7 +176,7 @@ end
172176
# This will fail if there is only a single atom record in the file
173177
# and it is not in the loop format
174178
function tokenizecifstructure(f::IO)
175-
tokens = String[]
179+
tokens = SubString{String}[]
176180
reading = false
177181
in_keys = true
178182
category_groups = ["_atom_site.", "_struct_conf."]
@@ -218,14 +222,14 @@ end
218222

219223
# Read a mmCIF file into a MMCIFDict
220224
function MMCIFDict(f::IO; gzip::Bool=false)
221-
mmcif_dict = MMCIFDict()
222225
if gzip
223226
gz = GzipDecompressorStream(f)
224227
tokens = tokenizecif(gz)
225228
close(gz)
226229
else
227230
tokens = tokenizecif(f)
228231
end
232+
mmcif_dict = MMCIFDict{eltype(tokens)}()
229233
# Data label token is read first
230234
if length(tokens) == 0
231235
return mmcif_dict
@@ -236,16 +240,16 @@ function MMCIFDict(f::IO; gzip::Bool=false)
236240
end
237241

238242
# Add tokens to a mmCIF dictionary
239-
function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractString})
243+
function populatedict!(mmcif_dict::MMCIFDict{K}, tokens::AbstractVector{<:AbstractString}) where K<:AbstractString
240244
key = ""
241-
keys = String[]
245+
keys = K[]
242246
loop_flag = false
243247
i = 0 # Value counter
244248
n = 0 # Key counter
245249
for token in tokens
246250
if token == "loop_" || token == "LOOP_"
247251
loop_flag = true
248-
keys = String[]
252+
keys = K[]
249253
i = 0
250254
n = 0
251255
continue
@@ -258,7 +262,7 @@ function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractS
258262
if i > 0
259263
loop_flag = false
260264
else
261-
mmcif_dict[token] = String[]
265+
mmcif_dict[token] = K[]
262266
push!(keys, token)
263267
n += 1
264268
continue
@@ -290,14 +294,14 @@ function Base.read(input::IO,
290294
run_dssp::Bool=false,
291295
run_stride::Bool=false,
292296
gzip::Bool=false)
293-
mmcif_dict = MMCIFDict()
294297
if gzip
295298
gz = GzipDecompressorStream(input)
296299
tokens = tokenizecifstructure(gz)
297300
close(gz)
298301
else
299302
tokens = tokenizecifstructure(input)
300303
end
304+
mmcif_dict = MMCIFDict{eltype(tokens)}()
301305
populatedict!(mmcif_dict, tokens)
302306
return MolecularStructure(
303307
mmcif_dict;
@@ -673,13 +677,13 @@ end
673677
Write multiple `MMCIFDict`s as a `Dict{String, MMCIFDict}` to a filepath or stream.
674678
The keyword argument `gzip` (default `false`) determines if the output is gzipped.
675679
"""
676-
function writemultimmcif(filepath::AbstractString, cifs::Dict{String, MMCIFDict}; gzip::Bool=false)
680+
function writemultimmcif(filepath::AbstractString, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false)
677681
open(filepath, "w") do f
678682
writemultimmcif(f, cifs; gzip=gzip)
679683
end
680684
end
681685

682-
function writemultimmcif(io::IO, cifs::Dict{String, MMCIFDict}; gzip::Bool=false)
686+
function writemultimmcif(io::IO, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false)
683687
if gzip
684688
io = GzipCompressorStream(io)
685689
end

test/runtests.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1783,7 +1783,7 @@ end
17831783
mmcif_1ake = testfilepath("mmCIF", "1AKE.cif")
17841784
gzip_file(mmcif_1ake, temp_filename)
17851785
for dic in (MMCIFDict(mmcif_1ake), MMCIFDict(temp_filename; gzip=true))
1786-
@test isa(dic.dict, Dict{String, Vector{String}})
1786+
@test isa(dic.dict, Dict{K, Vector{K}} where K<:AbstractString)
17871787
@test dic["_pdbx_database_status.recvd_initial_deposition_date"] == ["1991-11-08"]
17881788
@test dic["_audit_author.name"] == ["Mueller, C.W.", "Schulz, G.E."]
17891789
@test length(dic["_atom_site.group_PDB"]) == 3816

0 commit comments

Comments
 (0)