Skip to content

Improve read perf, store coords as SVector #70

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[weakdeps]
Expand Down Expand Up @@ -50,5 +51,6 @@ MetaGraphs = "0.7, 0.8"
PrecompileTools = "1"
RecipesBase = "1"
STRIDE_jll = "1"
StaticArraysCore = "1"
Statistics = "1.9"
julia = "1.9"
1 change: 1 addition & 0 deletions src/BioStructures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ using PrecompileTools
using RecipesBase

using LinearAlgebra
using StaticArraysCore
using Statistics

include("model.jl")
Expand Down
105 changes: 55 additions & 50 deletions src/mmcif.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,20 @@
source.
The keyword argument `gzip` (default `false`) determines if the input is gzipped.
"""
struct MMCIFDict <: AbstractDict{String, Vector{String}}
dict::Dict{String, Vector{String}}
struct MMCIFDict{K<:AbstractString} <: AbstractDict{K, Vector{K}}
dict::Dict{K, Vector{K}}
end

MMCIFDict() = MMCIFDict(Dict())
MMCIFDict{K}() where K<:AbstractString = MMCIFDict{K}(Dict{K,Vector{K}}())
MMCIFDict() = MMCIFDict{String}()

MMCIFDict(d::AbstractDict{K, Vector{K}}) where K<:AbstractString = MMCIFDict{K}(d)

Check warning on line 85 in src/mmcif.jl

View check run for this annotation

Codecov / codecov/patch

src/mmcif.jl#L85

Added line #L85 was not covered by tests
MMCIFDict(d::AbstractDict) = MMCIFDict{String}(Dict(d))

Base.getindex(mmcif_dict::MMCIFDict, field::AbstractString) = mmcif_dict.dict[field]

function Base.setindex!(mmcif_dict::MMCIFDict,
val::AbstractVector{<:String},
val::AbstractVector{<:AbstractString},
field::AbstractString)
mmcif_dict.dict[field] = val
return mmcif_dict
Expand All @@ -104,8 +108,7 @@

# Split a mmCIF line into tokens
# See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax
function splitline(s::AbstractString)
tokens = String[]
function splitline!(tokens, s::AbstractString)
in_token = false
# Quote character of the currently open quote, or ' ' if no quote open
quote_open_char = ' '
Expand All @@ -114,7 +117,7 @@
if c in whitespacechars
if in_token && quote_open_char == ' '
in_token = false
push!(tokens, s[start_i:(i - 1)])
push!(tokens, @view(s[start_i:(i - 1)]))
end
elseif c in quotechars
if quote_open_char == ' '
Expand All @@ -127,7 +130,7 @@
elseif c == quote_open_char && (i == length(s) || s[i + 1] in whitespacechars)
quote_open_char = ' '
in_token = false
push!(tokens, s[start_i:(i - 1)])
push!(tokens, @view(s[start_i:(i - 1)]))
end
elseif c == '#' && !in_token
return tokens
Expand All @@ -137,17 +140,18 @@
end
end
if in_token
push!(tokens, s[start_i:end])
push!(tokens, @view(s[start_i:end]))
end
if quote_open_char != ' '
throw(ArgumentError("Line ended with quote open: $s"))
end
return tokens
end
splitline(s::AbstractString) = splitline!(String[], s) # mostly for testing

# Get tokens from a mmCIF file
function tokenizecif(f::IO)
tokens = String[]
tokens = SubString{String}[]
for line in eachline(f)
if startswith(line, "#")
continue
Expand All @@ -162,7 +166,7 @@
end
push!(tokens, join(token_buffer, "\n"))
else
append!(tokens, splitline(line))
splitline!(tokens, line)
end
end
return tokens
Expand All @@ -172,7 +176,7 @@
# This will fail if there is only a single atom record in the file
# and it is not in the loop format
function tokenizecifstructure(f::IO)
tokens = String[]
tokens = SubString{String}[]
reading = false
in_keys = true
category_groups = ["_atom_site.", "_struct_conf."]
Expand Down Expand Up @@ -204,7 +208,7 @@
in_keys = true
else
in_keys = false
append!(tokens, splitline(line))
splitline!(tokens, line)
end
end
return tokens
Expand All @@ -218,14 +222,14 @@

# Read a mmCIF file into a MMCIFDict
function MMCIFDict(f::IO; gzip::Bool=false)
mmcif_dict = MMCIFDict()
if gzip
gz = GzipDecompressorStream(f)
tokens = tokenizecif(gz)
close(gz)
else
tokens = tokenizecif(f)
end
mmcif_dict = MMCIFDict{eltype(tokens)}()
# Data label token is read first
if length(tokens) == 0
return mmcif_dict
Expand All @@ -236,16 +240,16 @@
end

# Add tokens to a mmCIF dictionary
function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String})
function populatedict!(mmcif_dict::MMCIFDict{K}, tokens::AbstractVector{<:AbstractString}) where K<:AbstractString
key = ""
keys = String[]
keys = K[]
loop_flag = false
i = 0 # Value counter
n = 0 # Key counter
for token in tokens
if token == "loop_" || token == "LOOP_"
loop_flag = true
keys = String[]
keys = K[]
i = 0
n = 0
continue
Expand All @@ -258,22 +262,14 @@
if i > 0
loop_flag = false
else
mmcif_dict[token] = String[]
mmcif_dict[token] = K[]
push!(keys, token)
n += 1
continue
end
else
try
push!(mmcif_dict[keys[i % n + 1]], token)
catch ex
# A zero division error means we have not found any keys
if isa(ex, DivideError)
throw(ArgumentError("Loop keys not found, token: \"$token\""))
else
rethrow()
end
end
iszero(n) && throw(ArgumentError("Loop keys not found, token: \"$token\""))
push!(mmcif_dict[keys[i % n + 1]], token)
i += 1
continue
end
Expand All @@ -298,14 +294,14 @@
run_dssp::Bool=false,
run_stride::Bool=false,
gzip::Bool=false)
mmcif_dict = MMCIFDict()
if gzip
gz = GzipDecompressorStream(input)
tokens = tokenizecifstructure(gz)
close(gz)
else
tokens = tokenizecifstructure(input)
end
mmcif_dict = MMCIFDict{eltype(tokens)}()
populatedict!(mmcif_dict, tokens)
return MolecularStructure(
mmcif_dict;
Expand Down Expand Up @@ -384,25 +380,34 @@
end

# Constructor from mmCIF ATOM/HETATM line
AtomRecord(d::MMCIFDict, i::Integer) = AtomRecord(
d["_atom_site.group_PDB"][i] == "HETATM",
parse(Int, d["_atom_site.id"][i]),
d["_atom_site.auth_atom_id"][i],
d["_atom_site.label_alt_id"][i] in missingvals ? ' ' : d["_atom_site.label_alt_id"][i][1],
d["_atom_site.auth_comp_id"][i],
d["_atom_site.auth_asym_id"][i],
parse(Int, d["_atom_site.auth_seq_id"][i]),
d["_atom_site.pdbx_PDB_ins_code"][i] in missingvals ? ' ' : d["_atom_site.pdbx_PDB_ins_code"][i][1],
[
parse(Float64, d["_atom_site.Cartn_x"][i]),
parse(Float64, d["_atom_site.Cartn_y"][i]),
parse(Float64, d["_atom_site.Cartn_z"][i])
],
d["_atom_site.occupancy"][i] in missingvals ? 1.0 : parse(Float64, d["_atom_site.occupancy"][i]),
d["_atom_site.B_iso_or_equiv"][i] in missingvals ? 0.0 : parse(Float64, d["_atom_site.B_iso_or_equiv"][i]),
d["_atom_site.type_symbol"][i] in missingvals ? " " : d["_atom_site.type_symbol"][i],
d["_atom_site.pdbx_formal_charge"][i] in missingvals ? " " : d["_atom_site.pdbx_formal_charge"][i],
)
function AtomRecord(d::MMCIFDict, i::Integer)
alt_id = d["_atom_site.label_alt_id"][i]
ins_code = d["_atom_site.pdbx_PDB_ins_code"][i]
occupancy = d["_atom_site.occupancy"][i]
temp_factor = d["_atom_site.B_iso_or_equiv"][i]
typesym = d["_atom_site.type_symbol"][i]
charge = d["_atom_site.pdbx_formal_charge"][i]

return AtomRecord(
d["_atom_site.group_PDB"][i] == "HETATM",
parse(Int, d["_atom_site.id"][i]),
d["_atom_site.auth_atom_id"][i],
alt_id in missingvals ? ' ' : alt_id[1],
d["_atom_site.auth_comp_id"][i],
d["_atom_site.auth_asym_id"][i],
parse(Int, d["_atom_site.auth_seq_id"][i]),
ins_code in missingvals ? ' ' : ins_code[1],
SVector{3,Float64}((
parse(Float64, d["_atom_site.Cartn_x"][i]),
parse(Float64, d["_atom_site.Cartn_y"][i]),
parse(Float64, d["_atom_site.Cartn_z"][i]),
)),
occupancy in missingvals ? 1.0 : parse(Float64, occupancy),
temp_factor in missingvals ? 0.0 : parse(Float64, temp_factor),
typesym in missingvals ? " " : typesym,
charge in missingvals ? " " : charge,
)
end

# Format a mmCIF data value by enclosing with quotes or semicolon lines where
# appropriate. See
Expand Down Expand Up @@ -672,13 +677,13 @@
Write multiple `MMCIFDict`s as a `Dict{String, MMCIFDict}` to a filepath or stream.
The keyword argument `gzip` (default `false`) determines if the output is gzipped.
"""
function writemultimmcif(filepath::AbstractString, cifs::Dict{String, MMCIFDict}; gzip::Bool=false)
function writemultimmcif(filepath::AbstractString, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false)
open(filepath, "w") do f
writemultimmcif(f, cifs; gzip=gzip)
end
end

function writemultimmcif(io::IO, cifs::Dict{String, MMCIFDict}; gzip::Bool=false)
function writemultimmcif(io::IO, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false)
if gzip
io = GzipCompressorStream(io)
end
Expand Down
41 changes: 20 additions & 21 deletions src/model.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,20 +102,20 @@ end
Base.showerror(io::IO, e::PDBConsistencyError) = print(io, "PDBConsistencyError: ", e.message)

"An atom that is part of a macromolecule."
struct Atom <: AbstractAtom
serial::Int
name::String
alt_loc_id::Char
coords::Vector{Float64}
occupancy::Float64
temp_factor::Float64
element::String
charge::String
residue::StructuralElement
mutable struct Atom <: AbstractAtom
const serial::Int
const name::String
const alt_loc_id::Char
coords::SVector{3,Float64}
const occupancy::Float64
const temp_factor::Float64
const element::String
const charge::String
const residue::StructuralElement
end

function Atom(a::Atom, r::StructuralElement)
return Atom(a.serial, a.name, a.alt_loc_id, copy(a.coords), a.occupancy,
return Atom(a.serial, a.name, a.alt_loc_id, a.coords, a.occupancy,
a.temp_factor, a.element, a.charge, r)
end

Expand Down Expand Up @@ -234,7 +234,7 @@ struct AtomRecord
chain_id::String
res_number::Int
ins_code::Char
coords::Vector{Float64}
coords::SVector{3,Float64}
occupancy::Float64
temp_factor::Float64
element::String
Expand Down Expand Up @@ -483,7 +483,7 @@ Set the x coordinate in Å of an `AbstractAtom` to `val`.

For `DisorderedAtom`s only the default atom is updated.
"""
x!(at::Atom, x::Real) = (at.coords[1] = x; at)
x!(at::Atom, x::Real) = (at.coords = SVector{3,Float64}((x, at.coords[2], at.coords[3])))
x!(dis_at::DisorderedAtom, x::Real) = x!(defaultatom(dis_at), x)

"""
Expand All @@ -501,7 +501,7 @@ Set the y coordinate in Å of an `AbstractAtom` to `val`.

For `DisorderedAtom`s only the default atom is updated.
"""
y!(at::Atom, y::Real) = (at.coords[2] = y; at)
y!(at::Atom, y::Real) = (at.coords = SVector{3,Float64}((at.coords[1], y, at.coords[3])))
y!(dis_at::DisorderedAtom, y::Real) = y!(defaultatom(dis_at), y)

"""
Expand All @@ -519,31 +519,30 @@ Set the z coordinate in Å of an `AbstractAtom` to `val`.

For `DisorderedAtom`s only the default atom is updated.
"""
z!(at::Atom, z::Real) = (at.coords[3] = z; at)
z!(at::Atom, z::Real) = (at.coords = SVector{3,Float64}((at.coords[1], at.coords[2], z)))
z!(dis_at::DisorderedAtom, z::Real) = z!(defaultatom(dis_at), z)

"""
coords(at)

Get the coordinates in Å of an `AbstractAtom` as a `Vector{Float64}`.
Get the coordinates in Å of an `AbstractAtom` as a `SVector{3,Float64}`.
"""
coords(at::Atom) = at.coords
coords(dis_at::DisorderedAtom) = coords(defaultatom(dis_at))

"""
coords!(at, new_coords)

Set the coordinates in Å of an `AbstractAtom` to a `Vector` of 3 numbers.
Set the coordinates in Å of an `AbstractAtom` to `new_coords`, an iterable of 3 numbers.

For `DisorderedAtom`s only the default atom is updated.
"""
function coords!(at::Atom, new_coords)
if length(new_coords) != 3
throw(ArgumentError("3 coordinates must be given"))
end
x!(at, new_coords[1])
y!(at, new_coords[2])
z!(at, new_coords[3])
x, y, z = new_coords
at.coords = SVector{3,Float64}((x, y, z))
return at
end

Expand Down Expand Up @@ -784,7 +783,7 @@ function resid(hetatm::Bool, resnum::Int, inscode::Char)
end
else
if inscode == ' '
return "$resnum"
return string(resnum)
else
return "$resnum$inscode"
end
Expand Down
4 changes: 2 additions & 2 deletions src/pdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,11 @@ function AtomRecord(pdb_line::String, line_n::Integer=1)
parsechainid(pdb_line, line_n),
parseresnumber(pdb_line, line_n),
parseinscode(pdb_line, line_n),
[
SVector{3,Float64}((
parsecoordx(pdb_line, line_n),
parsecoordy(pdb_line, line_n),
parsecoordz(pdb_line, line_n)
],
)),
n >= 60 ? parseoccupancy(pdb_line) : 1.0,
n >= 66 ? parsetempfac(pdb_line) : 0.0,
n >= 78 ? parseelement(pdb_line) : " ",
Expand Down
Loading
Loading