From 13d05dc8f78063808b043278f67f24f4390af7b9 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 14:07:00 +0200 Subject: [PATCH 01/19] docs: add Doxygen/Breathe infrastructure for C++ API reference Wires up Doxygen XML generation + Breathe Sphinx extension so that C++ API documentation from header comments renders in the existing Sphinx/readthedocs site at docs/api.rst. - docs/Doxyfile: Doxygen config (XML-only output, internal headers excluded) - docs/conf.py: runs Doxygen as subprocess, adds breathe extension - docs/api.rst: stub C++ API reference page (expanded by subsequent PRs) - docs/index.rst: replace external cxx-api link with internal api.rst - docs/requirements.txt: add breathe >= 4.35 - .readthedocs.yaml: add apt_packages: [doxygen] - .gitignore: exclude docs/_doxygen/ (generated) Builds on prior work in project-gemmi/gemmi#402 (Paul Emsley / pemsley). Co-authored-by: C. Vonrhein / CV-GPhL --- .gitignore | 3 +++ .readthedocs.yaml | 2 ++ docs/Doxyfile | 48 +++++++++++++++++++++++++++++++++++++++++++ docs/api.rst | 31 ++++++++++++++++++++++++++++ docs/conf.py | 16 ++++++++++++++- docs/index.rst | 2 +- docs/requirements.txt | 1 + 7 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 docs/Doxyfile create mode 100644 docs/api.rst diff --git a/.gitignore b/.gitignore index 0aa5b7ab6..6f22aa373 100644 --- a/.gitignore +++ b/.gitignore @@ -83,3 +83,6 @@ fortran/*.a # test programs fortran/fsym fortran/ftest + +# Doxygen-generated output (consumed by Breathe/Sphinx, not committed) +docs/_doxygen/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e86ae0e3c..909080dca 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,6 +10,8 @@ build: os: ubuntu-24.04 tools: python: "3.12" + apt_packages: + - doxygen # Format htmlzip produces a single page. Instead, use zip as shown in: # https://github.com/readthedocs/readthedocs.org/issues/3242#issuecomment-1410321534 diff --git a/docs/Doxyfile b/docs/Doxyfile new file mode 100644 index 000000000..d305f175a --- /dev/null +++ b/docs/Doxyfile @@ -0,0 +1,48 @@ +PROJECT_NAME = "Gemmi" +PROJECT_BRIEF = "A library for macromolecular crystallography" +PROJECT_NUMBER = + +# Input +INPUT = ../include/gemmi +FILE_PATTERNS = *.hpp +STRIP_FROM_PATH = ../include +STRIP_FROM_INC_PATH = ../include +RECURSIVE = NO +# All public headers are flat in include/gemmi/; the only subdirectory is +# third_party/ which must not be documented. + +# Exclude internal/data-only headers +EXCLUDE_PATTERNS = ace_*.hpp \ + acedrg_tables.hpp \ + mc_tables.hpp \ + eig3.hpp \ + cc_adj.hpp \ + ccp4ener.hpp \ + mmcif_impl.hpp + +# Output: XML only (Breathe consumes this; Sphinx produces HTML) +GENERATE_XML = YES +GENERATE_HTML = NO +GENERATE_LATEX = NO +XML_OUTPUT = _doxygen/xml + +# Extraction +EXTRACT_ALL = YES +EXTRACT_PRIVATE = NO +EXTRACT_STATIC = YES +EXTRACT_ANON_NSPACES = NO + +# Preprocessing (needed for GEMMI_DLL export macro and similar) +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = YES +EXPAND_ONLY_PREDEF = YES +PREDEFINED = GEMMI_DLL= + +# Quiet build, warnings to file +QUIET = YES +WARN_IF_UNDOCUMENTED = NO +WARN_LOGFILE = _doxygen/doxygen-warnings.log + +# Source browsing off (Breathe handles cross-references) +SOURCE_BROWSER = NO +INLINE_SOURCES = NO diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 000000000..2386adfca --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,31 @@ +.. _api: + +C++ API Reference +################# + +This page documents the Gemmi C++ library API, generated from Doxygen comments +in ``include/gemmi/*.hpp``. It is updated with each pull request in the +API documentation series. + +For the Python API, see the `Python API reference `_. + +.. note:: + + Documentation coverage is being added incrementally. Headers not yet + listed here will appear in subsequent pull requests. + +Core Data Structures +-------------------- + +*(Full documentation added in PR 2.)* + +.. doxygenfile:: model.hpp + :project: gemmi + +Map and Grid Data +----------------- + +*(Stub — full documentation added in PR 6.)* + +.. doxygenfile:: grid.hpp + :project: gemmi diff --git a/docs/conf.py b/docs/conf.py index 2d8a8b95b..6bdf72386 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,11 +1,19 @@ # -*- coding: utf-8 -*- +import os +import subprocess + +# Run Doxygen before Sphinx processes doxygenfile:: directives. +# Must run from docs/ so Doxyfile and _doxygen/xml/ paths resolve correctly. +_docs_dir = os.path.dirname(os.path.abspath(__file__)) +subprocess.check_call(['doxygen', 'Doxyfile'], cwd=_docs_dir) + # -- General configuration ------------------------------------------------ # while we use Sphinx 8+, old version suffices to run doctests needs_sphinx = '5.3.0' -extensions = ['sphinx.ext.doctest', 'sphinx_inline_tabs'] +extensions = ['sphinx.ext.doctest', 'sphinx_inline_tabs', 'breathe'] templates_path = ['_templates'] @@ -125,3 +133,9 @@ def _compute_navigation_tree(context: Dict[str, Any]) -> str: import gemmi gemmi.set_leak_warnings(False) ''' + +# -- Breathe configuration (Doxygen XML → Sphinx) ------------------------- + +breathe_projects = {"gemmi": os.path.join(_docs_dir, "_doxygen", "xml")} +breathe_default_project = "gemmi" +breathe_default_members = ('members',) # show all public members in every doxygenfile:: directive diff --git a/docs/index.rst b/docs/index.rst index 4302a3789..c1f524ce6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -93,7 +93,7 @@ Contents ChangeLog Python API reference - C++ API reference + api Credits ======= diff --git a/docs/requirements.txt b/docs/requirements.txt index 223a532b1..804a29253 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ sphinx >= 8.1.3 furo >= 2024.8.6 sphinx-inline-tabs +breathe >= 4.35 From 908eea4922a9967e94834cd67201d35fad4e3253 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 19:55:10 +0200 Subject: [PATCH 02/19] =?UTF-8?q?fix(conf.py):=20guard=20doxygen=20call=20?= =?UTF-8?q?=E2=80=94=20skip=20gracefully=20when=20not=20installed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/conf.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 6bdf72386..55fab5e35 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,19 +1,25 @@ # -*- coding: utf-8 -*- import os +import shutil import subprocess -# Run Doxygen before Sphinx processes doxygenfile:: directives. -# Must run from docs/ so Doxyfile and _doxygen/xml/ paths resolve correctly. _docs_dir = os.path.dirname(os.path.abspath(__file__)) -subprocess.check_call(['doxygen', 'Doxyfile'], cwd=_docs_dir) +_doxygen_xml_dir = os.path.join(_docs_dir, "_doxygen", "xml") + +# Run Doxygen before Sphinx processes doxygenfile:: directives. +# Skipped gracefully when doxygen is not installed (e.g. CI doctest runner). +if shutil.which('doxygen'): + subprocess.check_call(['doxygen', 'Doxyfile'], cwd=_docs_dir) # -- General configuration ------------------------------------------------ # while we use Sphinx 8+, old version suffices to run doctests needs_sphinx = '5.3.0' -extensions = ['sphinx.ext.doctest', 'sphinx_inline_tabs', 'breathe'] +extensions = ['sphinx.ext.doctest', 'sphinx_inline_tabs'] +if os.path.isdir(_doxygen_xml_dir): + extensions.append('breathe') templates_path = ['_templates'] @@ -135,7 +141,9 @@ def _compute_navigation_tree(context: Dict[str, Any]) -> str: ''' # -- Breathe configuration (Doxygen XML → Sphinx) ------------------------- +# Only active when _doxygen/xml/ exists (i.e. doxygen was run). -breathe_projects = {"gemmi": os.path.join(_docs_dir, "_doxygen", "xml")} -breathe_default_project = "gemmi" -breathe_default_members = ('members',) # show all public members in every doxygenfile:: directive +if os.path.isdir(_doxygen_xml_dir): + breathe_projects = {"gemmi": _doxygen_xml_dir} + breathe_default_project = "gemmi" + breathe_default_members = ('members',) # show all public members in every doxygenfile:: directive From 3f20f61b301dfd31d399baffd69fb4800744ffab Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 20:02:20 +0200 Subject: [PATCH 03/19] ci: add docs-build job (Doxygen + Sphinx/Breathe HTML) Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f399317c4..fdfc9c30c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -217,6 +217,22 @@ jobs: - name: run tests under valgrind run: PYTHONMALLOC=malloc valgrind python3 -m unittest discover -v -s tests/ + docs: + name: "Docs build (Doxygen + Sphinx/Breathe)" + runs-on: ubuntu-latest + if: "!contains(github.event.head_commit.message, '[skip ci]')" + steps: + - uses: actions/checkout@v4 + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y doxygen + python3 -m pip install sphinx sphinx-inline-tabs breathe furo + - name: Build HTML docs + run: | + cd docs + sphinx-build -M html . _build -n -E + almalinux: runs-on: ubuntu-latest name: "AlmaLinux 8" From f17559ec0e35b3b3e9ded8d3bb5e9309711ab2ca Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 22:51:26 +0200 Subject: [PATCH 04/19] docs(calculate.hpp): add full Doxygen API documentation --- include/gemmi/calculate.hpp | 116 +++++++++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 8 deletions(-) diff --git a/include/gemmi/calculate.hpp b/include/gemmi/calculate.hpp index 03a10765b..b1bb9a7a8 100644 --- a/include/gemmi/calculate.hpp +++ b/include/gemmi/calculate.hpp @@ -12,6 +12,10 @@ namespace gemmi { +/// @brief Check if an object or any of its descendants contains hydrogen atoms. +/// @tparam T Type of object (Model, Chain, Residue, Atom, etc.) +/// @param obj Object to check +/// @return True if hydrogen is present, false otherwise template bool has_hydrogen(const T& obj) { for (const auto& child : obj.children()) if (has_hydrogen(child)) @@ -22,6 +26,11 @@ template<> inline bool has_hydrogen(const Atom& atom) { return atom.is_hydrogen(); } +/// @brief Count atom sites in an object, optionally filtered by Selection. +/// @tparam T Type of object (Model, Chain, Residue, Atom, etc.) +/// @param obj Object to count atoms in +/// @param sel Optional Selection filter; nullptr means all atoms +/// @return Number of matching atom sites template size_t count_atom_sites(const T& obj, const Selection* sel=nullptr) { size_t sum = 0; if (!sel || sel->matches(obj)) @@ -33,6 +42,11 @@ template<> inline size_t count_atom_sites(const Atom& atom, const Selection* sel return (!sel || sel->matches(atom)) ? 1 : 0; } +/// @brief Sum occupancies in an object, optionally filtered by Selection. +/// @tparam T Type of object (Model, Chain, Residue, Atom, etc.) +/// @param obj Object to sum occupancies in +/// @param sel Optional Selection filter; nullptr means all atoms +/// @return Sum of occupancies for matching atoms template double count_occupancies(const T& obj, const Selection* sel=nullptr) { double sum = 0; if (!sel || sel->matches(obj)) @@ -44,7 +58,10 @@ template<> inline double count_occupancies(const Atom& atom, const Selection* se return (!sel || sel->matches(atom)) ? atom.occ : 0; } - +/// @brief Calculate total mass in atomic mass units for an object. +/// @tparam T Type of object (Model, Chain, Residue, Atom, etc.) +/// @param obj Object to calculate mass for +/// @return Total mass in atomic mass units (accounting for occupancy) template double calculate_mass(const T& obj) { double sum = 0; for (const auto& child : obj.children()) @@ -55,12 +72,21 @@ template<> inline double calculate_mass(const Atom& atom) { return atom.occ * atom.element.weight(); } +/// @brief Result of center-of-mass calculation. struct CenterOfMass { + /// Sum of mass-weighted positions Position weighted_sum; + /// Total mass double mass; + /// @brief Get center-of-mass position. + /// @return Mass-weighted centroid (weighted_sum / mass) Position get() const { return Position(weighted_sum / mass); } }; +/// @brief Calculate the center of mass (mass-weighted centroid). +/// @tparam T Type of object (Model, Chain, Residue, Atom, etc.) +/// @param obj Object to calculate center of mass for +/// @return CenterOfMass with weighted_sum and total mass template CenterOfMass calculate_center_of_mass(const T& obj) { CenterOfMass total{{}, 0.}; for (const auto& child : obj.children()) { @@ -74,6 +100,10 @@ template<> inline CenterOfMass calculate_center_of_mass(const Atom& atom) { return CenterOfMass{Position(atom.pos * w_mass), w_mass}; } +/// @brief Calculate min and max isotropic B-factors in an object. +/// @tparam T Type of object (Model, Chain, Residue, Atom, etc.) +/// @param obj Object to scan +/// @return Pair of (min_B, max_B) template std::pair calculate_b_iso_range(const T& obj) { std::pair range{INFINITY, -INFINITY}; for (const auto& child : obj.children()) { @@ -87,7 +117,10 @@ template<> inline std::pair calculate_b_iso_range(const Atom& atom) return {atom.b_iso, atom.b_iso}; } -/// uses min/max eigenvalues of Baniso, or Biso if B-factor is isotropic +/// @brief Calculate min and max B-factors from anisotropic B-tensors. +/// Uses min/max eigenvalues of anisotropic B-tensor, or B_iso if B-factor is isotropic. +/// @param model Model to scan +/// @return Pair of (min_eigenvalue * u_to_b(), max_eigenvalue * u_to_b()) inline std::pair calculate_b_aniso_range(const Model& model) { std::pair range{INFINITY, -INFINITY}; for (const Chain& chain : model.chains) @@ -117,7 +150,11 @@ template<> inline void expand_box(const Atom& atom, Box& box) { box.extend(atom.pos); } -// we don't take NCS into account here (cf. NeighborSearch::set_bounding_cell()) +/// @brief Calculate axis-aligned bounding box in Cartesian space. +/// @note NCS is not taken into account here (cf. NeighborSearch::set_bounding_cell()) +/// @param st Structure to scan +/// @param margin Optional margin to add around the box +/// @return Axis-aligned bounding box inline Box calculate_box(const Structure& st, double margin=0.) { Box box; expand_box(st, box); @@ -126,6 +163,11 @@ inline Box calculate_box(const Structure& st, double margin=0.) { return box; } +/// @brief Calculate bounding box in fractional coordinates. +/// @param st Structure to scan (must have a unit cell) +/// @param margin Optional margin in Cartesian space (converted to fractional) +/// @return Bounding box in fractional coordinates +/// @throws Fails with message if Structure has no unit cell inline Box calculate_fractional_box(const Structure& st, double margin=0.) { if (!st.cell.is_crystal()) fail("calculate_fractional_box(): Structure has no unit cell for fractionalization"); @@ -141,21 +183,34 @@ inline Box calculate_fractional_box(const Structure& st, double marg } -// Calculate B_est from E. Merritt, Some B_eq are more equivalent than others, -// Acta Cryst. A67, 512 (2011) -// http://skuld.bmsc.washington.edu/parvati/ActaA_67_512.pdf +/// @brief Calculate B_equiv from anisotropic B-tensor (or B_iso if isotropic). +/// Based on E. Merritt, "Some B_eq are more equivalent than others", +/// Acta Cryst. A67, 512 (2011). +/// @param atom Atom with (possibly anisotropic) B-factor +/// @return B_equiv = sqrt((sum_eigenvalues) / (sum_inverse_eigenvalues)) * u_to_b() inline double calculate_b_est(const Atom& atom) { auto eig = atom.aniso.calculate_eigenvalues(); return u_to_b() * std::sqrt((eig[0] + eig[1] + eig[2]) / (1/eig[0] + 1/eig[1] + 1/eig[2])); } +/// @brief Calculate angle at p1 between vectors p1→p0 and p1→p2. +/// @param p0 First position +/// @param p1 Central position (vertex of the angle) +/// @param p2 Third position +/// @return Angle in radians inline double calculate_angle(const Position& p0, const Position& p1, const Position& p2) { return (p0 - p1).angle(p2 - p1); } -// discussion: https://stackoverflow.com/questions/20305272/ +/// @brief Calculate dihedral angle defined by four positions. +/// @param p0 First position +/// @param p1 Second position (on the bond axis) +/// @param p2 Third position (on the bond axis) +/// @param p3 Fourth position +/// @return Dihedral angle in radians, in the range [-π, π] +/// @note See discussion at https://stackoverflow.com/questions/20305272/ inline double calculate_dihedral(const Position& p0, const Position& p1, const Position& p2, const Position& p3) { Vec3 b0 = p1 - p0; @@ -168,7 +223,12 @@ inline double calculate_dihedral(const Position& p0, const Position& p1, return std::atan2(y, x); } -/// the return value is in the same range as that of atan2(), i.e. [-pi, pi] +/// @brief Calculate dihedral angle from four Atom pointers. +/// @param a First atom pointer +/// @param b Second atom pointer (on the bond axis) +/// @param c Third atom pointer (on the bond axis) +/// @param d Fourth atom pointer +/// @return Dihedral angle in radians (same range as atan2: [-π, π]), or NaN if any pointer is null inline double calculate_dihedral_from_atoms(const Atom* a, const Atom* b, const Atom* c, const Atom* d) { if (a && b && c && d) @@ -176,11 +236,23 @@ inline double calculate_dihedral_from_atoms(const Atom* a, const Atom* b, return NAN; } +/// @brief Calculate peptide bond ω dihedral angle. +/// ω is defined by atoms: Cα(i) - C(i) - N(i+1) - Cα(i+1) +/// @param res Current residue +/// @param next Next residue +/// @return Omega angle in radians, or NaN if atoms are missing inline double calculate_omega(const Residue& res, const Residue& next) { return calculate_dihedral_from_atoms(res.get_ca(), res.get_c(), next.get_n(), next.get_ca()); } +/// @brief Check if a peptide bond is in cis configuration. +/// @param ca1 Cα atom of the current residue +/// @param c C atom of the current residue +/// @param n N atom of the next residue +/// @param ca2 Cα atom of the next residue +/// @param tolerance_deg Tolerance in degrees (absolute |ω| < tolerance indicates cis) +/// @return True if |ω| < tolerance_deg, false otherwise inline bool is_peptide_bond_cis(const Atom* ca1, const Atom* c, const Atom* n, const Atom* ca2, double tolerance_deg) { @@ -188,11 +260,24 @@ inline bool is_peptide_bond_cis(const Atom* ca1, const Atom* c, return std::fabs(omega) < rad(tolerance_deg); } +/// @brief Calculate chiral volume (scalar triple product). +/// @param actr Central atom position (center of chirality) +/// @param a1 First substituent position +/// @param a2 Second substituent position +/// @param a3 Third substituent position +/// @return Scalar triple product: (a1-actr) · ((a2-actr) × (a3-actr)) inline double calculate_chiral_volume(const Position& actr, const Position& a1, const Position& a2, const Position& a3) { return (a1 - actr).dot((a2 - actr).cross(a3 - actr)); } +/// @brief Calculate Ramachandran dihedral angles φ (phi) and ψ (psi). +/// φ is defined by: C(i-1) - N(i) - Cα(i) - C(i) +/// ψ is defined by: N(i) - Cα(i) - C(i) - N(i+1) +/// @param prev Previous residue (nullptr if not available) +/// @param res Current residue +/// @param next Next residue (nullptr if not available) +/// @return Array [phi, psi] in radians; NaN values if flanking residues are missing or atoms not found inline std::array calculate_phi_psi(const Residue* prev, const Residue& res, const Residue* next) { @@ -209,15 +294,30 @@ inline std::array calculate_phi_psi(const Residue* prev, return phi_psi; } +/// @brief Find the least-squares plane through a set of atoms. +/// @param atoms Vector of atom pointers +/// @return Array [a, b, c, d] representing plane equation ax + by + cz = d +/// @note All atoms must have non-zero occupancy to be included GEMMI_DLL std::array find_best_plane(const std::vector& atoms); +/// @brief Calculate signed distance from a point to a plane. +/// @param pos Position of the point +/// @param coeff Plane coefficients [a, b, c, d] from ax + by + cz = d +/// @return Signed distance from pos to plane inline double get_distance_from_plane(const Position& pos, const std::array& coeff) { return coeff[0] * pos.x + coeff[1] * pos.y + coeff[2] * pos.z + coeff[3]; } +/// @brief Parse a crystallographic triplet string into an FTransform. +/// @param s CIF triplet string (e.g., "x,y,z" or "-x+1/2,-y,z") +/// @return FTransform representing the fractional transformation GEMMI_DLL FTransform parse_triplet_as_ftransform(const std::string& s); +/// @brief Calculate the anisotropic U tensor at a position from TLS group parameters. +/// @param tls TLS group parameters (T, L, S matrices) +/// @param pos Position where U is evaluated +/// @return 3×3 symmetric matrix U at the given position GEMMI_DLL SMat33 calculate_u_from_tls(const TlsGroup& tls, const Position& pos); } // namespace gemmi From 2e75b21699a30629b32ead1de570d67c198a6282 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 22:52:06 +0200 Subject: [PATCH 05/19] docs(align.hpp): add full Doxygen API documentation --- include/gemmi/align.hpp | 85 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 10 deletions(-) diff --git a/include/gemmi/align.hpp b/include/gemmi/align.hpp index ceb8327f7..ab4d857ab 100644 --- a/include/gemmi/align.hpp +++ b/include/gemmi/align.hpp @@ -12,9 +12,15 @@ namespace gemmi { -// Sequence alignment and label_seq_id assignment +/// @name Sequence Alignment and label_seq_id Assignment +/// @{ -// helper function for sequence alignment +/// @brief Compute per-position gap-open penalties for sequence alignment. +/// Accounts for gaps and discontinuities in the observed polymer sequence. +/// @param polymer Polymer span to analyze +/// @param polymer_type Type of polymer (peptide or nucleic acid) +/// @param scoring Optional AlignmentScoring parameters; uses default partial_model() if nullptr +/// @return Vector of gap-opening penalties for each position inline std::vector prepare_target_gapo(const ConstResidueSpan& polymer, PolymerType polymer_type, const AlignmentScoring* scoring=nullptr) { @@ -35,6 +41,12 @@ inline std::vector prepare_target_gapo(const ConstResidueSpan& polymer, return gaps; } +/// @brief Align a full sequence (SEQRES/Entity) to observed residues in a polymer. +/// @param full_seq Full sequence from SEQRES or Entity +/// @param polymer Polymer chain span (observed residues) +/// @param polymer_type Type of polymer (peptide or nucleic acid) +/// @param scoring Optional AlignmentScoring parameters; uses default partial_model() if nullptr +/// @return AlignmentResult with CIGAR string describing the alignment inline AlignmentResult align_sequence_to_polymer( const std::vector& full_seq, const ConstResidueSpan& polymer, @@ -68,7 +80,10 @@ inline AlignmentResult align_sequence_to_polymer( (std::uint8_t)encoding.size(), *scoring); } -// check for exact match between model sequence and full sequence (SEQRES) +/// @brief Check if model sequence exactly matches SEQRES numbering (fast path). +/// @param polymer Polymer chain span +/// @param ent Entity with full_sequence from SEQRES +/// @return True if label_seq values already match Entity::full_sequence exactly inline bool seqid_matches_seqres(const ConstResidueSpan& polymer, const Entity& ent) { if (ent.full_sequence.size() != polymer.size()) @@ -82,6 +97,8 @@ inline bool seqid_matches_seqres(const ConstResidueSpan& polymer, return true; } +/// @brief Clear full_sequence and database references from all entities. +/// @param st Structure to clear inline void clear_sequences(Structure& st) { for (Entity& ent : st.entities) { ent.full_sequence.clear(); @@ -90,11 +107,16 @@ inline void clear_sequences(Structure& st) { } } +/// @brief Match FASTA sequences to entities and set Entity::full_sequence. +/// @param st Structure to update +/// @param fasta_sequences Vector of FASTA sequence strings GEMMI_DLL void assign_best_sequences(Structure& st, const std::vector& fasta_sequences); -// Uses sequence alignment (model to SEQRES) to assign label_seq. -// force: assign label_seq even if full sequence is not known (assumes no gaps) +/// @brief Assign label_seq_id to residues using alignment to Entity full_sequence. +/// @param polymer Polymer chain span to assign label_seq to +/// @param ent Entity with full_sequence from SEQRES (nullptr allowed if force=true) +/// @param force If true, assign label_seq sequentially even without Entity information inline void assign_label_seq_to_polymer(ResidueSpan& polymer, const Entity* ent, bool force) { AlignmentResult result; @@ -142,6 +164,8 @@ inline void assign_label_seq_to_polymer(ResidueSpan& polymer, } } +/// @brief Reset all Residue::label_seq to unset state. +/// @param st Structure to clear inline void clear_label_seq_id(Structure& st) { for (Model& model : st.models) for (Chain& chain : model.chains) @@ -149,6 +173,9 @@ inline void clear_label_seq_id(Structure& st) { res.label_seq = Residue::OptionalNum(); } +/// @brief Assign label_seq_id to all polymer chains in a structure. +/// @param st Structure to process +/// @param force If true, assign label_seq sequentially even without Entity information inline void assign_label_seq_id(Structure& st, bool force) { for (Model& model : st.models) for (Chain& chain : model.chains) @@ -160,14 +187,29 @@ inline void assign_label_seq_id(Structure& st, bool force) { } -// superposition +/// @} +/// @name Structure Superposition +/// @{ +/// @brief Atom selection mode for superposition calculations. enum class SupSelect { - CaP, // only Ca (aminoacids) or P (nucleotides) atoms - MainChain, // only main chain atoms + /// Use only Cα atoms (for peptides) or P atoms (for nucleic acids) + CaP, + /// Use backbone atoms (N, CA, C, O for peptides; P, O5', C5', C4', C3', O3' for nucleic acids) + MainChain, + /// Use all atoms All }; +/// @brief Extract matching atom positions from two polymer spans for superposition. +/// @param pos1 Output vector of positions from fixed chain +/// @param pos2 Output vector of positions from movable chain +/// @param fixed Fixed polymer chain (reference structure) +/// @param movable Movable polymer chain (structure to be superposed) +/// @param ptype Polymer type (peptide or nucleic acid) +/// @param sel Atom selection mode (CaP, MainChain, or All) +/// @param altloc Alternate location code to select; '\0' means all conformers +/// @param ca_offsets Optional output vector storing indices of Cα/P atoms in pos1/pos2 inline void prepare_positions_for_superposition(std::vector& pos1, std::vector& pos2, ConstResidueSpan fixed, @@ -226,6 +268,13 @@ inline void prepare_positions_for_superposition(std::vector& pos1, } } +/// @brief Calculate RMSD between two polymer spans without transformation. +/// @param fixed Fixed polymer chain (reference structure) +/// @param movable Movable polymer chain (structure to compare) +/// @param ptype Polymer type (peptide or nucleic acid) +/// @param sel Atom selection mode (CaP, MainChain, or All) +/// @param altloc Alternate location code to select; '\0' means all conformers +/// @return SupResult with RMSD and atom count (rotation/translation matrices are empty) inline SupResult calculate_current_rmsd(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, @@ -242,6 +291,15 @@ inline SupResult calculate_current_rmsd(ConstResidueSpan fixed, return r; } +/// @brief Calculate optimal superposition using QCP (quaternion characteristic polynomial). +/// @param fixed Fixed polymer chain (reference structure) +/// @param movable Movable polymer chain (structure to be superposed) +/// @param ptype Polymer type (peptide or nucleic acid) +/// @param sel Atom selection mode (CaP, MainChain, or All) +/// @param trim_cycles Number of iterative trimming cycles (0 = no trimming) +/// @param trim_cutoff Outlier distance threshold in Å for trimming (default 2.0) +/// @param altloc Alternate location code to select; '\0' means all conformers +/// @return SupResult with rotation matrix, translation vector, and RMSD inline SupResult calculate_superposition(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, @@ -280,8 +338,13 @@ inline SupResult calculate_superposition(ConstResidueSpan fixed, return sr; } -// Returns superpositions for all residues in fixed.first_conformer(), -// performed by superposing backbone in radius=10.0 from residue's Ca. +/// @brief Compute local superpositions using a sliding window around each Cα/P. +/// Calculates superpositions by sliding a sphere of given radius around each Cα/P position. +/// @param fixed Fixed polymer chain (reference structure) +/// @param movable Movable polymer chain (structure to be superposed) +/// @param ptype Polymer type (peptide or nucleic acid) +/// @param radius Window radius in Ångströms around each Cα/P (default 10.0) +/// @return Vector of SupResult, one per residue in fixed.first_conformer() inline std::vector calculate_superpositions_in_moving_window( ConstResidueSpan fixed, ConstResidueSpan movable, @@ -313,5 +376,7 @@ inline std::vector calculate_superpositions_in_moving_window( return result; } +/// @} + } // namespace gemmi #endif From 192465bff856fe33ca3b65e8747e6f54b4b3017f Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 22:54:07 +0200 Subject: [PATCH 06/19] docs: fix spec gaps in calculate.hpp and align.hpp Co-Authored-By: Claude Sonnet 4.6 --- include/gemmi/align.hpp | 6 +++--- include/gemmi/calculate.hpp | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/gemmi/align.hpp b/include/gemmi/align.hpp index ab4d857ab..3c7501047 100644 --- a/include/gemmi/align.hpp +++ b/include/gemmi/align.hpp @@ -107,9 +107,9 @@ inline void clear_sequences(Structure& st) { } } -/// @brief Match FASTA sequences to entities and set Entity::full_sequence. -/// @param st Structure to update -/// @param fasta_sequences Vector of FASTA sequence strings +/// @brief Match FASTA sequences to entities and set Entity::full_sequence in-place. +/// @param st Structure whose entities are updated with matched sequences. +/// @param fasta_sequences Vector of FASTA-format sequence strings to match against entities. GEMMI_DLL void assign_best_sequences(Structure& st, const std::vector& fasta_sequences); diff --git a/include/gemmi/calculate.hpp b/include/gemmi/calculate.hpp index b1bb9a7a8..7569fbaec 100644 --- a/include/gemmi/calculate.hpp +++ b/include/gemmi/calculate.hpp @@ -142,6 +142,10 @@ inline std::pair calculate_b_aniso_range(const Model& model) { } +/// @brief Expand an axis-aligned bounding box to include all atoms in obj. +/// @tparam T Type supporting children() iteration (Model, Chain, Residue, or Atom). +/// @param obj Object whose atom positions are added to the box. +/// @param box Bounding box to expand in-place. template void expand_box(const T& obj, Box& box) { for (const auto& child : obj.children()) expand_box(child, box); From 234b0ca001cfc71310700572e2c42ca8a56d3922 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 22:55:40 +0200 Subject: [PATCH 07/19] docs(neighbor.hpp): add full Doxygen API documentation --- include/gemmi/neighbor.hpp | 128 +++++++++++++++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 4 deletions(-) diff --git a/include/gemmi/neighbor.hpp b/include/gemmi/neighbor.hpp index 40f5dadb6..32af8db7e 100644 --- a/include/gemmi/neighbor.hpp +++ b/include/gemmi/neighbor.hpp @@ -16,56 +16,98 @@ namespace gemmi { +/// @brief Cell-linked-list spatial index for fast atom neighbor searching. +/// Supports both macromolecular structures (Model) and small-molecule structures +/// (SmallStructure), with periodic boundary conditions and crystallographic symmetry. struct NeighborSearch { + /// @brief Atom reference stored in a grid cell. + /// Encodes the position and indices needed to resolve back to the original atom or site. struct Mark { + /// Cartesian position of the atom (may be a symmetry image). Position pos; + /// Alternate conformer identifier ('\\0' = no alternate). char altloc; + /// Chemical element of the atom. Element element; + /// Index into the list of crystallographic images (0 = identity). short image_idx; + /// Index of the chain in the model's chain vector. int chain_idx; + /// Index of the residue within the chain. int residue_idx; + /// Index of the atom within the residue. int atom_idx; Mark(const Position& p, char alt, El el, short im, int ch, int res, int atom) : pos(p), altloc(alt), element(el), image_idx(im), chain_idx(ch), residue_idx(res), atom_idx(atom) {} + /// @brief Resolve the indices into a CRA (chain/residue/atom) triple. + /// @param mdl Model to resolve indices into (non-const version). + /// @return CRA triple pointing to the resolved atom. CRA to_cra(Model& mdl) const { Chain& c = mdl.chains.at(chain_idx); Residue& r = c.residues.at(residue_idx); Atom& a = r.atoms.at(atom_idx); return {&c, &r, &a}; } + + /// @brief Resolve the indices into a const CRA triple. + /// @param mdl Model to resolve indices into (const version). + /// @return const CRA triple pointing to the resolved atom. const_CRA to_cra(const Model& mdl) const { const Chain& c = mdl.chains.at(chain_idx); const Residue& r = c.residues.at(residue_idx); const Atom& a = r.atoms.at(atom_idx); return {&c, &r, &a}; } + + /// @brief Resolve the atom_idx to a SmallStructure::Site. + /// @param small_st SmallStructure to resolve into (non-const version). + /// @return Reference to the resolved site. SmallStructure::Site& to_site(SmallStructure& small_st) const { return small_st.sites.at(atom_idx); } + + /// @brief Resolve the atom_idx to a const SmallStructure::Site. + /// @param small_st SmallStructure to resolve into (const version). + /// @return Const reference to the resolved site. const SmallStructure::Site& to_site(const SmallStructure& small_st) const { return small_st.sites.at(atom_idx); } }; + /// Spatial Grid of Mark vectors (one cell per grid voxel). Grid> grid; + /// The search radius passed to the constructor. double radius_specified = 0.; + /// Pointer to the indexed macromolecular model (nullptr if SmallStructure). Model* model = nullptr; + /// Pointer to the indexed small-molecule structure (nullptr if Model). SmallStructure* small_structure = nullptr; + /// If true, apply periodic boundary conditions when searching. bool use_pbc = true; + /// If true, include hydrogen atoms in the index. bool include_h = true; + /// @brief Default constructor creating an empty NeighborSearch. NeighborSearch() = default; - // Model is not const so it can be modified in for_each_contact() + + /// @brief Initialize grid for a macromolecular model. + /// @param model_ Reference to the Model to index (non-const, can be modified in for_each_contact()). + /// @param cell Unit cell defining the crystallographic symmetry and periodicity. + /// @param radius Search radius used to dimension the grid. NeighborSearch(Model& model_, const UnitCell& cell, double radius) { model = &model_; radius_specified = radius; set_bounding_cell(cell); set_grid_size(); } + + /// @brief Initialize for a small-molecule structure. + /// @param small_st Reference to the SmallStructure to index. + /// @param radius Search radius used to dimension the grid. NeighborSearch(SmallStructure& small_st, double radius) { small_structure = &small_st; radius_specified = radius; @@ -73,13 +115,38 @@ struct NeighborSearch { set_grid_size(); } + /// @brief Fill the grid with all atoms in the model or small structure. + /// @param include_h_ If true, include hydrogen atoms (default: true). + /// @return Reference to *this for method chaining. NeighborSearch& populate(bool include_h_=true); + + /// @brief Add all atoms from one chain to the grid. + /// @param chain The chain to add. + /// @param include_h_ If true, include hydrogen atoms (default: true). void add_chain(const Chain& chain, bool include_h_=true); + + /// @brief Add atoms from a chain with explicit chain index. + /// Used when chain_idx matters; internally called by populate(). + /// @param chain The chain to add. + /// @param n_ch Index of the chain in the model's chain vector. void add_chain_n(const Chain& chain, int n_ch); + + /// @brief Add a single atom to the grid. + /// @param atom The atom to add. + /// @param n_ch Index of the chain. + /// @param n_res Index of the residue within the chain. + /// @param n_atom Index of the atom within the residue. void add_atom(const Atom& atom, int n_ch, int n_res, int n_atom); + + /// @brief Add a SmallStructure site to the grid. + /// @param site The site to add. + /// @param n Index of the site in the sites vector. void add_site(const SmallStructure::Site& site, int n); - // assumes data in [0, 1), but uses index_n to account for numerical errors + /// @brief Return the reference to the grid cell containing a fractional coordinate. + /// Assumes data in [0, 1) but uses index_n to account for numerical errors. + /// @param fr Fractional coordinate in the unit cell. + /// @return Reference to the vector of Marks in the grid cell. std::vector& get_subcell(const Fractional& fr) { size_t idx = grid.index_n(int(fr.x * grid.nu), int(fr.y * grid.nv), @@ -89,9 +156,21 @@ struct NeighborSearch { return grid.data[idx]; } + /// @brief Iterate over all grid cells within k cells of a position. + /// @tparam Func Callable type with signature void(std::vector&, const Fractional&). + /// @param pos Cartesian position. + /// @param func Function to call for each cell, receiving the Marks and fractional coordinates. + /// @param k Grid multiplier (default: 1); larger k searches more cells. template void for_each_cell(const Position& pos, const Func& func, int k=1); + /// @brief Iterate over all Marks within radius of a position, respecting alternate conformers. + /// @tparam Func Callable type with signature void(Mark&, double) for (mark, dist_sq). + /// @param pos Cartesian position. + /// @param alt Alternate conformer identifier to match ('\\0' matches all). + /// @param radius Search radius in Angstroms. + /// @param func Function to call for each nearby mark. + /// @param k Grid multiplier (default: 1); larger k searches more cells. template void for_each(const Position& pos, char alt, double radius, const Func& func, int k=1) { if (radius <= 0) @@ -106,12 +185,21 @@ struct NeighborSearch { }, k); } + /// @brief Return the minimum grid multiplier k covering the given radius. + /// @param r Search radius in Angstroms. + /// @return Minimum k value such that k*radius_specified >= r. int sufficient_k(double r) const { // .00001 is added to account for possible numeric error in r return r <= radius_specified ? 1 : int(r / radius_specified + 1.00001); } - // with radius==0 it uses radius_specified + /// @brief Find all Marks within a distance range of a position. + /// If radius==0, uses radius_specified. + /// @param pos Cartesian position. + /// @param alt Alternate conformer identifier. + /// @param min_dist Minimum distance in Angstroms. + /// @param radius Maximum distance in Angstroms (0 = use radius_specified). + /// @return Vector of pointers to Marks in the distance range [min_dist, radius]. std::vector find_atoms(const Position& pos, char alt, double min_dist, double radius) { int k = sufficient_k(radius); @@ -125,15 +213,31 @@ struct NeighborSearch { return out; } + /// @brief Find neighbor Marks of a Model atom. + /// @param atom The atom to find neighbors for. + /// @param min_dist Minimum distance in Angstroms. + /// @param max_dist Maximum distance in Angstroms. + /// @return Vector of pointers to nearby Marks. std::vector find_neighbors(const Atom& atom, double min_dist, double max_dist) { return find_atoms(atom.pos, atom.altloc, min_dist, max_dist); } + + /// @brief Find neighbor Marks of a SmallStructure site. + /// @param site The site to find neighbors for. + /// @param min_dist Minimum distance in Angstroms. + /// @param max_dist Maximum distance in Angstroms. + /// @return Vector of pointers to nearby Marks. std::vector find_site_neighbors(const SmallStructure::Site& site, double min_dist, double max_dist) { Position pos = grid.unit_cell.orthogonalize(site.fract); return find_atoms(pos, '\0', min_dist, max_dist); } + /// @brief Find the nearest atom within k grid layers and a radius limit. + /// @param pos Cartesian position. + /// @param k Grid multiplier (number of layers to search). + /// @param radius Maximum search radius in Angstroms. + /// @return Pair of (nearest Mark pointer, distance squared). Pointer is nullptr if no atom found. std::pair find_nearest_atom_within_k(const Position& pos, int k, double radius) { Mark* mark = nullptr; @@ -151,7 +255,11 @@ struct NeighborSearch { return {mark, nearest_dist_sq}; } - // it would be good to return also NearestImage + /// @brief Find the single nearest atom within an optional radius limit. + /// Automatically adapts the grid search multiplier k to find the nearest atom. + /// @param pos Cartesian position. + /// @param radius Maximum search radius in Angstroms (default: INFINITY). + /// @return Pointer to the nearest Mark, or nullptr if no atom found. Mark* find_nearest_atom(const Position& pos, double radius=INFINITY) { double r_spec = radius_specified; if (radius == 0.f) @@ -176,13 +284,25 @@ struct NeighborSearch { return nullptr; } + /// @brief Distance squared between two positions, accounting for unit cell periodicity. + /// @param pos1 First Cartesian position. + /// @param pos2 Second Cartesian position. + /// @return Squared distance, accounting for periodic boundary conditions. double dist_sq(const Position& pos1, const Position& pos2) const { return grid.unit_cell.distance_sq(pos1, pos2); } + + /// @brief Euclidean distance between two positions with periodic boundary conditions. + /// @param pos1 First Cartesian position. + /// @param pos2 Second Cartesian position. + /// @return Euclidean distance in Angstroms. double dist(const Position& pos1, const Position& pos2) const { return std::sqrt(dist_sq(pos1, pos2)); } + /// @brief Retrieve the crystallographic image transformation for the given image index. + /// @param image_idx Crystallographic image index (0 = identity, 1+ = symmetry images). + /// @return The fractional transform for the image. FTransform get_image_transformation(int image_idx) const { // 0 is for identity, other indices are shifted by one. if (image_idx == 0) From 5bcbc445b6fd46494173e88e26ca1b5423dd4471 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 22:55:56 +0200 Subject: [PATCH 08/19] docs(contact.hpp): add full Doxygen API documentation --- include/gemmi/contact.hpp | 51 +++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/include/gemmi/contact.hpp b/include/gemmi/contact.hpp index ed439f462..b6b4350ed 100644 --- a/include/gemmi/contact.hpp +++ b/include/gemmi/contact.hpp @@ -11,40 +11,81 @@ namespace gemmi { +/// @brief Configures and performs distance-based contact search between atoms. +/// Built on top of NeighborSearch with support for configurable filtering, +/// per-element radii, and flexible results collection. struct ContactSearch { + /// @brief Filter options for atom pair inclusion in contact search results. enum class Ignore { - Nothing=0, SameResidue, AdjacentResidues, SameChain, SameAsu + Nothing=0, ///< Report all contacts. + SameResidue, ///< Skip contacts between atoms in the same residue. + AdjacentResidues, ///< Skip contacts between atoms in adjacent residues (i±1). + SameChain, ///< Skip contacts between atoms in the same chain. + SameAsu ///< Skip contacts between atoms in the same asymmetric unit (report only symmetry-related contacts). }; - // parameters used to configure the search + + /// Maximum contact distance in Angstroms. double search_radius; + /// Which atom pairs to skip (default: SameResidue). Ignore ignore = Ignore::SameResidue; - bool twice = false; // report both A-B and B-A + /// If true, report each contact pair twice (A→B and B→A); default false. + bool twice = false; + /// Skip atoms with occupancy below this threshold (0 = no filtering). float min_occupancy = 0.f; + /// Squared distance threshold for identifying atoms on special positions. double special_pos_cutoff_sq = 0.8 * 0.8; + /// Per-element contact radii (used when checking atom-type-based distance criteria). std::vector radii; + /// @brief Create a contact search with the given search radius. + /// @param radius Maximum contact distance in Angstroms. ContactSearch(double radius) noexcept : search_radius(radius) {} - // a helper function that sets per-atom radii basing on covalent_radius() + /// @brief Populate per-element radii based on covalent radii with scaling. + /// @param multiplier Scaling factor applied to covalent radius. + /// @param tolerance Constant tolerance added to the scaled radius. void setup_atomic_radii(double multiplier, double tolerance) { radii.resize((size_t)El::END); for (int i = 0; i != (int) El::END; ++i) radii[i] = float(multiplier * Element(i).covalent_r() + tolerance / 2); } + + /// @brief Get the contact radius for a given element. + /// @param el Chemical element. + /// @return Contact radius for the element (0.f if radii not set up). float get_radius(El el) const { return radii.empty() ? 0.f : radii[(int)el]; } + + /// @brief Override the contact radius for a specific element. + /// @param el Chemical element. + /// @param r New contact radius in Angstroms. void set_radius(El el, float r) { if (!radii.empty()) radii[(int)el] = r; } + /// @brief Iterate over all contacts, applying ignore logic and calling a function for each. + /// @tparam Func Callable type with signature void(const CRA&, const CRA&, int, double) + /// for (partner1, partner2, image_idx, dist_sq). + /// @param ns NeighborSearch object containing the indexed atoms. + /// @param func Function to call for each contact found. template void for_each_contact(NeighborSearch& ns, const Func& func); + /// @brief Result of a contact search between two atoms. struct Result { - CRA partner1, partner2; + /// CRA (chain/residue/atom) reference of the first partner. + CRA partner1; + /// CRA (chain/residue/atom) reference of the second partner. + CRA partner2; + /// Crystallographic image index for partner2 (0 = same ASU). int image_idx; + /// Squared distance between the partners in Angstroms squared. double dist_sq; }; + + /// @brief Collect and return all contacts as a vector of Result. + /// @param ns NeighborSearch object containing the indexed atoms. + /// @return Vector of Result structs representing all found contacts. std::vector find_contacts(NeighborSearch& ns) { std::vector out; for_each_contact(ns, [&out](const CRA& cra1, const CRA& cra2, From e7f8251b84ac182ccf908b3ad57154dcbf8da4ce Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 22:58:42 +0200 Subject: [PATCH 09/19] docs(assembly.hpp): add full Doxygen API documentation --- include/gemmi/assembly.hpp | 157 ++++++++++++++++++++++++++++++++++--- 1 file changed, 148 insertions(+), 9 deletions(-) diff --git a/include/gemmi/assembly.hpp b/include/gemmi/assembly.hpp index 5032fadfa..ad7358fdd 100644 --- a/include/gemmi/assembly.hpp +++ b/include/gemmi/assembly.hpp @@ -13,24 +13,60 @@ namespace gemmi { -enum class HowToNameCopiedChain { Short, AddNumber, Dup }; +/// Naming strategy for chains copied during assembly or NCS expansion. +/// +/// @brief Specifies how to name copied chains to ensure uniqueness. +enum class HowToNameCopiedChain { + /// Use 1–2 character chain names cycling through A–Z, a–z, 0–9 + Short, + /// Append a numeric suffix to the original chain name + AddNumber, + /// Keep the original chain name (may result in duplicates) + Dup +}; +/// Utility to generate unique chain names for assembly and NCS expansion. +/// +/// Manages a set of used chain names and provides methods to generate new, +/// unique names according to the specified naming strategy. struct ChainNameGenerator { + /// The naming strategy to apply HowToNameCopiedChain how; + /// List of already-assigned chain names to avoid duplicates std::vector used_names; + /// Initialize with a naming strategy. + /// + /// @param how_ The chain naming strategy to use ChainNameGenerator(HowToNameCopiedChain how_) : how(how_) {} + + /// Initialize with chain names pre-populated from a Model. + /// + /// @param model The model to extract existing chain names from + /// @param how_ The chain naming strategy to use ChainNameGenerator(const Model& model, HowToNameCopiedChain how_) : how(how_) { if (how != HowToNameCopiedChain::Dup) for (const Chain& chain : model.chains) used_names.push_back(chain.name); } + + /// Register a chain name if not already used. + /// + /// @param name The name to register + /// @return true if the name was successfully added, false if already in used_names bool try_add(const std::string& name) { if (in_vector(name, used_names)) return false; used_names.push_back(name); return true; } + + /// Generate a unique 1–2 character name starting from preferred. + /// + /// Cycles through A–Z, a–z, 0–9 to find an available name. + /// + /// @param preferred The preferred 1-character name to try first + /// @return A unique short name std::string make_short_name(const std::string& preferred) { static const char symbols[] = { 'A','B','C','D','E','F','G','H','I','J','K','L','M', @@ -59,6 +95,13 @@ struct ChainNameGenerator { fail("run out of 1- and 2-letter chain names"); } + /// Generate a name by appending a numeric postfix to a base name. + /// + /// Increments the numeric suffix until a unique name is found. + /// + /// @param base The base chain name + /// @param n The starting number to append + /// @return A unique name of the form "base" + number std::string make_name_with_numeric_postfix(const std::string& base, int n) { std::string name = base; name += std::to_string(n); @@ -69,6 +112,14 @@ struct ChainNameGenerator { return name; } + /// Generate a new name according to the configured naming strategy. + /// + /// Dispatches to make_short_name(), make_name_with_numeric_postfix(), + /// or returns the original name based on the how field. + /// + /// @param old The original chain name + /// @param n The numeric suffix (used only for AddNumber strategy) + /// @return A new unique chain name std::string make_new_name(const std::string& old, int n) { switch (how) { case HowToNameCopiedChain::Short: return make_short_name(old); @@ -79,6 +130,13 @@ struct ChainNameGenerator { } }; +/// Rename a chain to ensure its name is unique within the model. +/// +/// Uses the Short naming strategy to generate a 1–2 character name +/// that does not conflict with other chains in the model. +/// +/// @param model The model containing the chain +/// @param chain The chain to rename inline void ensure_unique_chain_name(const Model& model, Chain& chain) { ChainNameGenerator namegen(HowToNameCopiedChain::Short); for (const Chain& ch : model.chains) @@ -87,9 +145,28 @@ inline void ensure_unique_chain_name(const Model& model, Chain& chain) { chain.name = namegen.make_short_name(chain.name); } +/// Apply all generators in an assembly to a model to generate a biological unit. +/// +/// Creates a new Model containing copies of the input model transformed +/// according to each generator in the assembly. Chain names are made unique +/// according to the specified strategy. +/// +/// @param assembly The assembly record containing transformation operators +/// @param model The input model to transform +/// @param how The strategy for naming copied chains +/// @param logging Logger for warning messages +/// @return A new Model containing the assembled biological unit GEMMI_DLL Model make_assembly(const Assembly& assembly, const Model& model, HowToNameCopiedChain how, const Logger& logging); +/// Create a pseudo-assembly representing all unit-cell images. +/// +/// Generates an Assembly whose operators produce all crystallographic +/// images of the structure within the unit cell. Useful for packing displays +/// and structure visualization. +/// +/// @param cell The unit cell with precomputed image transformations +/// @return An Assembly object whose generators apply all unit-cell symmetry inline Assembly pseudo_assembly_for_unit_cell(const UnitCell& cell) { Assembly assembly("unit_cell"); std::vector operators(cell.images.size() + 1); @@ -102,36 +179,98 @@ inline Assembly pseudo_assembly_for_unit_cell(const UnitCell& cell) { return assembly; } -/// If called with assembly_name="unit_cell" changes structure to unit cell (P1). -/// \par keep_spacegroup preserves space group and unit cell - is it needed? +/// Modify a Structure in-place to contain only a named assembly. +/// +/// Replaces all models in the structure with the expanded assembly, +/// optionally preserving the space group. If assembly_name="unit_cell", +/// converts the structure to the unit cell (P1). +/// +/// @param st The structure to modify in-place +/// @param assembly_name The name of the assembly to apply +/// @param how The strategy for naming copied chains +/// @param logging Logger for warning messages +/// @param keep_spacegroup If true, preserve the original space group and unit cell +/// @param merge_dist Distance threshold for merging nearby atoms (default 0.2 Å) GEMMI_DLL void transform_to_assembly(Structure& st, const std::string& assembly_name, HowToNameCopiedChain how, const Logger& logging, bool keep_spacegroup=false, double merge_dist=0.2); +/// Expand a model by applying NCS operations. +/// +/// Generates copies of the input model by applying each NCS (non-crystallographic symmetry) +/// operation, returning a new Model with all copies combined. +/// +/// @param model The input model to expand +/// @param ncs Vector of NCS operations to apply +/// @param how The strategy for naming copied chains +/// @return A new Model containing the original and all NCS-transformed copies GEMMI_DLL Model expand_ncs_model(const Model& model, const std::vector& ncs, HowToNameCopiedChain how); -/// Searches and merges overlapping equivalent atoms from different chains. -/// To be used after expand_ncs() and make_assembly(). +/// Merge atoms within a distance threshold after NCS or assembly expansion. +/// +/// Searches for overlapping atoms from different chains that are equivalent +/// under the unit cell symmetry and merges them. Typically used after +/// expand_ncs() and make_assembly() to eliminate duplicate atoms. +/// +/// @param model The model to modify in-place +/// @param cell The unit cell (used for distance calculations) +/// @param max_dist Maximum distance for merging atoms (default 0.2 Å) +/// @param compare_serial If true, only merge atoms with matching serial numbers (default true) GEMMI_DLL void merge_atoms_in_expanded_model(Model& model, const UnitCell& cell, double max_dist=0.2, bool compare_serial=true); +/// Rename all chains to use the shortest possible unique names. +/// +/// Replaces chain names with 1–2 character names (A–Z, a–z, 0–9) while +/// maintaining uniqueness within the structure. +/// +/// @param st The structure to modify in-place GEMMI_DLL void shorten_chain_names(Structure& st); +/// Expand NCS operations in all models of a structure in-place. +/// +/// Applies all NCS (non-crystallographic symmetry) operations to every model +/// in the structure, generating copies and optionally merging nearby atoms. +/// +/// @param st The structure to expand in-place +/// @param how The strategy for naming copied chains +/// @param merge_dist Distance threshold for merging atoms (default 0.2 Å) GEMMI_DLL void expand_ncs(Structure& st, HowToNameCopiedChain how, double merge_dist=0.2); -/// HowToNameCopiedChain::Dup adds segment name to chain name +/// Split chains at residue segment boundaries into separate chains. +/// +/// Divides each chain into multiple chains at segment breaks, with new chain +/// names generated according to the strategy. If how=HowToNameCopiedChain::Dup, +/// the segment name is appended to the chain name. +/// +/// @param model The model to modify in-place +/// @param how The strategy for naming split chains GEMMI_DLL void split_chains_by_segments(Model& model, HowToNameCopiedChain how); -/// Finds nearby crystallographic symmetry images for viewer use. -/// The search uses only the first model and returns non-identity images. +/// Find all crystallographic symmetry images within a distance radius. +/// +/// Searches for non-identity symmetry images of the structure around a given +/// position. Uses only the first model and is intended for viewer applications. +/// +/// @param st The structure to query +/// @param pos The center position for the search +/// @param radius The search radius in Ångströms +/// @return Vector of NearestImage objects describing the nearby symmetry images GEMMI_DLL std::vector get_nearby_sym_ops(const Structure& st, const Position& pos, double radius); -/// Returns a copy of the structure transformed to the requested symmetry image. +/// Extract a symmetry image of the structure as a new Structure object. +/// +/// Creates and returns a copy of the input structure transformed to the +/// coordinates of the specified symmetry image. +/// +/// @param st The input structure +/// @param image The symmetry image specification +/// @return A new Structure object containing the transformed copy GEMMI_DLL Structure get_sym_image(const Structure& st, const NearestImage& image); } // namespace gemmi From 7a53ccf1ff55322e74c371a55374b09134ca68c0 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 22:59:41 +0200 Subject: [PATCH 10/19] docs(select.hpp): add full Doxygen API documentation --- include/gemmi/select.hpp | 221 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 218 insertions(+), 3 deletions(-) diff --git a/include/gemmi/select.hpp b/include/gemmi/select.hpp index 51be4112c..2749c64ff 100644 --- a/include/gemmi/select.hpp +++ b/include/gemmi/select.hpp @@ -18,18 +18,40 @@ namespace gemmi { // /mdl/chn/*(res).ic/at[el]:aloc // +/// Parsed CCP4-style atom/residue/chain selection expression. +/// +/// Represents a selection filter such as "//A/10-20/CA[C].B" for filtering +/// atoms, residues, chains, and models from a structure. Provides both +/// individual match predicates and FilterProxy views for convenient iteration +/// over matching elements. struct GEMMI_DLL Selection { + /// Set of allowed names with optional inversion. + /// + /// Represents comma-separated lists like "ALA,GLY" with optional + /// inversion to mean "everything except". struct List { + /// If true, matches everything (corresponding to "*") bool all = true; + /// If true, the match logic is inverted (names NOT in list) bool inverted = false; - std::string list; // comma-separated + /// Comma-separated list of names + std::string list; + /// Get the canonical string representation. + /// + /// @return "*" if all=true, else the list optionally prefixed with "!" std::string str() const { if (all) return "*"; return inverted ? "!" + list : list; } + /// Check if a name matches this list. + /// + /// Accounts for the all and inverted flags. + /// + /// @param name The name to check + /// @return true if the name matches according to all and inverted settings bool has(const std::string& name) const { if (all) return true; @@ -38,8 +60,20 @@ struct GEMMI_DLL Selection { } }; + /// Matches a single character flag against a pattern string. + /// + /// Supports optional inversion with a leading '!' character. struct FlagList { + /// Flag characters; may be prefixed with '!' to invert the match std::string pattern; + + /// Check if a flag appears in the pattern. + /// + /// If pattern begins with '!', returns true if the flag does NOT appear + /// in the rest of the pattern. + /// + /// @param flag The flag character to check + /// @return true if the flag matches the pattern (or does not match if inverted) bool has(char flag) const { if (pattern.empty()) return true; @@ -49,16 +83,35 @@ struct GEMMI_DLL Selection { } }; + /// A residue sequence position for range matching. + /// + /// Represents a single point in the residue sequence, with an optional + /// insertion code. Special values INT_MIN and INT_MAX represent unset bounds. struct SequenceId { + /// Integer sequence number (INT_MIN = unset lower bound, INT_MAX = unset upper bound) int seqnum; + /// Insertion code character ('*' = wildcard) char icode; + /// Check if this sequence ID is unset. + /// + /// @return true if seqnum is INT_MIN or INT_MAX bool empty() const { return seqnum == INT_MIN || seqnum == INT_MAX; } + /// Get the string representation of this sequence ID. + /// + /// @return String like "10" or "10A" depending on icode std::string str() const; + /// Compare this sequence ID to another SeqId. + /// + /// Compares first by sequence number, then by insertion code if needed. + /// The wildcard '*' for icode matches any insertion code. + /// + /// @param seqid The SeqId to compare to + /// @return negative if less than, 0 if equal, positive if greater than seqid int compare(const SeqId& seqid) const { if (seqnum != *seqid.num) return seqnum < *seqid.num ? -1 : 1; @@ -68,11 +121,25 @@ struct GEMMI_DLL Selection { } }; + /// A numeric filter on atom properties (occupancy or B-factor). + /// + /// Represents a constraint like "q>0.5" (occupancy greater than 0.5) + /// or "b<30" (B-factor less than 30). struct AtomInequality { + /// Property character: 'q' for occupancy, 'b' for B-factor char property; + /// Relation: -1 for less than, 0 for equals, +1 for greater than int relation; + /// The threshold value to compare against double value; + /// Check if an atom satisfies this inequality. + /// + /// Extracts the specified property from the atom and compares it + /// to the threshold using the specified relation. + /// + /// @param a The atom to check + /// @return true if the atom's property satisfies the inequality bool matches(const Atom& a) const { double atom_value = 0.; if (property == 'q') @@ -86,36 +153,89 @@ struct GEMMI_DLL Selection { return atom_value == value; } + /// Get the string representation of this inequality. + /// + /// @return String like "q>0.5" or "b<30" std::string str() const; }; - int mdl = 0; // 0 = all + /// Model number to select (0 = all models) + int mdl = 0; + /// List of allowed chain IDs List chain_ids; + /// Start of sequence ID range (inclusive) SequenceId from_seqid = {INT_MIN, '*'}; + /// End of sequence ID range (inclusive) SequenceId to_seqid = {INT_MAX, '*'}; + /// List of allowed residue names (3-letter codes) List residue_names; + /// List of allowed entity type strings List entity_types; - // array corresponding to enum EntityType + /// Flags array for entity type matching (corresponds to enum EntityType) std::array et_flags; + /// List of allowed atom names List atom_names; + /// Vector of allowed element numbers std::vector elements; + /// List of allowed alternate conformer IDs List altlocs; + /// Flag-based filter for residues FlagList residue_flags; + /// Flag-based filter for atoms FlagList atom_flags; + /// Vector of numeric property filters std::vector atom_inequalities; + /// Default constructor. Selection() = default; + + /// Parse a CCP4 selection string. + /// + /// Parses a selection string like "//A/10-20/CA[C].B" into the + /// selection criteria. + /// + /// @param cid The CCP4-style selection string Selection(const std::string& cid); + /// Get the canonical string representation of this selection. + /// + /// @return String representation of the parsed selection std::string str() const; + /// Check if a structure matches this selection. + /// + /// Structures always match (they are never filtered). + /// + /// @param s The structure to check + /// @return Always true bool matches(const Structure&) const { return true; } + + /// Check if a model matches this selection. + /// + /// Matches if the model number is 0 (select all) or matches mdl. + /// + /// @param model The model to check + /// @return true if the model is included in this selection bool matches(const Model& model) const { return mdl == 0 || mdl == model.num; } + + /// Check if a chain matches this selection. + /// + /// Matches if the chain ID is in the allowed list. + /// + /// @param chain The chain to check + /// @return true if the chain is included in this selection bool matches(const Chain& chain) const { return chain_ids.has(chain.name); } + + /// Check if a residue matches this selection. + /// + /// Matches residue name, sequence ID range, entity type, and flags. + /// + /// @param res The residue to check + /// @return true if the residue is included in this selection bool matches(const Residue& res) const { return (entity_types.all || et_flags[(int)res.entity_type]) && residue_names.has(res.name) && @@ -123,6 +243,13 @@ struct GEMMI_DLL Selection { to_seqid.compare(res.seqid) >= 0 && residue_flags.has(res.flag); } + + /// Check if an atom matches this selection. + /// + /// Matches atom name, element, altloc, flags, and numeric inequalities. + /// + /// @param a The atom to check + /// @return true if the atom is included in this selection bool matches(const Atom& a) const { return atom_names.has(a.name) && (elements.empty() || elements[a.element.ordinal()]) && @@ -131,25 +258,58 @@ struct GEMMI_DLL Selection { std::all_of(atom_inequalities.begin(), atom_inequalities.end(), [&](const AtomInequality& i) { return i.matches(a); }); } + + /// Check if a chain-residue-atom triplet matches this selection. + /// + /// Matches if all non-null components pass their respective match tests. + /// + /// @param cra The CRA structure to check + /// @return true if the CRA is included in this selection bool matches(const CRA& cra) const { return (cra.chain == nullptr || matches(*cra.chain)) && (cra.residue == nullptr || matches(*cra.residue)) && (cra.atom == nullptr || matches(*cra.atom)); } + /// Get a filtered view over models in a structure. + /// + /// @param st The structure to filter + /// @return A FilterProxy that iterates over matching models FilterProxy models(Structure& st) const { return {*this, st.models}; } + + /// Get a filtered view over chains in a model. + /// + /// @param model The model to filter + /// @return A FilterProxy that iterates over matching chains FilterProxy chains(Model& model) const { return {*this, model.chains}; } + + /// Get a filtered view over residues in a chain. + /// + /// @param chain The chain to filter + /// @return A FilterProxy that iterates over matching residues FilterProxy residues(Chain& chain) const { return {*this, chain.residues}; } + + /// Get a filtered view over atoms in a residue. + /// + /// @param residue The residue to filter + /// @return A FilterProxy that iterates over matching atoms FilterProxy atoms(Residue& residue) const { return {*this, residue.atoms}; } + /// Find the first matching atom in a model. + /// + /// Searches through the model's chains and residues to find the first + /// atom that matches this selection. + /// + /// @param model The model to search + /// @return A CRA with the first match, or {nullptr, nullptr, nullptr} if not found CRA first_in_model(Model& model) const { if (matches(model)) for (Chain& chain : model.chains) { @@ -165,6 +325,12 @@ struct GEMMI_DLL Selection { return {nullptr, nullptr, nullptr}; } + /// Find the first matching atom in a structure. + /// + /// Searches through all models to find the first matching atom. + /// + /// @param st The structure to search + /// @return A pair of {Model*, CRA} with the first match, or {nullptr, empty_CRA} if not found std::pair first(Structure& st) const { for (Model& model : st.models) { CRA cra = first_in_model(model); @@ -174,6 +340,14 @@ struct GEMMI_DLL Selection { return {nullptr, {nullptr, nullptr, nullptr}}; } + /// Recursively copy matching children from orig into target. + /// + /// Used internally by copy_selection() to recursively populate a target + /// structure with only the matching elements. + /// + /// @tparam T The type of element (Model, Chain, Residue, or Atom) + /// @param orig The original element to copy from + /// @param target The target element to copy into template void add_matching_children(const T& orig, T& target) const { for (const auto& orig_child : orig.children()) @@ -182,17 +356,36 @@ struct GEMMI_DLL Selection { add_matching_children(orig_child, target.children().back()); } } + + /// Specialization for Atom (leaf node). void add_matching_children(const Atom&, Atom&) const {} + /// Set the residue flag filter pattern. + /// + /// @param pattern Flag pattern string (may be prefixed with '!' to invert) + /// @return *this for method chaining Selection& set_residue_flags(const std::string& pattern) { residue_flags.pattern = pattern; return *this; } + + /// Set the atom flag filter pattern. + /// + /// @param pattern Flag pattern string (may be prefixed with '!' to invert) + /// @return *this for method chaining Selection& set_atom_flags(const std::string& pattern) { atom_flags.pattern = pattern; return *this; } + /// Create a copy of an element containing only matching children. + /// + /// Returns a copy of orig with all non-matching children filtered out. + /// Recursively applies the filter to nested children. + /// + /// @tparam T The type of element (Structure, Model, Chain, or Residue) + /// @param orig The original element to copy + /// @return A new element containing only matching children template T copy_selection(const T& orig) const { T copied = orig.empty_copy(); @@ -200,6 +393,14 @@ struct GEMMI_DLL Selection { return copied; } + /// Remove all matching atoms or residues in-place. + /// + /// Recursively removes all matching children from the element. + /// After removing all matching children from a child, also removes + /// any empty children. + /// + /// @tparam T The type of element (Model, Chain, or Residue) + /// @param t The element to modify in-place template void remove_selected(T& t) const { for (auto& child : t.children()) @@ -208,6 +409,11 @@ struct GEMMI_DLL Selection { vector_remove_if(t.children(), [&](typename T::child_type& c) { return c.children().empty(); }); } + + /// Specialization for Residue: remove matching atoms. + /// + /// Optimized version that clears all atoms if the selection matches + /// all atoms, otherwise removes atoms one by one. void remove_selected(Residue& res) const { if (atom_names.all && elements.empty() && altlocs.all && atom_flags.pattern.empty() && atom_inequalities.empty()) @@ -216,12 +422,21 @@ struct GEMMI_DLL Selection { vector_remove_if(res.atoms, [&](Atom& c) { return matches(c); }); } + /// Remove all non-matching atoms or residues in-place. + /// + /// Recursively removes all non-matching children from the element, + /// then recurses into the remaining children. + /// + /// @tparam T The type of element (Model, Chain, or Residue) + /// @param t The element to modify in-place template void remove_not_selected(T& t) const { vector_remove_if(t.children(), [&](typename T::child_type& c) { return !matches(c); }); for (auto& child : t.children()) remove_not_selected(child); } + + /// Specialization for Atom (leaf node, no-op). void remove_not_selected(Atom&) const {} }; From 1cc7c730f1ebf102064235dc102d666b7fa4dc2b Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 23:00:53 +0200 Subject: [PATCH 11/19] docs: fix missing @brief tag on ChainNameGenerator in assembly.hpp Co-Authored-By: Claude Sonnet 4.6 --- include/gemmi/assembly.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/gemmi/assembly.hpp b/include/gemmi/assembly.hpp index ad7358fdd..594f65ea7 100644 --- a/include/gemmi/assembly.hpp +++ b/include/gemmi/assembly.hpp @@ -25,7 +25,7 @@ enum class HowToNameCopiedChain { Dup }; -/// Utility to generate unique chain names for assembly and NCS expansion. +/// @brief Utility to generate unique chain names for assembly and NCS expansion. /// /// Manages a set of used chain names and provides methods to generate new, /// unique names according to the specified naming strategy. From 2c7d894d75108b525641d1bd92f45b8fafb18c32 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 23:02:51 +0200 Subject: [PATCH 12/19] docs(modify.hpp): add full Doxygen API documentation --- include/gemmi/modify.hpp | 106 ++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 19 deletions(-) diff --git a/include/gemmi/modify.hpp b/include/gemmi/modify.hpp index 6d20f5607..97d5b69ac 100644 --- a/include/gemmi/modify.hpp +++ b/include/gemmi/modify.hpp @@ -14,6 +14,10 @@ namespace gemmi { /// Remove alternative conformations. +/// Recursively removes all alternative conformations, keeping only the first +/// (altloc='A' or blank). For Chain level, keeps one representative per residue seqid. +/// For Residue level, removes duplicate atoms by name. +/// @tparam T Model, Chain, or Residue template void remove_alternative_conformations(T& obj) { for (auto& child : obj.children()) remove_alternative_conformations(child); @@ -39,7 +43,9 @@ template<> inline void remove_alternative_conformations(Chain& chain) { } } -/// Remove hydrogens. +/// Remove hydrogens and deuterium atoms. +/// Recursively removes all H and D atoms from the structure. +/// @tparam T Model, Chain, or Residue template void remove_hydrogens(T& obj) { for (auto& child : obj.children()) remove_hydrogens(child); @@ -53,6 +59,10 @@ template<> inline void remove_hydrogens(Residue& res) { /// Set isotropic ADP to the range (b_min, b_max). Values smaller than /// b_min are changed to b_min, values larger than b_max to b_max. /// Anisotropic ADP is left unchanged. +/// @tparam T Model, Chain, Residue, or Atom +/// @param obj object to modify +/// @param b_min minimum B-factor value +/// @param b_max maximum B-factor value template void assign_b_iso(T& obj, float b_min, float b_max) { for (auto& child : obj.children()) assign_b_iso(child, b_min, b_max); @@ -61,7 +71,9 @@ template<> inline void assign_b_iso(Atom& atom, float b_min, float b_max) { atom.b_iso = clamp(atom.b_iso, b_min, b_max); } -/// Remove anisotropic ADP +/// Remove anisotropic displacement parameters. +/// Recursively zeroes all anisotropic ADP tensors (u11, u22, u33, u12, u13, u23). +/// @tparam T Model, Chain, Residue, or Atom template void remove_anisou(T& obj) { for (auto& child : obj.children()) remove_anisou(child); @@ -70,7 +82,9 @@ template<> inline void remove_anisou(Atom& atom) { atom.aniso = {0, 0, 0, 0, 0, 0}; } -/// Set absent ANISOU to value from B_iso +/// Set absent ANISOU records to isotropic values derived from B_iso. +/// For atoms without anisotropic ADP, creates isotropic tensor U = B_iso/(8π²). +/// @tparam T Model, Chain, Residue, or Atom template void ensure_anisou(T& obj) { for (auto& child : obj.children()) ensure_anisou(child); @@ -82,7 +96,12 @@ template<> inline void ensure_anisou(Atom& atom) { } } -/// apply Transform to both atom's position and ADP +/// Apply a Transform to atomic positions and anisotropic displacement parameters. +/// Recursively applies the given transformation to all atom coordinates and +/// congruence-transforms anisotropic ADPs. +/// @tparam T Model, Chain, Residue, or Atom +/// @param obj object to transform +/// @param tr transformation to apply template void transform_pos_and_adp(T& obj, const Transform& tr) { for (auto& child : obj.children()) transform_pos_and_adp(child, tr); @@ -93,7 +112,10 @@ template<> inline void transform_pos_and_adp(Atom& atom, const Transform& tr) { atom.aniso = atom.aniso.transformed_by(tr.mat); } -/// set atom site serial numbers to 1, 2, ..., optionally leaving gaps for TERs +/// Assign atom site serial numbers (1, 2, 3, ...) sequentially. +/// Optionally leaves gaps at chain ends for TER records if numbering polymer chains. +/// @param model model containing chains +/// @param numbered_ter if true, increment serial after last polymer residue in each chain inline void assign_serial_numbers(Model& model, bool numbered_ter=false) { int serial = 0; for (Chain& chain : model.chains) @@ -105,17 +127,22 @@ inline void assign_serial_numbers(Model& model, bool numbered_ter=false) { ++serial; } } + +/// Assign atom site serial numbers to all models in a structure. +/// @param st structure containing models +/// @param numbered_ter if true, increment serial after last polymer residue in each chain inline void assign_serial_numbers(Structure& st, bool numbered_ter=false) { for (Model& model : st.models) assign_serial_numbers(model, numbered_ter); } -/// Helper function for processing (usually: changing) names and numbers -/// in AtomAddress instances in metadata: -/// Connection, CisPep, StructSite, Helix, Sheet::Strand. -/// Other fields are not updated here, in particular: ModRes, Entity::DbRef, -/// Entity::full_sequence, TlsGroup::Selection. +/// Apply a function to all AtomAddress references in structure metadata. +/// Processes atom addresses in Connection, CisPep, StructSite, Helix, and Sheet records. +/// Does not update ModRes, Entity::DbRef, Entity::full_sequence, or TlsGroup::Selection. +/// @tparam Func callable taking an AtomAddress& parameter +/// @param st structure whose metadata will be processed +/// @param func function to apply to each AtomAddress template void process_addresses(Structure& st, Func func) { for (Connection& con : st.connections) { @@ -144,9 +171,13 @@ void process_addresses(Structure& st, Func func) { } } -/// Takes func(const std::string& chain_name, gemmi::SeqId& seqid). -/// It doesn't process Entity::DbRef::seq_begin/seq_end (b/c there is no -/// single corresponding chain name). +/// Apply a function to all SeqId references in structure metadata. +/// Processes sequence IDs in Connection/CisPep/StructSite/Helix/Sheet records, +/// ModRes entries, and TlsGroup selections. +/// Does not process Entity::DbRef::seq_begin/seq_end (no single chain name). +/// @tparam Func callable taking (const std::string& chain_name, SeqId& seqid) +/// @param st structure whose metadata will be processed +/// @param func function to apply to each (chain_name, seqid) pair template void process_sequence_ids(Structure& st, Func func) { process_addresses(st, [&](AtomAddress& aa) { func(aa.chain_name, aa.res_id.seqid); }); @@ -160,6 +191,11 @@ void process_sequence_ids(Structure& st, Func func) { } } +/// Rename a chain throughout the structure. +/// Updates all occurrences of old_name to new_name in models and metadata. +/// @param st structure to modify +/// @param old_name current chain name +/// @param new_name new chain name inline void rename_chain(Structure& st, const std::string& old_name, const std::string& new_name) { auto update = [&](std::string& name) { @@ -178,6 +214,12 @@ inline void rename_chain(Structure& st, const std::string& old_name, update(chain.name); } +/// Rename residues throughout the structure. +/// Updates all residues named old_name to new_name in models and metadata. +/// Also updates Entity sequences and StructSite member records. +/// @param st structure to modify +/// @param old_name current residue name +/// @param new_name new residue name inline void rename_residues(Structure& st, const std::string& old_name, const std::string& new_name) { auto update = [&](ResidueId& rid) { @@ -211,6 +253,12 @@ inline void rename_residues(Structure& st, const std::string& old_name, } +/// Rename atoms in residues of a specific type. +/// For residues named res_name, renames atoms according to the old→new map. +/// Updates both atomic coordinates and metadata references. +/// @param st structure to modify +/// @param res_name residue type to target +/// @param old_new map from old atom names to new atom names inline void rename_atom_names(Structure& st, const std::string& res_name, const std::map& old_new) { auto update = [&old_new](std::string& name) { @@ -232,6 +280,10 @@ inline void rename_atom_names(Structure& st, const std::string& res_name, } +/// Expand deuterium-fraction representation into explicit H/D alternate conformations. +/// Converts H atoms with non-zero d_fraction into H/D altloc pairs. +/// If d_fraction >= 1, converts to pure D; otherwise creates H altloc 'A' and D altloc 'D'. +/// @param res residue containing hydrogens to expand inline void replace_d_fraction_with_altlocs(Residue& res) { for (size_t i = res.atoms.size(); i-- != 0; ) { Atom& atom = res.atoms[i]; @@ -263,6 +315,11 @@ inline void replace_d_fraction_with_altlocs(Residue& res) { } } +/// Contract explicit D atoms into H atoms with fractional occupancy. +/// Merges H and D atoms at the same position into a single H atom, +/// storing the D occupancy as the fraction field. +/// @param res residue containing deuterium atoms to contract +/// @return true if any D atoms were found and processed, false otherwise inline bool replace_deuterium_with_fraction(Residue& res) { bool found = false; for (auto d = res.atoms.end(); d-- != res.atoms.begin(); ) @@ -294,11 +351,13 @@ inline bool replace_deuterium_with_fraction(Residue& res) { return found; } -/// Hydrogens modelled as H/D mixture (altlocs H and D with the same position -/// and ADP, but with refined fraction of D), it can be stored in mmCIF either -/// as two atoms (H and D) or, using CCP4/Refmac extension, as H atoms with -/// the ccp4_deuterium_fraction parameter. -/// This function switches fraction <-> altlocs +/// Toggle deuterium representation between fraction and explicit altlocs. +/// Hydrogens modelled as H/D mixtures can be stored either as: +/// - Two atoms with altlocs H and D (same position/ADP, different occupancies) +/// - Single H atom with ccp4_deuterium_fraction parameter (CCP4/Refmac extension) +/// This function converts between the two representations. +/// @param st structure to modify +/// @param store_fraction if true, use fraction representation; otherwise use altlocs inline void store_deuterium_as_fraction(Structure& st, bool store_fraction) { if (st.has_d_fraction == store_fraction) return; @@ -314,6 +373,10 @@ inline void store_deuterium_as_fraction(Structure& st, bool store_fraction) { } } +/// Set the deuterium fraction of all hydrogen atoms. +/// Assigns the fraction field of all H atoms in the structure to d_fract. +/// @param st structure to modify +/// @param d_fract deuterium fraction to assign (0.0 to 1.0) inline void set_deuterium_fraction_of_hydrogens(Structure& st, float d_fract) { st.has_d_fraction = true; for (Model& model : st.models) @@ -324,7 +387,12 @@ inline void set_deuterium_fraction_of_hydrogens(Structure& st, float d_fract) { atom.fraction = d_fract; } -/// Convert coordinates to the standard coordinate system for the unit cell. +/// Transform structure to the standard crystallographic frame. +/// Converts to standard coordinates where the a-axis is along x, +/// the b-axis is in the xy-plane, and c-axis points along +z. +/// Updates ORIGX matrices and NCS operators accordingly. +/// Only operates on crystal structures with explicit transformation matrices. +/// @param st structure to transform inline void standardize_crystal_frame(Structure& st) { if (!st.cell.explicit_matrices || !st.cell.is_crystal()) return; From 26f37f8455bf1050e9021f049c83c6ed5dbfd752 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 23:04:24 +0200 Subject: [PATCH 13/19] docs(polyheur.hpp): add full Doxygen API documentation --- include/gemmi/polyheur.hpp | 175 ++++++++++++++++++++++++++++++------- 1 file changed, 141 insertions(+), 34 deletions(-) diff --git a/include/gemmi/polyheur.hpp b/include/gemmi/polyheur.hpp index 6186e79ff..8aa2a289b 100644 --- a/include/gemmi/polyheur.hpp +++ b/include/gemmi/polyheur.hpp @@ -12,12 +12,21 @@ namespace gemmi { -// A simplistic classification. It may change in the future. -// It returns PolymerType which corresponds to _entity_poly.type, -// but here we use only PeptideL, Rna, Dna, DnaRnaHybrid and Unknown. +/// Classify a residue span as peptide, RNA, DNA, or other polymer type. +/// A simplistic heuristic classification based on residue names. +/// Returns PolymerType (PeptideL, PeptideD, Rna, Dna, DnaRnaHybrid, or Unknown). +/// @param span sequence of residues to classify +/// @param ignore_entity_type if false, uses Entity type if known and consistent +/// @return PolymerType classification GEMMI_DLL PolymerType check_polymer_type(const ConstResidueSpan& span, bool ignore_entity_type=false); +/// Determine polymer type from entity or by classification. +/// Returns the entity's polymer_type if known and non-Unknown, +/// otherwise classifies the residue span. +/// @param ent entity with known polymer type (may be nullptr) +/// @param polymer residue span to classify if entity type unknown +/// @return PolymerType classification inline PolymerType get_or_check_polymer_type(const Entity* ent, const ConstResidueSpan& polymer) { if (ent && ent->polymer_type != PolymerType::Unknown) @@ -25,8 +34,15 @@ inline PolymerType get_or_check_polymer_type(const Entity* ent, return check_polymer_type(polymer); } +/// Atom name and chemical element pair. +/// Used to represent backbone atom components of a polymer type. struct AtomNameElement { std::string atom_name; El el; }; +/// Get backbone atom names and elements for a polymer type. +/// For peptides: N, CA, C, O. +/// For nucleic acids: P, O5', C5', C4', O4', C3', O3', C2', O2', C1'. +/// @param ptype polymer type (peptide or nucleic acid) +/// @return vector of {atom_name, element} pairs for the backbone inline std::vector get_mainchain_atoms(PolymerType ptype) { if (is_polynucleotide(ptype)) return {{"P", El::P}, {"O5'", El::O}, {"C5'", El::C}, @@ -35,23 +51,46 @@ inline std::vector get_mainchain_atoms(PolymerType ptype) { return {{"N", El::N}, {"CA", El::C}, {"C", El::C}, {"O", El::O}}; } -/// distance-based check for peptide bond +/// Check if two atoms are within peptide bond distance. +/// Tests C(i)–N(i+1) distance < 2.01 Ångströms. +/// @param a1 C atom (may be nullptr) +/// @param a2 N atom (may be nullptr) +/// @return true if both atoms exist and are within bond distance inline bool in_peptide_bond_distance(const Atom* a1, const Atom* a2) { return a1 && a2 && a1->pos.dist_sq(a2->pos) < sq(1.341 * 1.5); } +/// Check if two consecutive residues are bonded by a peptide bond. +/// Tests if the C atom of r1 and N atom of r2 are within peptide bond distance. +/// @param r1 first (C-terminal) residue +/// @param r2 second (N-terminal) residue +/// @return true if peptide bond exists inline bool have_peptide_bond(const Residue& r1, const Residue& r2) { return in_peptide_bond_distance(r1.get_c(), r2.get_n()); } -/// distance-based check for phosphodiester bond between nucleotide +/// Check if two atoms are within nucleotide bond distance. +/// Tests O3'(i)–P(i+1) distance < 2.4 Ångströms. +/// @param a1 O3' atom (may be nullptr) +/// @param a2 P atom (may be nullptr) +/// @return true if both atoms exist and are within bond distance inline bool in_nucleotide_bond_distance(const Atom* a1, const Atom* a2) { return a1 && a2 && a1->pos.dist_sq(a2->pos) < sq(1.6 * 1.5); } +/// Check if two consecutive residues are bonded by a phosphodiester bond. +/// Tests if the O3' atom of r1 and P atom of r2 are within nucleotide bond distance. +/// @param r1 first residue +/// @param r2 second (following) residue +/// @return true if phosphodiester bond exists inline bool have_nucleotide_bond(const Residue& r1, const Residue& r2) { return in_nucleotide_bond_distance(r1.get_o3prim(), r2.get_p()); } -/// check C-N or O3'-P distance +/// Strict connectivity check using actual bond atoms. +/// For peptides: tests C-N distance; for nucleotides: tests O3'-P distance. +/// @param r1 first residue +/// @param r2 second residue +/// @param ptype polymer type (peptide or nucleic acid) +/// @return true if residues are bonded inline bool are_connected(const Residue& r1, const Residue& r2, PolymerType ptype) { if (is_polypeptide(ptype)) return have_peptide_bond(r1, r2); @@ -60,7 +99,13 @@ inline bool are_connected(const Residue& r1, const Residue& r2, PolymerType ptyp return false; } -/// are_connected2() is less exact, but requires only CA (or P) atoms. +/// Loose connectivity check using only alpha-carbon or phosphorus atoms. +/// For peptides: tests Cα–Cα < 5 Å; for nucleotides: tests P–P < 7.5 Å. +/// Requires only Cα or P atoms, not full backbone atoms. +/// @param r1 first residue +/// @param r2 second residue +/// @param ptype polymer type (peptide or nucleic acid) +/// @return true if residues appear connected by distance inline bool are_connected2(const Residue& r1, const Residue& r2, PolymerType ptype) { auto this_or_first = [](const Atom* a, const Residue& r, El el) -> const Atom* { if (a || r.atoms.empty()) @@ -82,7 +127,13 @@ inline bool are_connected2(const Residue& r1, const Residue& r2, PolymerType pty return false; } -/// are_connected3() = are_connected() + fallback to are_connected2() +/// Connectivity check with fallback from strict to loose criteria. +/// First tries are_connected() with actual bond atoms, +/// then falls back to are_connected2() using Cα or P distances. +/// @param r1 first residue +/// @param r2 second residue +/// @param ptype polymer type (peptide or nucleic acid) +/// @return true if residues are connected by either criterion inline bool are_connected3(const Residue& r1, const Residue& r2, PolymerType ptype) { if (is_polypeptide(ptype)) { if (const Atom* a1 = r1.get_c()) @@ -102,41 +153,67 @@ inline bool are_connected3(const Residue& r1, const Residue& r2, PolymerType pty return false; } +/// Convert polymer residues to one-letter sequence codes. +/// Translates residue names to standard IUPAC single-letter codes. +/// Unknown residues become 'X'. +/// @param polymer sequence of residues to convert +/// @return one-letter sequence string GEMMI_DLL std::string make_one_letter_sequence(const ConstResidueSpan& polymer); -/// Assigns entity_type=Polymer|NonPolymer|Water for each Residue (only -/// for residues with entity_type==Unknown, unless overwrite=true). -/// Determining where the polymer ends and ligands start is sometimes -/// arbitrary -- there can be a non-standard residue at the end that can -/// be regarded as as either the last residue or a linked ligand. +/// Assign entity types to residues in a chain. +/// Assigns entity_type=Polymer|NonPolymer|Water to each residue. +/// Only updates residues with entity_type==Unknown unless overwrite=true. +/// Note: determining polymer/ligand boundary can be ambiguous for non-standard residues. +/// @param chain chain to annotate +/// @param overwrite if true, reassign types even for known residues GEMMI_DLL void add_entity_types(Chain& chain, bool overwrite); + +/// Assign entity types to all chains in a structure. +/// @param st structure to annotate +/// @param overwrite if true, reassign types even for known residues GEMMI_DLL void add_entity_types(Structure& st, bool overwrite); -/// Assigns entity_type=Unknown for all residues. +/// Reset all residue entity types to Unknown. +/// @param st structure to modify GEMMI_DLL void remove_entity_types(Structure& st); -/// Assigns Residue::entity_id based on Residue::subchain and Entity::subchains. +/// Assign entity IDs to residues based on subchain assignments. +/// Links Residue::entity_id to Entity records via Residue::subchain and Entity::subchains. +/// @param st structure to annotate +/// @param overwrite if true, reassign IDs even for residues that already have them GEMMI_DLL void add_entity_ids(Structure& st, bool overwrite); -/// The subchain field in the residue is where we store_atom_site.label_asym_id -/// from mmCIF files. As of 2018 wwPDB software splits author's chains -/// (auth_asym_id) into label_asym_id units: -/// * linear polymer, -/// * non-polymers (each residue has different separate label_asym_id), -/// * and waters. -/// Refmac/makecif is doing similar thing but using different naming and -/// somewhat different rules (it was written in 1990's before PDBx/mmCIF). -/// -/// Here we use naming and rules different from both wwPDB and makecif. -/// Note: call add_entity_types() first. +/// Assign subchain labels to residues in a chain. +/// Splits the chain into segments: linear polymers, non-polymers (each with unique ID), +/// and waters. Stores label_asym_id-like names in Residue::subchain. +/// wwPDB software splits auth_asym_id into label_asym_id units similarly. +/// This function uses compatible but distinct naming and rules. +/// @param chain chain to segment +/// @param nonpolymer_counter incremented for each non-polymer segment (input/output) +/// @note call add_entity_types() first to set entity types correctly GEMMI_DLL void assign_subchain_names(Chain& chain, int& nonpolymer_counter); +/// Assign subchain names to all chains in a structure. +/// Calls assign_subchain_names on each chain to segment into polymer/non-polymer units. +/// @param st structure to annotate +/// @param force if true, reassign subchains even if already assigned +/// @param fail_if_unknown if true, raise error if polymer type is unknown; if false, skip GEMMI_DLL void assign_subchains(Structure& st, bool force, bool fail_if_unknown=true); +/// Create missing Entity records for all subchains in the structure. +/// Ensures each Residue::subchain has a corresponding Entity. +/// @param st structure to update GEMMI_DLL void ensure_entities(Structure& st); +/// Merge Entity records with identical sequences into one. +/// Consolidates duplicate sequence entries and updates residue entity_ids. +/// @param st structure to deduplicate GEMMI_DLL void deduplicate_entities(Structure& st); +/// Set up entity metadata for a structure in a standard workflow. +/// Convenience function that calls add_entity_types + assign_subchains + +/// ensure_entities + deduplicate_entities in sequence. +/// @param st structure to set up inline void setup_entities(Structure& st) { add_entity_types(st, /*overwrite=*/false); assign_subchains(st, /*force=*/false); @@ -144,9 +221,16 @@ inline void setup_entities(Structure& st) { deduplicate_entities(st); } -/// Determine ATOM/HETATM record type, based on Residue::entity_type +/// Determine ATOM/HETATM record type based on residue entity type. +/// Returns 'A' (ATOM) for polymers or 'H' (HETATM) for non-polymers and waters. +/// @param res residue to classify +/// @return 'A' or 'H' GEMMI_DLL char recommended_het_flag(const Residue& res); -/// R = recommended_het_flag(), other valid values are A, H and '\0' + +/// Assign het_flag to all residues in an object. +/// @tparam T Model, Chain, or Residue +/// @param obj object to modify +/// @param flag het_flag value: 'A' (ATOM), 'H' (HETATM), 'R' (recommended), or ' ' (none) template void assign_het_flags(T& obj, char flag='R') { for (auto& child : obj.children()) assign_het_flags(child, flag); @@ -158,7 +242,10 @@ template<> inline void assign_het_flags(Residue& res, char flag) { res.het_flag = flag == 'R' ? recommended_het_flag(res) : flag; } -// Remove waters. It may leave empty chains. +/// Remove water residues from a structure or chain. +/// May leave empty chains if all residues are waters. +/// @tparam T Model, Chain, or Residue +/// @param obj object to modify template void remove_waters(T& obj) { for (auto& child : obj.children()) remove_waters(child); @@ -168,7 +255,11 @@ template<> inline void remove_waters(Chain& ch) { [](const Residue& res) { return res.is_water(); }); } -// Remove ligands and waters. It may leave empty chains. +/// Remove all non-polymer residues (ligands and waters). +/// Requires entity_type to be assigned first (see add_entity_types). +/// May leave empty chains if all residues are removed. +/// @tparam T Model, Chain, or Residue +/// @param obj object to modify template void remove_ligands_and_waters(T& obj) { for (auto& child : obj.children()) remove_ligands_and_waters(child); @@ -181,22 +272,38 @@ template<> inline void remove_ligands_and_waters(Chain& ch) { }); } -// Trim to alanine. Returns true if trimmed, false if it's (likely) not AA. +/// Reduce an amino acid residue to an alanine backbone. +/// Removes all side-chain atoms, keeping only N, CA, C, O and CB. +/// @param res residue to trim +/// @return true if trimmed successfully, false if residue is not a standard amino acid GEMMI_DLL bool trim_to_alanine(Residue& res); +/// Trim all amino acid residues in a chain to alanine. +/// @param chain chain to modify inline void trim_to_alanine(Chain& chain) { for (Residue& res : chain.residues) trim_to_alanine(res); } -// Functions for switching between long (>3 chars) residue names (CCD codes) -// and shortened ones that are compatible with the PDB format. +/// Convert long CCD codes to 3-letter abbreviations for PDB format compatibility. +/// Refmac does similar shortening. Enables Refmac refinement with long residue names. +/// @param st structure to modify GEMMI_DLL void shorten_ccd_codes(Structure& st); + +/// Restore full CCD codes from their 3-letter abbreviations. +/// Inverse of shorten_ccd_codes(); uses stored abbreviation mappings. +/// @param st structure to modify GEMMI_DLL void restore_full_ccd_codes(Structure& st); -/// Modifies Entity::full_sequence. Uses only the first chain for each Entity. +/// Annotate microheterogeneity positions in entity sequences. +/// Identifies residue positions with multiple alternate conformations and +/// marks them in Entity::full_sequence. Uses only the first chain for each entity. +/// @param st structure to annotate +/// @param overwrite if true, overwrite existing microheterogeneity annotations GEMMI_DLL void add_microhetero_to_sequences(Structure& st, bool overwrite=false); +/// Assign sequential integer IDs to all TLS groups in the structure. +/// @param st structure to annotate GEMMI_DLL void add_tls_group_ids(Structure& st); } // namespace gemmi From c4fab63f751ac3acd680ebfb9b152d36ace8538d Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 23:06:53 +0200 Subject: [PATCH 14/19] docs(sfcalc.hpp): add full Doxygen API documentation --- include/gemmi/sfcalc.hpp | 82 +++++++++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 10 deletions(-) diff --git a/include/gemmi/sfcalc.hpp b/include/gemmi/sfcalc.hpp index bab426ca5..22bfd7156 100644 --- a/include/gemmi/sfcalc.hpp +++ b/include/gemmi/sfcalc.hpp @@ -17,26 +17,42 @@ namespace gemmi { -// calculate part of the structure factor: exp(2 pi i r * s) +/// @brief Compute exp(2πi r·s) for one atom at fractional position for a reflection. +/// @param fpos Fractional coordinates of the atom. +/// @param hkl Miller indices of the reflection. +/// @return Complex phase factor. inline std::complex calculate_sf_part(const Fractional& fpos, const Miller& hkl) { double arg = 2 * pi() * (hkl[0]*fpos.x + hkl[1]*fpos.y + hkl[2]*fpos.z); return std::complex{std::cos(arg), std::sin(arg)}; } +/// @brief Calculates structure factors by direct summation over all atoms. +/// @tparam Table Scattering factor table type (e.g., IT92, WK95, ElectronTable). +/// Must have a nested Coef type with coef_type member. template class StructureFactorCalculator { public: using coef_type = typename Table::Coef::coef_type; + /// @brief Initialize with the unit cell. + /// @param cell Unit cell; a reference is stored for use in subsequent calculations. StructureFactorCalculator(const UnitCell& cell) : cell_(cell) {} + /// @brief Cache (sin θ/λ)² for hkl and clear the scattering factor cache. + /// Must be called before computing structure factors for a new reflection. + /// @param hkl Miller indices of the reflection. void set_stol2_and_scattering_factors(const Miller& hkl) { stol2_ = (coef_type) cell_.calculate_stol_sq(hkl); scattering_factors_.clear(); scattering_factors_.resize(addends.size(), 0.); } + /// @brief Return scattering factor for element at current (sin θ/λ)². + /// Lazily computes and caches the result. + /// @param element The chemical element. + /// @param charge Formal charge (signed char; 0 = neutral atom). + /// @return Scattering factor value. double get_scattering_factor(Element element, signed char charge) { double& sfactor = scattering_factors_[element.ordinal()]; if (sfactor == 0.) { @@ -47,27 +63,48 @@ class StructureFactorCalculator { return sfactor; } - // Calculation of Debye-Waller factor with isotropic ADPs + /// @brief Isotropic Debye-Waller factor exp(-B·stol²) for a small-molecule site. + /// @param site Small-molecule site with u_iso field. + /// @return Debye-Waller factor value. double dwf_iso(const SmallStructure::Site& site) const { return std::exp(-u_to_b() * stol2_ * site.u_iso); } + + /// @brief Isotropic Debye-Waller factor exp(-B·stol²) for a macromolecular atom. + /// @param atom Macromolecular atom with b_iso field. + /// @return Debye-Waller factor value. double dwf_iso(const Atom& atom) const { return std::exp(-stol2_ * atom.b_iso); } - // Calculation of Debye-Waller factor exp(-2 pi^2 s.U.s) - // cf. B. Rupp's book, p. 641 or RWGK & Adams 2002, J. Appl. Cryst. 35, 477 - // Small molecule and macromolecular anisotropic U's are defined differently, - // so we have two functions. + /// @brief Anisotropic Debye-Waller factor exp(-2π²·s·U·s) for a small-molecule site. + /// Uses site.aniso, where the anisotropic U tensor is in crystallographic coordinates. + /// @param site Small-molecule site with aniso field. + /// @param hkl Miller indices of the reflection. + /// @return Debye-Waller factor value. double dwf_aniso(const SmallStructure::Site& site, const Vec3& hkl) const { Vec3 arh(cell_.ar * hkl.x, cell_.br * hkl.y, cell_.cr * hkl.z); return std::exp(-2 * pi() * pi() * site.aniso.r_u_r(arh)); } + + /// @brief Anisotropic Debye-Waller factor exp(-2π²·s·U·s) for a macromolecular atom. + /// Uses atom.aniso in orthogonal coordinates. + /// @param atom Macromolecular atom with aniso field. + /// @param hkl Miller indices of the reflection. + /// @return Debye-Waller factor value. double dwf_aniso(const Atom& atom, const Vec3& hkl) const { return std::exp(-2 * pi() * pi() * atom.aniso.transformed_by<>(cell_.frac.mat).r_u_r(hkl)); } + /// @brief Contribution of one atom to structure factor, given its scattering factor. + /// Accounts for Debye-Waller factor, occupancy, and phase. + /// @tparam Site SmallStructure::Site or Atom. + /// @param fract Fractional coordinates of the atom. + /// @param site Site/atom record containing occupancy and aniso data. + /// @param hkl Miller indices of the reflection. + /// @param sf Precomputed scattering factor for this atom. + /// @return Complex structure factor contribution. template std::complex calculate_sf_from_atom_sf(const Fractional& fract, const Site& site, @@ -88,6 +125,13 @@ class StructureFactorCalculator { return oc_sf * sum; } + /// @brief Contribution of one atom to structure factor, with automatic scattering factor lookup. + /// Accounts for Debye-Waller factor, occupancy, and phase. + /// @tparam Site SmallStructure::Site or Atom. + /// @param fract Fractional coordinates of the atom. + /// @param site Site/atom record containing element and charge information. + /// @param hkl Miller indices of the reflection. + /// @return Complex structure factor contribution. template std::complex calculate_sf_from_atom(const Fractional& fract, const Site& site, @@ -96,6 +140,10 @@ class StructureFactorCalculator { return calculate_sf_from_atom_sf(fract, site, hkl, atom_sf); } + /// @brief Sum contributions from all atoms in a Model for the given reflection. + /// @param model The macromolecular model. + /// @param hkl Miller indices of the reflection. + /// @return Total structure factor. std::complex calculate_sf_from_model(const Model& model, const Miller& hkl) { std::complex sf = 0.; set_stol2_and_scattering_factors(hkl); @@ -106,7 +154,12 @@ class StructureFactorCalculator { return sf; } - // Z part of Mott-Bethe formula (when need to use different model) + /// @brief Compute Z (atomic number sum) component for Mott-Bethe conversion. + /// Used when a different model is needed for the Z calculation. + /// @param model The macromolecular model. + /// @param hkl Miller indices of the reflection. + /// @param only_h If true, restrict calculation to hydrogen atoms. + /// @return Z component of structure factor. std::complex calculate_mb_z(const Model& model, const Miller& hkl, bool only_h) { std::complex sf = 0.; stol2_ = (coef_type) cell_.calculate_stol_sq(hkl); @@ -119,12 +172,19 @@ class StructureFactorCalculator { return sf; } + /// @brief Return the Mott-Bethe prefactor at current (sin θ/λ)². + /// Used to convert X-ray structure factors to electron structure factors. + /// @return Mott-Bethe factor: -mott_bethe_const() / (4·stol²). double mott_bethe_factor() const { return -mott_bethe_const() / 4 / stol2_; } - // The occupancy is assumed to take into account symmetry, - // i.e. to be fractional if the atom is on special position. + /// @brief Sum contributions from all sites in a SmallStructure for the given reflection. + /// The occupancy is assumed to account for symmetry (i.e., fractional if the atom + /// is on a special position). + /// @param small_st The small-molecule structure. + /// @param hkl Miller indices of the reflection. + /// @return Total structure factor. std::complex calculate_sf_from_small_structure(const SmallStructure& small_st, const Miller& hkl) { std::complex sf = 0.; @@ -139,7 +199,9 @@ class StructureFactorCalculator { coef_type stol2_; std::vector scattering_factors_; public: - Addends addends; // usually f' for X-rays + /// Addends object for anomalous scattering corrections (usually f' for X-rays). + /// Public and mutable by the caller. + Addends addends; }; } // namespace gemmi From 3eac55d4e91cc2be291b6b5f9711d7899c42fac9 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 23:07:01 +0200 Subject: [PATCH 15/19] docs(ecalc.hpp): add full Doxygen API documentation --- include/gemmi/ecalc.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/gemmi/ecalc.hpp b/include/gemmi/ecalc.hpp index 9da667d98..812c55d9c 100644 --- a/include/gemmi/ecalc.hpp +++ b/include/gemmi/ecalc.hpp @@ -10,6 +10,16 @@ namespace gemmi { +/// @brief Compute per-reflection amplitude normalization factors for E-scale conversion. +/// Uses the Karle approach: E = F / sqrt(Σ f²), with resolution-bin-based averaging and smoothing. +/// @tparam DataProxy Type satisfying the data proxy interface: must provide size(), stride(), +/// spacegroup(), unit_cell(), get_hkl(n), and get_num(n) methods. +/// @param data Data proxy (e.g., MtzDataProxy or ReflDataProxy). +/// @param fcol_idx Column index of F amplitudes in the proxy. +/// @param binner Binner object defining resolution shells. +/// @return Vector of multipliers (one per reflection); NaN for missing values. +/// Algorithm: collects F² in bins, applies [0.75, 1, 0.75] smoothing kernel, +/// returns 1/sqrt() per reflection for normalization. template std::vector calculate_amplitude_normalizers(const DataProxy& data, int fcol_idx, const Binner& binner) { From 9c193f5b472b62bd7af806777dc186bc2fe0a547 Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 23:08:12 +0200 Subject: [PATCH 16/19] docs(scaling.hpp): add full Doxygen API documentation --- include/gemmi/scaling.hpp | 134 ++++++++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 34 deletions(-) diff --git a/include/gemmi/scaling.hpp b/include/gemmi/scaling.hpp index 630774c0b..84f2b74d5 100644 --- a/include/gemmi/scaling.hpp +++ b/include/gemmi/scaling.hpp @@ -13,17 +13,26 @@ namespace gemmi { +/// @brief Type alias for a symmetric 3×3 tensor represented as 6 coefficients. +/// Stores (u11, u22, u33, u12, u13, u23). using Vec6 = std::array; +/// @brief Dot product of Vec6 with a symmetric 3×3 matrix. +/// Used for tensor contractions in ADP refinement. +/// @param a Vec6 vector (6-element array). +/// @param s Symmetric 3×3 matrix. +/// @return Dot product result. inline double vec6_dot(const Vec6& a, const SMat33& s) { return a[0] * s.u11 + a[1] * s.u22 + a[2] * s.u33 + a[3] * s.u12 + a[4] * s.u13 + a[5] * s.u23; } -/// Symmetry constraints of ADP. -/// The number of rows is the number of independent coefficients in U. -/// For example, for tetragonal crystal returns two normalized Vec6 vectors -/// in directions [1 1 0 0 0 0] and [0 0 1 0 0 0]. +/// @brief Return the symmetry-adapted constraint vectors for the ADP tensor in a space group. +/// The number and content of returned constraint vectors depend on the crystal system. +/// For example, cubic returns one vector [1 1 1 0 0 0] (isotropic); tetragonal returns +/// two vectors in directions [1 1 0 0 0 0] and [0 0 1 0 0 0] (diagonal u11=u22, u33 free). +/// @param sg Pointer to the space group; if null, triclinic symmetry is assumed (6 free components). +/// @return Vector of normalized Vec6 constraint vectors for independent ADP components. inline std::vector adp_symmetry_constraints(const SpaceGroup* sg) { auto constraints = [](const std::initializer_list& l) { constexpr double K2 = 0.70710678118654752440; // sqrt(1/2) @@ -68,46 +77,58 @@ inline std::vector adp_symmetry_constraints(const SpaceGroup* sg) { unreachable(); } +/// @brief Anisotropic scaling of calculated structure factors to observed data. +/// Optionally includes bulk solvent correction: Fc + k_sol·exp(-b_sol·stol²)·Fmask. +/// Parameter refinement uses Levenberg-Marquardt (or NLopt if WITH_NLOPT is defined). +/// @tparam Real Floating-point type (float or double). template struct Scaling { + /// @brief One reflection used in the least-squares fit. struct Point { - Miller hkl; - double stol2; - std::complex fcmol, fmask; - Real fobs, sigma; + Miller hkl; ///< Miller indices. + double stol2; ///< (sin θ/λ)² for this reflection. + std::complex fcmol, fmask; ///< Calculated molecular and mask structure factors. + Real fobs, sigma; ///< Observed amplitude and its standard uncertainty. Miller get_x() const { return hkl; } double get_y() const { return fobs; } double get_weight() const { return 1.0 /* / sigma*/; } }; - UnitCell cell; - // model parameters - double k_overall = 1.; - // b_star = F B_cart F^T, where F - fractionalization matrix - SMat33 b_star{0, 0, 0, 0, 0, 0}; - std::vector constraint_matrix; - bool use_solvent = false; - bool fix_k_sol = false; - bool fix_b_sol = false; - // initialize with average values (Fokine & Urzhumtsev, 2002) - double k_sol = 0.35; - double b_sol = 46.0; - std::vector points; - + UnitCell cell; ///< Unit cell parameters. + double k_overall = 1.; ///< Overall scale factor. + SMat33 b_star{0, 0, 0, 0, 0, 0}; ///< Anisotropic B* tensor in reciprocal space. + std::vector constraint_matrix; ///< Symmetry constraints on b_star. + bool use_solvent = false; ///< If true, include bulk solvent correction. + bool fix_k_sol = false; ///< If true, do not refine k_sol. + bool fix_b_sol = false; ///< If true, do not refine b_sol. + double k_sol = 0.35; ///< Bulk solvent scale factor. + double b_sol = 46.0; ///< Bulk solvent B-factor. + std::vector points; ///< Reflection data for fitting. + + /// @brief Initialize with unit cell and space group. + /// Sets up constraint_matrix from crystal system symmetry. + /// @param cell_ Unit cell parameters. + /// @param sg Pointer to space group (for symmetry constraints). Scaling(const UnitCell& cell_, const SpaceGroup* sg) : cell(cell_), constraint_matrix(adp_symmetry_constraints(sg)) {} - // B_{overall} is stored as B* not B_{cartesian}. - // Use getter and setter to convert from/to B_{cartesian}. + /// @brief Set b_star from a real-space B matrix. + /// Converts from Cartesian coordinates to reciprocal-space B*. + /// @param b_overall B matrix in real (Cartesian) space. void set_b_overall(const SMat33& b_overall) { b_star = b_overall.transformed_by(cell.frac.mat); } + + /// @brief Get b_star converted back to real (Cartesian) space. + /// @return B matrix in Cartesian coordinates. SMat33 get_b_overall() const { return b_star.transformed_by(cell.orth.mat); } - // Scale data, optionally adding bulk solvent correction. + /// @brief Apply scaling to all reflections in-place, optionally adding bulk solvent correction. + /// @param asu_data ASU data to scale (modified in-place). + /// @param mask_data Mask (solvent) data; required if use_solvent is true, otherwise may be null. void scale_data(AsuData>& asu_data, const AsuData>* mask_data) const { if (use_solvent && !(mask_data && mask_data->size() == asu_data.size())) @@ -126,6 +147,12 @@ struct Scaling { } } + /// @brief Compute scaled value for one reflection. + /// Applies overall scale factor and optionally bulk solvent correction. + /// @param hkl Miller indices. + /// @param f_value Calculated molecular structure factor. + /// @param mask_value Mask (solvent) structure factor. + /// @return Scaled complex structure factor. std::complex scale_value(const Miller& hkl, std::complex f_value, std::complex mask_value) { if (use_solvent) { @@ -135,6 +162,9 @@ struct Scaling { return f_value * (Real) get_overall_scale_factor(hkl); } + /// @brief Return current parameters as a flat vector. + /// Includes k_overall, b_star components (via constraint_matrix), and optionally k_sol and b_sol. + /// @return Vector of parameters in order: k_overall, [k_sol], [b_sol], b_star_components. std::vector get_parameters() const { std::vector ret; ret.push_back(k_overall); @@ -149,7 +179,9 @@ struct Scaling { return ret; } - /// set k_overall, k_sol, b_sol, b_star + /// @brief Set parameters from pointer or vector. + /// Updates k_overall, b_star components (via constraint_matrix), and optionally k_sol and b_sol. + /// @param p Pointer to parameter array (order: k_overall, [k_sol], [b_sol], b_star_components). void set_parameters(const double* p) { k_overall = p[0]; int n = 0; @@ -171,11 +203,17 @@ struct Scaling { } } + /// @brief Set parameters from vector. + /// @param p Vector of parameters. void set_parameters(const std::vector& p) { set_parameters(p.data()); } - // pre: all AsuData args are sorted + /// @brief Populate points from matching reflections in calc and obs datasets. + /// Precondition: all AsuData arguments must be sorted by Miller indices. + /// @param calc Calculated structure factors. + /// @param obs Observed amplitudes and sigmas. + /// @param mask_data Mask structure factors; required if use_solvent is true. void prepare_points(const AsuData>& calc, const AsuData>& obs, const AsuData>* mask_data) { @@ -210,21 +248,32 @@ struct Scaling { } + /// @brief Compute k_sol * exp(-b_sol * stol²) for bulk solvent correction. + /// @param stol2 (sin θ/λ)² value. + /// @return Solvent scale factor. double get_solvent_scale(double stol2) const { return k_sol * std::exp(-b_sol * stol2); } + /// @brief Compute k_overall * exp(-b_star:hkl) for overall scale factor. + /// @param hkl Miller indices. + /// @return Overall scale factor including anisotropic B*. double get_overall_scale_factor(const Miller& hkl) const { return k_overall * std::exp(-0.25 * b_star.r_u_r(hkl)); } + /// @brief Compute total calculated structure factor for point p. + /// Includes molecular part and optionally bulk solvent correction. + /// @param p Reflection point. + /// @return Total Fcalc = Fcmol + k_sol·exp(-b_sol·stol²)·Fmask (or just Fcmol if no solvent). std::complex get_fcalc(const Point& p) const { if (!use_solvent) return p.fcmol; return p.fcmol + (Real)get_solvent_scale(p.stol2) * p.fmask; } - // quick linear fit (ignoring sigma) to get initial k_overall and isotropic B + /// @brief Quick linear least-squares fit of isotropic B only (ignoring sigma). + /// Used for initialization before full refinement. void fit_isotropic_b_approximately() { double sx = 0, sy = 0, sxx = 0, sxy = 0; int n = 0; @@ -249,7 +298,8 @@ struct Scaling { set_b_overall({b_iso, b_iso, b_iso, 0, 0, 0}); } - // least-squares fitting of k_overall only + /// @brief Compute optimal k_overall by linear least squares. + /// @return Optimal scale factor. double lsq_k_overall() const { double sxx = 0, sxy = 0; for (const Point& p : points) { @@ -263,10 +313,9 @@ struct Scaling { return sxx != 0. ? sxy / sxx : 1.; } - // For testing only, don't use it. - // Estimates anisotropic b_star using other parameters (incl. isotropic B), - // following P. Afonine et al, doi:10.1107/S0907444913000462 sec. 2.1. - // The symmetry constraints are not implemented - don't use it! + /// @brief Fit anisotropic B* by linear approximation (for testing/initialization only). + /// DO NOT USE for production: symmetry constraints are not implemented. + /// Based on P. Afonine et al., doi:10.1107/S0907444913000462, section 2.1. void fit_b_star_approximately() { double b_iso = 1/3. * get_b_overall().trace(); //size_t nc = constraint_matrix.size(); @@ -297,11 +346,15 @@ struct Scaling { //printf("fitted B = {%g %g %g %g %g %g}\n", e[0], e[1], e[2], e[3], e[4], e[5]); } + /// @brief Full Levenberg-Marquardt optimization of all parameters. + /// @return Final R factor. double fit_parameters() { LevMar levmar; return levmar.fit(*this); } + /// @brief Compute R-factor: Σ|Fobs - Fcalc| / Σ|Fobs|. + /// @return R-factor value. double calculate_r_factor() const { double abs_diff_sum = 0; double denom = 0; @@ -312,11 +365,17 @@ struct Scaling { return abs_diff_sum / denom; } - // interface for fitting + /// @brief Compute Fcalc for point p (interface for fitting). + /// @param p Reflection point. + /// @return Scaled calculated structure factor amplitude. double compute_value(const Point& p) const { return std::abs(get_fcalc(p)) * (Real) get_overall_scale_factor(p.hkl); } + /// @brief Compute Fcalc and its derivatives with respect to all parameters. + /// @param p Reflection point. + /// @param dy_da Output vector to fill with derivatives [dy/dk_overall, dy/dk_sol, dy/db_sol, dy/db_star_components]. + /// @return Calculated structure factor amplitude. double compute_value_and_derivatives(const Point& p, std::vector& dy_da) const { Vec3 h(p.hkl); double kaniso = std::exp(-0.25 * b_star.r_u_r(h)); @@ -353,6 +412,13 @@ struct Scaling { } }; +/// @brief Fit scaling parameters using NLopt with the named optimizer. +/// Only available when compiled with WITH_NLOPT. +/// @tparam Real Floating-point type (float or double). +/// @param scaling The Scaling object to optimize. +/// @param optimizer Name of the optimizer algorithm (NLopt algorithm string). +/// @return Final residual value (WSSR). +/// @note This function is for testing and evaluation purposes. // only for testing and evaluation - scaling with NLOpt #if WITH_NLOPT namespace impl { From fc6cd20d918bc9128a8c477955d485664de3247b Mon Sep 17 00:00:00 2001 From: "C. Vonrhein" Date: Wed, 22 Apr 2026 23:11:28 +0200 Subject: [PATCH 17/19] docs(dssp.hpp): add full Doxygen API documentation Add comprehensive Doxygen documentation comments to all public items in the DSSP secondary structure assignment module: - All enum types and values (SecondaryStructure, TurnType, HelixPosition, BridgeType, HydrogenMode, HBondDefinition) - All public structs and their data members (Bridge, HBond, SecondaryStructureInfo, DsspOptions, DsspCalculator) - All public methods with @param, @return, and @brief tags - The calculate_dssp convenience function - Used @tparam and other tags as appropriate for clarity Co-Authored-By: Claude Opus 4.7 (1M context) --- include/gemmi/dssp.hpp | 190 ++++++++++++++++++++++++++--------------- 1 file changed, 122 insertions(+), 68 deletions(-) diff --git a/include/gemmi/dssp.hpp b/include/gemmi/dssp.hpp index ddcaef76e..7af6ea651 100644 --- a/include/gemmi/dssp.hpp +++ b/include/gemmi/dssp.hpp @@ -12,89 +12,101 @@ namespace gemmi { -// Secondary structure types as defined in DSSP +/// @brief Secondary structure type codes as defined in DSSP. enum class SecondaryStructure : char { - Loop = '~', // Loop/coil - Break = '=', // Break - Bend = 'S', // Bend - Turn = 'T', // Turn - Helix_PP = 'P', // Polyproline helix - Helix_5 = 'I', // Pi helix (5-turn) - Helix_3 = 'G', // 3-10 helix - Strand = 'E', // Extended strand - Bridge = 'B', // Beta bridge - Helix_4 = 'H' // Alpha helix (4-turn) + Loop = '~', /// Coil/loop (unassigned secondary structure) + Break = '=', /// Chain break + Bend = 'S', /// Bend (local geometry criterion) + Turn = 'T', /// Hydrogen-bonded turn + Helix_PP = 'P', /// Polyproline II helix + Helix_5 = 'I', /// π-helix (5-turn) + Helix_3 = 'G', /// 3₁₀-helix (3-turn) + Strand = 'E', /// Extended β-strand + Bridge = 'B', /// Isolated β-bridge + Helix_4 = 'H' /// α-helix (4-turn) }; -// Turn types +/// @brief Hydrogen bond turn types, indexed by residue separation (i to i+n). enum class TurnType { - Turn_3 = 3, - Turn_4 = 4, - Turn_5 = 5, - Turn_PP = 6 + Turn_3 = 3, /// 3-residue turn (i to i+3 hydrogen bond) + Turn_4 = 4, /// 4-residue turn (i to i+4 hydrogen bond) + Turn_5 = 5, /// 5-residue turn (i to i+5 hydrogen bond) + Turn_PP = 6 /// Polyproline II type turn (i to i+6) }; -// Helix positions +/// @brief Position of a residue within a helix. enum class HelixPosition { - None = 0, - Start, - Middle, - End, - StartAndEnd + None = 0, /// Not part of a helix + Start, /// First residue of a helix + Middle, /// Interior residue of a helix + End, /// Last residue of a helix + StartAndEnd /// Single-residue helix (acts as both start and end) }; -// Bridge types +/// @brief Type of β-bridge partnership between residues. enum class BridgeType { - None = 0, - Parallel, - AntiParallel + None = 0, /// No bridge + Parallel, /// Parallel β-bridge + AntiParallel /// Antiparallel β-bridge }; +/// @brief A simple β-bridge record between two residues. struct Bridge { - size_t partner1; - size_t partner2; - BridgeType type; + size_t partner1; /// Chain index of the first bridge residue + size_t partner2; /// Chain index of the second bridge residue + BridgeType type; /// Type of bridge (Parallel or AntiParallel) }; -// Hydrogen bond modes +/// @brief Source of hydrogen atoms for hydrogen bond detection. enum class HydrogenMode { - Existing = 0, // Use existing hydrogen atoms from structure - Calculate // Calculate hydrogen positions (original DSSP method) + Existing = 0, /// Use hydrogen atoms already present in the structure + Calculate /// Compute idealized hydrogen positions (original DSSP method, for structures lacking explicit H) }; -// Hydrogen bond definition +/// @brief Criterion for hydrogen bond acceptance. enum class HBondDefinition { - Energy = 0, // Energy-based (original DSSP) - Geometry // Geometry-based (distance + angle) + Energy = 0, /// Use Kabsch-Sander energy criterion (E < cutoff in kcal/mol) + Geometry /// Use geometric criteria (distance and angle thresholds) }; +/// @brief Record of one hydrogen bond detected by DSSP. struct GEMMI_DLL HBond { - Topo::ResInfo *donor = nullptr, *acceptor = nullptr; - char alt1 = '\0'; - char alt2 = '\0'; - double energy = 0; - bool is_valid = false; + Topo::ResInfo *donor = nullptr; /// Pointer to the donor residue (NH group) + Topo::ResInfo *acceptor = nullptr; /// Pointer to the acceptor residue (C=O group) + char alt1 = '\0'; /// Alternate conformer of the donor atom + char alt2 = '\0'; /// Alternate conformer of the acceptor atom + double energy = 0; /// Hydrogen bond energy in kcal/mol (for Energy mode); NaN if not applicable + bool is_valid = false; /// True if this bond meets the cutoff criterion }; -// Per-residue secondary structure information +/// @brief Per-residue secondary structure annotation and structural features. struct GEMMI_DLL SecondaryStructureInfo { - SecondaryStructure ss_type = SecondaryStructure::Loop; - std::vector parallel_bridges; - std::vector antiparallel_bridges; - std::vector break_partners; + SecondaryStructure ss_type = SecondaryStructure::Loop; /// Assigned secondary structure code + std::vector parallel_bridges; /// Chain indices of residues forming parallel β-bridges with this residue + std::vector antiparallel_bridges; /// Chain indices of residues forming antiparallel β-bridges with this residue + std::vector break_partners; /// Pointers to SecondaryStructureInfo of chain-break partners std::array helix_positions = {HelixPosition::None, HelixPosition::None, - HelixPosition::None, HelixPosition::None}; - bool has_break = false; - bool nturn_acceptor = false; + HelixPosition::None, HelixPosition::None}; /// Helix position for each turn type (indexed by TurnType-3) + bool has_break = false; /// True if there is a chain break before this residue + bool nturn_acceptor = false; /// True if this residue is the acceptor of an n-turn hydrogen bond + /// @brief Set the helix position for a given turn type. + /// @param turn TurnType value (Turn_3, Turn_4, Turn_5, or Turn_PP) + /// @param pos HelixPosition value void set_helix_position(TurnType turn, HelixPosition pos) { helix_positions[static_cast(turn) - 3] = pos; } + /// @brief Retrieve the helix position for a given turn type. + /// @param turn TurnType value + /// @return HelixPosition at the specified turn HelixPosition get_helix_position(TurnType turn) const { return helix_positions[static_cast(turn) - 3]; } + /// @brief Record a β-bridge partnership with another residue. + /// @param partner_idx Chain index of the bridge partner + /// @param type BridgeType (Parallel or AntiParallel) void add_bridge(size_t partner_idx, BridgeType type) { if (type == BridgeType::Parallel) { parallel_bridges.push_back(partner_idx); @@ -103,43 +115,65 @@ struct GEMMI_DLL SecondaryStructureInfo { } } + /// @brief Check if this residue has any bridges of the given type. + /// @param type BridgeType to check + /// @return True if this residue has at least one bridge of the given type bool has_bridges(BridgeType type) const { return type == BridgeType::Parallel ? !parallel_bridges.empty() : !antiparallel_bridges.empty(); } }; -// DSSP options/parameters +/// @brief Configuration parameters for the DSSP secondary structure algorithm. struct GEMMI_DLL DsspOptions { - HydrogenMode hydrogen_mode = HydrogenMode::Calculate; - HBondDefinition hbond_definition = HBondDefinition::Energy; - double cutoff = 0.9; // nm - bool pi_helix_preference = true; - bool search_polyproline = true; - bool shortened_pp_stretch = false; - double hbond_energy_cutoff = -0.5; // kcal/mol - double min_ca_distance = 9.0; // Angstrom - double bend_angle_min = 70.0; // degrees - double max_peptide_bond_distance = 2.5; // Angstrom + HydrogenMode hydrogen_mode = HydrogenMode::Calculate; /// How to source hydrogen atoms (Existing or Calculate) + HBondDefinition hbond_definition = HBondDefinition::Energy; /// Criterion for hydrogen bond acceptance (Energy or Geometry) + double cutoff = 0.9; /// Distance cutoff in nm for hydrogen bond search (default 0.9) + bool pi_helix_preference = true; /// If true, prefer π-helix over α-helix at ambiguous positions + bool search_polyproline = true; /// If true, detect polyproline II helices + bool shortened_pp_stretch = false; /// If true, allow polyproline stretches shorter than canonical + double hbond_energy_cutoff = -0.5; /// Energy threshold in kcal/mol for hydrogen bond acceptance (default -0.5) + double min_ca_distance = 9.0; /// Minimum Cα–Cα distance in Angstrom to consider residues potentially bonded (default 9.0) + double bend_angle_min = 70.0; /// Minimum bend angle in degrees to assign Bend secondary structure (default 70.0) + double max_peptide_bond_distance = 2.5; /// Maximum C–N distance in Angstrom considered a valid peptide bond (default 2.5) }; -// Main DSSP calculator class +/// @brief Main DSSP engine that performs secondary structure assignment for a protein chain. +/// Detects and annotates secondary structure elements (helices, strands, turns, etc.) +/// based on hydrogen bond patterns and geometry. struct GEMMI_DLL DsspCalculator { + /// @brief Initialize the calculator with options. + /// @param opts DsspOptions configuration (default: standard DSSP parameters) explicit DsspCalculator(const DsspOptions& opts = DsspOptions{}) : options(opts) {} - // Calculate secondary structure for a chain + /// @brief Run the full DSSP secondary structure assignment for a chain. + /// Requires a populated NeighborSearch for spatial lookups. + /// @param ns NeighborSearch object with the protein atoms indexed + /// @param chain_info Topo::ChainInfo containing residue topology + /// @return Secondary structure string, one character per residue std::string calculate_secondary_structure(NeighborSearch& ns, Topo::ChainInfo& chain_info); - // Get detailed secondary structure information + /// @brief Access the per-residue secondary structure information. + /// @return Const reference to the vector of SecondaryStructureInfo const std::vector& get_detailed_info() const { return ss_info; } - DsspOptions options; - std::vector ss_info; - std::vector bridges_; + DsspOptions options; /// Configuration options for this calculation + std::vector ss_info; /// Per-residue secondary structure and structural features (populated after calculate_secondary_structure) + std::vector bridges_; /// List of all detected β-bridges - // Calculate hydrogen bonds + /// @brief Detect and record all hydrogen bonds in the chain. + /// @param ns NeighborSearch object with atoms indexed + /// @param chain_info Topo::ChainInfo containing residue topology void calculate_hydrogen_bonds(NeighborSearch& ns, Topo::ChainInfo& chain_info); + + /// @brief Compute and store Kabsch-Sander hydrogen bond energy between donor and acceptor. + /// @param donor Topo::ResInfo pointer for the NH (donor) residue + /// @param acceptor Topo::ResInfo pointer for the C=O (acceptor) residue void calculate_hbond_energy(Topo::ResInfo* donor, Topo::ResInfo* acceptor); + + /// @brief Evaluate geometric hydrogen bond criterion (distance and angle) between donor and acceptor. + /// @param donor Topo::ResInfo pointer for the NH (donor) residue + /// @param acceptor Topo::ResInfo pointer for the C=O (acceptor) residue void calculate_hbond_geometry(Topo::ResInfo* donor, Topo::ResInfo* acceptor); /* @@ -150,16 +184,36 @@ struct GEMMI_DLL DsspCalculator { void find_polyproline_helices(Topo::ChainInfo& chain_info); */ - // Utility functions + /// @brief Check if a valid hydrogen bond exists from donor to acceptor. + /// @param donor Topo::ResInfo pointer for the donor + /// @param acceptor Topo::ResInfo pointer for the acceptor + /// @return True if a hydrogen bond exists between the residues bool has_hbond_between(Topo::ResInfo* donor, Topo::ResInfo* acceptor) const; + + /// @brief Check for chain breaks between two residue indices. + /// @param chain_info Topo::ChainInfo containing residue topology + /// @param res1_idx Chain index of the first residue + /// @param res2_idx Chain index of the second residue + /// @return True if no chain breaks exist between res1_idx and res2_idx bool no_chain_breaks_between(Topo::ChainInfo& chain_info, size_t res1_idx, size_t res2_idx) const; + + /// @brief Determine the type of β-bridge between two residues. + /// @param chain_info Topo::ChainInfo containing residue topology + /// @param res1_idx Chain index of the first residue + /// @param res2_idx Chain index of the second residue + /// @return BridgeType (Parallel, AntiParallel, or None) BridgeType calculate_bridge_type(Topo::ChainInfo& chain_info, size_t res1_idx, size_t res2_idx) const; // Generate final secondary structure string //std::string generate_ss_string() const; }; -// Convenience function for simple use cases +/// @brief Convenience function to calculate secondary structure with default or custom options. +/// Creates a DsspCalculator with the given options and runs the full secondary structure assignment. +/// @param ns NeighborSearch object with atoms indexed for spatial lookups +/// @param cinfo Topo::ChainInfo containing the chain topology +/// @param opts DsspOptions configuration (default: standard DSSP parameters) +/// @return Secondary structure string, one character per residue GEMMI_DLL std::string calculate_dssp(NeighborSearch& ns, Topo::ChainInfo& cinfo, const DsspOptions& opts = DsspOptions{}); From 558543305335a151f679de158de13b16ecb27982 Mon Sep 17 00:00:00 2001 From: Clemens Vonrhein Date: Wed, 22 Apr 2026 23:18:45 +0200 Subject: [PATCH 18/19] docs(api.rst): add Calculations and Analysis + Structure Factor sections (PR 7) Co-Authored-By: Claude Sonnet 4.6 --- docs/api.rst | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/docs/api.rst b/docs/api.rst index 2386adfca..ee89d7e99 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -29,3 +29,52 @@ Map and Grid Data .. doxygenfile:: grid.hpp :project: gemmi + +Calculations and Analysis +------------------------- + +Geometric calculations, sequence alignment, structure superposition, +neighbour search, contact detection, biological assembly, atom selection, +structure modification, polymer heuristics, and secondary structure assignment. + +.. doxygenfile:: calculate.hpp + :project: gemmi + +.. doxygenfile:: align.hpp + :project: gemmi + +.. doxygenfile:: neighbor.hpp + :project: gemmi + +.. doxygenfile:: contact.hpp + :project: gemmi + +.. doxygenfile:: assembly.hpp + :project: gemmi + +.. doxygenfile:: select.hpp + :project: gemmi + +.. doxygenfile:: modify.hpp + :project: gemmi + +.. doxygenfile:: polyheur.hpp + :project: gemmi + +.. doxygenfile:: dssp.hpp + :project: gemmi + +Structure Factor Calculations +----------------------------- + +Direct structure factor summation, amplitude normalisation (F→E), +and anisotropic scaling with optional bulk-solvent correction. + +.. doxygenfile:: sfcalc.hpp + :project: gemmi + +.. doxygenfile:: ecalc.hpp + :project: gemmi + +.. doxygenfile:: scaling.hpp + :project: gemmi From d69b16cf8a2f6bf9ebcd83f4683759991d3f1a49 Mon Sep 17 00:00:00 2001 From: Clemens Vonrhein Date: Thu, 23 Apr 2026 15:09:09 +0200 Subject: [PATCH 19/19] docs: add @par References citations to calculations headers sfcalc.hpp: Bourhis et al. (2015) for structure factor computation context. scaling.hpp: Afonine et al. (2012) for anisotropic scaling; Fokine/Urzhumtsev (2002) for initial bulk-solvent parameters. calculate.hpp: Merritt (2011) for B_est formula. dssp.hpp: Kabsch/Sander (1983) for DSSP algorithm. Co-authored-by: C. Vonrhein / CV-GPhL --- include/gemmi/calculate.hpp | 5 +++-- include/gemmi/dssp.hpp | 8 ++++++-- include/gemmi/scaling.hpp | 10 +++++++++- include/gemmi/sfcalc.hpp | 8 ++++++++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/gemmi/calculate.hpp b/include/gemmi/calculate.hpp index 7569fbaec..7e2a3883d 100644 --- a/include/gemmi/calculate.hpp +++ b/include/gemmi/calculate.hpp @@ -188,10 +188,11 @@ inline Box calculate_fractional_box(const Structure& st, double marg /// @brief Calculate B_equiv from anisotropic B-tensor (or B_iso if isotropic). -/// Based on E. Merritt, "Some B_eq are more equivalent than others", -/// Acta Cryst. A67, 512 (2011). /// @param atom Atom with (possibly anisotropic) B-factor /// @return B_equiv = sqrt((sum_eigenvalues) / (sum_inverse_eigenvalues)) * u_to_b() +/// @par References +/// Merritt, E.A. (2011). Some B_eq are more equivalent than others. +/// Acta Cryst. A67, 512–516. https://doi.org/10.1107/S0108767311034350 inline double calculate_b_est(const Atom& atom) { auto eig = atom.aniso.calculate_eigenvalues(); return u_to_b() * std::sqrt((eig[0] + eig[1] + eig[2]) / diff --git a/include/gemmi/dssp.hpp b/include/gemmi/dssp.hpp index 7af6ea651..b714ccb35 100644 --- a/include/gemmi/dssp.hpp +++ b/include/gemmi/dssp.hpp @@ -139,8 +139,12 @@ struct GEMMI_DLL DsspOptions { }; /// @brief Main DSSP engine that performs secondary structure assignment for a protein chain. -/// Detects and annotates secondary structure elements (helices, strands, turns, etc.) -/// based on hydrogen bond patterns and geometry. +/// @details Detects and annotates secondary structure elements (helices, strands, turns, etc.) +/// based on hydrogen bond patterns and geometry, following the original DSSP algorithm. +/// @par References +/// Kabsch, W. & Sander, C. (1983). Dictionary of protein secondary structure: +/// pattern recognition of hydrogen-bonded and geometrical features. +/// Biopolymers 22, 2577–2637. https://doi.org/10.1002/bip.360221211 struct GEMMI_DLL DsspCalculator { /// @brief Initialize the calculator with options. /// @param opts DsspOptions configuration (default: standard DSSP parameters) diff --git a/include/gemmi/scaling.hpp b/include/gemmi/scaling.hpp index 84f2b74d5..ede7e45ee 100644 --- a/include/gemmi/scaling.hpp +++ b/include/gemmi/scaling.hpp @@ -78,9 +78,17 @@ inline std::vector adp_symmetry_constraints(const SpaceGroup* sg) { } /// @brief Anisotropic scaling of calculated structure factors to observed data. -/// Optionally includes bulk solvent correction: Fc + k_sol·exp(-b_sol·stol²)·Fmask. +/// @details Optionally includes bulk solvent correction: Fc + k_sol·exp(-b_sol·stol²)·Fmask. /// Parameter refinement uses Levenberg-Marquardt (or NLopt if WITH_NLOPT is defined). /// @tparam Real Floating-point type (float or double). +/// @par References +/// Afonine, P.V. et al. (2012). Towards automated crystallographic structure +/// refinement with phenix.refine. Acta Cryst. D68, 352–367. +/// https://doi.org/10.1107/S0907444913000462 +/// +/// Fokine, A. & Urzhumtsev, A. (2002). Flat bulk-solvent model: obtaining +/// optimal parameters. Acta Cryst. A58, 384–392. +/// https://doi.org/10.1107/S0108767302005669 template struct Scaling { /// @brief One reflection used in the least-squares fit. diff --git a/include/gemmi/sfcalc.hpp b/include/gemmi/sfcalc.hpp index 22bfd7156..d3d79d9f9 100644 --- a/include/gemmi/sfcalc.hpp +++ b/include/gemmi/sfcalc.hpp @@ -28,8 +28,16 @@ inline std::complex calculate_sf_part(const Fractional& fpos, } /// @brief Calculates structure factors by direct summation over all atoms. +/// @details Simple direct summation; for optimised FFT-based calculations see +/// dencalc.hpp + fourier.hpp. /// @tparam Table Scattering factor table type (e.g., IT92, WK95, ElectronTable). /// Must have a nested Coef type with coef_type member. +/// @par References +/// Bourhis, L.J., Dolomanov, O.V., Gildea, R.J., Howard, J.A.K. & +/// Puschmann, H. (2015). The anatomy of a comprehensive constrained, +/// restrained refinement program for the modern computing environment — +/// Olex2 dissected. Acta Cryst. A70, 300–311. +/// https://doi.org/10.1107/S2053273314022207 template class StructureFactorCalculator { public: