diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 000000000..19b248015 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,63 @@ +.. _api: + +C++ API Reference +################# + +This page documents the Gemmi C++ library API, generated from Doxygen comments +in ``include/gemmi/*.hpp``. It is updated with each pull request in the +API documentation series. + +For the Python API, see the `Python API reference `_. + +.. note:: + + Documentation coverage is being added incrementally. Headers not yet + listed here will appear in subsequent pull requests. + +Core Data Structures +-------------------- + +*(Full documentation added in PR 2.)* + +.. doxygenfile:: model.hpp + :project: gemmi + +Reflection Data +--------------- + +*(Full documentation added in PR 5.)* + +.. doxygenfile:: mtz.hpp + :project: gemmi + +.. doxygenfile:: refln.hpp + :project: gemmi + +.. doxygenfile:: cif2mtz.hpp + :project: gemmi + +.. doxygenfile:: mtz2cif.hpp + :project: gemmi + +.. doxygenfile:: xds_ascii.hpp + :project: gemmi + +.. doxygenfile:: xds2mtz.hpp + :project: gemmi + +.. doxygenfile:: intensit.hpp + :project: gemmi + +.. doxygenfile:: binner.hpp + :project: gemmi + +.. doxygenfile:: asudata.hpp + :project: gemmi + +Map and Grid Data +----------------- + +*(Stub — full documentation added in PR 6.)* + +.. doxygenfile:: grid.hpp + :project: gemmi diff --git a/include/gemmi/asudata.hpp b/include/gemmi/asudata.hpp index ccb44ee09..aacf67561 100644 --- a/include/gemmi/asudata.hpp +++ b/include/gemmi/asudata.hpp @@ -1,6 +1,7 @@ // Copyright 2020 Global Phasing Ltd. // -// AsuData for storing reflection data. +/// @file +/// @brief AsuData template for storing per-HKL reflection data in asymmetric unit. #ifndef GEMMI_ASUDATA_HPP_ #define GEMMI_ASUDATA_HPP_ @@ -14,13 +15,18 @@ namespace gemmi { +/// @brief Correlation calculation for complex-valued reflection data. +/// Accumulates running statistics for complex numbers (e.g., structure factors). struct ComplexCorrelation { - int n = 0; - double sum_xx = 0.; - double sum_yy = 0.; - std::complex sum_xy = 0.; - std::complex mean_x = 0.; - std::complex mean_y = 0.; + int n = 0; ///< Number of points accumulated + double sum_xx = 0.; ///< Sum of |x|^2 + double sum_yy = 0.; ///< Sum of |y|^2 + std::complex sum_xy = 0.; ///< Sum of (x - mean_x) * conj(y - mean_y) + std::complex mean_x = 0.; ///< Running mean of x + std::complex mean_y = 0.; ///< Running mean of y + /// @brief Add a complex-valued point pair to the correlation. + /// @param x First complex value + /// @param y Second complex value void add_point(std::complex x, std::complex y) { ++n; double inv_n = 1.0 / n; @@ -33,15 +39,27 @@ struct ComplexCorrelation { mean_x += dx * inv_n; mean_y += dy * inv_n; } + /// @brief Add a complex-valued point pair (float version). + /// @param x First complex value + /// @param y Second complex value void add_point(std::complex x, std::complex y) { add_point(std::complex(x), std::complex(y)); } + /// @brief Compute correlation coefficient from accumulated statistics. std::complex coefficient() const { return sum_xy / std::sqrt(sum_xx * sum_yy); } + /// @brief Compute ratio of mean magnitudes. double mean_ratio() const { return std::abs(mean_y) / std::abs(mean_x); } }; -/// \pre a and b are sorted +/// @brief Apply a function to matching reflections from two sorted lists. +/// Iterates through reflections with matching HKLs in both lists and calls func. +/// @pre Vectors a and b are sorted by HKL. +/// @tparam Func Callable type taking (const T&, const T&). +/// @tparam T Reflection data type (must have hkl member and operator<). +/// @param a First reflection list. +/// @param b Second reflection list. +/// @param func Function to call for each matching pair. template void for_matching_reflections(const std::vector& a, const std::vector& b, @@ -61,7 +79,12 @@ void for_matching_reflections(const std::vector& a, } } -/// \pre a and b are sorted +/// @brief Calculate correlation of intensity values between two sorted lists. +/// @pre Vectors a and b are sorted by HKL. +/// @tparam T Reflection data type (must have hkl and value members). +/// @param a First reflection list. +/// @param b Second reflection list. +/// @return Correlation object computed from matching HKLs. template Correlation calculate_hkl_value_correlation(const std::vector& a, const std::vector& b) { @@ -72,7 +95,12 @@ Correlation calculate_hkl_value_correlation(const std::vector& a, return cor; } -/// \pre a and b are sorted +/// @brief Calculate correlation of complex-valued reflection data between two sorted lists. +/// @pre Vectors a and b are sorted by HKL. +/// @tparam T Reflection data type (must have hkl and value members). +/// @param a First reflection list. +/// @param b Second reflection list. +/// @return ComplexCorrelation object computed from matching HKLs. template ComplexCorrelation calculate_hkl_complex_correlation(const std::vector& a, const std::vector& b) { @@ -83,7 +111,12 @@ ComplexCorrelation calculate_hkl_complex_correlation(const std::vector& a, return cor; } -/// \pre a and b are sorted +/// @brief Count matching reflections with identical values in two sorted lists. +/// @pre Vectors a and b are sorted by HKL. +/// @tparam T Reflection data type (must have hkl and value members). +/// @param a First reflection list. +/// @param b Second reflection list. +/// @return Number of matching HKLs with equal values. template int count_equal_values(const std::vector& a, const std::vector& b) { int count = 0; @@ -94,31 +127,44 @@ int count_equal_values(const std::vector& a, const std::vector& b) { return count; } +/// @brief Miller index paired with a generic value. +/// Used as the basic element in AsuData containers. +/// @tparam T Value type (float, double, complex, etc.). template struct HklValue { - alignas(8) Miller hkl; - T value; + alignas(8) Miller hkl; ///< Miller indices (h, k, l) + T value; ///< Associated value (intensity, structure factor, etc.) + /// @brief Compare with Miller index. bool operator<(const Miller& m) const { return hkl < m; } + /// @brief Compare with another HklValue by their HKL indices. bool operator<(const HklValue& o) const { return operator<(o.hkl); } }; +/// @brief Value paired with its uncertainty (sigma/standard deviation). +/// @tparam T Numeric type (float, double, etc.). template struct ValueSigma { using value_type = T; - T value, sigma; + T value; ///< The measured or calculated value + T sigma; ///< Standard deviation or uncertainty + /// @brief Check equality of both value and sigma. bool operator==(const ValueSigma& o) const { return value == o.value && sigma == o.sigma; } }; +/// @brief Implementation functions for moving reflections to asymmetric unit. namespace impl { +/// @brief Generic move_to_asu for real-valued data (no phase adjustment). template void move_to_asu(const GroupOps&, const Miller& hkl, int, HklValue& hkl_value) { hkl_value.hkl = hkl; } +/// @brief Specialized move_to_asu for complex-valued data (applies phase shift). +/// Applies phase shift from symmetry operator and conjugation for isym % 2 == 0. template void move_to_asu(const GroupOps& gops, const Miller& hkl, int isym, HklValue>& v) { @@ -135,12 +181,17 @@ void move_to_asu(const GroupOps& gops, const Miller& hkl, int isym, } } // namespace impl +/// @brief Generic container for reflection data in asymmetric unit. +/// Stores values (e.g., structure factors, phases, intensities) indexed by Miller indices. +/// Keeps data sorted by HKL and can enforce ASU constraints. +/// @tparam T Value type (float, double, complex, ValueSigma<>, etc.). template struct AsuData { - std::vector> v; - UnitCell unit_cell_; - const SpaceGroup* spacegroup_ = nullptr; - // function defining FPhiProxy interface + std::vector> v; ///< Reflection data (HKL + value pairs) + UnitCell unit_cell_; ///< Unit cell parameters + const SpaceGroup* spacegroup_ = nullptr; ///< Space group (not owned by this object) + + // FPhiProxy interface compatibility methods size_t stride() const { return 1; } size_t size() const { return v.size(); } Miller get_hkl(size_t n) const { return v[n].hkl; } @@ -148,11 +199,17 @@ struct AsuData { double get_phi(size_t n) const { return std::arg(v[n].value); } const UnitCell& unit_cell() const { return unit_cell_; } const SpaceGroup* spacegroup() const { return spacegroup_; } + + /// @brief Sort reflections by HKL indices if not already sorted. void ensure_sorted() { if (!std::is_sorted(v.begin(), v.end())) std::sort(v.begin(), v.end()); } + /// @brief Transform all reflections to the asymmetric unit. + /// Moves reflections outside ASU to their equivalent inside, applying + /// symmetry operators and phase shifts as needed for complex values. + /// @param tnt_asu If true, use TNT-style ASU; otherwise use standard ASU. void ensure_asu(bool tnt_asu=false) { if (!spacegroup_) fail("AsuData::ensure_asu(): space group not set"); @@ -167,7 +224,13 @@ struct AsuData { } } - // load values from one column + /// @brief Load values from a single data column. + /// Reads HKLs and values from proxy, filters NaN, converts to ASU, and sorts. + /// @tparam DataProxy Type with column_index(), unit_cell(), spacegroup(), + /// size(), stride(), get_hkl(), get_num() interface. + /// @param proxy Data proxy object (MTZ, mmCIF, etc.). + /// @param label Column name/label to load. + /// @param as_is If true, skip ASU conversion and sorting (raw load). template void load_values(const DataProxy& proxy, const std::string& label, bool as_is=false) { @@ -185,7 +248,15 @@ struct AsuData { } } - // load values from two or more columns + /// @brief Load values from multiple columns (for complex, vector, or sigma pairs). + /// Reads N columns per reflection and combines them into a single value. + /// Filters out reflections with any NaN in the N columns. + /// @tparam N Number of columns to load (2 for complex/F+sigma, etc.). + /// @tparam DataProxy Type with column_index(), unit_cell(), spacegroup(), + /// size(), stride(), get_hkl(), get_num() interface. + /// @param proxy Data proxy object (MTZ, mmCIF, etc.). + /// @param labels Array of N column names/labels to load. + /// @param as_is If true, skip ASU conversion and sorting (raw load). template void load_values(const DataProxy& proxy, const std::array& labels, bool as_is=false) { @@ -212,14 +283,17 @@ struct AsuData { } private: - // for T being a number, std::array and std::complex, respectively: + /// @brief Helper to convert numeric array(s) to T. + /// Overloaded for: scalar, array<1>, complex (F+phase), ValueSigma (F+sigma). static void set_value_from_array(T& val, const std::array& nums) { val = nums[0]; } static void set_value_from_array(T& val, const T& nums) { val = nums; } + /// @brief Convert (F, phi) pair to complex structure factor. template static void set_value_from_array(std::complex& val, const std::array& nums) { R theta = (R)rad(nums[1]); val = {nums[0] * std::cos(theta), nums[0] * std::sin(theta)}; } + /// @brief Convert (value, sigma) pair to ValueSigma. template static void set_value_from_array(ValueSigma& val, const std::array& nums) { val.value = nums[0]; @@ -227,6 +301,14 @@ struct AsuData { } }; +/// @brief Create AsuData by loading from N columns in a data source. +/// @tparam T Value type to load into. +/// @tparam N Number of columns to combine per reflection. +/// @tparam Data Data source type (MTZ, mmCIF, etc.). +/// @param data Source data object. +/// @param labels Array of N column labels to load. +/// @param as_is If true, skip ASU conversion and sorting. +/// @return New AsuData object populated with data. template AsuData make_asu_data(const Data& data, const std::array& labels, bool as_is=false) { @@ -234,6 +316,14 @@ AsuData make_asu_data(const Data& data, const std::array& labe asu_data.template load_values(data_proxy(data), labels, as_is); return asu_data; } + +/// @brief Create AsuData by loading from a single column in a data source. +/// @tparam T Value type to load into. +/// @tparam Data Data source type (MTZ, mmCIF, etc.). +/// @param data Source data object. +/// @param label Column label to load. +/// @param as_is If true, skip ASU conversion and sorting. +/// @return New AsuData object populated with data. template AsuData make_asu_data(const Data& data, const std::string& label, bool as_is) { AsuData asu_data; @@ -241,7 +331,11 @@ AsuData make_asu_data(const Data& data, const std::string& label, bool as_is) return asu_data; } -/// retains only points with positive SIGF and F/SIGF > cutoff +/// @brief Filter AsuData to retain only reflections with high signal-to-noise. +/// Removes reflections where sigma <= 0 or value/sigma < cutoff. +/// @tparam T Numeric type (float, double). +/// @param asu_data AsuData container with ValueSigma values (in/out). +/// @param cutoff Minimum value/sigma ratio to retain. template void discard_by_sigma_ratio(AsuData>& asu_data, double cutoff) { vector_remove_if(asu_data.v, [cutoff](const HklValue>& p) { diff --git a/include/gemmi/binner.hpp b/include/gemmi/binner.hpp index 8e7325638..82864bf2e 100644 --- a/include/gemmi/binner.hpp +++ b/include/gemmi/binner.hpp @@ -1,6 +1,7 @@ // Copyright 2022 Global Phasing Ltd. // -// Binning - resolution shells for reflections. +/// @file +/// @brief Binner class for organizing reflections into resolution shells. #ifndef GEMMI_BINNER_HPP_ #define GEMMI_BINNER_HPP_ @@ -11,14 +12,25 @@ namespace gemmi { +/// @brief Divide reflections into resolution shells (bins) for statistics. +/// +/// Organizes reflection data into bins by resolution. Supports multiple +/// binning schemes (equal count, linear/quadratic/cubic in d or 1/d^2). +/// Provides fast bin lookup for both sorted and unsorted data. struct Binner { + /// @brief Strategy for dividing reflections into resolution shells. enum class Method { - EqualCount, - Dstar, - Dstar2, - Dstar3, + EqualCount, ///< Bins with approximately equal number of reflections + Dstar, ///< Linear spacing in 1/d (resolution) + Dstar2, ///< Linear spacing in 1/d^2 (squared reciprocal spacing) + Dstar3, ///< Cubic spacing (linear in (1/d^2)^(1/3)) }; + /// @brief Set up bins using pre-calculated 1/d^2 values. + /// @param nbins Number of resolution bins to create. + /// @param method Binning scheme (EqualCount, Dstar, Dstar2, Dstar3). + /// @param inv_d2 Vector of 1/d^2 values for all reflections (moved from caller). + /// @param cell_ Unit cell for resolution calculations (nullptr uses already-set cell). void setup_from_1_d2(int nbins, Method method, std::vector&& inv_d2, const UnitCell* cell_) { if (nbins < 1) @@ -84,6 +96,15 @@ struct Binner { limits.resize(nbins); } + /// @brief Set up bins from reflection data using DataProxy interface. + /// Automatically calculates 1/d^2 values from HKLs and unit cell. + /// @tparam DataProxy Type with spacegroup(), unit_cell(), size(), stride(), + /// get_hkl(), get_num() interface. + /// @param nbins Number of resolution bins to create. + /// @param method Binning scheme (EqualCount, Dstar, Dstar2, Dstar3). + /// @param proxy Data proxy object (typically Intensities or MTZ). + /// @param cell_ Unit cell for resolution calculations (nullptr uses proxy.unit_cell()). + /// @param col_idx Column index for filtering NaN values (0 = skip filtering). template void setup(int nbins, Method method, const DataProxy& proxy, const UnitCell* cell_=nullptr, size_t col_idx=0) { @@ -98,12 +119,15 @@ struct Binner { setup_from_1_d2(nbins, method, std::move(inv_d2), nullptr); } + /// @brief Check that bins have been set up; throw if not. void ensure_limits_are_set() const { if (limits.empty()) fail("Binner not set up"); } - // Generic. Method-specific versions could be faster. + /// @brief Find bin index for a given 1/d^2 value (binary search, generic). + /// @param inv_d2 Squared reciprocal resolution (1/d^2). + /// @return Bin index (0 to size()-1). int get_bin_from_1_d2(double inv_d2) { ensure_limits_are_set(); auto it = std::lower_bound(limits.begin(), limits.end(), inv_d2); @@ -111,13 +135,20 @@ struct Binner { return int(it - limits.begin()); } + /// @brief Find bin index for a reflection by Miller indices. + /// @param hkl Miller indices (h,k,l). + /// @return Bin index (0 to size()-1). int get_bin(const Miller& hkl) { double inv_d2 = cell.calculate_1_d2(hkl); return get_bin_from_1_d2(inv_d2); } - // We assume that bins are seeked mostly for sorted reflections, - // so it's usually either the same bin as previously, or the next one. + /// @brief Find bin index using a hint (fast path for sorted reflections). + /// Updates hint to the found bin index for the next call. + /// Assumes sorted reflections: each bin is same or next to previous. + /// @param inv_d2 Squared reciprocal resolution (1/d^2). + /// @param hint In/out: previous bin index (input), current bin index (output). + /// @return Bin index (0 to size()-1). int get_bin_from_1_d2_hinted(double inv_d2, int& hint) const { if (inv_d2 <= limits[hint]) { while (hint != 0 && limits[hint-1] > inv_d2) @@ -130,11 +161,20 @@ struct Binner { return hint; } + /// @brief Find bin index for a reflection using a hint (fast path). + /// @param hkl Miller indices (h,k,l). + /// @param hint In/out: previous bin index (input), current bin index (output). + /// @return Bin index (0 to size()-1). int get_bin_hinted(const Miller& hkl, int& hint) const { double inv_d2 = cell.calculate_1_d2(hkl); return get_bin_from_1_d2_hinted(inv_d2, hint); } + /// @brief Get bin indices for all reflections in a DataProxy. + /// Uses hinting for fast processing of sorted data. + /// @tparam DataProxy Type with size(), stride(), get_hkl() interface. + /// @param proxy Data proxy object. + /// @return Vector of bin indices (length = proxy.size() / proxy.stride()). template std::vector get_bins(const DataProxy& proxy) const { ensure_limits_are_set(); @@ -145,6 +185,11 @@ struct Binner { return nums; } + /// @brief Get bin indices for an array of 1/d^2 values. + /// Uses hinting for fast processing of sorted data. + /// @param inv_d2 Pointer to array of squared reciprocal resolutions. + /// @param size Number of values in array. + /// @return Vector of bin indices (length = size). std::vector get_bins_from_1_d2(const double* inv_d2, size_t size) const { ensure_limits_are_set(); int hint = 0; @@ -154,24 +199,34 @@ struct Binner { return nums; } + /// @brief Get bin indices for a vector of 1/d^2 values. + /// @param inv_d2 Vector of squared reciprocal resolutions. + /// @return Vector of bin indices (length = inv_d2.size()). std::vector get_bins_from_1_d2(const std::vector& inv_d2) const { return get_bins_from_1_d2(inv_d2.data(), inv_d2.size()); } + /// @brief Get minimum resolution (highest 1/d^2) of a bin. + /// @param n Bin index (0 to size()-1). + /// @return Minimum resolution d in Angstroms (highest angle, tightest spacing). double dmin_of_bin(int n) const { return 1. / std::sqrt(n == (int) size() - 1 ? max_1_d2 : limits.at(n)); } + /// @brief Get maximum resolution (lowest 1/d^2) of a bin. + /// @param n Bin index (0 to size()-1). + /// @return Maximum resolution d in Angstroms (lowest angle, loosest spacing). double dmax_of_bin(int n) const { return 1. / std::sqrt(n == 0 ? min_1_d2 : limits.at(n-1)); } + /// @brief Number of bins (resolution shells). size_t size() const { return limits.size(); } - UnitCell cell; - double min_1_d2; - double max_1_d2; - std::vector limits; // upper limit of each bin - std::vector mids; // the middle of each bin + UnitCell cell; ///< Unit cell for calculating 1/d^2 from HKL + double min_1_d2; ///< Minimum 1/d^2 in data (lowest resolution) + double max_1_d2; ///< Maximum 1/d^2 in data (highest resolution) + std::vector limits; ///< Upper limit (1/d^2) of each bin + std::vector mids; ///< Midpoint (1/d^2) of each bin }; } // namespace gemmi diff --git a/include/gemmi/cif2mtz.hpp b/include/gemmi/cif2mtz.hpp index 4300bb09e..6a6de2d09 100644 --- a/include/gemmi/cif2mtz.hpp +++ b/include/gemmi/cif2mtz.hpp @@ -1,3 +1,10 @@ +/// @file +/// @brief Convert structure factor data from mmCIF to MTZ format +/// +/// Provides CifToMtz for converting reflection data from PDB/mmCIF format +/// to CCP4 MTZ binary format. Handles both merged and unmerged data, +/// including anomalous and old-style anomalous structures. + // Copyright 2021 Global Phasing Ltd. // // A class for converting SF-mmCIF to MTZ (merged or unmerged). @@ -20,7 +27,15 @@ namespace gemmi { -// "Old-style" anomalous or unmerged data is expected to have only these tags. +/// @brief Check if reflection block uses old-style anomalous data format. +/// +/// "Old-style" anomalous or unmerged data is expected to have only these tags: +/// index_h/k/l, wavelength_id, crystal_id, scale_group_code, status, +/// and either (intensity_meas/sigma) or (F_meas_au/sigma). +/// +/// @param rb ReflnBlock to check +/// @param data_type Expected data type (Unmerged or Anomalous) +/// @return true if all tags in the reflection loop match the old-style subset inline bool possible_old_style(const ReflnBlock& rb, DataType data_type) { if (rb.refln_loop == nullptr) return false; @@ -43,13 +58,21 @@ inline bool possible_old_style(const ReflnBlock& rb, DataType data_type) { } +/// @brief Convert old-style anomalous data to modern PDBx format. +/// /// Before _refln.pdbx_F_plus/minus was introduced, anomalous data was -/// stored as two F_meas_au reflections, say (1,1,3) and (-1,-1,-3). -/// This function transcribes it to how the anomalous data is stored -/// in PDBx/mmCIF nowadays: -/// _refln.F_meas_au -> pdbx_F_plus / pdbx_F_minus, -/// _refln.F_meas_sigma_au -> pdbx_F_plus_sigma / pdbx_F_minus_sigma. -/// _refln.intensity_{meas,sigma} -> _refln.pdbx_F_plus{,_sigma} / ... +/// stored as two F_meas_au reflections, e.g. (1,1,3) and (-1,-1,-3). +/// This function transcribes it to the modern PDBx/mmCIF storage: +/// - _refln.F_meas_au → pdbx_F_plus / pdbx_F_minus +/// - _refln.F_meas_sigma_au → pdbx_F_plus_sigma / pdbx_F_minus_sigma +/// - _refln.intensity_meas/sigma → pdbx_I_plus/I_minus and sigmas +/// +/// Reflections are moved to the ASU, and when both +/- forms exist for +/// the same HKL, they are merged (missing values set to '.'). +/// +/// @param loop CIF loop containing old-style anomalous data +/// @param sg Space group for ASU determination (null → P1) +/// @return New loop with reflections in standard pdbx format inline cif::Loop transcript_old_anomalous_to_standard(const cif::Loop& loop, const SpaceGroup* sg) { std::vector positions; @@ -139,7 +162,29 @@ inline cif::Loop transcript_old_anomalous_to_standard(const cif::Loop& loop, } +/// @brief Converter from CIF reflection data to MTZ format. +/// +/// Handles conversion of structure factor data from mmCIF format +/// (PDB/wwPDB standard) to MTZ format (CCP4 binary format). +/// Supports both merged and unmerged reflection data, with configurable +/// column mappings from CIF _refln tags to MTZ column labels and types. +/// +/// Uses a specification (default or custom) to map CIF tags to MTZ columns, +/// handling code-to-number translation for categorical data (e.g., FreeR flags). struct CifToMtz { + /// @brief Get the default column mapping specification. + /// + /// Returns static arrays of mapping lines. Each line has format: + /// `cif_tag mtz_label col_type dataset_id [code_mapping]` + /// where code_mapping (optional) is a comma-separated list of `code=value` pairs. + /// + /// For merged data: includes FreeR_flag, intensities, structure factors, + /// anomalous pairs, calculated phases, and weight/FOM columns. + /// + /// For unmerged data: includes intensity_meas, detector coordinates, and rotation angle. + /// + /// @param for_merged true for merged data, false for unmerged + /// @return Pointer to null-terminated array of specification strings // Alternative mmCIF tags for the same MTZ label should be consecutive static const char** default_spec(bool for_merged) { static const char* merged[] = { @@ -192,13 +237,23 @@ struct CifToMtz { return for_merged ? merged : unmerged; } + /// @brief Specification entry mapping a CIF tag to an MTZ column. struct Entry { - std::string refln_tag; - std::string col_label; - char col_type; - int dataset_id; + std::string refln_tag; ///< CIF _refln.* tag name (without prefix) + std::string col_label; ///< MTZ column label + char col_type; ///< MTZ column type code (H,K,L,F,Q,J,P,etc.) + int dataset_id; ///< Dataset id for merged data (0 or 1) + /// Code-to-number mappings for categorical data (e.g., FreeR code → number) std::vector> code_to_number; + /// @brief Parse a specification line into an Entry. + /// + /// Expected format: + /// - `cif_tag mtz_label col_type dataset_id` + /// - `cif_tag mtz_label col_type dataset_id code1=val1,code2=val2` + /// + /// @param line Specification line to parse + /// @throws gemmi::fail if format is invalid Entry(const std::string& line) { std::vector tokens; tokens.reserve(4); @@ -234,6 +289,13 @@ struct CifToMtz { } } + /// @brief Translate a categorical value to its numeric equivalent. + /// + /// Uses code_to_number mappings to convert coded values (e.g., 'o', 'f') + /// to numeric equivalents (e.g., 1.0, 0.0 for FreeR flags). + /// + /// @param v The coded value to translate + /// @return Translated number, or NAN if no mapping found float translate_code_to_number(const std::string& v) const { if (v.size() == 1) { for (const auto& c2n : code_to_number) @@ -249,12 +311,31 @@ struct CifToMtz { } }; - bool force_unmerged = false; - std::string title; + bool force_unmerged = false; ///< If true, treat all data as unmerged + std::string title; ///< Title to set in output MTZ file + /// Historical entries to include in MTZ history; defaults to version line std::vector history = { "From gemmi-cif2mtz " GEMMI_VERSION }; - double wavelength = NAN; + double wavelength = NAN; ///< Override wavelength (NAN = auto-detect) + /// Custom column specification lines; if empty, uses default_spec() std::vector spec_lines; + /// @brief Convert a single mmCIF reflection block to MTZ format. + /// + /// Maps CIF _refln columns to MTZ columns using the specification (custom or default). + /// Handles merged and unmerged data differently: + /// + /// **Merged data:** Creates a single dataset with wavelength from ReflnBlock. + /// + /// **Unmerged data:** Extracts diffrn_id and pdbx_image_id to create BATCH column, + /// creates a dataset per crystal, and uses UnmergedHklMover to put HKLs into ASU. + /// + /// Missing values ('.' in CIF) become NAN in MTZ. The M/ISYM and BATCH columns + /// are automatically added for unmerged data. + /// + /// @param rb Reflection block with parsed _refln loop + /// @param logger Logger for informational and error messages + /// @return MTZ structure with data, columns, and metadata + /// @throws gemmi::fail if required tags (index_h/k/l, data columns) are missing Mtz convert_block_to_mtz(const ReflnBlock& rb, Logger& logger) const { Mtz mtz; mtz.title = title.empty() ? "Converted from mmCIF block " + rb.block.name : title; @@ -505,6 +586,23 @@ struct CifToMtz { return mtz; } + /// @brief Auto-detect data type and convert with optional transformation. + /// + /// Performs intelligent detection and conversion: + /// + /// **Mode 'f' (fix old-style anomalous):** If the block contains old-style + /// anomalous data, transforms it to modern pdbx_F_plus/minus format before conversion. + /// + /// **Mode 'a' (auto-detect anomalous):** After conversion, analyzes the unique + /// HKLs under symmetry to detect if data is actually anomalous or unmerged despite + /// initial classification. Logs warnings and attempts recovery for old-style data. + /// + /// **Other modes:** Performs conversion as-is without transformation. + /// + /// @param rb Reflection block (modified in-place if transformation occurs) + /// @param logger Logger for notes and errors + /// @param mode Conversion mode: 'f'=fix old anomalous, 'a'=auto-detect, other=no transform + /// @return Converted MTZ structure Mtz auto_convert_block_to_mtz(ReflnBlock& rb, Logger& logger, char mode) const { if (mode == 'f' && possible_old_style(rb, DataType::Anomalous)) *rb.refln_loop = transcript_old_anomalous_to_standard(*rb.refln_loop, rb.spacegroup); @@ -537,6 +635,15 @@ struct CifToMtz { } private: + /// @brief Convert _refln.status code to FreeR_flag value. + /// + /// Maps status codes to numeric equivalents: + /// - 'o' or quoted 'o' → 1.0 (observed/working set) + /// - 'f' or quoted 'f' → 0.0 (free set) + /// - other → NAN + /// + /// @param str Status code string from CIF + /// @return Numeric flag value (1.0, 0.0, or NAN) static float status_to_freeflag(const std::string& str) { char c = str[0]; if (c == '\'' || c == '"') diff --git a/include/gemmi/intensit.hpp b/include/gemmi/intensit.hpp index d47ad6bb8..ec77b28c3 100644 --- a/include/gemmi/intensit.hpp +++ b/include/gemmi/intensit.hpp @@ -1,5 +1,8 @@ // Copyright 2020 Global Phasing Ltd. // +/// @file +/// @brief Intensities class for reading and merging intensity data from various formats. +// // Class Intensities that reads multi-record data from MTZ, mmCIF or XDS_ASCII // and merges it into mean or anomalous intensities. // It can also read merged data. @@ -23,28 +26,40 @@ struct ReflnBlock; namespace cif { struct Block; } using std::int8_t; -// If used to request a particular data type: -// MergedMA = Mean if available, otherwise Anomalous, -// MergedAM = Anomalous if available, otherwise Mean. -// UAM = Unmerged if available, otherwise MergedAM -enum class DataType { Unknown, Unmerged, Mean, Anomalous, - MergedMA, MergedAM, UAM }; +/// Data type of intensity data: unmerged, mean intensity, or anomalous intensities. +/// +/// When requesting a particular data type, the MergedMA/MergedAM/UAM variants +/// allow fallback to secondary options (e.g., MergedMA = Mean if available, else Anomalous). +enum class DataType { + Unknown, ///< Unknown or unspecified intensity type + Unmerged, ///< Unmerged (multi-record) intensity data + Mean, ///< Mean intensity + Anomalous, ///< Anomalous intensities (I+/I-) + MergedMA, ///< Mean if available, otherwise Anomalous (fallback type) + MergedAM, ///< Anomalous if available, otherwise Mean (fallback type) + UAM ///< Unmerged if available, otherwise MergedAM (fallback type) +}; +/// @brief Statistics calculated for a resolution shell (bin) of merged intensities. +/// +/// Accumulates numerators and denominators for R-merge, R-meas, and R-pim. +/// Can be summed across shells to compute overall statistics. struct GEMMI_DLL MergingStats { - int all_refl = 0; // all reflections, sometimes called observations - int unique_refl = 0; - int stats_refl = 0; // unique reflections with 2+ observations (used for statistics) - double r_merge_num = 0; // numerator for R-merge - double r_meas_num = 0; // numerator for R-meas - double r_pim_num = 0; // numerator for R-pim - double r_denom = 0; // denominator for R-* - // sums for CC1/2 - double sum_ibar = 0; - double sum_ibar2 = 0; - double sum_sig2_eps = 0; - - /// This class is additive. Adding two MergingStats gives the same result - /// as calculating statistics in these two resolution shells from the start. + int all_refl = 0; ///< Total number of observations (all reflections) + int unique_refl = 0; ///< Number of unique reflections + int stats_refl = 0; ///< Unique reflections with 2+ observations (for statistics) + double r_merge_num = 0; ///< Numerator for R-merge calculation + double r_meas_num = 0; ///< Numerator for R-meas (redundancy-weighted) calculation + double r_pim_num = 0; ///< Numerator for R-pim (precision-indicating merge) calculation + double r_denom = 0; ///< Denominator for R-merge/meas/pim calculations + double sum_ibar = 0; ///< Sum of mean intensities for CC1/2 + double sum_ibar2 = 0; ///< Sum of squared mean intensities for CC1/2 + double sum_sig2_eps = 0; ///< Sum of variance terms for CC1/2 + + /// Accumulate statistics from another shell. + /// @param o Statistics from another resolution shell. + /// Adding two MergingStats gives the same result as calculating statistics + /// for the combined shells from the start. void add_other(const MergingStats& o) { all_refl += o.all_refl; unique_refl += o.unique_refl; @@ -58,64 +73,89 @@ struct GEMMI_DLL MergingStats { sum_sig2_eps += o.sum_sig2_eps; } + /// Compute R-merge for this shell. double r_merge() const { return r_merge_num / r_denom; } + /// Compute redundancy-weighted R-meas for this shell. double r_meas() const { return r_meas_num / r_denom; } + /// Compute precision-indicating R-pim for this shell. double r_pim() const { return r_pim_num / r_denom; } + /// Compute CC1/2 (correlation coefficient of half-datasets). double cc_half() const; // calculated using sigma-tau method - /// split-half reliability using the Spearman-Brown prediction formula :-) + /// Compute CC* using Spearman-Brown prediction formula from CC1/2. double cc_full() const { double cc = cc_half(); return 2 * cc / (1 + cc); } + /// Estimate of overall correlation coefficient from CC1/2. double cc_star() const { return std::sqrt(cc_full()); } }; -/// Returns STARANISO version or empty string. +/// @brief Extract STARANISO anisotropy B-tensor from MTZ file. +/// @param mtz MTZ file object to read from. +/// @param output Anisotropic B-tensor (3x3 symmetric matrix in Voigt notation). +/// @return STARANISO version string if found, empty string otherwise. GEMMI_DLL std::string read_staraniso_b_from_mtz(const Mtz& mtz, SMat33& output); +/// @brief Container for intensity data from reflection measurements. +/// +/// Stores multi-record (unmerged) or merged intensities with metadata such as +/// unit cell, space group, and wavelength. Supports merging operations and +/// import from MTZ, mmCIF, and XDS_ASCII formats. struct GEMMI_DLL Intensities { + /// @brief A single reflection record with intensity, sigma, and metadata. struct Refl { - Miller hkl; - int8_t isign; // 1 for I(+), -1 for I(-), 0 for mean or unmerged - int8_t isym; // for unmerged data: encodes symmetry op like M/ISYM in MTZ - short nobs; - double value; - double sigma; - + Miller hkl; ///< Miller indices (h, k, l) + int8_t isign; ///< Intensity component: 1=I(+), -1=I(-), 0=mean/unmerged + int8_t isym; ///< Symmetry operator encoding (ISYM in MTZ for unmerged data) + short nobs; ///< Number of observations (used during merging) + double value; ///< Intensity value + double sigma; ///< Standard deviation of intensity + + /// Compare reflections by (h,k,l, isign) for sorting. bool operator<(const Refl& o) const { return std::tie(hkl[0], hkl[1], hkl[2], isign) < std::tie(o.hkl[0], o.hkl[1], o.hkl[2], o.isign); } - // for merged data + /// @brief Get intensity label ("<I>", "I(+)", or "I(-)"). const char* intensity_label() const { if (isign == 0) return ""; return isign > 0 ? "I(+)" : "I(-)"; } + /// @brief Format HKL and intensity label as a string. std::string hkl_label() const { return cat(intensity_label(), " (", hkl[0], ' ', hkl[1], ' ', hkl[2], ')'); } }; + /// @brief Anisotropic scaling tensor for STARANISO B-factor correction. struct AnisoScaling { - SMat33 b = {0., 0., 0., 0., 0., 0.}; + SMat33 b = {0., 0., 0., 0., 0., 0.}; ///< Symmetric B-tensor in Voigt notation + /// Check if anisotropic tensor is set (non-zero). bool ok() const { return !b.all_zero(); } + /// @brief Compute scaling factor for a reflection at given HKL. + /// @param hkl Miller indices + /// @param cell Unit cell to convert HKL to reciprocal-space vector + /// @return Exponential scaling factor exp(0.5 * B * s * s) double scale(const Miller& hkl, const UnitCell& cell) const { Vec3 s = cell.frac.mat.left_multiply(Vec3(hkl[0], hkl[1], hkl[2])); return std::exp(0.5 * b.r_u_r(s)); } }; - std::vector data; - const SpaceGroup* spacegroup = nullptr; - UnitCell unit_cell; - double unit_cell_rmsd[6] = {0., 0., 0., 0., 0., 0.}; - double wavelength; - DataType type = DataType::Unknown; - std::vector isym_ops; - AnisoScaling staraniso_b; - + std::vector data; ///< Reflection records + const SpaceGroup* spacegroup = nullptr; ///< Space group (not owned by this object) + UnitCell unit_cell; ///< Crystal unit cell parameters + double unit_cell_rmsd[6] = {0., 0., 0., 0., 0., 0.}; ///< RMSDs of unit cell parameters + double wavelength; ///< Diffraction wavelength in Angstroms + DataType type = DataType::Unknown; ///< Type of intensity data stored + std::vector isym_ops; ///< Symmetry operators (for unmerged data) + AnisoScaling staraniso_b; ///< STARANISO anisotropy correction tensor + + /// @brief Get string representation of intensity data type. + /// @param data_type Type to convert. + /// @return Human-readable label ("I", "<I>", "I+/I-", "n/a", etc.). static const char* type_str(DataType data_type) { switch (data_type) { case DataType::Unmerged: return "I"; @@ -129,16 +169,28 @@ struct GEMMI_DLL Intensities { unreachable(); } + /// @brief Get string representation of this object's intensity data type. const char* type_str() const { return Intensities::type_str(type); } + /// @brief Get space group name (Hermann-Mauguin symbol) or "none". std::string spacegroup_str() const { return spacegroup ? spacegroup->xhm() : "none"; } - // returns (d_max, d_min) + /// @brief Get minimum and maximum resolution of reflections. + /// @return {d_max, d_min} as array of two doubles. std::array resolution_range() const; - // pre: both are sorted + /// @brief Calculate correlation of intensity values between two sorted lists. + /// @param other Another Intensities object to correlate with. + /// @return Correlation object with matching reflections analyzed. Correlation calculate_correlation(const Intensities& other) const; + /// @brief Add a single reflection if its data is valid. + /// Skips reflections with NaN or non-positive sigma (rejected by XDS, etc.). + /// @param hkl Miller indices + /// @param isign Intensity sign (1 for I+, -1 for I-, 0 for mean) + /// @param isym Symmetry operator encoding + /// @param value Intensity value + /// @param sigma Standard deviation of intensity void add_if_valid(const Miller& hkl, int8_t isign, int8_t isym, double value, double sigma) { // XDS marks rejected reflections with negative sigma. // Sigma 0.0 rarely happens (e.g. 5tkn), but is also problematic. @@ -146,6 +198,7 @@ struct GEMMI_DLL Intensities { data.push_back({hkl, isign, isym, /*nobs=*/0, value, sigma}); } + /// @brief Remove reflections that are forbidden by space group symmetry. void remove_systematic_absences() { if (!spacegroup) return; @@ -153,53 +206,99 @@ struct GEMMI_DLL Intensities { vector_remove_if(data, [&](Refl& x) { return gops.is_systematically_absent(x.hkl); }); } + /// @brief Sort reflections by (h, k, l, isign) in ascending order. void sort() { std::sort(data.begin(), data.end()); } + /// @brief Merge reflections in-place to a specified data type (mean or anomalous). + /// @param new_type Target data type for merged intensities. void merge_in_place(DataType new_type); + /// @brief Create a merged copy without modifying this object. + /// @param new_type Target data type for merged intensities. + /// @return New Intensities object with merged data. Intensities merged(DataType new_type) { Intensities m(*this); m.merge_in_place(new_type); return m; } - /// use_weights can be 'Y' (yes, like Aimless), 'U' (unweighted), 'X' (yes, like XDS) + /// @brief Calculate R-merge and related statistics for each resolution shell. + /// @param binner Resolution shell binning (or nullptr for all data in one shell). + /// @param use_weights Weighting scheme: 'Y'=Aimless style, 'U'=unweighted, 'X'=XDS style. + /// @return Vector of MergingStats, one per shell. std::vector calculate_merging_stats(const Binner* binner, char use_weights='Y') const; - // call with DataType::Anomalous before calculate_merging_stats() to get I+/I- stats + /// @brief Prepare data for merging and classify anomalous/mean component. + /// Call with DataType::Anomalous before calculate_merging_stats() to get I+/I- stats. + /// @param new_type Target data type. + /// @return Classified data type after preparation. DataType prepare_for_merging(DataType new_type); + /// @brief Convert unmerged ISYM indices to ASU indices using stored isym_ops. void switch_to_asu_indices(); + /// @brief Load unmerged intensities from MTZ file. + /// @param mtz MTZ object to read from. void import_unmerged_intensities_from_mtz(const Mtz& mtz); + /// @brief Load mean intensities from MTZ file. + /// @param mtz MTZ object to read from. void import_mean_intensities_from_mtz(const Mtz& mtz); - // with check_complete=true, throw if anomalous data is null where it shouldn't be + /// @brief Load anomalous intensities (I+/I-) from MTZ file. + /// @param mtz MTZ object to read from. + /// @param check_complete If true, throw if anomalous data is null where expected. void import_anomalous_intensities_from_mtz(const Mtz& mtz, bool check_complete=false); + /// @brief Load intensities from MTZ file, auto-detecting data type. + /// @param mtz MTZ object to read from. + /// @param data_type Requested data type; DataType::Unknown auto-detects. void import_mtz(const Mtz& mtz, DataType data_type=DataType::Unknown); + /// @brief Load unmerged intensities from mmCIF reflection block. + /// @param rb Reflection block to read from. void import_unmerged_intensities_from_mmcif(const ReflnBlock& rb); + /// @brief Load mean intensities from mmCIF reflection block. + /// @param rb Reflection block to read from. void import_mean_intensities_from_mmcif(const ReflnBlock& rb); + /// @brief Load anomalous intensities (I+/I-) from mmCIF reflection block. + /// @param rb Reflection block to read from. + /// @param check_complete If true, throw if anomalous data is null where expected. void import_anomalous_intensities_from_mmcif(const ReflnBlock& rb, bool check_complete=false); + /// @brief Load structure factor squared (F^2) from mmCIF reflection block. + /// @param rb Reflection block to read from. void import_f_squared_from_mmcif(const ReflnBlock& rb); + /// @brief Load intensities from mmCIF reflection block, auto-detecting data type. + /// @param rb Reflection block to read from. + /// @param data_type Requested data type; DataType::Unknown auto-detects. void import_refln_block(const ReflnBlock& rb, DataType data_type=DataType::Unknown); + /// @brief Load intensities from XDS_ASCII file. + /// @param xds XDS_ASCII object to read from. void import_xds(const XdsAscii& xds); - // returns STARANISO version or empty string + /// @brief Extract and store STARANISO B-tensor from MTZ file. + /// @param mtz MTZ object to read from. + /// @return STARANISO version string if found, empty string otherwise. std::string take_staraniso_b_from_mtz(const Mtz& mtz); + /// @brief Extract and store STARANISO B-tensor from mmCIF block. + /// @param block CIF block to read from. + /// @return True if tensor was found and loaded, false otherwise. bool take_staraniso_b_from_mmcif(const cif::Block& block); + /// @brief Create a merged MTZ file from these intensities. + /// @param with_nobs If true, include NOBS (observation count) column. + /// @return Mtz object with merged data. Mtz prepare_merged_mtz(bool with_nobs); }; -// Minimal compatibility with MtzDataProxy and ReflnDataProxy. +/// @brief Adapter providing DataProxy interface to Intensities data. +/// Enables use of Intensities with generic algorithms expecting standard data proxy interface. struct IntensitiesDataProxy { - const Intensities& intensities_; + const Intensities& intensities_; ///< Reference to underlying Intensities object + size_t stride() const { return 1; } size_t size() const { return intensities_.data.size(); } const SpaceGroup* spacegroup() const { return intensities_.spacegroup; } @@ -208,6 +307,15 @@ struct IntensitiesDataProxy { double get_num(size_t n) const { return intensities_.data[n].value; } }; +/// @brief Infer intensity data type from reflection data under symmetry. +/// +/// Examines unique reflections and detects whether data is unmerged (multiple +/// copies of same HKL), mean intensity (single copy), or anomalous (both I+/I-). +/// +/// @tparam DataProxy Type with spacegroup(), unit_cell(), size(), stride(), +/// get_hkl() interface. +/// @param proxy Data proxy object to analyze. +/// @return Pair of (inferred DataType, number of unique HKLs in ASU). template std::pair check_data_type_under_symmetry(const DataProxy& proxy) { const SpaceGroup* sg = proxy.spacegroup(); diff --git a/include/gemmi/mtz.hpp b/include/gemmi/mtz.hpp index c7cf44316..6e5b8b987 100644 --- a/include/gemmi/mtz.hpp +++ b/include/gemmi/mtz.hpp @@ -1,3 +1,6 @@ +/// @file +/// @brief MTZ reflection file format (X-ray crystallography). + // Copyright 2019 Global Phasing Ltd. // // MTZ reflection file format. @@ -24,15 +27,19 @@ namespace gemmi { -// Unmerged MTZ files always store in-asu hkl indices and symmetry operation -// encoded in the M/ISYM column. Here is a helper for writing such files. +/// Helper for writing unmerged MTZ files with correct M/ISYM column values. +/// Converts Miller indices to ASU-equivalent and encodes the symmetry operation. struct UnmergedHklMover { + /// Initialize with spacegroup information. + /// @param spacegroup The space group (may be null). UnmergedHklMover(const SpaceGroup* spacegroup) : asu_(spacegroup) { if (spacegroup) group_ops_ = spacegroup->operations(); } - // Modifies hkl and returns ISYM value for M/ISYM + /// Move HKL indices to ASU and return the encoded ISYM value. + /// @param hkl [in,out] Miller indices; modified to ASU-equivalent values. + /// @return ISYM value for the M/ISYM column (encodes symmetry operation). int move_to_asu(std::array& hkl) { std::pair hkl_isym = asu_.to_asu(hkl, group_ops_); hkl = hkl_isym.first; @@ -44,63 +51,125 @@ struct UnmergedHklMover { GroupOps group_ops_; }; +/// MTZ file metadata: crystallographic parameters, symmetry, and file structure. struct MtzMetadata { - std::string source_path; // input file path, if known + /// Input file path (if known). + std::string source_path; + /// True if the file's byte order matches the system (not swapped). bool same_byte_order = true; + /// For unmerged MTZ: true if HKL indices have been switched to original (non-ASU) values. bool indices_switched_to_original = false; + /// Offset (in 32-bit words) to the start of the header block. std::int64_t header_offset = 0; + /// Version stamp from VERS header line (e.g., "MTZ:V1.1"). std::string version_stamp; + /// Title from TITLE header line. std::string title; + /// Number of reflections in the data array. int nreflections = 0; + /// Sort order: columns used to sort reflections (0 = not used). std::array sort_order = {}; + /// Minimum 1/d² value in the file (d = 1/sqrt(1/d²)). double min_1_d2 = NAN; + /// Maximum 1/d² value in the file. double max_1_d2 = NAN; + /// VALM value: typically unused (for future use in CCP4). float valm = NAN; + /// Number of symmetry operations (redundant with symops.size()). int nsymop = 0; + /// Global unit cell parameters. UnitCell cell; + /// CCP4 space group number. int spacegroup_number = 0; + /// Space group name (Hermann-Mauguin, e.g., "P 21 21 21"). std::string spacegroup_name; + /// Symmetry operations read from SYMM header lines. std::vector symops; + /// Pointer to the SpaceGroup object (from symmetry database). const SpaceGroup* spacegroup = nullptr; + /// Historical processing steps (MTZHIST records). std::vector history; + /// Text appended after MTZENDOFHEADERS (non-standard). std::string appended_text; - // used to report non-critical problems when reading a file (also used in mtz2cif) + /// Logger for non-critical problems during file reading. Logger logger; }; +/// Representation of an MTZ reflection file. +/// Contains reflection data, column definitions, batch headers (for unmerged files), +/// and crystallographic metadata (cell, space group, symmetry operations). struct GEMMI_DLL Mtz : public MtzMetadata { + /// A dataset in the MTZ file hierarchy: project → crystal → dataset → columns. struct Dataset { + /// Unique dataset ID (positive integer, typically starting at 1). int id; + /// Project name (e.g., dataset owner or beamline). std::string project_name; + /// Crystal name (e.g., sample identifier). std::string crystal_name; + /// Dataset name (e.g., experiment or scan identifier). std::string dataset_name; + /// Unit cell parameters for this dataset (overrides global cell if set). UnitCell cell; - double wavelength; // 0 means not set + /// X-ray wavelength in Angstroms (0 = not set). + double wavelength; }; + /// A column in the reflection data array. + /// Stores one field per reflection (e.g., amplitude, phase, flag). struct Column { + /// Dataset ID this column belongs to. int dataset_id; + /// Column type code: 'H'=Miller index (H, K, or L), 'F'=amplitude, + /// 'Q'=standard deviation, 'J'=intensity, 'M/ISYM'=symmetry flag (unmerged), + /// 'D'=anomalous difference, 'P'=phase (degrees), 'W'=weight, 'A'=phase prob., + /// 'B'=batch number, 'Y'=M/ISYM, 'I'=integer, 'R'=R-factor, 'G'=F(+)/F(-), + /// 'K'=I(+)/I(-), 'L'=string. char type; + /// Column label (e.g., "FP", "SIGFP", "FWT", "PHWT"). std::string label; + /// Minimum value in this column (NAN if not computed). float min_value = NAN; + /// Maximum value in this column (NAN if not computed). float max_value = NAN; - std::string source; // from COLSRC + /// Source of the data (from COLSRC header; e.g., derivation formula). + std::string source; + /// Pointer to parent Mtz object (for data access). Mtz* parent; + /// Index of this column in parent->columns (used to access data array). std::size_t idx; + /// Get the Dataset this column belongs to. Dataset& dataset() { return parent->dataset(dataset_id); } + /// Get the Dataset this column belongs to (const). const Dataset& dataset() const { return parent->dataset(dataset_id); } + /// True if parent Mtz has data loaded. bool has_data() const { return parent->has_data(); } + /// Number of values in this column (0 if no data loaded). int size() const { return has_data() ? parent->nreflections : 0; } + /// Stride between consecutive values in the data array (= number of columns). size_t stride() const { return parent->columns.size(); } + /// Access column value for reflection n. + /// @param n Reflection index (0 to nreflections-1). float& operator[](std::size_t n) { return parent->data[idx + n * stride()]; } + /// Access column value for reflection n (const). float operator[](std::size_t n) const { return parent->data[idx + n * stride()]; } + /// Access column value for reflection n with bounds checking. + /// @param n Reflection index. + /// @return Reference to the data value. + /// @throws std::out_of_range if n is out of bounds. float& at(std::size_t n) { return parent->data.at(idx + n * stride()); } + /// Access column value for reflection n with bounds checking (const). float at(std::size_t n) const { return parent->data.at(idx + n * stride()); } + /// True if this column type represents an integer value. + /// Returns true for types H, B, Y, I (indices, batch, ISYM, integers). bool is_integer() const { return type == 'H' || type == 'B' || type == 'Y' || type == 'I'; } + /// Find the next column in the same dataset with a specific type. + /// @param next_type The column type to search for. + /// @return Pointer to the next matching column, or nullptr. const Column* get_next_column_if_type(char next_type) const { if (idx + 1 < parent->columns.size()) { const Column& next_col = parent->columns[idx + 1]; @@ -110,22 +179,31 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return nullptr; } + /// Iterator over this column's values. using iterator = StrideIter; + /// Begin iterator over all values in this column. iterator begin() { assert(parent); assert(&parent->columns[idx] == this); return iterator({parent->data.data(), idx, stride()}); } + /// End iterator for this column. iterator end() { return iterator({parent->data.data() + parent->data.size(), idx, stride()}); } + /// Const iterator over this column's values. using const_iterator = StrideIter; + /// Begin const iterator. const_iterator begin() const { return const_cast(this)->begin(); } + /// End const iterator. const_iterator end() const { return const_cast(this)->end(); } }; + /// Batch header for unmerged MTZ files (one per diffraction image/sweep). + /// Contains crystallographic and experimental metadata in fixed positions. struct Batch { + /// Initialize a batch with default values (matching CCP4 COMBAT/Pointless). Batch() { ints.resize(29, 0); floats.resize(156, 0.); @@ -136,16 +214,26 @@ struct GEMMI_DLL Mtz : public MtzMetadata { // COMBAT sets BSCALE=1, but Pointless sets it to 0. //floats[43] = 1.f; // batch scale } + /// Batch number (usually 1-based). int number = 0; + /// Title or description of the batch. std::string title; + /// Integer values: ints[20] = dataset_id, ints[0,1,2] = sizes (fixed). std::vector ints; + /// Float values: floats[0-5] = cell, floats[6-14] = U matrix, floats[36-37] = phi range, + /// floats[86] = wavelength. std::vector floats; + /// Axis names (e.g., "OMEGA", "KAPPA", "PHI"). std::vector axes; + /// Extract unit cell parameters from batch header. + /// @return Unit cell (a, b, c, alpha, beta, gamma). UnitCell get_cell() const { return UnitCell(floats[0], floats[1], floats[2], floats[3], floats[4], floats[5]); } + /// Set unit cell parameters in batch header. + /// @param uc The unit cell to store. void set_cell(const UnitCell& uc) { floats[0] = (float) uc.a; floats[1] = (float) uc.b; @@ -155,12 +243,26 @@ struct GEMMI_DLL Mtz : public MtzMetadata { floats[5] = (float) uc.gamma; } + /// Get the dataset ID from batch header. + /// @return Dataset ID (from ints[20]). int dataset_id() const { return ints[20]; } + /// Set the dataset ID in batch header. + /// @param id Dataset ID to store in ints[20]. void set_dataset_id(int id) { ints[20] = id; } + /// Get the X-ray wavelength. + /// @return Wavelength in Angstroms (from floats[86]). float wavelength() const { return floats[86]; } + /// Set the X-ray wavelength. + /// @param lambda Wavelength in Angstroms. void set_wavelength(float lambda) { floats[86] = lambda; } + /// Get the phi rotation start angle. + /// @return Start angle in degrees (from floats[36]). float phi_start() const { return floats[36]; } + /// Get the phi rotation end angle. + /// @return End angle in degrees (from floats[37]). float phi_end() const { return floats[37]; } + /// Get the crystal orientation matrix U (3×3). + /// @return U matrix from floats[6-14]. Mat33 matrix_U() const { return Mat33(floats[6], floats[9], floats[12], floats[7], floats[10], floats[13], @@ -168,16 +270,25 @@ struct GEMMI_DLL Mtz : public MtzMetadata { } }; + /// All datasets in the file. std::vector datasets; + /// All columns in the file (ordered by position in the data array). std::vector columns; + /// Batch headers (empty for merged MTZ files). std::vector batches; + /// Reflection data: laid out as [col0_refl0, col1_refl0, ..., col0_refl1, col1_refl1, ...]. + /// Size = columns.size() * nreflections. Access via Column's operator[]. std::vector data; + /// Create an empty MTZ object. + /// @param with_base If true, initialize with a default HKL_base dataset and H, K, L columns. explicit Mtz(bool with_base=false) { if (with_base) add_base(); } + /// Move constructor. Mtz(Mtz&& o) noexcept { *this = std::move(o); } + /// Move assignment. Updates parent pointers in all columns. Mtz& operator=(Mtz&& o) noexcept { MtzMetadata::operator=(std::move(o)); datasets = std::move(o.datasets); @@ -189,7 +300,7 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return *this; } - // explicit to be aware where we make copies + /// Copy constructor. Explicit to prevent accidental copies. Updates parent pointers. explicit Mtz(const Mtz& o) : MtzMetadata(o) { datasets = o.datasets; columns = o.columns; @@ -199,19 +310,29 @@ struct GEMMI_DLL Mtz : public MtzMetadata { col.parent = this; } + /// Copy assignment is deleted (explicit copy constructor forces intentionality). Mtz& operator=(Mtz const&) = delete; + /// Initialize with default HKL_base dataset and H, K, L columns. void add_base() { datasets.push_back({0, "HKL_base", "HKL_base", "HKL_base", cell, 0.}); for (int i = 0; i != 3; ++i) add_column(std::string(1, "HKL"[i]), 'H', 0, i, false); } - // Functions to use after MTZ headers (and data) is read. + /// @name Crystallographic properties (after reading headers/data) + /// @{ + /// Get the highest resolution in Angstroms. + /// @return d_min = 1/sqrt(max_1_d2); resolution is high when d is small. double resolution_high() const { return std::sqrt(1.0 / max_1_d2); } + /// Get the lowest resolution in Angstroms. + /// @return d_max = 1/sqrt(min_1_d2); resolution is low when d is large. double resolution_low() const { return std::sqrt(1.0 / min_1_d2); } + /// Get the unit cell for a specific dataset or the global cell. + /// @param dataset Dataset ID (default -1 = global cell, but searches datasets first). + /// @return Reference to the unit cell. UnitCell& get_cell(int dataset=-1) { for (Dataset& ds : datasets) if (ds.id == dataset && ds.cell.is_crystal() && ds.cell.a > 0) @@ -219,10 +340,13 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return cell; } + /// Get the unit cell (const). const UnitCell& get_cell(int dataset=-1) const { return const_cast(this)->get_cell(dataset); } + /// Set the global and all per-dataset unit cells to the same value. + /// @param new_cell The new unit cell parameters. void set_cell_for_all(const UnitCell& new_cell) { cell = new_cell; cell.set_cell_images_from_spacegroup(spacegroup); // probably not needed @@ -230,20 +354,37 @@ struct GEMMI_DLL Mtz : public MtzMetadata { ds.cell = cell; } + /// Calculate average unit cell from all batch headers, optionally with RMSD. + /// @param rmsd [out] Pointer to array of 6 doubles (a, b, c, alpha, beta, gamma RMSD), + /// or nullptr to skip. + /// @return Average cell from all batches, or global cell if batches are invalid. UnitCellParameters get_average_cell_from_batch_headers(double* rmsd) const; + /// Set the space group and update related fields. + /// @param new_sg Pointer to SpaceGroup (may be null). void set_spacegroup(const SpaceGroup* new_sg) { spacegroup = new_sg; spacegroup_number = new_sg ? spacegroup->ccp4 : 0; spacegroup_name = new_sg ? spacegroup->hm : ""; } + /// @} + /// @name Dataset access + /// @{ + + /// Get the last (most recently added) dataset. + /// @return Reference to the last dataset. + /// @throws std::runtime_error if no datasets exist. Dataset& last_dataset() { if (datasets.empty()) fail("MTZ dataset not found (missing DATASET header line?)."); return datasets.back(); } + /// Get dataset by ID. + /// @param id Dataset ID to look up. + /// @return Reference to the dataset. + /// @throws std::runtime_error if ID not found. Dataset& dataset(int id) { if ((size_t)id < datasets.size() && datasets[id].id == id) return datasets[id]; @@ -252,20 +393,32 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return d; fail("MTZ file has no dataset with ID " + std::to_string(id)); } + /// Get dataset by ID (const). const Dataset& dataset(int id) const { return const_cast(this)->dataset(id); } + /// Find a dataset by name. + /// @param name Dataset name to search for. + /// @return Pointer to the dataset, or nullptr if not found. Dataset* dataset_with_name(const std::string& name) { for (Dataset& d : datasets) if (d.dataset_name == name) return &d; return nullptr; } + /// Find a dataset by name (const). const Dataset* dataset_with_name(const std::string& label) const { return const_cast(this)->dataset_with_name(label); } + /// @} + /// @name Column access and queries + /// @{ + + /// Count columns with a specific label (may be > 1 if duplicates exist). + /// @param label Column label to count. + /// @return Number of columns with this label. int count(const std::string& label) const { int n = 0; for (const Column& col : columns) @@ -274,6 +427,9 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return n; } + /// Count columns of a specific type. + /// @param type Column type code (e.g., 'F', 'P', 'Q'). + /// @return Number of columns with this type. int count_type(char type) const { int n = 0; for (const Column& col : columns) @@ -282,6 +438,11 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return n; } + /// Find the first column with a given label, optionally filtered by dataset and type. + /// @param label Column label. + /// @param ds [optional] Restrict search to this dataset (nullptr = any). + /// @param type [optional] Restrict search to this type ('*' = any). + /// @return Pointer to the column, or nullptr if not found. Column* column_with_label(const std::string& label, const Dataset* ds=nullptr, char type='*') { for (Column& col : columns) if (col.label == label && (!ds || ds->id == col.dataset_id) @@ -289,17 +450,26 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return &col; return nullptr; } + /// Find the first column with a given label (const). const Column* column_with_label(const std::string& label, const Dataset* ds=nullptr, char type='*') const { return const_cast(this)->column_with_label(label, ds, type); } + /// Get a column by label, raising an error if not found. + /// @param label Column label. + /// @param ds [optional] Restrict search to this dataset. + /// @return Reference to the column. + /// @throws std::runtime_error if column not found. const Column& get_column_with_label(const std::string& label, const Dataset* ds=nullptr) const { if (const Column* col = column_with_label(label, ds)) return *col; fail("Column label not found: " + label); } + /// Get all columns of a specific type. + /// @param type Column type code. + /// @return Vector of pointers to matching columns. std::vector columns_with_type(char type) const { std::vector cols; for (const Column& col : columns) @@ -308,6 +478,9 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return cols; } + /// Get positions (indices) of all columns with a specific type. + /// @param col_type Column type code. + /// @return Vector of column indices. std::vector positions_of_columns_with_type(char col_type) const { std::vector cols; for (int i = 0; i < (int) columns.size(); ++i) @@ -316,9 +489,11 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return cols; } - // F(+)/(-) pairs should have type G (and L for sigma), - // I(+)/(-) -- K (M for sigma), but E(+)/(-) has no special column type, - // so here we use column labels not types. + /// Find anomalous (±) column pairs by label pattern matching. + /// Looks for labels with "(+)" and matches corresponding "(-)" columns. + /// Note: F(+)/(-) pairs use type G, I(+)/(-) use type K, but E(+)/(-) + /// have no dedicated type, so label matching is used. + /// @return Vector of (index_plus, index_minus) pairs. std::vector> positions_of_plus_minus_columns() const { std::vector> r; for (int i = 0; i < (int) columns.size(); ++i) { @@ -339,7 +514,11 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return r; } - /// the order of labels matters + /// Find the first column matching any label in a prioritized list. + /// @param labels List of labels to try in order. + /// @param type [optional] Column type to match ('*' = any). + /// @return Pointer to the first matching column, or nullptr. + /// @note Order of labels matters; returns the first match. const Column* column_with_one_of_labels(std::initializer_list labels, char type='*') const { for (const char* label : labels) @@ -348,7 +527,11 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return nullptr; } - /// the order of labels doesn't matter + /// Find a column matching a type and any of several labels. + /// @param type Column type to match. + /// @param labels List of labels to search for. + /// @return Pointer to the first matching column, or nullptr. + /// @note Order of labels does not matter. Column* column_with_type_and_any_of_labels(char type, std::initializer_list labels) { for (Column& col : columns) if (col.type == type) { @@ -359,75 +542,118 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return nullptr; } + /// Find the R-free flag column (common labels: FREE, RFREE, R_FREE_FLAGS, etc.). + /// @return Pointer to R-free column (type 'I'), or nullptr. Column* rfree_column() { // cf. MtzToCif::default_spec in mtz2cif.hpp return column_with_type_and_any_of_labels('I', {"FREE", "RFREE", "FREER", "FreeR_flag", "R-free-flags", "FreeRflag", "R_FREE_FLAGS"}); } + /// Find the R-free flag column (const). const Column* rfree_column() const { return const_cast(this)->rfree_column(); } + /// Find the mean intensity column (common labels: IMEAN, I, IOBS, I-obs). + /// @return Pointer to intensity column (type 'J'), or nullptr. Column* imean_column() { return column_with_type_and_any_of_labels('J', {"IMEAN", "I", "IOBS", "I-obs"}); } + /// Find the mean intensity column (const). const Column* imean_column() const { return const_cast(this)->imean_column(); } + /// Find the I(+) anomalous intensity column (common labels: I(+), IOBS(+), Iplus). + /// @return Pointer to I(+) column (type 'K'), or nullptr. Column* iplus_column() { return column_with_type_and_any_of_labels('K', {"I(+)", "IOBS(+)", "I-obs(+)", "Iplus"}); } + /// Find the I(+) column (const). const Column* iplus_column() const { return const_cast(this)->iplus_column(); } + /// Find the I(-) anomalous intensity column. + /// @return Pointer to I(-) column (type 'K'), or nullptr. Column* iminus_column() { return column_with_type_and_any_of_labels('K', {"I(-)", "IOBS(-)", "I-obs(-)", "Iminus"}); } + /// Find the I(-) column (const). const Column* iminus_column() const { return const_cast(this)->iminus_column(); } + /// @} + /// @name Data status + /// @{ + + /// Check if reflection data has been loaded. + /// @return True if data.size() == columns.size() * nreflections. bool has_data() const { return data.size() == columns.size() * nreflections; } + /// Check if this is a merged MTZ file (no batch headers). + /// @return True if batches.empty(). bool is_merged() const { return batches.empty(); } - /// Calculates min/max for all combinations of reflections and unit cells, - /// where unit cells are a global CELL and per-dataset DCELL. + /// Calculate min/max 1/d² from all reflections and unit cells. + /// Considers both global cell and per-dataset DCELLs. + /// @return [min_1_d2, max_1_d2]. std::array calculate_min_max_1_d2() const; + /// Recalculate and update min_1_d2 and max_1_d2 from reflection data. void update_reso() { std::array reso = calculate_min_max_1_d2(); min_1_d2 = reso[0]; max_1_d2 = reso[1]; } - // Functions for reading MTZ headers and data. + /// @} + /// @name File I/O + /// @{ + /// Toggle the assumed byte order and swap header_offset accordingly. void toggle_endianness() { same_byte_order = !same_byte_order; swap_eight_bytes(&header_offset); } + /// Read and verify the first 80 bytes (MTZ magic and machine stamp). + /// @param stream Input stream positioned at file start. void read_first_bytes(AnyStream& stream); - /// read headers until END + /// Read header records from VERS until END. + /// @param stream Input stream positioned at the header block. + /// @param save_headers [optional] Pointer to string vector to save header lines. void read_main_headers(AnyStream& stream, std::vector* save_headers); - /// read the part between END and MTZENDOFHEADERS + /// Read history (MTZHIST) and batch (MTZBATS) records after the END header. + /// @param stream Input stream positioned after END. void read_history_and_batch_headers(AnyStream& stream); + /// Set up spacegroup pointer from spacegroup_number or spacegroup_name. void setup_spacegroup(); + /// Read raw reflection data from stream (float32 binary). + /// @param stream Input stream. + /// @param do_read If false, skip reading (compute space only). void read_raw_data(AnyStream& stream, bool do_read=true); + /// Read all header records (convenience wrapper). + /// @param stream Input stream. void read_all_headers(AnyStream& stream); + /// Read MTZ from a stream, including headers and optionally data. + /// Expects stream positioned at file start; reads in order: raw data, main headers, batch headers. + /// @param stream Input stream. + /// @param with_data If true, read reflection data; if false, skip it. void read_stream(AnyStream& stream, bool with_data); + /// Read MTZ from a file path. + /// @param path File path. + /// @throws std::system_error or std::runtime_error on failure. void read_file(const std::string& path) { try { source_path = path; @@ -440,45 +666,90 @@ struct GEMMI_DLL Mtz : public MtzMetadata { } } + /// Read MTZ from an input object (e.g., MaybeGzipped for .mtz or .mtz.gz). + /// @tparam Input Type with path() and create_stream() methods. + /// @param input Input object. + /// @param with_data If true, read reflection data. template void read_input(Input&& input, bool with_data) { source_path = input.path(); read_stream(*input.create_stream(), with_data); } - /// the same as read_input(MaybeGzipped(path), with_data) + /// Read MTZ from a file, handling .gz compression automatically. + /// @param path File path (.mtz or .mtz.gz). + /// @param with_data If true, read reflection data (default true). void read_file_gz(const std::string& path, bool with_data=true); + /// @} + /// @name Data manipulation (reflection rows) + /// @{ + + /// Get sorted row indices based on the first N columns (HKL by default). + /// @param use_first Number of columns to use for sorting (default 3 = h, k, l). + /// @return Vector of indices [0..nreflections-1] sorted by the first N columns. std::vector sorted_row_indices(int use_first=3) const; + + /// Sort reflections in-place using the first N columns. + /// @param use_first Number of columns to use for sorting (default 3). + /// @return True if any sorting was done; false if already sorted. bool sort(int use_first=3); + /// Extract Miller indices from a reflection at a given offset in the data array. + /// @param offset Offset to the first element of the reflection (H, K, L at offsets 0, 1, 2). + /// @return Miller indices. Miller get_hkl(size_t offset) const { return {{(int)data[offset], (int)data[offset+1], (int)data[offset+2]}}; } + + /// Set Miller indices at a given offset. + /// @param offset Offset to the H element. + /// @param hkl Miller indices to store. void set_hkl(size_t offset, const Miller& hkl) { for (int i = 0; i != 3; ++i) data[offset + i] = static_cast(hkl[i]); } - /// Returns offset of the first hkl or (size_t)-1. Can be slow. + /// Find the data offset of the first reflection with specific Miller indices. + /// @param hkl Miller indices to search for. + /// @param start Starting offset (optional, default 0). + /// @return Offset to the reflection, or (size_t)-1 if not found. + /// @note This is a linear search; can be slow for large files. size_t find_offset_of_hkl(const Miller& hkl, size_t start=0) const; - /// (for merged MTZ only) change HKL to ASU equivalent, adjust phases, etc + /// Move all reflections to ASU and adjust phases/anomalous data accordingly. + /// For merged MTZ only. Transforms F(+), F(-), phases, and Hendrickson-Lattman coefficients. + /// @param tnt_asu If true, use TNT ASU setting; if false, use default ASU. void ensure_asu(bool tnt_asu=false); - /// Reindex data, usually followed by ensure_asu(). Outputs messages through logger. + /// Reindex reflections using a new basis and update space group accordingly. + /// Applies symmetry operation to HKL, removes fractional indices, adjusts cell and space group. + /// Outputs messages to logger. + /// @param op Reindexing operation (must have no translation and determinant > 0). void reindex(const Op& op); - /// Change symmetry to P1 and expand reflections. Does not sort. - /// Similar to command EXPAND in SFTOOLS. + /// Expand reflections to P1 using all symmetry operations. + /// Duplicate reflections under symmetry, adjust phases if present. + /// Similar to SFTOOLS EXPAND command. + /// @note Does not re-sort; sort afterwards if needed. void expand_to_p1(); - /// (for unmerged MTZ only) change HKL according to M/ISYM + /// For unmerged MTZ: convert HKL from ASU to original (observer) indices. + /// Reads M/ISYM column and applies inverse symmetry operations. + /// @return True if M/ISYM column was found and data was modified. bool switch_to_original_hkl(); - /// (for unmerged MTZ only) change HKL to ASU equivalent and set ISYM + /// For unmerged MTZ: convert HKL to ASU and set M/ISYM column accordingly. + /// @return True if M/ISYM column was found and data was modified. bool switch_to_asu_hkl(); + /// @} + /// @name Data construction + /// @{ + + /// Create a new dataset with auto-assigned ID and add to the file. + /// @param name Name to use for project, crystal, and dataset. + /// @return Reference to the newly added dataset. Dataset& add_dataset(const std::string& name) { int id = 0; for (const Dataset& d : datasets) @@ -488,20 +759,39 @@ struct GEMMI_DLL Mtz : public MtzMetadata { return datasets.back(); } + /// Add a column to the file and optionally expand the data array. + /// @param label Column label. + /// @param type Column type code. + /// @param dataset_id Dataset ID for this column (-1 = last dataset). + /// @param pos Position in column list (-1 = append). + /// @param expand_data If true, insert empty rows (NAN) in the data array. + /// @return Reference to the newly added column. Column& add_column(const std::string& label, char type, int dataset_id, int pos, bool expand_data); - // extra_col are columns right after src_col that are also copied. + /// Replace a column with data from another column (including trailing columns). + /// @param dest_idx Destination column index. + /// @param src_col Source column to copy from. + /// @param trailing_cols [optional] Labels of columns immediately after src_col to also copy. + /// @return Reference to the destination column. Column& replace_column(size_t dest_idx, const Column& src_col, const std::vector& trailing_cols={}); - // If dest_idx < 0 - columns are appended at the end - // append new column(s), otherwise overwrite existing ones. + /// Copy a column to a destination, or append if dest_idx < 0. + /// @param dest_idx Destination index (-1 = append). + /// @param src_col Source column. + /// @param trailing_cols [optional] Labels of subsequent columns to also copy. + /// @return Reference to the destination column. Column& copy_column(int dest_idx, const Column& src_col, const std::vector& trailing_cols={}); + /// Remove a column from the file and data array. + /// @param idx Column index to remove. void remove_column(size_t idx); + /// Remove reflection rows matching a condition. + /// @tparam Func Callable that takes pointer to row data and returns true to remove. + /// @param condition Predicate function. template void remove_rows_if(Func condition) { if (!has_data()) @@ -518,6 +808,9 @@ struct GEMMI_DLL Mtz : public MtzMetadata { nreflections = int(data.size() / width); } + /// Insert new empty columns in the data array. + /// @param added Number of columns to insert. + /// @param pos_ Position to insert at (-1 = at the end). void expand_data_rows(size_t added, int pos_=-1) { size_t old_row_size = columns.size() - added; if (data.size() != old_row_size * nreflections) @@ -528,6 +821,10 @@ struct GEMMI_DLL Mtz : public MtzMetadata { vector_insert_columns(data, old_row_size, (size_t)nreflections, added, pos, NAN); } + /// Replace the reflection data array with new data. + /// @param new_data Pointer to float array. + /// @param n Total number of floats (must be divisible by columns.size()). + /// @throws std::runtime_error if n is not a multiple of columns.size(). void set_data(const float* new_data, size_t n) { size_t ncols = columns.size(); if (n % ncols != 0) @@ -536,24 +833,54 @@ struct GEMMI_DLL Mtz : public MtzMetadata { data.assign(new_data, new_data + n); } - // Function for writing MTZ file + /// @} + /// @name File output + /// @{ + + /// Write MTZ to a C FILE stream. + /// @param stream Open FILE* stream (should be in binary write mode). void write_to_cstream(std::FILE* stream) const; + + /// Write MTZ to a string (binary data). + /// @param str [out] String to append the binary MTZ data to. void write_to_string(std::string& str) const; + + /// Write MTZ to a file. + /// @param path File path. void write_to_file(const std::string& path) const; + + /// Get the size of the binary MTZ output in bytes. + /// @return Size needed for the complete MTZ file. size_t size_to_write() const; + + /// Write MTZ to a buffer. + /// @param buf Pointer to output buffer. + /// @param maxlen Maximum bytes to write. + /// @return Number of bytes written. size_t write_to_buffer(char* buf, size_t maxlen) const; private: + /// Generic write implementation (template to support FILE*, string, buffer). + /// @tparam Write Function type (size_t write(const void*, size_t, size_t)). template void write_to_stream(Write write) const; }; +/// @} +/// Convenience function: read MTZ from a file path. +/// @param path File path. +/// @return Loaded Mtz object. inline Mtz read_mtz_file(const std::string& path) { Mtz mtz; mtz.read_file(path); return mtz; } +/// Convenience function: read MTZ from an input object (handles gzip). +/// @tparam Input Type with path() and create_stream() methods. +/// @param input Input object (e.g., MaybeGzipped). +/// @param with_data If true, read reflection data; if false, headers only. +/// @return Loaded Mtz object. template Mtz read_mtz(Input&& input, bool with_data) { Mtz mtz; @@ -561,17 +888,33 @@ Mtz read_mtz(Input&& input, bool with_data) { return mtz; } -// Abstraction of data source, cf. ReflnDataProxy. +/// Abstraction layer for accessing MTZ data uniformly. +/// Provides stride, data access, and cell/symmetry information. +/// Similar to ReflnDataProxy for reflection data in other formats. struct MtzDataProxy { + /// Reference to the MTZ object. const Mtz& mtz_; + /// Stride (number of columns) between consecutive reflections. size_t stride() const { return mtz_.columns.size(); } + /// Total number of floats in the data array. size_t size() const { return mtz_.data.size(); } + /// Element type (always float). using num_type = float; + /// Access a data element by index. + /// @param n Index into the flat data array. float get_num(size_t n) const { return mtz_.data[n]; } + /// Get the unit cell. const UnitCell& unit_cell() const { return mtz_.cell; } + /// Get the space group. const SpaceGroup* spacegroup() const { return mtz_.spacegroup; } + /// Get Miller indices from a reflection. + /// @param offset Offset to the H element. Miller get_hkl(size_t offset) const { return mtz_.get_hkl(offset); } + /// Find the column index for a given label. + /// @param label Column label. + /// @return Column index (idx). + /// @throws std::runtime_error if label not found. size_t column_index(const std::string& label) const { if (const Mtz::Column* col = mtz_.column_with_label(label)) return col->idx; @@ -579,13 +922,21 @@ struct MtzDataProxy { } }; -// Like above, but here the data is stored outside of the Mtz class +/// MtzDataProxy variant for external data (not stored in Mtz). +/// Wraps MTZ metadata with a separate data array pointer. struct MtzExternalDataProxy : MtzDataProxy { + /// Pointer to external data array. const float* data_; + /// Initialize with MTZ metadata and external data. + /// @param mtz MTZ object (for structure info only). + /// @param data Pointer to external float array (size = columns.size() * nreflections). MtzExternalDataProxy(const Mtz& mtz, const float* data) : MtzDataProxy{mtz}, data_(data) {} + /// Total size of the external data array. size_t size() const { return mtz_.columns.size() * mtz_.nreflections; } + /// Access element from external data. float get_num(size_t n) const { return data_[n]; } + /// Get Miller indices from external data. Miller get_hkl(size_t offset) const { return {{(int)data_[offset + 0], (int)data_[offset + 1], @@ -593,6 +944,9 @@ struct MtzExternalDataProxy : MtzDataProxy { } }; +/// Create a proxy for accessing MTZ data. +/// @param mtz MTZ object. +/// @return MtzDataProxy wrapping the MTZ. inline MtzDataProxy data_proxy(const Mtz& mtz) { return {mtz}; } } // namespace gemmi diff --git a/include/gemmi/mtz2cif.hpp b/include/gemmi/mtz2cif.hpp index 387ceedc8..79811d865 100644 --- a/include/gemmi/mtz2cif.hpp +++ b/include/gemmi/mtz2cif.hpp @@ -19,27 +19,60 @@ namespace gemmi { +/// @file +/// @brief Converter for MTZ reflection data to SF-mmCIF format. + +/// @brief Converts MTZ files (merged or unmerged) to SF-mmCIF reflection tables. +/// +/// This class provides configuration options for column selection, naming, +/// filtering, and metadata handling when converting MTZ format to mmCIF. class GEMMI_DLL MtzToCif { public: - // options that can be set directly - std::vector spec_lines; // conversion specification (cf. default_spec) - const char* block_name = nullptr; // NAME in data_NAME - std::string entry_id = "xxxx"; // _entry.id - bool with_comments = true; // write comments - bool with_history = true; // write MTZ history in comments - bool skip_empty = false; // skip reflections with no values - bool skip_negative_sigi = false; // skip refl. with sigma(I) < 0 in unmerged - bool enable_UB = false; // write _diffrn_orient_matrix.UB - bool write_staraniso_tensor = true; // write _reflns.pdbx_aniso_B_tensor_* + /// Column conversion specification lines (see default_spec for format). + std::vector spec_lines; + /// CIF data block name (NAME in data_NAME). + const char* block_name = nullptr; + /// Entry identifier (_entry.id tag). + std::string entry_id = "xxxx"; + /// Whether to write comments describing the conversion. + bool with_comments = true; + /// Whether to write MTZ history records in comments. + bool with_history = true; + /// Skip reflections where all selected numeric columns are missing. + bool skip_empty = false; + /// Skip unmerged reflections with sigma(I) < 0. + bool skip_negative_sigi = false; + /// Write _diffrn_orient_matrix.UB orientation matrix. + bool enable_UB = false; + /// Write _reflns.pdbx_aniso_B_tensor_* Starraniso B-tensor (if available). + bool write_staraniso_tensor = true; + /// Write PDB-specific special marker for validation. bool write_special_marker_for_pdb = false; - int less_anomalous = 0; // skip (+)/(-) columns even if in spec - std::string skip_empty_cols; // columns used to determine "emptiness" - double wavelength = NAN; // user-specified wavelength - int trim = 0; // output only reflections -N<=h,k,l<=N - int free_flag_value = -1; // -1 = auto: 0 or (if we have >50% of 0's) 1 - std::string staraniso_version; // for _software.version in "special_marker" - std::string gemmi_run_from; // added to gemmi as _software.description + /// If non-zero, skip anomalous (+/-) column pairs. + int less_anomalous = 0; + /// Columns used to determine if reflection is "empty" (when skip_empty=true). + std::string skip_empty_cols; + /// User-specified wavelength (NAN means use MTZ value). + double wavelength = NAN; + /// Trim reflections: output only those with -N<=h,k,l<=N (0 = no trim). + int trim = 0; + /// Free flag value: -1=auto, 0 or 1=explicit. + int free_flag_value = -1; + /// Starraniso version string for metadata. + std::string staraniso_version; + /// Description string appended to gemmi software entry. + std::string gemmi_run_from; + /// @brief Get default column specification for merged or unmerged data. + /// + /// The returned spec_lines describe MTZ-to-mmCIF column mapping. + /// Format: [?|&|$|H][COLUMN_NAME] [TYPE] [mmCIF_TAG] [FORMAT] + /// - ? = optional column (try alternatives separated by |) + /// - & = required, uses previous column's result + /// - $ = internal (dataset_id, counter) + /// - H = required by IUCR standard + /// @param for_merged If true, return spec for merged data; else unmerged. + /// @return Null-terminated array of spec strings. static const char** default_spec(bool for_merged) { static const char* merged[] = { "H H index_h", @@ -90,24 +123,56 @@ class GEMMI_DLL MtzToCif { return for_merged ? merged : unmerged; } + /// @brief Write MTZ reflection data to CIF format. + /// @param mtz First MTZ dataset (required). + /// @param mtz2 Optional second MTZ dataset for anomalous comparison. + /// @param staraniso_b Optional Starraniso B-tensor to include. + /// @param os Output stream for CIF file. void write_cif(const Mtz& mtz, const Mtz* mtz2, SMat33* staraniso_b, std::ostream& os); + /// @brief Write XDS reflection data to CIF format. + /// @param xds XDS_ASCII data to convert. + /// @param os Output stream for CIF file. void write_cif_from_xds(const XdsAscii& xds, std::ostream& os) const; }; +/// @brief Write Starraniso B-tensor to mmCIF format. +/// @param b 3x3 symmetric B-tensor matrix. +/// @param entry_id Entry identifier for tags. +/// @param buf Temporary buffer for formatting. +/// @param os Output stream. GEMMI_DLL void write_staraniso_b_in_mmcif(const SMat33& b, const std::string& entry_id, char* buf, std::ostream& os); -/// remove '_dataset_name' that can be appended to column names in ccp4i +/// @brief Remove '_dataset_name' appendix from MTZ column labels. +/// +/// This suffix is sometimes added by CCP4i and needs removal for proper conversion. +/// @param mtz MTZ file to modify. +/// @param logger For reporting changes. GEMMI_DLL void remove_appendix_from_column_names(Mtz& mtz, const Logger& logger); +/// @brief Validate merged MTZ has required columns for PDB deposition. +/// @param mtz MTZ to check. +/// @param logger For reporting results. +/// @return True if all required columns present. GEMMI_DLL bool validate_merged_mtz_deposition_columns(const Mtz& mtz, const Logger& logger); -// note: both mi and ui get modified +/// @brief Validate merged intensity data for consistency and quality. +/// +/// Compares merged and unmerged intensity columns for anomalous differences and completeness. +/// Modifies both Intensities objects. +/// @param mi Merged intensities (modified). +/// @param ui Unmerged intensities (modified). +/// @param relaxed_check If true, apply looser validation criteria. +/// @param logger For reporting issues. +/// @return True if validation passes. GEMMI_DLL bool validate_merged_intensities(Intensities& mi, Intensities& ui, bool relaxed_check, const Logger& logger); +/// @brief Extract software information from MTZ history records. +/// @param history Vector of history strings from MTZ file. +/// @return Vector of SoftwareItem objects describing processing steps. GEMMI_DLL std::vector get_software_from_mtz_history(const std::vector& history); diff --git a/include/gemmi/refln.hpp b/include/gemmi/refln.hpp index d74cc9939..c24df4c18 100644 --- a/include/gemmi/refln.hpp +++ b/include/gemmi/refln.hpp @@ -15,19 +15,38 @@ namespace gemmi { +/// @file +/// @brief Structure and accessors for reflection data from SF-mmCIF files. + +/// @brief Wrapper for reflection data block from mmCIF file. +/// +/// Provides column access and HKL index management for reflection data +/// stored in CIF loops (either merged _refln or unmerged _diffrn_refln categories). struct ReflnBlock { + /// CIF data block containing reflection data. cif::Block block; + /// Entry identifier from _entry.id tag. std::string entry_id; + /// Unit cell parameters. UnitCell cell; + /// Pointer to space group; nullptr if unknown. const SpaceGroup* spacegroup = nullptr; + /// X-ray wavelength in Angstroms (0 if multiple wavelengths). double wavelength; + /// Number of wavelengths in the data block. int wavelength_count; + /// Pointer to _refln loop (merged data) or nullptr if absent. cif::Loop* refln_loop = nullptr; + /// Pointer to _diffrn_refln loop (unmerged data) or nullptr if absent. cif::Loop* diffrn_refln_loop = nullptr; + /// Points to active loop (refln_loop or diffrn_refln_loop). cif::Loop* default_loop = nullptr; + /// Default constructor (empty block). ReflnBlock() = default; + /// Move constructor. ReflnBlock(ReflnBlock&& rblock_) = default; + /// Construct from CIF block; extracts cell, spacegroup, wavelength, and reflection loops. ReflnBlock(cif::Block&& block_) : block(std::move(block_)) { entry_id = cif::as_string(block.find_value("_entry.id")); impl::set_cell_from_mmcif(block, cell); @@ -43,7 +62,9 @@ struct ReflnBlock { diffrn_refln_loop = block.find_loop("_diffrn_refln.index_h").get_loop(); default_loop = refln_loop ? refln_loop : diffrn_refln_loop; } + /// Move assignment. ReflnBlock& operator=(ReflnBlock&&) = default; + /// Copy assignment (deep copy of block and pointers). ReflnBlock& operator=(const ReflnBlock& o) { if (this == &o) return *this; @@ -61,19 +82,28 @@ struct ReflnBlock { return *this; } + /// @brief Check if block contains valid reflection data. + /// @return True if default_loop is set and not null. bool ok() const { return default_loop != nullptr; } + /// @brief Throw exception if block is not valid. void check_ok() const { if (!ok()) fail("Invalid ReflnBlock"); } - // position after "_refln." or "_diffrn_refln." + /// @brief Get offset to tag name (after "_refln." or "_diffrn_refln."). + /// @return 7 for merged, 14 for unmerged. size_t tag_offset() const { return default_loop == refln_loop ? 7 : 14; } + /// @brief Switch between merged and unmerged reflection loops. + /// @param unmerged If true, use _diffrn_refln loop; otherwise use _refln. void use_unmerged(bool unmerged) { default_loop = unmerged ? diffrn_refln_loop : refln_loop; } + /// @brief Check if active loop is merged reflection data. bool is_merged() const { return ok() && default_loop == refln_loop; } - // deprecated + /// @brief Check if active loop is unmerged reflection data (deprecated). bool is_unmerged() const { return ok() && default_loop == diffrn_refln_loop; } + /// @brief Get list of column labels (without category prefix). + /// @return Vector of tag names from the active loop. std::vector column_labels() const { check_ok(); std::vector labels(default_loop->tags.size()); @@ -82,6 +112,9 @@ struct ReflnBlock { return labels; } + /// @brief Find column index by tag name (without category prefix). + /// @param tag Column name (e.g., "index_h", "F_meas_sigma_au"). + /// @return Column index, or -1 if not found. int find_column_index(const std::string& tag) const { if (!ok()) return -1; @@ -92,6 +125,10 @@ struct ReflnBlock { return -1; } + /// @brief Get column index by tag name, throwing exception if not found. + /// @param tag Column name. + /// @return Column index. + /// @throws gemmi::fail if column does not exist. size_t get_column_index(const std::string& tag) const { int idx = find_column_index(tag); if (idx == -1) { @@ -103,6 +140,11 @@ struct ReflnBlock { return idx; } + /// @brief Extract column values as typed vector. + /// @tparam T Value type (e.g., int, double). + /// @param tag Column name. + /// @param null Default value for missing data. + /// @return Vector of converted values. template std::vector make_vector(const std::string& tag, T null) const { size_t n = get_column_index(tag); @@ -112,12 +154,16 @@ struct ReflnBlock { return v; } + /// @brief Get column indices for h, k, l indices. + /// @return Array of 3 column indices for index_h, index_k, index_l. std::array get_hkl_column_indices() const { return {{get_column_index("index_h"), get_column_index("index_k"), get_column_index("index_l")}}; } + /// @brief Extract Miller indices from all reflections. + /// @return Vector of Miller indices. std::vector make_miller_vector() const { auto hkl_idx = get_hkl_column_indices(); std::vector v(default_loop->length()); @@ -127,6 +173,9 @@ struct ReflnBlock { return v; } + /// @brief Calculate 1/d^2 for all reflections using unit cell parameters. + /// @return Vector of 1/d^2 values. + /// @throws gemmi::fail if unit cell is not set. std::vector make_1_d2_vector() const { if (!cell.is_crystal() || cell.a <= 0) fail("Unit cell is not known"); @@ -141,6 +190,8 @@ struct ReflnBlock { return r; } + /// @brief Calculate d-spacing for all reflections using unit cell parameters. + /// @return Vector of d-spacing values in Angstroms. std::vector make_d_vector() const { std::vector vec = make_1_d2_vector(); for (double& d : vec) @@ -149,7 +200,12 @@ struct ReflnBlock { } }; -// moves blocks from the argument to the return value +/// @brief Convert CIF blocks to ReflnBlocks, propagating cell and spacegroup info. +/// +/// Fills in missing cell or spacegroup data by copying from the first block +/// that contains it. Moves blocks from input to output. +/// @param blocks Input CIF blocks (consumed). +/// @return Vector of ReflnBlocks. inline std::vector as_refln_blocks(std::vector&& blocks) { std::vector rvec; @@ -175,8 +231,15 @@ std::vector as_refln_blocks(std::vector&& blocks) { return rvec; } -// Get the first (merged) block with required labels. -// Optionally, block name can be specified. +/// @brief Find the first merged reflection block containing specified columns. +/// +/// Searches blocks for one with _refln loop and required column labels. +/// Propagates spacegroup from first block if needed. +/// @param blocks Input CIF blocks (consumed). +/// @param labels Required column names (empty string means optional). +/// @param block_name Optional: if provided, only this named block is considered. +/// @return The matching ReflnBlock. +/// @throws gemmi::fail if no matching block or required columns not found. inline ReflnBlock get_refln_block(std::vector&& blocks, const std::vector& labels, const char* block_name=nullptr) { @@ -209,6 +272,9 @@ inline ReflnBlock get_refln_block(std::vector&& blocks, fail("Tags not found in SF-mmCIF file: _refln.", join_str(labels, ", _refln.")); } +/// @brief Convert CIF block from non-mmCIF format to ReflnBlock. +/// @param block Input CIF block (e.g., from hkl file). +/// @return ReflnBlock with data from _refln_index_h loop. inline ReflnBlock hkl_cif_as_refln_block(cif::Block& block) { ReflnBlock rb; rb.block.swap(block); @@ -224,29 +290,47 @@ inline ReflnBlock hkl_cif_as_refln_block(cif::Block& block) { return rb; } -// Abstraction of data source, cf. MtzDataProxy. +/// @brief Generic data source abstraction over a ReflnBlock for row iteration. +/// +/// Provides uniform interface for accessing reflection columns (similar to MtzDataProxy). struct ReflnDataProxy { + /// Reference to underlying ReflnBlock. const ReflnBlock& rb_; + /// Cached indices for h, k, l columns. std::array hkl_cols_; + /// Initialize proxy from a ReflnBlock. explicit ReflnDataProxy(const ReflnBlock& rb) : rb_(rb), hkl_cols_(rb_.get_hkl_column_indices()) {} + /// Number of columns (values per reflection). size_t stride() const { return loop().tags.size(); } + /// Total number of values (stride * reflection count). size_t size() const { return loop().values.size(); } + /// Numeric value type. using num_type = double; + /// Get numeric value at flattened loop index. double get_num(size_t n) const { return cif::as_number(loop().values[n]); } + /// Get unit cell. const UnitCell& unit_cell() const { return rb_.cell; } + /// Get spacegroup. const SpaceGroup* spacegroup() const { return rb_.spacegroup; } + /// Get Miller indices at given offset in loop. Miller get_hkl(size_t offset) const { return {{get_int(offset + hkl_cols_[0]), get_int(offset + hkl_cols_[1]), get_int(offset + hkl_cols_[2])}}; } + /// Get column index by label. size_t column_index(const std::string& label) const { return rb_.get_column_index(label); } private: + /// Get active loop (with bounds check). const cif::Loop& loop() const { rb_.check_ok(); return *rb_.default_loop; } + /// Get integer value at flattened loop index. int get_int(size_t n) const { return cif::as_int(loop().values[n]); } }; +/// @brief Create a data proxy over a ReflnBlock. +/// @param rb ReflnBlock to wrap. +/// @return ReflnDataProxy for generic access. inline ReflnDataProxy data_proxy(const ReflnBlock& rb) { return ReflnDataProxy(rb); } } // namespace gemmi diff --git a/include/gemmi/xds2mtz.hpp b/include/gemmi/xds2mtz.hpp index af142a4e3..826c6268b 100644 --- a/include/gemmi/xds2mtz.hpp +++ b/include/gemmi/xds2mtz.hpp @@ -12,6 +12,17 @@ namespace gemmi { +/// @file +/// @brief Converter for XDS reflection data to MTZ format. + +/// @brief Convert XDS reflection data to MTZ format. +/// +/// For unmerged data, creates unmerged MTZ with batch headers matching Pointless output. +/// For merged data, uses Intensities class to prepare merged MTZ. +/// Sets up all standard MTZ columns: H, K, L, M/ISYM, BATCH, I, SIGI, XDET, YDET, ROT, +/// plus optional FRACTIONCALC, LP, CORR, and MAXC columns depending on XDS read_columns. +/// @param xds XDS data to convert (modified). +/// @return Populated MTZ object sorted by reflection index. inline Mtz xds_to_mtz(XdsAscii& xds) { if (xds.is_merged()) { Intensities intensities; diff --git a/include/gemmi/xds_ascii.hpp b/include/gemmi/xds_ascii.hpp index 38b7a93f9..1f4ed8ceb 100644 --- a/include/gemmi/xds_ascii.hpp +++ b/include/gemmi/xds_ascii.hpp @@ -11,76 +11,138 @@ namespace gemmi { -// from Pointless docs: likely in-house source, in which case -// the unpolarised value is left unchanged (recognised wavelengths -// are CuKalpha 1.5418 +- 0.0019, Mo 0.7107 +- 0.0002, Cr 2.29 +- 0.01) +/// @file +/// @brief Reader for XDS_ASCII.HKL and INTEGRATE.HKL reflection files. + +/// @brief Check if wavelength likely comes from in-house X-ray source. +/// +/// Based on Pointless documentation; recognizes Cu, Mo, and Cr K-alpha +/// wavelengths with their typical uncertainties. +/// @param wavelength Wavelength in Angstroms. +/// @return True if wavelength matches Cu, Mo, or Cr. inline bool likely_in_house_source(double wavelength) { return std::fabs(wavelength - 1.5418) < 0.0019 || std::fabs(wavelength - 0.7107) < 0.0002 || std::fabs(wavelength - 2.29) < 0.01; } +/// @brief Metadata for XDS reflection data (base class). struct XdsAsciiMetadata { + /// @brief Properties of one integration set (dataset/sweep). struct Iset { + /// Integration set ID. int id; + /// Input file name for this set. std::string input_file; + /// Wavelength in Angstroms (0 if not specified). double wavelength = 0.; + /// Unit cell constants [a, b, c, alpha, beta, gamma]. std::array cell_constants = {0., 0., 0., 0., 0., 0.}; - //statistics set by gather_iset_statistics() + /// Minimum frame number (set by gather_iset_statistics). int frame_number_min = -1; + /// Maximum frame number (set by gather_iset_statistics). int frame_number_max = -1; + /// Total number of distinct frames in set. int frame_count = -1; + /// Number of reflections in this set. int reflection_count = -1; + /// @brief Construct Iset with given ID. Iset(int id_) : id(id_) {} }; + /// Source file path. std::string source_path; - int read_columns = 0; // doesn't include ITEM_ISET from XSCALE + /// Number of columns read from DATA section (0-13, excludes ITEM_ISET from XSCALE). + int read_columns = 0; + /// Space group number from XDS_ASCII header. int spacegroup_number = 0; + /// X-ray wavelength in Angstroms. double wavelength = 0.; + /// Unit cell constants [a, b, c, alpha, beta, gamma]. std::array cell_constants = {0., 0., 0., 0., 0., 0.}; + /// Unit cell matrix: rows are a*, b*, c* axes (reciprocal lattice). Mat33 cell_axes{0.}; + /// Incident beam direction (usually normalized). Vec3 incident_beam_dir; + /// Oscillation range per frame in degrees. double oscillation_range = 0.; + /// Rotation axis direction. Vec3 rotation_axis; + /// Starting angle for rotation in degrees. double starting_angle = 0.; + /// Mosaicity/reflecting range standard deviation in degrees. double reflecting_range_esd = 0.; + /// Friedel's law assumption: '\0' unknown, 'T'rue, 'F'alse. char friedels_law = '\0'; + /// First frame number. int starting_frame = 1; - int nx = 0; // detector size - number of pixels + /// Detector width in pixels. + int nx = 0; + /// Detector height in pixels. int ny = 0; - double qx = 0.; // pixel size in mm + /// Pixel size in x-direction (mm). + double qx = 0.; + /// Pixel size in y-direction (mm). double qy = 0.; + /// Detector origin in x-direction (mm). double orgx = 0.; + /// Detector origin in y-direction (mm). double orgy = 0.; + /// Distance from sample to detector (mm). double detector_distance = 0.; + /// Program name that generated the file (e.g., "XDS"). std::string generated_by; + /// Version string of generating program. std::string version_str; + /// Integration sets (for multi-sweep data). std::vector isets; }; +/// @brief Container for XDS reflection data (XDS_ASCII.HKL or INTEGRATE.HKL). +/// +/// Stores integration metadata and per-reflection measurements from XDS output. +/// Supports both merged and unmerged data depending on read_columns value. struct GEMMI_DLL XdsAscii : XdsAsciiMetadata { + /// @brief One reflection record from XDS data. struct Refl { + /// Miller indices (h, k, l). Miller hkl; + /// Integration set ID (dataset number). int iset = 1; + /// Integrated intensity. double iobs; + /// Standard deviation of iobs. double sigma; + /// Detector position x (mm). double xd; + /// Detector position y (mm). double yd; + /// Frame number (zd) relative to starting frame (can be negative). double zd; + /// Reciprocal lattice point (RLP) value; related to partiality. double rlp; + /// Peak intensity percentage (0-10000 for 0-100%). double peak; - double corr; // is it always integer? + /// Correction factor (Lorentz-polarization etc; usually 100-150). + double corr; + /// Maximum pixel value in reflection. double maxc; - // ZD can be negative for a few reflections + /// @brief Get frame number (rounded up from zd). + /// @return Frame number (zd is negative-friendly). int frame() const { return (int) std::floor(zd + 1); } }; + /// All reflection records. std::vector data; + /// @brief Default constructor. XdsAscii() = default; + /// @brief Construct with existing metadata. XdsAscii(const XdsAsciiMetadata& m) : XdsAsciiMetadata(m) {} + /// @brief Get existing or create new integration set. + /// @param id Integration set ID. + /// @return Reference to Iset with given ID. Iset& find_or_add_iset(int id) { for (Iset& i : isets) if (i.id == id) @@ -88,24 +150,40 @@ struct GEMMI_DLL XdsAscii : XdsAsciiMetadata { isets.emplace_back(id); return isets.back(); } + /// @brief Read XDS file from stream. + /// @param reader Input stream handler. + /// @param source File path (for error messages). void read_stream(AnyStream& reader, const std::string& source); + /// @brief Read XDS file from input object (file or stdin). + /// @tparam T Input object with create_stream() and path() methods. + /// @param input Input object. template void read_input(T&& input) { read_stream(*input.create_stream(), input.path()); } + /// @brief Check if data is merged (few columns in XDS file). + /// @return True if read_columns < 8 (no per-reflection BATCH info). bool is_merged() const { return read_columns < 8; } - // set a few Iset properties in isets + /// @brief Calculate frame number statistics for each integration set. + /// + /// Sets frame_number_min, frame_number_max, frame_count, and + /// reflection_count for each Iset. void gather_iset_statistics(); + /// @brief Calculate rotation angle for a reflection. + /// @param refl Reflection record with zd frame number. + /// @return Rotation angle in degrees. double rot_angle(const Refl& refl) const { double z = refl.zd - starting_frame + 1; return starting_angle + oscillation_range * z; } - // it's already normalized, but just in case normalize it again + /// @brief Get normalized rotation axis. + /// @return Normalized rotation_axis vector. + /// @throws gemmi::fail if rotation_axis is zero. Vec3 get_rotation_axis() const { double length = rotation_axis.length(); if (length == 0) @@ -113,7 +191,9 @@ struct GEMMI_DLL XdsAscii : XdsAsciiMetadata { return rotation_axis / length; } - // I'm not sure if always |incident_beam_dir| == 1/wavelength + /// @brief Get normalized incident beam direction (S0). + /// @return Normalized incident_beam_dir vector. + /// @throws gemmi::fail if incident_beam_dir is zero. Vec3 get_s0_direction() const { double length = incident_beam_dir.length(); if (length == 0) @@ -121,6 +201,8 @@ struct GEMMI_DLL XdsAscii : XdsAsciiMetadata { return incident_beam_dir / length; } + /// @brief Check if reciprocal lattice vectors (cell_axes) are set. + /// @return True if all 3 reciprocal axes are non-zero. bool has_cell_axes() const { for (int i = 0; i < 3; ++i) if (cell_axes[i][0] == 0 && cell_axes[i][1] == 0 && cell_axes[i][2] == 0) @@ -128,12 +210,15 @@ struct GEMMI_DLL XdsAscii : XdsAsciiMetadata { return true; } - /// Return transition matrix from "Cambridge" frame to XDS frame. - /// x_xds = M x_cam + /// @brief Calculate transformation matrix from Cambridge frame to XDS frame. + /// + /// Cambridge frame: z along rotation axis, x along incident beam. + /// @return 3x3 matrix such that x_xds = M * x_cambridge. + /// @throws gemmi::fail if geometry data missing. Mat33 calculate_conversion_from_cambridge() const { // Cambridge z direction is along the principal rotation axis Vec3 z = get_rotation_axis(); - // Cambridge z direction is along beam + // Cambridge x direction is along beam Vec3 x = get_s0_direction(); Vec3 y = z.cross(x).normalized(); // beam and rotation axis may not be orthogonal @@ -141,6 +226,9 @@ struct GEMMI_DLL XdsAscii : XdsAsciiMetadata { return Mat33::from_columns(x, y, z); } + /// @brief Calculate crystal orientation matrix U. + /// @return 3x3 orientation matrix. + /// @throws gemmi::fail if cell_axes not set. Mat33 get_orientation() const { if (!has_cell_axes()) fail("unknown unit cell axes"); @@ -154,21 +242,31 @@ struct GEMMI_DLL XdsAscii : XdsAsciiMetadata { return Mat33::from_columns(ar, br, cr); } - /// \par p is degree of polarization from range (0,1), as used in XDS. + /// @brief Apply polarization correction to all intensities and sigmas. + /// + /// Based on incident beam direction, rotation axis, and polarization plane. + /// Assumes XDS file has unpolarized beam correction already applied. + /// @param p Degree of polarization in [0, 1] (0.5 for unpolarized). + /// @param normal Normal vector to polarization plane. void apply_polarization_correction(double p, Vec3 normal); - /// \par overload is maximally allowed pixel value in a peak (MAXC). + /// @brief Remove reflections with peak pixel value exceeding threshold. + /// @param overload Maximum allowed MAXC pixel value. void eliminate_overloads(double overload) { vector_remove_if(data, [&](Refl& r) { return r.maxc > overload; }); } - /// \par batchmin lowest allowed batch number. + /// @brief Remove reflections with frame number below threshold. + /// @param batchmin Minimum frame number to keep. void eliminate_batchmin(int batchmin) { double minz = batchmin - 1; vector_remove_if(data, [&](Refl& r) { return r.zd < minz; }); } }; +/// @brief Read XDS_ASCII file from path. +/// @param path File path. +/// @return Populated XdsAscii object. inline XdsAscii read_xds_ascii_file(const std::string& path) { XdsAscii ret; FileStream stream(path.c_str(), "rb"); @@ -176,7 +274,9 @@ inline XdsAscii read_xds_ascii_file(const std::string& path) { return ret; } -/// read possibly gzipped file +/// @brief Read XDS_ASCII file, handling gzip compression. +/// @param path File path (may be .gz). +/// @return Populated XdsAscii object. GEMMI_DLL XdsAscii read_xds_ascii(const std::string& path); } // namespace gemmi