diff --git a/docs/api.rst b/docs/api.rst index 6697dfc5..95beeae6 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -54,6 +54,33 @@ Map and Grid Data .. doxygenfile:: model.hpp :project: gemmi +CIF Data Reading and Writing +----------------------------- + +*(Full documentation added in PR 3.)* + +.. doxygenfile:: cifdoc.hpp + :project: gemmi + +.. doxygenfile:: cif.hpp + :project: gemmi + +.. doxygenfile:: read_cif.hpp + :project: gemmi + +.. doxygenfile:: to_cif.hpp + :project: gemmi + +.. doxygenfile:: to_json.hpp + :project: gemmi + +.. doxygenfile:: json.hpp + :project: gemmi + +.. doxygenfile:: numb.hpp + :project: gemmi + +.. doxygenfile:: ddl.hpp Structure I/O ------------- diff --git a/include/gemmi/cif.hpp b/include/gemmi/cif.hpp index 8ac46fc8..df2c694e 100644 --- a/include/gemmi/cif.hpp +++ b/include/gemmi/cif.hpp @@ -1,3 +1,14 @@ +/// @file +/// @brief PEGTL-based CIF parser with pluggable action handlers and Document construction. +/// +/// This header provides the complete CIF parsing infrastructure: +/// - PEG grammar rules for CIF 1.1 syntax (namespace `rules`) +/// - Customizable action handlers (templates specializing `Action`) +/// - Built-in actions that construct an in-memory Document +/// - Entry points: read_file(), read_memory(), read_cstream(), read_istream(), read() +/// +/// For high-level parsing of standard formats (mmCIF, plain CIF), prefer read_cif.hpp. + // Copyright 2017 Global Phasing Ltd. // // CIF parser (based on PEGTL) with pluggable actions, @@ -264,10 +275,22 @@ template<> struct Action { }; +/// @brief Parse CIF content from an input, populating a Document. +/// @tparam Input PEGTL input type (e.g., pegtl::file_input, pegtl::memory_input). +/// @param d Document to populate with parsed blocks and items. +/// @param in PEGTL input object. +/// @throws pegtl::parse_error on syntax errors. template void parse_input(Document& d, Input&& in) { pegtl::parse(in, d); } +/// @brief Read a complete CIF file and return a Document. +/// @tparam Input PEGTL input type. +/// @param in PEGTL input object with a source() method. +/// @param check_level Validation strictness: 0=no checks, 1=missing values & duplicates, 2=also empty loops. +/// @return Fully parsed Document. +/// @throws pegtl::parse_error on syntax errors. +/// @throws std::runtime_error on validation failures (check_level > 0). template Document read_input(Input&& in, int check_level=1) { Document doc; doc.source = in.source(); @@ -286,6 +309,12 @@ template Document read_input(Input&& in, int check_level=1) { return doc; } +/// @brief Parse a single CIF data block and add it to a Document. +/// @tparam Input PEGTL input type. +/// @param d Document to append to. +/// @param in PEGTL input. +/// @return Byte offset after parsing the block. +/// @throws pegtl::parse_error on syntax errors. template size_t parse_one_block(Document& d, Input&& in) { pegtl::parse(in, d); @@ -302,21 +331,48 @@ size_t parse_one_block(Document& d, Input&& in) { tao::pegtl::file_input<> in(path) #endif +/// @brief Read a CIF file from disk. +/// @param filename Path to the CIF file. +/// @param check_level Validation level (0-2). +/// @return Parsed Document. +/// @throws std::runtime_error if file cannot be opened. +/// @throws pegtl::parse_error on syntax errors. inline Document read_file(const std::string& filename, int check_level=1) { GEMMI_CIF_FILE_INPUT(in, filename); return read_input(in, check_level); } +/// @brief Read CIF from memory. +/// @param data Pointer to CIF content (need not be null-terminated). +/// @param size Number of bytes to parse. +/// @param name Label for error messages (e.g., "buffer"). +/// @param check_level Validation level (0-2). +/// @return Parsed Document. +/// @throws pegtl::parse_error on syntax errors. inline Document read_memory(const char* data, size_t size, const char* name, int check_level=1) { pegtl::memory_input<> in(data, size, name); return read_input(in, check_level); } +/// @brief Read CIF from a C FILE stream. +/// @param f Open FILE pointer (e.g., stdin, or result of fopen()). +/// @param bufsize Buffering size for reading (e.g., 16*1024). +/// @param name Label for error messages. +/// @param check_level Validation level (0-2). +/// @return Parsed Document. +/// @throws pegtl::parse_error on syntax errors. inline Document read_cstream(std::FILE *f, size_t bufsize, const char* name, int check_level=1) { pegtl::cstream_input<> in(f, bufsize, name); return read_input(in, check_level); } +/// @brief Read CIF from a C++ std::istream. +/// @param is Input stream (e.g., std::ifstream, std::cin). +/// @param bufsize Buffering size (e.g., 16*1024). +/// @param name Label for error messages. +/// @param check_level Validation level (0-2). +/// @return Parsed Document. +/// @throws pegtl::parse_error on syntax errors. inline Document read_istream(std::istream &is, size_t bufsize, const char* name, int check_level=1) { pegtl::istream_input<> in(is, bufsize, name); @@ -332,6 +388,11 @@ template<> struct CheckAction { } }; +/// @brief Try parsing CIF without validation or error throwing. +/// @tparam Input PEGTL input type. +/// @param in PEGTL input. +/// @param msg Optional pointer to store error message (if parsing fails). +/// @return true if parse succeeded, false otherwise. template bool try_parse(Input&& in, std::string* msg) { try { return pegtl::parse(in); @@ -342,8 +403,13 @@ template bool try_parse(Input&& in, std::string* msg) { } } -// A function for transparent reading of normal and compressed files. -// T should have the same traits as BasicInput and MaybeGzipped. +/// @brief Read CIF from a file or stream, handling compression transparently. +/// @tparam T Type with methods: uncompress_into_buffer(), is_stdin(), is_compressed(), path(). +/// (Traits matching BasicInput and MaybeGzipped wrappers in Gemmi.) +/// @param input Input wrapper (handles gzip, bzip2, and plain files). +/// @param check_level Validation level (0-2). +/// @return Parsed Document. +/// @throws pegtl::parse_error on syntax errors. template Document read(T&& input, int check_level=1) { if (CharArray mem = input.uncompress_into_buffer()) @@ -353,6 +419,11 @@ Document read(T&& input, int check_level=1) { return read_file(input.path(), check_level); } +/// @brief Check CIF syntax without building a Document. +/// @tparam T Type with uncompress_into_buffer() and path() methods. +/// @param input Input wrapper. +/// @param msg Optional pointer to store error message. +/// @return true if syntax is valid, false otherwise. template bool check_syntax(T&& input, std::string* msg) { if (CharArray mem = input.uncompress_into_buffer()) { @@ -363,6 +434,13 @@ bool check_syntax(T&& input, std::string* msg) { return try_parse(in, msg); } +/// @brief Read one CIF block from a file or stream into an existing Document. +/// @tparam T Type with is_compressed(), is_stdin(), uncompress_into_buffer(size_t), path() methods. +/// @param d Document to append block to. +/// @param input Input wrapper. +/// @param limit Max bytes to read from compressed file (0 = no limit). +/// @return Byte offset after parsing the block. +/// @throws pegtl::parse_error on syntax errors. template size_t read_one_block(Document& d, T&& input, size_t limit) { if (input.is_compressed()) { diff --git a/include/gemmi/cifdoc.hpp b/include/gemmi/cifdoc.hpp index 1adcdd99..db846313 100644 --- a/include/gemmi/cifdoc.hpp +++ b/include/gemmi/cifdoc.hpp @@ -1,3 +1,11 @@ +/// @file +/// @brief In-memory representation of a CIF (Crystallographic Information File) document. +/// +/// This header defines the core data structures for parsing and manipulating CIF files. +/// It provides a document model that can represent both traditional CIF and mmCIF (macromolecular CIF) +/// formats, as well as alternative serializations like CIF-JSON or mmJSON. +/// The model consists of blocks, items (tag-value pairs or loops), and supports frame nesting. + // Copyright 2017 Global Phasing Ltd. // // struct Document that represents the CIF file (but can also be @@ -33,11 +41,17 @@ namespace cif { using std::size_t; using gemmi::fail; +/// @brief Discriminator for CIF items: single tag-value pairs, loops, or frames. enum class ItemType : unsigned char { + /// A single tag-value pair (e.g., `_cell.length_a 10.5`) Pair, + /// A loop with tags (column headers) and values in row-major storage Loop, + /// A save frame (nested block); used in CIF to define templates or additional metadata Frame, + /// A comment item (prefix-preserved in output, not validated for syntax) Comment, + /// Placeholder for a logically removed item; storage not reclaimed Erased, }; @@ -124,48 +138,90 @@ struct LoopArg {}; struct FrameArg { std::string str; }; struct CommentArg { std::string str; }; +/// @brief A tabular loop structure: tags (column names) and flat row-major values. +/// +/// In CIF syntax, a loop is a compact representation of a table with named columns: +/// ``` +/// loop_ +/// _category.tag1 _category.tag2 _category.tag3 +/// value1a value2a value3a +/// value1b value2b value3b +/// ``` +/// Internally, the tag names are stored in `tags` and all values are stored sequentially +/// in `values` using row-major layout: for N columns and M rows, `values.size() == N*M`, +/// and element at (row r, column c) is at index `r * N + c`. struct Loop { + /// Column header names (tags), typically with a common prefix (e.g., `_atom_site.`) std::vector tags; + /// All values in row-major order: consecutive `tags.size()` elements form one row. + /// Invariant: `values.size() % tags.size() == 0`. std::vector values; - // search and access + /// @brief Find a tag by case-insensitive match. + /// @param lctag Tag name converted to lowercase. + /// @return Column index (0-based) if found; -1 if not found. int find_tag_lc(const std::string& lctag) const { auto f = std::find_if(tags.begin(), tags.end(), [&lctag](const std::string& t) { return gemmi::iequal(t, lctag); }); return f == tags.end() ? -1 : f - tags.begin(); } + /// @brief Find a tag by case-insensitive match. + /// @param tag Tag name (converted to lowercase internally). + /// @return Column index (0-based) if found; -1 if not found. int find_tag(const std::string& tag) const { return find_tag_lc(gemmi::to_lower(tag)); } + /// @brief Check if a tag exists (case-insensitive). bool has_tag(const std::string& tag) const { return find_tag(tag) != -1; } + /// @brief Number of columns in this loop. size_t width() const { return tags.size(); } + /// @brief Number of rows in this loop. size_t length() const { return values.size() / tags.size(); } + /// @brief Direct access to a value by row and column index (row-major layout). + /// @param row Row index (0-based). + /// @param col Column index (0-based). + /// @return Reference to the value at (row, col). std::string& val(size_t row, size_t col) { return values[row * tags.size() + col]; } + /// @brief Const overload of val(). const std::string& val(size_t row, size_t col) const { return const_cast(this)->val(row, col); } + /// @brief Clear all tags and values from this loop. void clear() { tags.clear(); values.clear(); } + /// @brief Insert values into the loop, optionally at a specific row position. + /// @tparam T Container type with begin()/end() iterators (e.g., std::vector, std::initializer_list). + /// @param new_values Container of strings to insert. + /// @param pos Row position to insert at (-1 appends at end). template void add_values(T new_values, int pos=-1) { auto it = values.end(); if (pos >= 0 && pos * width() < values.size()) it = values.begin() + pos * tags.size(); values.insert(it, new_values.begin(), new_values.end()); } + /// @brief Overload for initializer_list. void add_values(std::initializer_list new_values, int pos=-1) { add_values>(new_values, pos); } + /// @brief Add a complete row to the loop (must match column count). + /// @tparam T Container with begin()/end() iterators. + /// @param new_values Container of strings; size must equal `width()`. + /// @param pos Row position to insert at (-1 appends at end). + /// @throws std::runtime_error if new_values.size() != tags.size(). template void add_row(T new_values, int pos=-1) { if (new_values.size() != tags.size()) fail("add_row(): wrong row length."); add_values(new_values, pos); } + /// @brief Overload for initializer_list. void add_row(std::initializer_list new_values, int pos=-1) { add_row>(new_values, pos); } - // comments are added relying on how cif writing works + /// @brief Add a comment prefix to the first value of a row, then add the row. + /// @param ss Initializer list with comment string at index 0, then `width()` value strings. + /// @throws std::runtime_error if ss.size() != tags.size() + 1. void add_comment_and_row(std::initializer_list ss) { if (ss.size() != tags.size() + 1) fail("add_comment_and_row(): wrong row length."); @@ -173,13 +229,17 @@ struct Loop { vec[0] = cat('#', *ss.begin(), '\n', vec[0]); add_row(vec); } + /// @brief Remove the last row from the loop. + /// @throws std::runtime_error if the loop is already empty. void pop_row() { if (values.size() < tags.size()) fail("pop_row() called on empty Loop"); values.resize(values.size() - tags.size()); } - // the arguments must be valid row indices + /// @brief Move a row to a different position within the loop. + /// @param old_pos Current row index (0-based); must be < length(). + /// @param new_pos Target row index (0-based); must be < length(). void move_row(int old_pos, int new_pos) { size_t w = width(); auto src = values.begin() + old_pos * w; @@ -190,7 +250,10 @@ struct Loop { std::rotate(dst, src, src+w); } - // column_names are not checked for duplicates nor for category name + /// @brief Add new columns with an initial fill value. + /// @param column_names Vector of new tag names (must start with '_'). + /// @param value String value to fill for all existing rows. + /// @param pos Column position to insert at (-1 appends at end). void add_columns(const std::vector& column_names, const std::string& value, int pos=-1) { for (const std::string& name : column_names) @@ -202,6 +265,9 @@ struct Loop { vector_insert_columns(values, old_width, len, column_names.size(), upos, value); } + /// @brief Remove a column by tag name. + /// @param column_name Tag to remove (case-insensitive search). + /// @throws std::runtime_error if tag not found. void remove_column(const std::string& column_name) { int n = find_tag(column_name); if (n == -1) @@ -209,14 +275,19 @@ struct Loop { remove_column_at(n); } - /// \pre: n < tags.size() + /// @brief Remove a column by index. + /// @param n Column index; must be < tags.size(). void remove_column_at(size_t n) { tags.erase(tags.begin() + n); vector_remove_column(values, tags.size(), n); } + /// @brief Replace all values with columns from a vector of column vectors. + /// @param columns Vector of columns; size must equal width(), each column must equal length(). void set_all_values(std::vector> columns); + /// @brief Extract the common prefix from all tags in this loop. + /// @return Longest prefix that all tags share (case-insensitive). std::string common_prefix() const { if (tags.empty()) return {}; @@ -235,30 +306,55 @@ struct Loop { struct Item; struct Block; -// Accessor to a specific loop column, or to a single value from a Pair. +/// @brief A view into a single column of a Loop, or a single Pair value. +/// +/// Provides array-like access to a sequence of values from either a loop column or a pair value. +/// Acts as both a reference (can be modified through operator[]) and an iterable container. class Column { public: + /// @brief Construct an empty/null column. Column() : item_(nullptr) {} + /// @brief Construct a column view for a specific item and column index. + /// @param item Pointer to an Item (must be Loop or Pair type). + /// @param col Column index; for Loop, this is the column position; for Pair, should be 0. Column(Item* item, size_t col) : item_(item), col_(col) {} + /// @brief Iterator type for strided traversal of column values. using iterator = StrideIter; + /// @brief Begin iterator; provides access to the first value in the column. iterator begin(); + /// @brief End iterator; one-past-the-last value. iterator end(); + /// @brief Const iterator type. using const_iterator = StrideIter; + /// @brief Const begin iterator. const_iterator begin() const { return const_cast(this)->begin(); } + /// @brief Const end iterator. const_iterator end() const { return const_cast(this)->end(); } + /// @brief Get the underlying Loop, if this column comes from a Loop item; nullptr otherwise. Loop* get_loop() const; + /// @brief Get the tag (column header) string for this column. + /// @return Pointer to the tag string (valid as long as the Item is alive). std::string* get_tag(); + /// @brief Const overload of get_tag(). const std::string* get_tag() const { return const_cast(this)->get_tag(); } + /// @brief Number of values in this column. + /// @return Loop length if from a Loop; 1 if from a Pair; 0 if null. int length() const { if (const Loop* loop = get_loop()) return loop->length(); return item_ ? 1 : 0; } + /// @brief Check if this column is valid (not null). explicit operator bool() const { return item_ != nullptr ; } + /// @brief Access a value by index (0-based; for Pair, only index 0 is valid). std::string& operator[](int n); + /// @brief Safe access with bounds checking and negative indexing support. + /// @param n Index (negative indices count from end). + /// @return Reference to the value. + /// @throws std::out_of_range if index is out of bounds. std::string& at(int n) { if (n < 0) n += length(); @@ -267,67 +363,97 @@ class Column { " in Column with length " + std::to_string(length())); return operator[](n); } + /// @brief Const overload of at(). const std::string& at(int n) const { return const_cast(this)->at(n); } + /// @brief Get a CIF-unquoted string value (removes quotes/semicolons). std::string str(int n) const { return as_string(at(n)); } + /// @brief Get const pointer to the underlying Item. const Item* item() const { return item_; } + /// @brief Get mutable pointer to the underlying Item. Item* item() { return item_; } + /// @brief Get the column index within the Loop (or 0 for Pair). size_t col() const { return col_; } + /// @brief Erase this column from its item (removes from Loop or erases Pair). void erase(); private: - Item* item_; - size_t col_; // for loop this is a column index in item_->loop + Item* item_; ///< Pointer to the Item (Loop or Pair). + size_t col_; ///< Column index in the Loop, or 0 for Pair. }; -// Some values can be given either in loop or as tag-value pairs. -// The latter case is equivalent to a loop with a single row. -// We optimized for loops, and in case of tag-values we copy the values -// into the `values` vector. +/// @brief A unified view of data as either a loop (multiple rows) or pairs (single row). +/// +/// Some CIF data can be represented either way: +/// - As a loop with multiple rows (efficient for large tables) +/// - As separate tag-value pairs (equivalent to a loop with one row) +/// +/// This struct abstracts both representations to provide uniform access through Row objects. +/// It internally tracks column mappings and optimizes for the loop case. struct Table { + /// @brief Pointer to the Loop Item, or nullptr if data is in pairs. Item* loop_item; + /// @brief Reference to the Block containing the items. Block& bloc; + /// @brief Column position mappings: for each query column, the position in loop/pairs. + /// Negative position (-1) means the column is optional and absent. std::vector positions; + /// @brief Length of the common tag prefix (e.g., `_atom_site.` length). size_t prefix_length; + /// @brief A single row of the table, providing key-value access to columns. struct Row { + /// @brief Reference to the parent Table. Table& tab; + /// @brief Row index (-1 represents the tag row itself). int row_index; + /// @brief Unsafe access: position must be valid (>=0). std::string& value_at_unsafe(int pos); + /// @brief Safe access by position; throws if position is -1 (optional column absent). std::string& value_at(int pos) { if (pos == -1) throw std::out_of_range("Cannot access missing optional tag."); return value_at_unsafe(pos); } + /// @brief Const overload of value_at(). const std::string& value_at(int pos) const { return const_cast(this)->value_at(pos); } + /// @brief Access by column index in the table query (with bounds checking). std::string& at(int n) { return value_at(tab.positions.at(n < 0 ? n + size() : n)); } + /// @brief Const overload of at(). const std::string& at(int n) const { return const_cast(this)->at(n); } + /// @brief Unchecked access by column index. std::string& operator[](size_t n); + /// @brief Const overload. const std::string& operator[](size_t n) const { return const_cast(this)->operator[](n); } + /// @brief Pointer-based access to optional columns (nullptr if absent). std::string* ptr_at(int n) { int pos = tab.positions.at(n < 0 ? n + size() : n); return pos >= 0 ? &value_at(pos) : nullptr; } + /// @brief Const overload of ptr_at(). const std::string* ptr_at(int n) const { return const_cast(this)->ptr_at(n); } + /// @brief Check if a column is present. bool has(size_t n) const { return tab.positions.at(n) >= 0; } + /// @brief Check if a column is present and has a non-null value. bool has2(size_t n) const { return has(n) && !cif::is_null(operator[](n)); } + /// @brief Return the first non-null value among two columns, or a null placeholder. const std::string& one_of(size_t n1, size_t n2) const { static const std::string nul(1, '.'); if (has2(n1)) @@ -337,48 +463,72 @@ struct Table { return nul; } + /// @brief Number of columns in the table query. size_t size() const { return tab.width(); } + /// @brief Get a CIF-unquoted string value. std::string str(int n) const { return as_string(at(n)); } + /// @brief Iterator type for traversing columns in this row. using iterator = IndirectIter; + /// @brief Const iterator type. using const_iterator = IndirectIter; + /// @brief Begin iterator. iterator begin() { return iterator({this, tab.positions.begin()}); } + /// @brief End iterator. iterator end() { return iterator({this, tab.positions.end()}); } + /// @brief Const begin iterator. const_iterator begin() const { return const_iterator({this, tab.positions.begin()}); } + /// @brief Const end iterator. const_iterator end() const { return const_iterator({this, tab.positions.end()}); } }; + /// @brief Get the underlying Loop, if this table is loop-based. Loop* get_loop(); + /// @brief Check if this table is valid (has at least one column). bool ok() const { return !positions.empty(); } + /// @brief Number of columns in the table query. size_t width() const { return positions.size(); } + /// @brief Number of rows in this table. size_t length() const; + /// @brief Alias for length(). size_t size() const { return length(); } + /// @brief Check if column n is present (not -1). bool has_column(int n) const { return ok() && positions.at(n) >= 0; } + /// @brief Access the tag row (row_index == -1). Row tags() { return Row{*this, -1}; } + /// @brief Access a data row by index. Row operator[](int n) { return Row{*this, n}; } + /// @brief Validate and normalize a row index (supports negative indexing). + /// @param n Index to check (modified in-place). + /// @throws std::out_of_range if index is invalid. void at_check(int& n) const { if (n < 0) n += length(); if (n < 0 || static_cast(n) >= length()) throw std::out_of_range("No row with index " + std::to_string(n)); } + /// @brief Safe row access with bounds checking. Row at(int n) { at_check(n); return (*this)[n]; } + /// @brief Get the single row of a one-row table. + /// @return The first (and only) row. + /// @throws std::runtime_error if table has != 1 row. Row one() { if (length() != 1) fail("Expected one value, found " + std::to_string(length())); return (*this)[0]; } + /// @brief Get the common category prefix for this table (e.g., `_atom_site`). std::string get_prefix() const { for (int pos : positions) if (pos >= 0) @@ -387,15 +537,29 @@ struct Table { fail("The table has no columns."); } + /// @brief Find the first row where the first column matches a value. + /// @param s String value to search for (compared with as_string unquoting). + /// @return The matching row. + /// @throws std::runtime_error if no row matches. Row find_row(const std::string& s); + /// @brief Append a row with values matching the table columns. + /// @tparam T Container type with begin()/end(). + /// @param new_values Container of strings; size must equal width(). template void append_row(const T& new_values); + /// @brief Overload for initializer_list. void append_row(std::initializer_list new_values) { append_row>(new_values); } + /// @brief Remove a single row. void remove_row(int row_index) { remove_rows(row_index, row_index+1); } + /// @brief Remove rows [start, end). void remove_rows(int start, int end); + /// @brief Create a Column view for a position. Column column_at_pos(int pos); + /// @brief Get a Column view by table column index. + /// @param n Column index in the query. + /// @throws std::runtime_error if the column is absent (position -1). Column column(int n) { int pos = positions.at(n); if (pos == -1) @@ -403,6 +567,7 @@ struct Table { return column_at_pos(pos); } + /// @brief Move a row to a different position. void move_row(int old_pos, int new_pos) { at_check(old_pos); at_check(new_pos); @@ -410,7 +575,10 @@ struct Table { loop->move_row(old_pos, new_pos); } - // prefix is optional + /// @brief Find a column by tag name (supports prefix matching). + /// @param tag Column name to search for (case-insensitive). + /// @return Position of the matching column. + /// @throws std::runtime_error if tag not found. int find_column_position(const std::string& tag) const { std::string lctag = gemmi::to_lower(tag); Row tag_row = const_cast(this)->tags(); @@ -423,16 +591,18 @@ struct Table { fail("Column name not found: " + tag); } + /// @brief Get a Column view by tag name. Column find_column(const std::string& tag) { return column_at_pos(find_column_position(tag)); } + /// @brief Erase this table (remove all its items from the block). void erase(); - /// if it's pairs, convert it to loop + /// @brief Ensure data is in loop form (convert from pairs if needed). void ensure_loop(); - // It is not a proper input iterator, but just enough for using range-for. + /// @brief Iterator for range-based for loops over rows. struct iterator { Table& parent; int index; @@ -442,37 +612,85 @@ struct Table { Row operator*() { return parent[index]; } const std::string& get(int n) const { return parent[index].at(n); } }; + /// @brief Begin iterator for rows. iterator begin() { return iterator{*this, 0}; } + /// @brief End iterator for rows. iterator end() { return iterator{*this, (int)length()}; } }; +/// @brief A CIF data block, containing tags (pairs), loops, and nested frames. +/// +/// In CIF syntax, a block starts with `data_blockname` and contains items: +/// - Tag-value pairs: `_tag value` +/// - Loops: `loop_ _tag1 _tag2 ... value1a value2a value1b value2b ...` +/// - Frames: `save_framename ... save_` +/// +/// Blocks are case-insensitive for tag lookup (but case is preserved in output). struct Block { + /// @brief Block name (e.g., "structure" in `data_structure`). std::string name; + /// @brief Items in this block (pairs, loops, frames, comments). std::vector items; + /// @brief Construct a named block. explicit Block(const std::string& name_); + /// @brief Construct an unnamed block. Block(); + /// @brief Swap contents with another block. void swap(Block& o) noexcept { name.swap(o.name); items.swap(o.items); } // access functions + /// @brief Find an Item that is a tag-value pair by tag name. + /// @param tag Tag to search for (case-insensitive). + /// @return Pointer to the Item, or nullptr if not found or not a Pair. const Item* find_pair_item(const std::string& tag) const; + /// @brief Find a tag-value pair (Pair). + /// @param tag Tag to search for (case-insensitive). + /// @return Pointer to the Pair, or nullptr if not found. const Pair* find_pair(const std::string& tag) const; + /// @brief Find a loop containing a tag and get a Column view. + /// @param tag Tag to search for (case-insensitive). + /// @return Column view if found and item is a Loop; empty Column otherwise. Column find_loop(const std::string& tag); + /// @brief Find an Item that is a loop containing a tag. + /// @param tag Tag to search for (case-insensitive). + /// @return Pointer to the Item, or nullptr if not found. const Item* find_loop_item(const std::string& tag) const; + /// @brief Find a single value (from Pair or first row of Loop with single column). + /// @param tag Tag to search for (case-insensitive). + /// @return Pointer to the value string, or nullptr if not found. const std::string* find_value(const std::string& tag) const; + /// @brief Find all values with a tag (Column from Loop or Pair). + /// @param tag Tag to search for (case-insensitive). + /// @return Column view (empty if not found). Column find_values(const std::string& tag); + /// @brief Check if a tag exists in this block. bool has_tag(const std::string& tag) const { return const_cast(this)->find_values(tag).item() != nullptr; } + /// @brief Check if a tag exists and has at least one non-null value. bool has_any_value(const std::string& tag) const { Column c = const_cast(this)->find_values(tag); return c.item() != nullptr && !std::all_of(c.begin(), c.end(), is_null); } + /// @brief Find a table of values with specified tags (required tags). + /// @param prefix Common tag prefix (e.g., `_atom_site`). + /// @param tags Tags to search for (no '?' prefix; all required). + /// @return Table view (ok() == false if not all tags found). Table find(const std::string& prefix, const std::vector& tags); + /// @brief Overload without prefix. Table find(const std::vector& tags) { return find({}, tags); } + /// @brief Find a table with optional tags (all columns attempted). + /// @param prefix Common tag prefix. + /// @param tags Tags to search for; position -1 if not found. + /// @return Table view. Table find_any(const std::string& prefix, const std::vector& tags); + /// @brief Find a table, creating it if not found. + /// @param prefix Common tag prefix. + /// @param tags Tags; all are created as a new loop if not found. + /// @return Table view (ok() == true). Table find_or_add(const std::string& prefix, std::vector tags) { Table t = find(prefix, tags); if (!t.ok()) { @@ -483,26 +701,54 @@ struct Block { } return t; } + /// @brief Find a nested frame (save block) by name. + /// @param name Frame name (case-insensitive). + /// @return Pointer to the frame Block, or nullptr if not found. Block* find_frame(std::string name); + /// @brief Convert a Loop Item to a Table view. Table item_as_table(Item& item); + /// @brief Get the index of an item containing a tag. + /// @param tag Tag to search for (case-insensitive). + /// @return Index in the items vector. + /// @throws std::runtime_error if tag not found. size_t get_index(const std::string& tag) const; // modifying functions + /// @brief Set or update a tag-value pair. + /// @param tag Tag name (case-insensitive for lookup, but case is updated if tag is added). + /// @param value Value to set. void set_pair(const std::string& tag, const std::string& value); + /// @brief Initialize or get a loop for specified tags. + /// @param prefix Common tag prefix. + /// @param tags Column names (prefix added automatically). + /// @return Reference to the Loop (newly created if needed). Loop& init_loop(const std::string& prefix, std::vector tags) { Table tab = find_any(prefix, tags); return setup_loop(std::move(tab), prefix, std::move(tags)); } + /// @brief Move an item to a different position. + /// @param old_pos Current position (supports negative indexing). + /// @param new_pos Target position (supports negative indexing). void move_item(int old_pos, int new_pos); // mmCIF specific functions + /// @brief Get all category prefixes in mmCIF format (ending with '.'). std::vector get_mmcif_category_names() const; + /// @brief Find a category (all tags starting with prefix). + /// @param cat Category prefix (e.g., `_atom_site`; '.' is added if missing). + /// @return Table view with all matching tags. Table find_mmcif_category(std::string cat); + /// @brief Check if an mmCIF category exists. + /// @param cat Category prefix. bool has_mmcif_category(std::string cat) const; + /// @brief Initialize an mmCIF category loop. + /// @param cat Category prefix. + /// @param tags Column names (category prefix added automatically). + /// @return Reference to the Loop. Loop& init_mmcif_loop(std::string cat, std::vector tags) { ensure_mmcif_category(cat); // modifies cat return setup_loop(find_mmcif_category(cat), cat, std::move(tags)); @@ -516,52 +762,78 @@ struct Block { }; +/// @brief A single item in a CIF block: a pair, loop, frame, comment, or erased marker. +/// +/// Uses a discriminated union (tagged with ItemType) to store different data types. +/// For a Pair, stores tag and value. For a Loop, stores tags and values vectors. +/// For a Frame, stores a nested Block. struct Item { + /// @brief The type of item (discriminator for the union). ItemType type; + /// @brief Source line number where this item was parsed (or -1 if not from parsing). int line_number = -1; + /// @brief Union storing the actual data (only one is valid based on type). union { + /// @brief For Pair items: [tag, value]. Pair pair; + /// @brief For Loop items: tags and values. Loop loop; + /// @brief For Frame items: nested save frame Block. Block frame; }; + /// @brief Construct an erased (empty) item. Item() : type(ItemType::Erased) {} + /// @brief Construct a Loop item. explicit Item(LoopArg) : type{ItemType::Loop}, loop{} {} + /// @brief Construct a Pair with a tag (value empty). explicit Item(std::string&& t) : type{ItemType::Pair}, pair{{std::move(t), std::string()}} {} + /// @brief Construct a Pair with tag and value. Item(const std::string& t, const std::string& v) : type{ItemType::Pair}, pair{{t, v}} {} + /// @brief Construct a Frame from a FrameArg. explicit Item(FrameArg&& frame_arg) : type{ItemType::Frame}, frame(frame_arg.str) {} + /// @brief Construct a Comment from a CommentArg. explicit Item(CommentArg&& comment) : type{ItemType::Comment}, pair{{std::string(), std::move(comment.str)}} {} + /// @brief Move constructor. Item(Item&& o) noexcept : type(o.type), line_number(o.line_number) { move_value(std::move(o)); } + /// @brief Copy constructor. Item(const Item& o) : type(o.type), line_number(o.line_number) { copy_value(o); } + /// @brief Assignment operator (move-based). Item& operator=(Item o) { set_value(std::move(o)); return *this; } + /// @brief Destructor (calls destruct on the active union member). ~Item() { destruct(); } + /// @brief Mark this item as erased without freeing underlying storage. + /// Changes type to Erased; the union memory is left as-is. void erase() { destruct(); type = ItemType::Erased; } - // case-insensitive, the prefix should be lower-case + /// @brief Check if this item's tag(s) start with a prefix (case-insensitive). + /// @param prefix Prefix to match (should be lowercase). + /// @return True if the first tag starts with prefix. bool has_prefix(const std::string& prefix) const { return (type == ItemType::Pair && gemmi::istarts_with(pair[0], prefix)) || (type == ItemType::Loop && !loop.tags.empty() && gemmi::istarts_with(loop.tags[0], prefix)); } + /// @brief Replace this item's value with another item (may change type). void set_value(Item&& o) { if (type == o.type) { switch (type) { @@ -1058,13 +1330,26 @@ inline bool Block::has_mmcif_category(std::string cat) const { return false; } +/// @brief A parsed CIF file: a collection of blocks with optional metadata. +/// +/// Represents the complete document structure after parsing a CIF file. +/// Contains one or more data blocks, each with tag-value pairs, loops, and frames. struct Document { + /// @brief Source filename or identifier (for error messages). std::string source; + /// @brief All blocks in the document (data blocks). std::vector blocks; - // implementation detail: items of the currently parsed block or frame + /// @brief Implementation detail: pointer to items of current block during parsing. + /// (Used internally by the parser; not for public use.) std::vector* items_ = nullptr; + /// @brief Add a new block to the document. + /// @param name Block name (must be unique). + /// @param pos Position to insert (-1 appends at end). + /// @return Reference to the new Block. + /// @throws std::runtime_error if name already exists. + /// @throws std::out_of_range if pos is invalid. Block& add_new_block(const std::string& name, int pos=-1) { if (find_block(name)) fail("Block with such name already exists: " + name); @@ -1073,28 +1358,36 @@ struct Document { return *blocks.emplace(pos < 0 ? blocks.end() : blocks.begin() + pos, name); } + /// @brief Clear all blocks and source info. void clear() noexcept { source.clear(); blocks.clear(); items_ = nullptr; } - // returns blocks[0] if the document has exactly one block (like mmCIF) + /// @brief Get the single block from a one-block document (typical for mmCIF). + /// @return Reference to blocks[0]. + /// @throws std::runtime_error if document has != 1 block. Block& sole_block() { if (blocks.size() > 1) fail("single data block expected, got " + std::to_string(blocks.size())); return blocks.at(0); } + /// @brief Const overload of sole_block(). const Block& sole_block() const { return const_cast(this)->sole_block(); } + /// @brief Find a block by name (case-sensitive). + /// @param name Block name. + /// @return Pointer to the Block, or nullptr if not found. Block* find_block(const std::string& name) { for (Block& b : blocks) if (b.name == name) return &b; return nullptr; } + /// @brief Const overload of find_block(). const Block* find_block(const std::string& name) const { return const_cast(this)->find_block(name); } diff --git a/include/gemmi/ddl.hpp b/include/gemmi/ddl.hpp index 47c70a21..18c2e089 100644 --- a/include/gemmi/ddl.hpp +++ b/include/gemmi/ddl.hpp @@ -1,6 +1,7 @@ // Copyright Global Phasing Ltd. -// -// Using DDL1/DDL2 dictionaries to validate CIF/mmCIF files. + +/// @file +/// @brief DDL1/DDL2 dictionary-based validation of CIF and mmCIF files. #ifndef GEMMI_DDL_HPP_ #define GEMMI_DDL_HPP_ @@ -13,49 +14,139 @@ namespace gemmi { namespace cif { -/// Represents DDL1 or DDL2 dictionary (ontology). +/// Represents a CIF dictionary (DDL1 or DDL2 ontology) for validation. +/// +/// A DDL (Data Definition Language) dictionary defines the structure, constraints, +/// and validation rules for CIF data. This class can load and use either: +/// - **DDL1** dictionaries (IUCr core, chemical structures) +/// - **DDL2** dictionaries (macromolecular CIF / mmCIF, used by PDB) +/// +/// After loading a dictionary with read_ddl(), you can validate CIF documents +/// against it to check for missing mandatory items, type violations, enumeration +/// violations, unique key violations, and other data integrity issues. struct GEMMI_DLL Ddl { - /// member functions use logger's callback and threshold for output + /// Logger for validation messages and warnings. + /// Member functions use this logger's callback and threshold settings for output. Logger logger; - // configuration - some of these flag must be set before read_ddl() + + // Configuration flags - set these before calling read_ddl() + + /// Report unknown tags (tags not defined in the dictionary). + /// Useful for catching typos in tag names. bool print_unknown_tags = true; - // these flags below are relevant to DDL2 only + + // The following flags apply to DDL2 dictionaries only + + /// Enable validation using regular expression patterns (DDL2 _item_type.code). bool use_regex = true; + + /// Use context-dependent validation rules (DDL2). + /// If true, validates items in specific category contexts. bool use_context = false; + + /// Use parent-child item relationships (DDL2 _item_linked). + /// If true, enforces dependencies between items. bool use_parents = false; + + /// Validate mandatory items (DDL2 _item.mandatory_code). + /// If true, reports missing items marked as mandatory. bool use_mandatory = true; + + /// Validate unique keys in loops (DDL2 _item_linked.key_id). + /// If true, checks that unique key values don't repeat. bool use_unique_keys = true; - // instead of _item_type.code, _pdbx_item_enumeration.value, and _item_range - // use _pdbx-prefixed equivalents (_pdbx_item_type.code, etc). + + /// Use PDBx deposition-specific validation checks. + /// If true, uses _pdbx-prefixed dictionary items instead of standard ones + /// (_pdbx_item_type.code instead of _item_type.code, etc.). + /// This mode is typically used during structure deposition to PDB. bool use_deposition_checks = false; - // variables set when reading DLL; normally, no need to change them - int major_version = 0; // currently 1 and 2 are supported - std::string dict_name; // _dictionary_name or _dictionary.title - std::string dict_version; // _dictionary_version or _dictionary.version + // Read-only fields set when reading a dictionary + + /// Major version of the loaded DDL (1 or 2). + /// Read from _dictionary_version or similar field in the dictionary. + int major_version = 0; + + /// Name of the dictionary (e.g., "cif_core.dic" or "mmcif_pdbx_v50"). + /// Read from _dictionary_name (DDL1) or _dictionary.title (DDL2). + std::string dict_name; + + /// Version string of the dictionary (e.g., "2.0.11"). + /// Read from _dictionary_version or _dictionary.version. + std::string dict_version; Ddl() = default; - // MSVC with dllexport attempts to export all non-deleted member functions, - // failing with Error C2280 (because of ddl_docs_) if we don't delete these: + + // Copy/assignment deleted: MSVC dllexport cannot handle the unique_ptr + // member in ddl_docs_. Instances should be moved or held in stable storage. Ddl(Ddl const&) = delete; Ddl& operator=(Ddl const&) = delete; - /// it moves doc to ddl_docs_ to control lifetime and prevent modifications + /// Load a DDL dictionary into this validator. + /// + /// Parses a DDL1 or DDL2 dictionary document and indexes it for validation. + /// The document is moved into internal storage to manage its lifetime. + /// The dictionary version (DDL1 or DDL2) is auto-detected. + /// + /// Configuration flags (e.g., use_mandatory, use_regex) should be set + /// before calling this function. + /// + /// @param doc CIF document containing a DDL dictionary (will be moved) void read_ddl(cif::Document&& doc); + /// Validate all blocks in a CIF document against this dictionary. + /// + /// Checks all blocks in the document and reports validation errors + /// via the configured logger. + /// + /// @param doc The CIF document to validate + /// @return true if validation passes, false if errors are found + /// + /// @see validate_block() to validate individual blocks bool validate_cif(const cif::Document& doc) const; + + /// Validate a single CIF block against this dictionary. + /// + /// Performs all enabled validation checks on the block: + /// - Mandatory items (if use_mandatory=true) + /// - Item types and enumeration values + /// - Regular expression patterns (if use_regex=true) + /// - Unique keys (if use_unique_keys=true) + /// - Parent-child relationships (if use_parents=true) + /// - Unknown tags (if print_unknown_tags=true) + /// + /// @param b The CIF block to validate + /// @param source Source identifier for error messages (e.g., block name or filename) + /// @return true if validation passes, false if errors are found bool validate_block(const cif::Block& b, const std::string& source) const; + /// Check audit conformance fields in a CIF document. + /// + /// Verifies that the document's audit records match dictionary expectations + /// (e.g., _audit_conform_dict_name, _audit_conform_dict_version). + /// Reports mismatches via the logger. + /// + /// @param doc The CIF document to check void check_audit_conform(const cif::Document& doc) const; + /// Access the regex patterns loaded from the dictionary. + /// + /// Returns a map of tag names to compiled regular expressions + /// that constrain the format of values for those tags (DDL2 validation). + /// + /// @return Map of regex patterns indexed by tag name const std::map& regexes() const { return regexes_; } private: - // items from DDL2 _pdbx_item_linked_group[_list] + /// Internal representation of DDL2 parent-child item relationships. + /// + /// Links parent and child tags that must be coordinated in the data. + /// Used for enforcing referential integrity (use_parents=true). struct ParentLink { - std::string group; - std::vector child_tags; - std::vector parent_tags; + std::string group; ///< Name of the linked group + std::vector child_tags; ///< Child item tags + std::vector parent_tags;///< Parent item tags }; std::vector> ddl_docs_; diff --git a/include/gemmi/json.hpp b/include/gemmi/json.hpp index 4e20606f..191201d5 100644 --- a/include/gemmi/json.hpp +++ b/include/gemmi/json.hpp @@ -1,6 +1,7 @@ // Copyright 2017 Global Phasing Ltd. -// -// Reading CIF-JSON (COMCIFS) and mmJSON (PDBj) formats into cif::Document. + +/// @file +/// @brief Reading JSON formats (mmJSON and CIF-JSON) into CIF documents. #ifndef GEMMI_JSON_HPP_ #define GEMMI_JSON_HPP_ @@ -12,15 +13,39 @@ namespace gemmi { namespace cif { -// reads mmJSON file mutating the input buffer as a side effect +/// Parse mmJSON format from a buffer (with in-place mutation). +/// +/// mmJSON is the macromolecular JSON format used by PDBj for structure data. +/// This function parses JSON in-place, modifying the input buffer as a side effect +/// for efficiency. If you need to preserve the original buffer, make a copy first. +/// +/// @param buffer Pointer to buffer containing mmJSON data (will be modified) +/// @param size Number of bytes in the buffer +/// @param name Optional source name for error messages (default: "mmJSON") +/// @return Parsed CIF document GEMMI_DLL Document read_mmjson_insitu(char* buffer, std::size_t size, const std::string& name="mmJSON"); +/// Read and parse an mmJSON file from disk. +/// +/// Convenience function that loads the file into memory and parses it. +/// The entire file is read into a buffer for parsing. +/// +/// @param path Path to the mmJSON file (may end with .gz for gzip compression) +/// @return Parsed CIF document inline Document read_mmjson_file(const std::string& path) { CharArray buffer = read_file_into_buffer(path); return read_mmjson_insitu(buffer.data(), buffer.size(), path); } +/// Read and parse mmJSON from an input source (file or stream). +/// +/// Template function supporting both file paths and stream inputs. +/// Reads data from the input source into a buffer, then parses. +/// +/// @tparam T An input type with is_stdin() and path() methods +/// @param input The input source to read from +/// @return Parsed CIF document template Document read_mmjson(T&& input) { std::string name = input.is_stdin() ? "stdin" : input.path(); diff --git a/include/gemmi/numb.hpp b/include/gemmi/numb.hpp index 54f09d5b..61c43f28 100644 --- a/include/gemmi/numb.hpp +++ b/include/gemmi/numb.hpp @@ -1,10 +1,7 @@ // Copyright 2017 Global Phasing Ltd. -// -// Utilities for parsing CIF numbers (the CIF spec calls them 'numb'). -// -// Numb - the numeric type in CIF - is a number with optional -// standard uncertainty (s.u.) in brackets: 1.23(8). -// Mmcif file do not use s.u. though - they define own numeric categories. + +/// @file +/// @brief Parsing CIF numeric values (numb) with optional standard uncertainty. #ifndef GEMMI_NUMB_HPP_ #define GEMMI_NUMB_HPP_ @@ -16,6 +13,28 @@ namespace gemmi { namespace cif { +/// Parse a CIF numeric value (numb), optionally including standard uncertainty. +/// +/// In CIF format, numeric values (numb) can include optional standard uncertainty +/// (s.u.) in parentheses, e.g., "1.23(8)" represents 1.23 with s.u. of 0.08. +/// The s.u. information is parsed and skipped; only the numeric value is returned. +/// +/// Note: mmCIF files typically do not use s.u. notation for numeric values; +/// they define their own numeric data categories instead. +/// +/// @param s String containing the numeric value to parse +/// @param nan Default return value if parsing fails (default: NaN) +/// @return Parsed numeric value (with s.u. removed), or nan if invalid +/// +/// @note The function accepts leading '+' signs and rejects NaN, Inf, and -Inf +/// as they are not allowed in standard CIF format. +/// +/// @example +/// @code +/// double d = as_number("1.234"); // returns 1.234 +/// double d = as_number("1.234(5)"); // returns 1.234 (s.u. ignored) +/// double d = as_number("invalid"); // returns NAN +/// @endcode inline double as_number(const std::string& s, double nan=NAN) { const char* start = s.data(); const char* end = s.data() + s.size(); @@ -40,15 +59,32 @@ inline double as_number(const std::string& s, double nan=NAN) { return result.ptr == end ? d : nan; } +/// Check if a string represents a valid CIF numeric value (numb). +/// +/// @param s String to check +/// @return true if the string is a valid CIF number, false otherwise inline bool is_numb(const std::string& s) { return !std::isnan(as_number(s)); } -// for use in templates (see also as_any() functions in cifdoc.hpp) +// Template overloads for use in generic type conversion functions +// (see also as_any() functions in cifdoc.hpp) + +/// Parse CIF numeric value as a float (template specialization). +/// +/// @param s String containing the numeric value +/// @param null Fallback value if parsing fails +/// @return Parsed float value, or null on failure inline float as_any(const std::string& s, float null) { return (float) as_number(s, null); } + +/// Parse CIF numeric value as a double (template specialization). +/// +/// @param s String containing the numeric value +/// @param null Fallback value if parsing fails +/// @return Parsed double value, or null on failure inline double as_any(const std::string& s, double null) { return as_number(s, null); } diff --git a/include/gemmi/read_cif.hpp b/include/gemmi/read_cif.hpp index 8a488884..b5fc022f 100644 --- a/include/gemmi/read_cif.hpp +++ b/include/gemmi/read_cif.hpp @@ -1,6 +1,7 @@ // Copyright 2021 Global Phasing Ltd. // -// Functions for reading possibly gzipped CIF files. +/// @file +/// @brief Reading possibly gzip-compressed CIF and JSON files. #ifndef GEMMI_READ_CIF_HPP_ #define GEMMI_READ_CIF_HPP_ @@ -10,21 +11,76 @@ namespace gemmi { +/// Read a CIF file, optionally gzip-compressed, from disk. +/// +/// @param path Path to the CIF file (may end with .gz for gzip compression) +/// @param check_level Syntax checking level (0=none, 1=moderate, 2=strict) +/// @return Parsed CIF document GEMMI_DLL cif::Document read_cif_gz(const std::string& path, int check_level=1); + +/// Check CIF syntax without fully parsing the file. +/// +/// Performs a quick syntax validation pass on a CIF file (optionally gzipped). +/// +/// @param path Path to the CIF file (may end with .gz for gzip compression) +/// @param msg If non-null, receives an error message if validation fails +/// @return true if file syntax is valid, false otherwise GEMMI_DLL bool check_cif_syntax_gz(const std::string& path, std::string* msg); + +/// Read an mmJSON file (optionally gzip-compressed) from disk. +/// +/// mmJSON is the JSON format used by PDBj for macromolecular CIF data. +/// +/// @param path Path to the mmJSON file (may end with .gz for gzip compression) +/// @return Parsed CIF document GEMMI_DLL cif::Document read_mmjson_gz(const std::string& path); + +/// Read a file into a buffer, optionally decompressing if gzip-compressed. +/// +/// @param path Path to the file (may end with .gz for gzip compression) +/// @return Buffer containing decompressed file contents GEMMI_DLL CharArray read_into_buffer_gz(const std::string& path); + +/// Parse a CIF document from a memory buffer. +/// +/// @param data Pointer to buffer containing CIF data +/// @param size Number of bytes to read +/// @param name Optional name for the source (used in error messages) +/// @param check_level Syntax checking level (0=none, 1=moderate, 2=strict) +/// @return Parsed CIF document GEMMI_DLL cif::Document read_cif_from_memory(const char* data, size_t size, const char* name, int check_level=1); + +/// Read only the first block from a CIF file, optionally gzip-compressed. +/// +/// Useful for reading CIF files where only the first block is needed, +/// potentially saving memory and parsing time. +/// +/// @param path Path to the CIF file (may end with .gz for gzip compression) +/// @param limit Maximum number of bytes to read from the file +/// @return CIF document containing only the first block GEMMI_DLL cif::Document read_first_block_gz(const std::string& path, size_t limit); -// cif::read_string() was moved here from cif.hpp to speed up compilation +/// Read CIF data from a string. +/// +/// This function was moved here from cif.hpp to speed up compilation. +/// +/// @param data CIF-formatted string +/// @param check_level Syntax checking level (0=none, 1=moderate, 2=strict) +/// @return Parsed CIF document namespace cif { inline Document read_string(const std::string& data, int check_level=1) { return read_cif_from_memory(data.data(), data.size(), "string", check_level); } } // namespace cif +/// Auto-detect and read either CIF or mmJSON format from a file. +/// +/// Determines format by file extension (.json, .js for JSON; otherwise CIF). +/// Handles gzip-compressed files transparently. +/// +/// @param path Path to the file (may end with .gz for gzip compression) +/// @return Parsed CIF document inline cif::Document read_cif_or_mmjson_gz(const std::string& path) { if (giends_with(path, "json") || giends_with(path, "js")) return read_mmjson_gz(path); diff --git a/include/gemmi/to_cif.hpp b/include/gemmi/to_cif.hpp index 09ef32ad..6e3fba5f 100644 --- a/include/gemmi/to_cif.hpp +++ b/include/gemmi/to_cif.hpp @@ -1,6 +1,7 @@ // Copyright 2017 Global Phasing Ltd. -// Writing cif::Document or its parts to std::ostream. +/// @file +/// @brief Writing CIF documents to output streams with configurable formatting. #ifndef GEMMI_TO_CIF_HPP_ #define GEMMI_TO_CIF_HPP_ @@ -11,31 +12,44 @@ namespace gemmi { namespace cif { -/// deprecated, use cif::WriteOptions instead +/// Deprecated output formatting style. Use cif::WriteOptions instead. +/// +/// This enum is provided for backward compatibility. Each style +/// corresponds to a particular WriteOptions configuration. enum class Style { - Simple, - NoBlankLines, - PreferPairs, // write single-row loops as pairs - Pdbx, // PreferPairs + put '#' (empty comments) between categories - Indent35, // start values in pairs from 35th column - Aligned, // columns in tables are left-aligned + Simple, ///< Standard CIF format (default) + NoBlankLines, ///< Compact: no blank lines between categories + PreferPairs, ///< Write single-row loops as pairs + Pdbx, ///< PreferPairs + put '#' (empty comments) between categories + Indent35, ///< Start values in pairs from 35th column + Aligned, ///< Align columns in loops to fixed width }; +/// Options for writing CIF output. +/// +/// Controls formatting, alignment, and output style of CIF documents. struct WriteOptions { - /// write single-row loops as pairs + /// Write single-row loops as tag-value pairs instead of loop constructs. bool prefer_pairs = false; - /// no blank lines between categories, only between blocks + /// Omit blank lines between categories (keep only between blocks). bool compact = false; - /// put '#' (empty comments) before/after categories + /// Insert '#' (empty comment lines) before and after categories. + /// This is a non-standard CIF extension. bool misuse_hash = false; - /// width reserved for tags in pairs (e.g. 34 = value starts at 35th column) + /// Width reserved for tags in pairs (0=no alignment, typical value 33-34). + /// If set, values start at column (align_pairs + 1). + /// Example: align_pairs=33 starts values at column 35. std::uint16_t align_pairs = 0; - /// if non-zero, determines max width of each column in a loop and aligns - /// all values to this width; the width is capped with the given value + /// Maximum column width in loops when aligning (0=no alignment). + /// If non-zero, all columns are padded to at most this width. + /// This produces more compact, readable loop output. std::uint16_t align_loops = 0; WriteOptions() {} - // implicit conversion from deprecated Style (for backward compatibility) + + /// Implicit conversion from deprecated Style enum (for backward compatibility). + /// + /// @param style Legacy Style enum value to convert WriteOptions(Style style) { switch (style) { case Style::Simple: @@ -59,6 +73,10 @@ struct WriteOptions { break; } } + + /// Return a human-readable string representation of active options. + /// + /// @return Comma-separated list of enabled options (e.g., "prefer_pairs,compact") std::string str() const { std::string s; if (prefer_pairs) @@ -77,16 +95,29 @@ struct WriteOptions { } }; -/// std::ostream with buffering. C++ streams are so slow that even primitive -/// buffering makes it significantly more efficient. +/// Buffered output stream wrapper for efficient CIF writing. +/// +/// Wraps std::ostream with a 4KB buffer to significantly improve I/O performance +/// when writing CIF documents. The buffer is automatically flushed on destruction +/// and when it fills. class BufOstream { public: + /// Construct a buffered output stream. + /// @param os_ The underlying std::ostream to write to explicit BufOstream(std::ostream& os_) : os(os_), ptr(buf) {} + + /// Destructor flushes remaining buffered data. ~BufOstream() { flush(); } + + /// Flush all buffered data to the underlying stream. void flush() { os.write(buf, ptr - buf); ptr = buf; } + + /// Write data to the buffer, flushing if necessary. + /// @param s Pointer to data to write + /// @param len Number of bytes to write void write(const char* s, size_t len) { constexpr int margin = sizeof(buf) - 512; if (ptr - buf + len > margin) { @@ -99,13 +130,24 @@ class BufOstream { std::memcpy(ptr, s, len); ptr += len; } + + /// Write a string to the buffer. + /// @param s The string to write void operator<<(const std::string& s) { write(s.c_str(), s.size()); } - // below we don't check the buffer boundary, these functions add <512 bytes + + // Note: The following functions assume writes are small (<512 bytes). + // No buffer boundary check is performed for performance. + + /// Write a single character to the buffer. + /// @param c The character to write void put(char c) { *ptr++ = c; } + + /// Write n space characters to the buffer (for padding/alignment). + /// @param n Number of spaces to write void pad(size_t n) { std::memset(ptr, ' ', n); ptr += n; @@ -117,10 +159,11 @@ class BufOstream { char* ptr; }; -// CIF files are read in binary mode. It makes difference only for text fields. -// If the text field with \r\n would be written as is in text mode on Windows -// \r would get duplicated. As a workaround, here we convert \r\n to \n. -// Hopefully \r that gets removed here is never meaningful. +// Note: CIF files are read in binary mode. Text fields with \r\n line endings +// are normalized to \n when writing to avoid duplication in Windows text mode. +/// Write a text field, normalizing \\r\\n to \\n. +/// @param os Buffered output stream +/// @param value The text field value to write inline void write_text_field(BufOstream& os, const std::string& value) { for (size_t pos = 0, end = 0; end != std::string::npos; pos = end + 1) { end = value.find("\r\n", pos); @@ -238,6 +281,13 @@ inline bool should_be_separated_(const Item& a, const Item& b) { return adot != bdot || a.pair[0].compare(0, adot, b.pair[0], 0, adot) != 0; } +/// Write a single CIF block to an output stream. +/// +/// Writes a CIF data block with the specified formatting options. +/// +/// @param os_ Output stream to write to +/// @param block The CIF block to write +/// @param options Formatting options (see WriteOptions documentation) inline void write_cif_block_to_stream(std::ostream& os_, const Block& block, WriteOptions options=WriteOptions()) { BufOstream os(os_); @@ -262,6 +312,14 @@ inline void write_cif_block_to_stream(std::ostream& os_, const Block& block, os.write("#\n", 2); } +/// Write a CIF document to an output stream. +/// +/// Writes a complete CIF document with all its blocks, using the specified +/// formatting options. Blocks are separated by blank lines for readability. +/// +/// @param os Output stream to write to +/// @param doc The CIF document to write +/// @param options Formatting options (see WriteOptions documentation) inline void write_cif_to_stream(std::ostream& os, const Document& doc, WriteOptions options=WriteOptions()) { bool first = true; diff --git a/include/gemmi/to_json.hpp b/include/gemmi/to_json.hpp index 3915cdd0..00bbddb9 100644 --- a/include/gemmi/to_json.hpp +++ b/include/gemmi/to_json.hpp @@ -1,6 +1,7 @@ // Copyright 2017 Global Phasing Ltd. -// Writing cif::Document or its parts as JSON (mmJSON, CIF-JSON, etc). +/// @file +/// @brief Writing CIF documents as JSON (mmJSON and CIF-JSON formats). #ifndef GEMMI_TO_JSON_HPP_ #define GEMMI_TO_JSON_HPP_ @@ -10,16 +11,54 @@ namespace gemmi { namespace cif { +/// Options for writing CIF data as JSON. +/// +/// Supports multiple JSON-based serialization formats for CIF data: +/// - **CIF-JSON (COMCIFS)**: Standard JSON representation of CIF documents, +/// supporting numbered values with uncertainties. +/// - **mmJSON (PDBj)**: Specialized JSON format optimized for macromolecular +/// CIF (mmCIF) data, with DDL2 category grouping and bare tags. +/// +/// Choose between preset configurations (comcifs() or mmjson()) or +/// configure individual options for custom output. struct JsonWriteOptions { - bool as_comcifs = false; // conform to the COMCIFS CIF-JSON draft - bool group_ddl2_categories = false; // for mmJSON - bool with_data_keyword = false; // for mmJSON - bool bare_tags = false; // "tag" instead of "_tag" - bool values_as_arrays = false; // "_tag": ["value"] - bool lowercase_names = true; // write case-insensitive names as lower case - int quote_numbers = 1; // 0=never (no s.u.), 1=mix, 2=always - std::string cif_dot = "null"; // how to convert '.' from CIF + /// Conform to the COMCIFS CIF-JSON draft specification. + /// If true, enables values_as_arrays, sets quote_numbers=2, and cif_dot="false". + bool as_comcifs = false; + /// Group items by DDL2 categories (for mmJSON compatibility). + /// Relevant mainly for mmJSON format. + bool group_ddl2_categories = false; + + /// Include the mmJSON "data_" keyword wrapper. + /// Used in mmJSON format output. + bool with_data_keyword = false; + + /// Use bare tag names (e.g., "tag" instead of "_tag"). + /// Used in mmJSON and other compact formats. + bool bare_tags = false; + + /// Represent all values as JSON arrays (e.g., "_tag": ["value"]). + /// Used in COMCIFS CIF-JSON and mmJSON; disabled if false. + bool values_as_arrays = false; + + /// Write case-insensitive tag names in lowercase. + /// CIF tag names are case-insensitive; this normalizes them. + bool lowercase_names = true; + + /// Control quoting of numeric values with uncertainty (s.u.). + /// - 0: Never quote numbers; s.u. information is lost (used for mmJSON) + /// - 1: Quote numbers only when they include s.u. (default, mixed mode) + /// - 2: Always quote numbers as strings (used for COMCIFS) + int quote_numbers = 1; + + /// How to represent the CIF '.' (not-applicable) value in JSON. + /// Common choices: "null" (JSON null), "false" (boolean false, used in COMCIFS). + std::string cif_dot = "null"; + + /// Preset options for COMCIFS CIF-JSON format. + /// + /// @return JsonWriteOptions configured for standard CIF-JSON output static JsonWriteOptions comcifs() { JsonWriteOptions opt; opt.as_comcifs = true; @@ -29,6 +68,12 @@ struct JsonWriteOptions { return opt; } + /// Preset options for mmJSON format (PDBj macromolecular JSON). + /// + /// mmJSON is used by PDBj for macromolecular structures. + /// It groups data by DDL2 categories and uses bare tag names. + /// + /// @return JsonWriteOptions configured for mmJSON output static JsonWriteOptions mmjson() { JsonWriteOptions opt; opt.group_ddl2_categories = true; @@ -41,9 +86,26 @@ struct JsonWriteOptions { } }; +/// Write a CIF document as JSON to an output stream. +/// +/// Serializes a CIF document in JSON format according to the specified options. +/// See JsonWriteOptions for details on supported formats and customization. +/// +/// @param os Output stream to write to +/// @param doc The CIF document to write +/// @param options Formatting and format selection options GEMMI_DLL void write_json_to_stream(std::ostream& os, const Document& doc, const JsonWriteOptions& options); +/// Write a CIF document as mmJSON (PDBj macromolecular JSON) to an output stream. +/// +/// Convenience function equivalent to: +/// @code +/// write_json_to_stream(os, doc, JsonWriteOptions::mmjson()); +/// @endcode +/// +/// @param os Output stream to write to +/// @param doc The CIF document to write inline void write_mmjson_to_stream(std::ostream& os, const Document& doc) { write_json_to_stream(os, doc, JsonWriteOptions::mmjson()); }