Skip to content

Commit 00ce283

Browse files
committed
fix(schema_hash): build normalized schema string using only column name (lowercase), type (SQLite affinity), pk flag
Build normalized schema string using only: column name (lowercase), type (SQLite affinity), pk flag Format: tablename:colname:affinity:pk,... (ordered by table name, then column id). This makes the hash resilient to formatting, quoting, case differences and portable across databases.
1 parent 58ce9a4 commit 00ce283

File tree

5 files changed

+182
-50
lines changed

5 files changed

+182
-50
lines changed

src/cloudsync.c

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,12 @@
4949
#define CLOUDSYNC_INIT_NTABLES 64
5050
#define CLOUDSYNC_MIN_DB_VERSION 0
5151

52-
#define CLOUDSYNC_PAYLOAD_SKIP_SCHEMA_HASH_CHECK 1
5352
#define CLOUDSYNC_PAYLOAD_MINBUF_SIZE (512*1024)
5453
#define CLOUDSYNC_PAYLOAD_SIGNATURE 0x434C5359 /* 'C','L','S','Y' */
5554
#define CLOUDSYNC_PAYLOAD_VERSION_ORIGNAL 1
5655
#define CLOUDSYNC_PAYLOAD_VERSION_1 CLOUDSYNC_PAYLOAD_VERSION_ORIGNAL
5756
#define CLOUDSYNC_PAYLOAD_VERSION_2 2
57+
#define CLOUDSYNC_PAYLOAD_VERSION_LATEST CLOUDSYNC_PAYLOAD_VERSION_2
5858
#define CLOUDSYNC_PAYLOAD_MIN_VERSION_WITH_CHECKSUM CLOUDSYNC_PAYLOAD_VERSION_2
5959

6060
#ifndef MAX
@@ -63,10 +63,6 @@
6363

6464
#define DEBUG_DBERROR(_rc, _fn, _data) do {if (_rc != DBRES_OK) printf("Error in %s: %s\n", _fn, database_errmsg(_data));} while (0)
6565

66-
#if CLOUDSYNC_PAYLOAD_SKIP_SCHEMA_HASH_CHECK
67-
bool schema_hash_disabled = true;
68-
#endif
69-
7066
typedef enum {
7167
CLOUDSYNC_PK_INDEX_TBL = 0,
7268
CLOUDSYNC_PK_INDEX_PK = 1,
@@ -2263,15 +2259,17 @@ int cloudsync_payload_apply (cloudsync_context *data, const char *payload, int b
22632259
header.nrows = ntohl(header.nrows);
22642260
header.schema_hash = ntohll(header.schema_hash);
22652261

2266-
#if !CLOUDSYNC_PAYLOAD_SKIP_SCHEMA_HASH_CHECK
2267-
if (!data || header.schema_hash != data->schema_hash) {
2268-
if (!database_check_schema_hash(data, header.schema_hash)) {
2269-
char buffer[1024];
2270-
snprintf(buffer, sizeof(buffer), "Cannot apply the received payload because the schema hash is unknown %llu.", header.schema_hash);
2271-
return cloudsync_set_error(data, buffer, DBRES_MISUSE);
2262+
// compare schema_hash only if not disabled and if the received payload was created with the current header version
2263+
// to avoid schema hash mismatch when processed by a peer with a different extension version during software updates.
2264+
if (dbutils_settings_get_int64_value(data, CLOUDSYNC_KEY_SKIP_SCHEMA_HASH_CHECK) == 0 && header.version == CLOUDSYNC_PAYLOAD_VERSION_LATEST ) {
2265+
if (header.schema_hash != data->schema_hash) {
2266+
if (!database_check_schema_hash(data, header.schema_hash)) {
2267+
char buffer[1024];
2268+
snprintf(buffer, sizeof(buffer), "Cannot apply the received payload because the schema hash is unknown %llu.", header.schema_hash);
2269+
return cloudsync_set_error(data, buffer, DBRES_MISUSE);
2270+
}
22722271
}
22732272
}
2274-
#endif
22752273

22762274
// sanity check header
22772275
if ((header.signature != CLOUDSYNC_PAYLOAD_SIGNATURE) || (header.ncols == 0)) {

src/dbutils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#define CLOUDSYNC_KEY_SCHEMA "schema"
2626
#define CLOUDSYNC_KEY_DEBUG "debug"
2727
#define CLOUDSYNC_KEY_ALGO "algo"
28+
#define CLOUDSYNC_KEY_SKIP_SCHEMA_HASH_CHECK "skip_schema_hash_check"
2829

2930
// settings
3031
int dbutils_settings_init (cloudsync_context *data);

src/postgresql/database_postgresql.c

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1612,21 +1612,9 @@ int64_t database_schema_version (cloudsync_context *data) {
16121612
}
16131613

16141614
uint64_t database_schema_hash (cloudsync_context *data) {
1615-
char *schema = NULL;
1616-
database_select_text(data,
1617-
"SELECT string_agg(LOWER(table_name || column_name || data_type), '' ORDER BY table_name, column_name) "
1618-
"FROM information_schema.columns WHERE table_schema = COALESCE(cloudsync_schema(), current_schema())",
1619-
&schema);
1620-
1621-
if (!schema) {
1622-
elog(INFO, "database_schema_hash: schema is NULL");
1623-
return 0;
1624-
}
1625-
1626-
size_t schema_len = strlen(schema);
1627-
uint64_t hash = fnv1a_hash(schema, schema_len);
1628-
cloudsync_memory_free(schema);
1629-
return hash;
1615+
int64_t value = 0;
1616+
int rc = database_select_int(data, "SELECT hash FROM cloudsync_schema_versions ORDER BY seq DESC LIMIT 1;", &value);
1617+
return (rc == DBRES_OK) ? (uint64_t)value : 0;
16301618
}
16311619

16321620
bool database_check_schema_hash (cloudsync_context *data, uint64_t hash) {
@@ -1639,16 +1627,65 @@ bool database_check_schema_hash (cloudsync_context *data, uint64_t hash) {
16391627
}
16401628

16411629
int database_update_schema_hash (cloudsync_context *data, uint64_t *hash) {
1630+
// Build normalized schema string using only: column name (lowercase), type (SQLite affinity), pk flag
1631+
// Format: tablename:colname:affinity:pk,... (ordered by table name, then column ordinal position)
1632+
// This makes the hash resilient to formatting, quoting, case differences and portable across databases
1633+
//
1634+
// PostgreSQL type to SQLite affinity mapping:
1635+
// - integer, smallint, bigint, boolean → 'integer'
1636+
// - bytea → 'blob'
1637+
// - real, double precision → 'real'
1638+
// - numeric, decimal → 'numeric'
1639+
// - Everything else → 'text' (default)
1640+
// This includes: text, varchar, char, uuid, timestamp, timestamptz, date, time,
1641+
// interval, json, jsonb, inet, cidr, macaddr, geometric types, xml, enums,
1642+
// and any custom/extension types. Using 'text' as default ensures compatibility
1643+
// since most types serialize to text representation and SQLite stores unknown
1644+
// types as TEXT affinity.
1645+
16421646
char *schema = NULL;
16431647
int rc = database_select_text(data,
1644-
"SELECT string_agg(LOWER(table_name || column_name || data_type), '' ORDER BY table_name, column_name) "
1645-
"FROM information_schema.columns WHERE table_schema = COALESCE(cloudsync_schema(), current_schema())",
1648+
"SELECT string_agg("
1649+
" LOWER(c.table_name) || ':' || LOWER(c.column_name) || ':' || "
1650+
" CASE "
1651+
// Integer types (including boolean as 0/1)
1652+
" WHEN c.data_type IN ('integer', 'smallint', 'bigint', 'boolean') THEN 'integer' "
1653+
// Blob type
1654+
" WHEN c.data_type = 'bytea' THEN 'blob' "
1655+
// Real/float types
1656+
" WHEN c.data_type IN ('real', 'double precision') THEN 'real' "
1657+
// Numeric types (explicit precision/scale)
1658+
" WHEN c.data_type IN ('numeric', 'decimal') THEN 'numeric' "
1659+
// Default to text for everything else:
1660+
// - String types: text, character varying, character, name, uuid
1661+
// - Date/time: timestamp, date, time, interval, etc.
1662+
// - JSON: json, jsonb
1663+
// - Network: inet, cidr, macaddr
1664+
// - Geometric: point, line, box, etc.
1665+
// - Custom/extension types
1666+
" ELSE 'text' "
1667+
" END || ':' || "
1668+
" CASE WHEN kcu.column_name IS NOT NULL THEN '1' ELSE '0' END, "
1669+
" ',' ORDER BY c.table_name, c.ordinal_position"
1670+
") "
1671+
"FROM information_schema.columns c "
1672+
"JOIN cloudsync_table_settings cts ON LOWER(c.table_name) = LOWER(cts.tbl_name) "
1673+
"LEFT JOIN information_schema.table_constraints tc "
1674+
" ON tc.table_name = c.table_name "
1675+
" AND tc.table_schema = c.table_schema "
1676+
" AND tc.constraint_type = 'PRIMARY KEY' "
1677+
"LEFT JOIN information_schema.key_column_usage kcu "
1678+
" ON kcu.table_name = c.table_name "
1679+
" AND kcu.column_name = c.column_name "
1680+
" AND kcu.table_schema = c.table_schema "
1681+
" AND kcu.constraint_name = tc.constraint_name "
1682+
"WHERE c.table_schema = COALESCE(cloudsync_schema(), current_schema())",
16461683
&schema);
16471684

16481685
if (rc != DBRES_OK || !schema) return cloudsync_set_error(data, "database_update_schema_hash error 1", DBRES_ERROR);
16491686

16501687
size_t schema_len = strlen(schema);
1651-
DEBUG_ALWAYS("database_update_schema_hash len %zu", schema_len);
1688+
DEBUG_MERGE("database_update_schema_hash len %zu schema %s", schema_len, schema);
16521689
uint64_t h = fnv1a_hash(schema, schema_len);
16531690
cloudsync_memory_free(schema);
16541691
if (hash && *hash == h) return cloudsync_set_error(data, "database_update_schema_hash constraint", DBRES_CONSTRAINT);
@@ -1664,7 +1701,7 @@ int database_update_schema_hash (cloudsync_context *data, uint64_t *hash) {
16641701
if (rc == DBRES_OK) {
16651702
if (hash) *hash = h;
16661703
return rc;
1667-
}
1704+
}
16681705

16691706
return cloudsync_set_error(data, "database_update_schema_hash error 2", DBRES_ERROR);
16701707
}

src/sqlite/database_sqlite.c

Lines changed: 112 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -803,25 +803,124 @@ bool database_check_schema_hash (cloudsync_context *data, uint64_t hash) {
803803
}
804804

805805
int database_update_schema_hash (cloudsync_context *data, uint64_t *hash) {
806-
char *schemasql = "SELECT group_concat(LOWER(sql)) FROM sqlite_master "
807-
"WHERE type = 'table' AND name IN (SELECT tbl_name FROM cloudsync_table_settings ORDER BY tbl_name) "
808-
"ORDER BY name;";
809-
806+
// Build normalized schema string using only: column name (lowercase), type (SQLite affinity), pk flag
807+
// Format: tablename:colname:affinity:pk,... (ordered by table name, then column id)
808+
// This makes the hash resilient to formatting, quoting, case differences and portable across databases
809+
//
810+
// Type mapping (simplified from SQLite affinity rules for cross-database compatibility):
811+
// - Types containing 'INT' → 'integer'
812+
// - Types containing 'CHAR', 'CLOB', 'TEXT' → 'text'
813+
// - Types containing 'BLOB' or empty → 'blob'
814+
// - Types containing 'REAL', 'FLOA', 'DOUB' → 'real'
815+
// - Types exactly 'NUMERIC' or 'DECIMAL' → 'numeric'
816+
// - Everything else → 'text' (default)
817+
//
818+
// NOTE: This deviates from SQLite's actual affinity rules where unknown types get NUMERIC affinity.
819+
// We use 'text' as default to improve cross-database compatibility with PostgreSQL, where types
820+
// like TIMESTAMPTZ, UUID, JSON, etc. are commonly used and map to 'text' in the PostgreSQL
821+
// implementation. This ensures schemas with PostgreSQL-specific type names in SQLite DDL
822+
// will hash consistently across both databases.
823+
sqlite3 *db = (sqlite3 *)cloudsync_db(data);
824+
825+
char **tables = NULL;
826+
int ntables, tcols;
827+
int rc = sqlite3_get_table(db, "SELECT DISTINCT tbl_name FROM cloudsync_table_settings ORDER BY tbl_name;",
828+
&tables, &ntables, &tcols, NULL);
829+
if (rc != SQLITE_OK || ntables == 0) {
830+
if (tables) sqlite3_free_table(tables);
831+
return SQLITE_ERROR;
832+
}
833+
810834
char *schema = NULL;
811-
int rc = database_select_text(data, schemasql, &schema);
812-
if (rc != DBRES_OK) return rc;
813-
if (!schema) return DBRES_ERROR;
814-
815-
uint64_t h = fnv1a_hash(schema, strlen(schema));
835+
size_t schema_len = 0;
836+
size_t schema_cap = 0;
837+
838+
for (int t = 1; t <= ntables; t++) {
839+
const char *tbl_name = tables[t];
840+
841+
// Query pragma_table_info for this table with normalized type
842+
char *col_sql = cloudsync_memory_mprintf(
843+
"SELECT LOWER(name), "
844+
"CASE "
845+
" WHEN UPPER(type) LIKE '%%INT%%' THEN 'integer' "
846+
" WHEN UPPER(type) LIKE '%%CHAR%%' OR UPPER(type) LIKE '%%CLOB%%' OR UPPER(type) LIKE '%%TEXT%%' THEN 'text' "
847+
" WHEN UPPER(type) LIKE '%%BLOB%%' OR type = '' THEN 'blob' "
848+
" WHEN UPPER(type) LIKE '%%REAL%%' OR UPPER(type) LIKE '%%FLOA%%' OR UPPER(type) LIKE '%%DOUB%%' THEN 'real' "
849+
" WHEN UPPER(type) IN ('NUMERIC', 'DECIMAL') THEN 'numeric' "
850+
" ELSE 'text' "
851+
"END, "
852+
"CASE WHEN pk > 0 THEN '1' ELSE '0' END "
853+
"FROM pragma_table_info('%q') ORDER BY cid;", tbl_name);
854+
855+
if (!col_sql) {
856+
if (schema) cloudsync_memory_free(schema);
857+
sqlite3_free_table(tables);
858+
return SQLITE_NOMEM;
859+
}
860+
861+
char **cols = NULL;
862+
int nrows, ncols;
863+
rc = sqlite3_get_table(db, col_sql, &cols, &nrows, &ncols, NULL);
864+
cloudsync_memory_free(col_sql);
865+
866+
if (rc != SQLITE_OK || ncols != 3) {
867+
if (cols) sqlite3_free_table(cols);
868+
if (schema) cloudsync_memory_free(schema);
869+
sqlite3_free_table(tables);
870+
return SQLITE_ERROR;
871+
}
872+
873+
// Append each column: tablename:colname:affinity:pk
874+
for (int r = 1; r <= nrows; r++) {
875+
const char *col_name = cols[r * 3];
876+
const char *col_type = cols[r * 3 + 1];
877+
const char *col_pk = cols[r * 3 + 2];
878+
879+
// Calculate required size: tbl_name:col_name:col_type:col_pk,
880+
size_t entry_len = strlen(tbl_name) + 1 + strlen(col_name) + 1 + strlen(col_type) + 1 + strlen(col_pk) + 1;
881+
882+
if (schema_len + entry_len + 1 > schema_cap) {
883+
schema_cap = (schema_cap == 0) ? 1024 : schema_cap * 2;
884+
if (schema_cap < schema_len + entry_len + 1) schema_cap = schema_len + entry_len + 1;
885+
char *new_schema = cloudsync_memory_realloc(schema, schema_cap);
886+
if (!new_schema) {
887+
if (schema) cloudsync_memory_free(schema);
888+
sqlite3_free_table(cols);
889+
sqlite3_free_table(tables);
890+
return SQLITE_NOMEM;
891+
}
892+
schema = new_schema;
893+
}
894+
895+
int written = snprintf(schema + schema_len, schema_cap - schema_len, "%s:%s:%s:%s,",
896+
tbl_name, col_name, col_type, col_pk);
897+
schema_len += written;
898+
}
899+
900+
sqlite3_free_table(cols);
901+
}
902+
903+
sqlite3_free_table(tables);
904+
905+
if (!schema || schema_len == 0) return SQLITE_ERROR;
906+
907+
// Remove trailing comma
908+
if (schema_len > 0 && schema[schema_len - 1] == ',') {
909+
schema[schema_len - 1] = '\0';
910+
schema_len--;
911+
}
912+
913+
DEBUG_MERGE("database_update_schema_hash len %zu schema %s", schema_len, schema);
914+
sqlite3_uint64 h = fnv1a_hash(schema, schema_len);
816915
cloudsync_memory_free(schema);
817916
if (hash && *hash == h) return SQLITE_CONSTRAINT;
818-
917+
819918
char sql[1024];
820919
snprintf(sql, sizeof(sql), "INSERT INTO cloudsync_schema_versions (hash, seq) "
821-
"VALUES (%" PRIu64 ", COALESCE((SELECT MAX(seq) FROM cloudsync_schema_versions), 0) + 1) "
920+
"VALUES (%lld, COALESCE((SELECT MAX(seq) FROM cloudsync_schema_versions), 0) + 1) "
822921
"ON CONFLICT(hash) DO UPDATE SET "
823-
"seq = (SELECT COALESCE(MAX(seq), 0) + 1 FROM cloudsync_schema_versions);", h);
824-
rc = database_exec(data, sql);
922+
" seq = (SELECT COALESCE(MAX(seq), 0) + 1 FROM cloudsync_schema_versions);", (sqlite3_int64)h);
923+
rc = sqlite3_exec(db, sql, NULL, NULL, NULL);
825924
if (rc == SQLITE_OK && hash) *hash = h;
826925
return rc;
827926
}

test/unit.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
extern char *OUT_OF_MEMORY_BUFFER;
3131
extern bool force_vtab_filter_abort;
3232
extern bool force_uncompressed_blob;
33-
extern bool schema_hash_disabled;
3433

3534
void dbvm_reset (dbvm_t *stmt);
3635
int dbvm_count (dbvm_t *stmt, const char *value, size_t len, int type);
@@ -4511,11 +4510,9 @@ bool do_test_merge_alter_schema_1 (int nclients, bool print_result, bool cleanup
45114510
do_insert(db[0], TEST_PRIKEYS, NINSERT, print_result);
45124511

45134512
// merge changes from db0 to db1, it should fail because db0 has a newer schema hash
4514-
if (!schema_hash_disabled) {
4515-
// perform the test ONLY if schema hash is enabled
4516-
if (do_merge_using_payload(db[0], db[1], only_locals, false) == true) {
4517-
return false;
4518-
}
4513+
// perform the test ONLY if schema hash is enabled
4514+
if (do_merge_using_payload(db[0], db[1], only_locals, false) == true) {
4515+
return false;
45194516
}
45204517

45214518
// augment TEST_NOCOLS also on db1

0 commit comments

Comments
 (0)