From 49f3dbba733633fa3bba30535e0c6f3676534528 Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Tue, 2 Jun 2026 12:16:15 -0700 Subject: [PATCH 01/11] fix(dashboard): address post-approval review feedback on #40528 (#40685) Co-authored-by: Claude Opus 4.8 --- .../components/PropertiesModal/index.tsx | 4 +++- .../components/PropertiesModal/utils.test.ts | 22 +++++++++++++++++-- .../components/PropertiesModal/utils.ts | 4 +++- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/superset-frontend/src/dashboard/components/PropertiesModal/index.tsx b/superset-frontend/src/dashboard/components/PropertiesModal/index.tsx index 90b89aa1c096..fe1fee734df3 100644 --- a/superset-frontend/src/dashboard/components/PropertiesModal/index.tsx +++ b/superset-frontend/src/dashboard/components/PropertiesModal/index.tsx @@ -254,7 +254,9 @@ const PropertiesModal = ({ selectedOwners: OwnerOption[], options: OwnerOption[], ) => { - setOwners(parseSelectedOwners(selectedOwners, options, owners)); + // Use the functional updater so the parse always reads the latest owners + // state rather than the value captured in this render's closure. + setOwners(prev => parseSelectedOwners(selectedOwners, options, prev)); }; const handleOnChangeRoles = (roles: { value: number; label: string }[]) => { diff --git a/superset-frontend/src/dashboard/components/PropertiesModal/utils.test.ts b/superset-frontend/src/dashboard/components/PropertiesModal/utils.test.ts index 9f16dc07ab12..40e2fba262a5 100644 --- a/superset-frontend/src/dashboard/components/PropertiesModal/utils.test.ts +++ b/superset-frontend/src/dashboard/components/PropertiesModal/utils.test.ts @@ -59,14 +59,32 @@ test('builds a new owner from the option text label when not already in state', ]); }); +test('leaves email undefined when option is cached but carries no email', () => { + // The option exists in the cache (so `opt` is defined) but has no + // OWNER_EMAIL_PROP, exercising the `?? undefined` branch directly. + const options = [ + { + value: 6, + label: 'Dave Doe', + [OWNER_TEXT_LABEL_PROP]: 'Dave Doe', + // intentionally no OWNER_EMAIL_PROP + }, + ]; + + expect( + parseSelectedOwners([{ value: 6, label: 'Dave Doe' }], options, []), + ).toEqual([{ id: 6, full_name: 'Dave Doe', email: undefined }]); +}); + test('falls back to a string label when the option has no text label', () => { + // No option in the cache => email stays undefined (not an empty string). expect( parseSelectedOwners([{ value: 4, label: 'Plain Name' }], [], []), - ).toEqual([{ id: 4, full_name: 'Plain Name', email: '' }]); + ).toEqual([{ id: 4, full_name: 'Plain Name', email: undefined }]); }); test('yields an empty name for a non-string label with no text label', () => { expect(parseSelectedOwners([{ value: 5, label: 1 }], [], [])).toEqual([ - { id: 5, full_name: '', email: '' }, + { id: 5, full_name: '', email: undefined }, ]); }); diff --git a/superset-frontend/src/dashboard/components/PropertiesModal/utils.ts b/superset-frontend/src/dashboard/components/PropertiesModal/utils.ts index d94f3c4568e7..706b04d48862 100644 --- a/superset-frontend/src/dashboard/components/PropertiesModal/utils.ts +++ b/superset-frontend/src/dashboard/components/PropertiesModal/utils.ts @@ -73,7 +73,9 @@ export function parseSelectedOwners( // `label` is a React element unless the option came from a plain-text // source, so only use it as a name when it is actually a string. (typeof o.label === 'string' ? o.label : ''), - email: opt?.[OWNER_EMAIL_PROP] || '', + // Leave email undefined when the option carries no email, rather than + // fabricating an empty string — keeps the optional `email?` type honest. + email: opt?.[OWNER_EMAIL_PROP] ?? undefined, }; }); } From 8508af3201c4eafadc77f4368ceefb53ef90b60f Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Tue, 2 Jun 2026 12:36:32 -0700 Subject: [PATCH 02/11] chore(key_value): prune expired entries from the key-value store (#40663) Co-authored-by: Claude Code Co-authored-by: Ville Brofeldt --- SECURITY.md | 2 +- superset/config.py | 7 + superset/daos/key_value.py | 32 +++++ superset/key_value/commands/__init__.py | 16 +++ superset/key_value/commands/prune.py | 143 +++++++++++++++++++ superset/tasks/scheduler.py | 16 +++ tests/unit_tests/key_value/prune_test.py | 168 +++++++++++++++++++++++ 7 files changed, 383 insertions(+), 1 deletion(-) create mode 100644 superset/key_value/commands/__init__.py create mode 100644 superset/key_value/commands/prune.py create mode 100644 tests/unit_tests/key_value/prune_test.py diff --git a/SECURITY.md b/SECURITY.md index e8ecfe56aec2..a038c9a8d882 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -109,7 +109,7 @@ If yes, it is in scope. If no, it is out of scope. The lists below apply that te - Any action an Admin role can perform through documented configuration, API, or UI. The Admin role is a trusted operational principal by policy. Per MITRE CNA Operational Rules 4.1, a qualifying vulnerability must violate a security policy; behavior within a documented trust boundary does not. - Deployment or operator decisions: the values of secrets and tokens, whether internal networks are reachable from the server, which database connectors or cache backends are enabled, which feature flags are set, where notifications are delivered, and which third-party plugins are loaded. - Compromise, modification, or malicious control of trusted backend infrastructure. Apache Superset assumes the integrity of its metastore, cache backends (for example Redis or Memcached), message brokers, secret stores, and other operator-managed infrastructure. Findings that require an attacker to read from, write to, or otherwise tamper with these systems, including injecting malicious state, serialized objects, cache entries, task metadata, configuration, or database records, are post-compromise scenarios and do not constitute vulnerabilities in Apache Superset itself. A finding remains in scope only if an unprivileged user can cause such modification through a vulnerability in Apache Superset. -- Code paths whose intended purpose is example data, demos, fixtures, local development, or documentation, rather than the production runtime. +- The continued presence of expired key-value or metastore-cache entries that have not yet been deleted from the metadata database. Such entries are excluded from reads once expired, are purged opportunistically on write, and are removed in bulk by the scheduled `prune_key_value` maintenance task; their lingering until purged is an eventual-cleanup property, not a security boundary, and does not constitute a vulnerability. - How a downstream application (spreadsheet program, email client, browser handling user-downloaded files) interprets output Apache Superset produced for it. - Findings without a reproducible proof of concept against a supported release. The burden of demonstrating exploitability rests with the reporter; findings closed for lack of a proof of concept may be refiled if one is later produced. - Brute force, rate limiting, denial of service, or resource exhaustion that does not bypass a documented control. diff --git a/superset/config.py b/superset/config.py index cd64e54bcc88..e2f48711a775 100644 --- a/superset/config.py +++ b/superset/config.py @@ -1422,6 +1422,13 @@ class CeleryConfig: # pylint: disable=too-few-public-methods # "schedule": crontab(minute=0, hour=0), # "kwargs": {"retention_period_days": 90, "max_rows_per_run": 10000}, # }, + # Uncomment to enable pruning of expired entries from the key-value store + # (for example, rows left behind by the metastore cache backend) + # "prune_key_value": { + # "task": "prune_key_value", + # "schedule": crontab(minute=0, hour=0), + # "kwargs": {"max_rows_per_run": 10000}, + # }, # Uncomment to enable Slack channel cache warm-up # "slack.cache_channels": { # "task": "slack.cache_channels", diff --git a/superset/daos/key_value.py b/superset/daos/key_value.py index f15293abcab8..e3e63d02cf2c 100644 --- a/superset/daos/key_value.py +++ b/superset/daos/key_value.py @@ -69,6 +69,14 @@ def delete_entry(resource: KeyValueResource, key: Key) -> bool: @staticmethod def delete_expired_entries(resource: KeyValueResource) -> None: + """ + Bulk-delete every expired entry for the given resource. + + Callers that pass an explicit ``key`` to ``create_entry`` along with an + ``expires_on`` must invoke this once before their ``create_entry`` call(s); + see :meth:`create_entry` for the rationale. Expired entries are also removed + in bulk across all resources by the scheduled ``prune_key_value`` task. + """ ( db.session.query(KeyValueEntry) .filter( @@ -88,6 +96,20 @@ def create_entry( key: Key | None = None, expires_on: datetime | None = None, ) -> KeyValueEntry: + """ + Create a new entry in the key-value store. + + .. note:: + This method intentionally does **not** purge expired entries. Callers + that pass an explicit ``key`` along with an ``expires_on`` must call + :meth:`delete_expired_entries` for the same ``resource`` once before + creating their entries. Purging is deliberately hoisted out of this + method so that a transaction creating many entries pays the cleanup cost + only once up front rather than on every insert. An expired entry still + occupying the same ``key`` would otherwise cause this insert to fail the + unique constraint. (Entries created without an explicit ``key`` get an + auto-generated id and cannot collide, so they need no prior purge.) + """ try: encoded_value = codec.encode(value) except Exception as ex: @@ -118,6 +140,12 @@ def upsert_entry( key: Key, expires_on: datetime | None = None, ) -> KeyValueEntry: + """ + Update an existing entry or create it if it does not exist. + + Because this overwrites any existing entry for the key (expired or not), it + does not require a prior :meth:`delete_expired_entries` call. + """ if entry := KeyValueDAO.get_entry(resource, key): entry.value = codec.encode(value) entry.expires_on = expires_on @@ -135,6 +163,10 @@ def update_entry( key: Key, expires_on: datetime | None = None, ) -> KeyValueEntry: + """ + Update an existing entry, raising ``KeyValueUpdateFailedError`` if no entry + exists for the given key. + """ if entry := KeyValueDAO.get_entry(resource, key): entry.value = codec.encode(value) entry.expires_on = expires_on diff --git a/superset/key_value/commands/__init__.py b/superset/key_value/commands/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/superset/key_value/commands/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/superset/key_value/commands/prune.py b/superset/key_value/commands/prune.py new file mode 100644 index 000000000000..6bcf93bd6cb7 --- /dev/null +++ b/superset/key_value/commands/prune.py @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import logging +import time +from datetime import datetime + +import sqlalchemy as sa + +from superset import db +from superset.commands.base import BaseCommand +from superset.key_value.models import KeyValueEntry + +logger = logging.getLogger(__name__) + + +# pylint: disable=consider-using-transaction +class KeyValuePruneCommand(BaseCommand): + """ + Command to prune the key-value store by deleting entries whose expiry has + already passed. + + The metastore-backed key-value store keeps an `expires_on` timestamp for + entries written with a timeout (for example, the metastore cache backend). + Unlike cache backends that evict on read, the metastore does not remove rows + on its own, so expired entries remain in the table until something deletes + them. This command performs that housekeeping by deleting every entry whose + `expires_on` is in the past, freeing up space in the table. + + Attributes: + max_rows_per_run (int | None): The maximum number of rows to delete in a + single run. If provided and greater than + zero, rows are selected deterministically + from the oldest first by id up to this + limit in this execution. + """ + + def __init__(self, max_rows_per_run: int | None = None): + """ + :param max_rows_per_run: The maximum number of rows to delete in a single run. + If provided and greater than zero, rows are selected deterministically from + the oldest first by id up to this limit in this execution. + """ + self.max_rows_per_run = max_rows_per_run + + def run(self) -> None: + """ + Executes the prune command + """ + batch_size = 999 # SQLite has a IN clause limit of 999 + total_deleted = 0 + start_time = time.time() + + # Capture a single cutoff timestamp and reuse it for both the selection + # and the delete. Reusing the same value (rather than calling + # datetime.now() again at delete time) keeps the delete predicate + # consistent with the selection and re-checks expiry on delete, so an + # entry refreshed (expires_on moved into the future) between selection + # and deletion is not removed. + cutoff = datetime.now() + + # Select all IDs whose expiry has already passed. Entries without an + # expiry (expires_on IS NULL) never expire and are left untouched. The + # `<=` comparison matches KeyValueEntry.is_expired() and + # KeyValueDAO.delete_expired_entries() so all expiry checks agree. + select_stmt = sa.select(KeyValueEntry.id).where( + KeyValueEntry.expires_on.isnot(None), + KeyValueEntry.expires_on <= cutoff, + ) + + # Optionally limited by max_rows_per_run + # order by oldest first for deterministic deletion + if self.max_rows_per_run is not None and self.max_rows_per_run > 0: + select_stmt = select_stmt.order_by(KeyValueEntry.id.asc()).limit( + self.max_rows_per_run + ) + + ids_to_delete = db.session.execute(select_stmt).scalars().all() + + total_rows = len(ids_to_delete) + + logger.info("Total rows to be deleted: %s", f"{total_rows:,}") + + next_logging_threshold = 1 + + # Iterate over the IDs in batches + for i in range(0, total_rows, batch_size): + batch_ids = ids_to_delete[i : i + batch_size] + + # Delete the selected batch using IN clause. The expiry predicate is + # re-applied here (against the same cutoff captured before selection) + # so an entry refreshed between selection and deletion is left intact. + result = db.session.execute( + sa.delete(KeyValueEntry).where( + KeyValueEntry.id.in_(batch_ids), + KeyValueEntry.expires_on.isnot(None), + KeyValueEntry.expires_on <= cutoff, + ) + ) + + # Update the total number of deleted records + total_deleted += result.rowcount + + # Explicitly commit the transaction so that, if an error occurs, the + # records deleted so far are committed + db.session.commit() + + # Log the number of deleted records every 1% increase in progress + percentage_complete = (total_deleted / total_rows) * 100 + if percentage_complete >= next_logging_threshold: + logger.info( + "Deleted %s expired rows from the key-value store (%d%% complete)", + f"{total_deleted:,}", + percentage_complete, + ) + next_logging_threshold += 1 + + elapsed_time = time.time() - start_time + minutes, seconds = divmod(elapsed_time, 60) + formatted_time = f"{int(minutes):02}:{int(seconds):02}" + logger.info( + "Pruning complete: %s expired rows deleted in %s", + f"{total_deleted:,}", + formatted_time, + ) + + def validate(self) -> None: + pass diff --git a/superset/tasks/scheduler.py b/superset/tasks/scheduler.py index 5b38048adf17..a37c049a06ac 100644 --- a/superset/tasks/scheduler.py +++ b/superset/tasks/scheduler.py @@ -38,6 +38,7 @@ from superset.daos.report import ReportScheduleDAO from superset.daos.tasks import TaskDAO from superset.extensions import celery_app +from superset.key_value.commands.prune import KeyValuePruneCommand from superset.stats_logger import BaseStatsLogger from superset.tasks.ambient_context import use_context from superset.tasks.constants import ABORT_STATES, TERMINAL_STATES @@ -236,6 +237,21 @@ def prune_tasks( logger.exception("An error occurred while pruning async tasks: %s", ex) +@celery_app.task(name="prune_key_value", bind=True) +def prune_key_value( + self: Task, + max_rows_per_run: int | None = None, + **kwargs: Any, +) -> None: + stats_logger: BaseStatsLogger = current_app.config["STATS_LOGGER"] + stats_logger.incr("prune_key_value") + + try: + KeyValuePruneCommand(max_rows_per_run).run() + except CommandException as ex: + logger.exception("An error occurred while pruning the key-value store: %s", ex) + + @celery_app.task(name="tasks.execute", bind=True) def execute_task( # noqa: C901 self: Any, # Celery task instance diff --git a/tests/unit_tests/key_value/prune_test.py b/tests/unit_tests/key_value/prune_test.py new file mode 100644 index 000000000000..f69b56bf0514 --- /dev/null +++ b/tests/unit_tests/key_value/prune_test.py @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=import-outside-toplevel, unused-argument +from __future__ import annotations + +from collections.abc import Generator +from datetime import datetime, timedelta +from typing import TYPE_CHECKING +from unittest.mock import patch + +import pytest +from flask.ctx import AppContext + +from superset.extensions import db +from superset.key_value.types import JsonKeyValueCodec, KeyValueResource + +if TYPE_CHECKING: + from superset.key_value.models import KeyValueEntry + +RESOURCE = KeyValueResource.METASTORE_CACHE +CODEC = JsonKeyValueCodec() +VALUE = {"foo": "bar"} + + +@pytest.fixture +def clean_key_value_store(app_context: AppContext) -> Generator[None, None, None]: + # The prune command commits, so a plain session rollback cannot undo its + # effects. Explicitly empty the table around each test for isolation. + from superset.key_value.models import KeyValueEntry + + db.session.query(KeyValueEntry).delete() + db.session.commit() # pylint: disable=consider-using-transaction + yield + db.session.query(KeyValueEntry).delete() + db.session.commit() # pylint: disable=consider-using-transaction + + +def _add_entry(expires_on: datetime | None) -> KeyValueEntry: + from superset.key_value.models import KeyValueEntry + + entry = KeyValueEntry( + resource=RESOURCE, + value=CODEC.encode(VALUE), + expires_on=expires_on, + ) + db.session.add(entry) + db.session.flush() + return entry + + +def test_prune_deletes_expired_entries( + clean_key_value_store: None, +) -> None: + from superset.key_value.commands.prune import KeyValuePruneCommand + from superset.key_value.models import KeyValueEntry + + expired_a = _add_entry(datetime.now() - timedelta(days=1)) + expired_b = _add_entry(datetime.now() - timedelta(seconds=5)) + expired_ids = {expired_a.id, expired_b.id} + + KeyValuePruneCommand().run() + + remaining_ids = {row.id for row in db.session.query(KeyValueEntry.id).all()} + assert expired_ids.isdisjoint(remaining_ids) + assert db.session.query(KeyValueEntry).count() == 0 + + +def test_prune_retains_non_expired_and_no_expiry_entries( + clean_key_value_store: None, +) -> None: + from superset.key_value.commands.prune import KeyValuePruneCommand + from superset.key_value.models import KeyValueEntry + + future = _add_entry(datetime.now() + timedelta(days=1)) + no_expiry = _add_entry(None) + expired = _add_entry(datetime.now() - timedelta(days=1)) + + KeyValuePruneCommand().run() + + remaining_ids = {row.id for row in db.session.query(KeyValueEntry.id).all()} + assert future.id in remaining_ids + assert no_expiry.id in remaining_ids + assert expired.id not in remaining_ids + assert db.session.query(KeyValueEntry).count() == 2 + + +def test_prune_empty_store_is_noop( + clean_key_value_store: None, +) -> None: + from superset.key_value.commands.prune import KeyValuePruneCommand + from superset.key_value.models import KeyValueEntry + + assert db.session.query(KeyValueEntry).count() == 0 + + # Should not raise and should leave the store empty + KeyValuePruneCommand().run() + + assert db.session.query(KeyValueEntry).count() == 0 + + +def test_prune_skips_entry_refreshed_after_selection( + clean_key_value_store: None, +) -> None: + from superset.key_value.commands.prune import KeyValuePruneCommand + from superset.key_value.models import KeyValueEntry + + # Simulate the TOCTOU window: an entry is expired when prune selects it but + # is refreshed (expires_on moved into the future) before the delete runs. + # The delete re-checks expiry against the cutoff captured at selection time, + # so the refreshed entry must survive. We inject the refresh right after the + # command captures its cutoff by patching datetime.now used in the command. + expired = _add_entry(datetime.now() - timedelta(days=1)) + db.session.commit() # pylint: disable=consider-using-transaction + + import superset.key_value.commands.prune as prune_module + + real_now = datetime.now + state = {"refreshed": False} + + class _PatchedDatetime: + @staticmethod + def now() -> datetime: + # The command calls now() once to capture the cutoff. After that + # call, refresh the entry so the subsequent delete sees a future + # expires_on but still deletes against the original cutoff. + current = real_now() + if not state["refreshed"]: + state["refreshed"] = True + db.session.query(KeyValueEntry).filter( + KeyValueEntry.id == expired.id + ).update({KeyValueEntry.expires_on: real_now() + timedelta(days=1)}) + db.session.commit() # pylint: disable=consider-using-transaction + return current + + with patch.object(prune_module, "datetime", _PatchedDatetime): + KeyValuePruneCommand().run() + + remaining_ids = {row.id for row in db.session.query(KeyValueEntry.id).all()} + assert expired.id in remaining_ids + + +def test_prune_respects_max_rows_per_run( + clean_key_value_store: None, +) -> None: + from superset.key_value.commands.prune import KeyValuePruneCommand + from superset.key_value.models import KeyValueEntry + + for _ in range(3): + _add_entry(datetime.now() - timedelta(days=1)) + + KeyValuePruneCommand(max_rows_per_run=2).run() + + # Only the two oldest expired rows are removed in a single capped run + assert db.session.query(KeyValueEntry).count() == 1 From a9df2c7e5eac11e224785bba91cb9c0745c4ef09 Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Tue, 2 Jun 2026 13:39:23 -0700 Subject: [PATCH 03/11] fix(mcp): address post-approval review feedback on auth logging PR #40646 (#40684) Co-authored-by: Claude Opus 4.8 --- superset/mcp_service/chart/validation/pipeline.py | 6 ++++-- superset/mcp_service/jwt_verifier.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/superset/mcp_service/chart/validation/pipeline.py b/superset/mcp_service/chart/validation/pipeline.py index 1f4e10472af7..2f83ef705df3 100644 --- a/superset/mcp_service/chart/validation/pipeline.py +++ b/superset/mcp_service/chart/validation/pipeline.py @@ -149,8 +149,10 @@ def validate_request_with_warnings( ChartErrorBuilder, ) - # SECURITY FIX: Sanitize validation error to prevent information disclosure - sanitized_reason = _sanitize_validation_error(e) + # SECURITY FIX: Sanitize validation error to prevent information disclosure. + # logger.exception above already logged the original error; pass + # log_original=False so the sanitizer does not log it a second time at INFO. + sanitized_reason = _sanitize_validation_error(e, log_original=False) error = ChartErrorBuilder.build_error( error_type="validation_system_error", template_key="validation_error", diff --git a/superset/mcp_service/jwt_verifier.py b/superset/mcp_service/jwt_verifier.py index 3402145bd25d..0ad46058965e 100644 --- a/superset/mcp_service/jwt_verifier.py +++ b/superset/mcp_service/jwt_verifier.py @@ -325,7 +325,7 @@ def _auth_error_handler(conn: HTTPConnection, exc: AuthenticationError) -> Respo logger.warning( "JWT authentication failed: %s (source_ip=%s, path=%s, user_agent=%s)", - exc, + _sanitize_for_log(exc), client_host, request_path, user_agent, @@ -564,7 +564,7 @@ async def load_access_token(self, token: str) -> AccessToken | None: # noqa: C9 "JWT authentication succeeded: client_id='%s', scopes=%s, " "auth_method='bearer_jwt'", _sanitize_for_log(client_id), - _sanitize_for_log(sorted(scopes)), + _sanitize_for_log(" ".join(sorted(scopes)) or "(none)"), ) return AccessToken( token=token, From 3e589436fa63ab2e4bb02256711af9d45bee9a0b Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Tue, 2 Jun 2026 13:40:10 -0700 Subject: [PATCH 04/11] fix(reports): sanitize error text in email notification template (#40641) Co-authored-by: Claude Code --- superset/reports/notifications/email.py | 8 ++++- .../reports/notifications/email_tests.py | 35 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/superset/reports/notifications/email.py b/superset/reports/notifications/email.py index d0f51c9c4fbc..4c172afd791d 100644 --- a/superset/reports/notifications/email.py +++ b/superset/reports/notifications/email.py @@ -100,13 +100,19 @@ def _get_smtp_domain() -> str: def _error_template(self, text: str) -> str: call_to_action = self._get_call_to_action() + # The error text is derived from exception messages that can embed + # data-controlled content (e.g. crafted table/column names in a DB + # error). Strip all HTML before interpolating it into the email body, + # matching the sanitization applied to the normal content path. + # pylint: disable=no-member + safe_text = nh3.clean(text, tags=set(), attributes={}) return __( """

Your report/alert was unable to be generated because of the following error: %(text)s

Please check your dashboard/chart for errors.

%(call_to_action)s

""", # noqa: E501 - text=text, + text=safe_text, url=self._content.url, call_to_action=call_to_action, ) diff --git a/tests/unit_tests/reports/notifications/email_tests.py b/tests/unit_tests/reports/notifications/email_tests.py index e2e0eeb29907..54fb2d4b30ec 100644 --- a/tests/unit_tests/reports/notifications/email_tests.py +++ b/tests/unit_tests/reports/notifications/email_tests.py @@ -64,6 +64,41 @@ def test_render_description_with_html() -> None: assert '<a href="http://www.example.com">333</a>' in email_body +def test_error_template_sanitizes_html() -> None: + # `superset.models.helpers`, a dependency of following imports, + # requires app context + from superset.reports.models import ReportRecipients, ReportRecipientType + from superset.reports.notifications.base import NotificationContent + from superset.reports.notifications.email import EmailNotification + + content = NotificationContent( + name="test alert", + text='DB error near ""', + header_data={ + "notification_format": "PNG", + "notification_type": "Alert", + "owners": [1], + "notification_source": None, + "chart_id": None, + "dashboard_id": None, + "slack_channels": None, + "execution_id": "test-execution-id", + }, + ) + email_body = ( + EmailNotification( + recipient=ReportRecipients(type=ReportRecipientType.EMAIL), content=content + ) + ._get_content() + .body + ) + + # Injected markup in the error text must be stripped, not rendered. + assert "" not in email_body + assert "onerror=alert(1)" not in email_body + + @with_feature_flags(DATE_FORMAT_IN_EMAIL_SUBJECT=True) def test_email_subject_with_datetime() -> None: # `superset.models.helpers`, a dependency of following imports, From 6eaee211aadac7ddb5fbc1d53720e6171b380c27 Mon Sep 17 00:00:00 2001 From: Shaitan <105581038+sha174n@users.noreply.github.com> Date: Tue, 2 Jun 2026 21:50:27 +0100 Subject: [PATCH 05/11] fix(sqllab): require dataset match for raw query access (#40409) Co-authored-by: Claude Opus 4.7 --- superset/extensions/metadb.py | 11 +- superset/models/sql_lab.py | 10 +- superset/security/manager.py | 128 +++++++++-- superset/sql/parse.py | 22 ++ superset/sqllab/validators.py | 6 +- tests/integration_tests/security_tests.py | 250 ++++++++++++++++++++++ tests/integration_tests/sqllab_tests.py | 34 ++- tests/unit_tests/sql/parse_tests.py | 24 +++ 8 files changed, 440 insertions(+), 45 deletions(-) diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py index 375b4b87ce02..e30b7aab7013 100644 --- a/superset/extensions/metadb.py +++ b/superset/extensions/metadb.py @@ -305,9 +305,16 @@ def _set_columns(self) -> None: raise ProgrammingError(f"Database not found: {self.database}") self._allow_dml = database.allow_dml - # verify permissions + # verify permissions. MetaDB returns row data through a regular + # query path, so the strict scoping used by SQL Lab applies here + # too: the referenced table must resolve to a Superset dataset the + # user has datasource_access on (or owns). table = Table(self.table, self.schema, self.catalog) - security_manager.raise_for_access(database=database, table=table) + security_manager.raise_for_access( + database=database, + table=table, + force_dataset_match=True, + ) # store this callable for later whenever we need an engine self.engine_context = partial( diff --git a/superset/models/sql_lab.py b/superset/models/sql_lab.py index a8454ebc88d9..dda24cc89429 100644 --- a/superset/models/sql_lab.py +++ b/superset/models/sql_lab.py @@ -306,10 +306,18 @@ def raise_for_access(self) -> None: """ Raise an exception if the user cannot access the resource. + Re-validation of a SQL Lab query uses the same strict scoping as the + initial execute path (``force_dataset_match=True``) so that fetching + results, exporting CSV, and streaming-exporting all enforce the same + per-table dataset-match requirement. ``raise_for_access`` parses + ``executed_sql`` (the Jinja-rendered query that actually ran) when + set, keeping the table set aligned with execution even though the + original ``template_params`` are not persisted on the query record. + :raises SupersetSecurityException: If the user cannot access the resource """ - security_manager.raise_for_access(query=self) + security_manager.raise_for_access(query=self, force_dataset_match=True) @property def db_engine_spec( diff --git a/superset/security/manager.py b/superset/security/manager.py index a62daa528d31..737b172cebd6 100644 --- a/superset/security/manager.py +++ b/superset/security/manager.py @@ -21,6 +21,7 @@ import re import time from collections import defaultdict +from types import SimpleNamespace from typing import Any, Callable, cast, NamedTuple, Optional, TYPE_CHECKING from flask import current_app, Flask, g, Request @@ -2883,6 +2884,7 @@ def raise_for_access( # noqa: C901 catalog: Optional[str] = None, schema: Optional[str] = None, template_params: Optional[dict[str, Any]] = None, + force_dataset_match: bool = False, ) -> None: """ Raise an exception if the user cannot access the resource. @@ -2897,6 +2899,15 @@ def raise_for_access( # noqa: C901 :param catalog: Optional catalog name :param schema: Optional schema name :param template_params: Optional template parameters for Jinja templating + :param force_dataset_match: When True, the historical + ``catalog_access`` / ``schema_access`` fallthroughs in the + database+table / query branch are bypassed and every referenced + table must resolve to a registered Superset dataset the user + has ``datasource_access`` on (or owns). Call sites that execute + or return raw row data (SQL Lab, MetaDB) set this to True. + The default (False) preserves the historical semantics for + chart-data, dataset CRUD, ``/table_metadata/``, and + ``/select_star/``. :raises SupersetSecurityException: If the user cannot access the resource """ # pylint: disable=import-outside-toplevel @@ -2941,39 +2952,118 @@ def raise_for_access( # noqa: C901 # inspector to read it. from superset.models.sql_lab import Query + # Prefer the rendered ``executed_sql`` when it is set so + # re-validation (results fetch, CSV / streaming export) + # authorises against the SQL that actually ran, not a + # re-render of the Jinja source with the wrong (or + # missing) ``template_params``. At execute time + # ``executed_sql`` is unset and we fall back to + # ``query.sql`` + ``template_params``. The same rendered + # SQL must also flow into engine-spec schema resolution + # (e.g. Postgres ``search_path`` detection) so it does + # not choke on unrendered ``{{ ... }}`` Jinja. + typed_query = cast(Query, query) + executed_sql = getattr(typed_query, "executed_sql", None) + sql_for_parse = ( + executed_sql + if isinstance(executed_sql, str) and executed_sql + else typed_query.sql + ) + use_executed_sql = sql_for_parse is executed_sql + parse_template_params = None if use_executed_sql else template_params + parse_query: Any = ( + SimpleNamespace( + sql=sql_for_parse, + schema=typed_query.schema, + catalog=typed_query.catalog, + ) + if use_executed_sql + else typed_query + ) + default_schema = database.get_default_schema_for_query( - cast(Query, query), - template_params, + cast(Query, parse_query), + parse_template_params, ) + parse_result = process_jinja_sql( + sql_for_parse, database, parse_template_params + ) + # Under strict scoping, refuse any statement the parser + # could not fully model: sqlglot ``exp.Command`` nodes + # (e.g. dynamic SQL inside a stored-procedure call) and + # non-sqlglot engines such as Kusto KQL whose statement + # classes do not produce a sqlglot AST. The per-table + # dataset-match check below would be blind to those + # references, so fail closed. + if ( + force_dataset_match + and parse_result.script.has_unparseable_statement + ): + raise SupersetSecurityException( + SupersetError( + error_type=SupersetErrorType.QUERY_SECURITY_ACCESS_ERROR, + message=_( + "SQL Lab cannot authorise a statement that " + "could not be fully parsed. Qualify tables " + "explicitly and avoid dynamic SQL inside " + "stored-procedure or vendor-specific calls." + ), + level=ErrorLevel.ERROR, + ) + ) tables = { table_.qualify( catalog=query.catalog or default_catalog, schema=default_schema, ) - for table_ in process_jinja_sql( - query.sql, database, template_params - ).tables + for table_ in parse_result.tables } elif table: - # Make sure table has the default catalog, if not specified. - tables = {table.qualify(catalog=default_catalog)} + # Make sure table has the default catalog, and (when an + # under-qualified Table was passed) the database's default + # schema. Callers like MetaDB legitimately pass 2-part URIs + # ``db.table`` that mean "the database's default schema"; + # resolving them against the engine spec lets those still + # match a registered dataset. If the engine cannot supply a + # default the strict-deny rule below fires and we fail closed. + table_catalog = table.catalog or default_catalog + schema_default = ( + None if table.schema else database.get_default_schema(table_catalog) + ) + tables = {table.qualify(catalog=table_catalog, schema=schema_default)} denied = set() + # When the caller asks for strict scoping (SQL Lab raw queries, + # MetaDB) the historical catalog_access/schema_access fallthroughs + # are skipped and every referenced table must resolve to a + # registered Superset dataset the user can access. Other callers + # keep the existing semantics. for table_ in tables: - catalog_perm = self.get_catalog_perm( - database.database_name, - table_.catalog, - ) - if catalog_perm and self.can_access("catalog_access", catalog_perm): - continue + if not force_dataset_match: + catalog_perm = self.get_catalog_perm( + database.database_name, + table_.catalog, + ) + if catalog_perm and self.can_access("catalog_access", catalog_perm): + continue - schema_perm = self.get_schema_perm( - database.database_name, - table_.catalog, - table_.schema, - ) - if schema_perm and self.can_access("schema_access", schema_perm): + schema_perm = self.get_schema_perm( + database.database_name, + table_.catalog, + table_.schema, + ) + if schema_perm and self.can_access("schema_access", schema_perm): + continue + + # Under strict scoping, refuse tables whose schema we could + # not pin down. query_datasources_by_name drops the schema + # filter when schema is None and would otherwise return + # SqlaTables in any schema, which the database engine may + # then resolve to a different schema via search_path. The + # dataset-match check would be against the wrong row. + if force_dataset_match and not table_.schema: + denied.add(table_) continue datasources = SqlaTable.query_datasources_by_name( diff --git a/superset/sql/parse.py b/superset/sql/parse.py index 4c2be71897e8..14705bae9775 100644 --- a/superset/sql/parse.py +++ b/superset/sql/parse.py @@ -1369,6 +1369,28 @@ def format(self, comments: bool = True) -> str: """ return ";\n".join(statement.format(comments) for statement in self.statements) + @property + def has_unparseable_statement(self) -> bool: + """ + True if any statement in the script cannot be fully modeled as an + AST whose table references Superset can enumerate. This covers two + cases that must both fail closed under strict scoping: + + * SQLGlot ``exp.Command`` nodes: statements sqlglot recognises but + cannot fully parse (e.g. dynamic SQL inside a stored-procedure + call); ``extract_tables_from_statement`` cannot see the tables. + * Non-sqlglot engines (e.g. Kusto KQL): the statement class does + not produce a sqlglot AST at all and its + ``_extract_tables_from_statement`` returns an empty set, so the + per-table check would have nothing to enforce against. + """ + for statement in self.statements: + if not isinstance(statement, SQLStatement): + return True + if isinstance(statement._parsed, exp.Command): # noqa: SLF001 + return True + return False + def get_settings(self) -> dict[str, str | bool]: """ Return the settings for the SQL script. diff --git a/superset/sqllab/validators.py b/superset/sqllab/validators.py index 3627b40ba5b7..3c4dec0d1352 100644 --- a/superset/sqllab/validators.py +++ b/superset/sqllab/validators.py @@ -30,4 +30,8 @@ class CanAccessQueryValidatorImpl(CanAccessQueryValidator): def validate( self, query: Query, template_params: Optional[dict[str, Any]] = None ) -> None: - security_manager.raise_for_access(query=query, template_params=template_params) + security_manager.raise_for_access( + query=query, + template_params=template_params, + force_dataset_match=True, + ) diff --git a/tests/integration_tests/security_tests.py b/tests/integration_tests/security_tests.py index 686970dc2b86..9390f046104f 100644 --- a/tests/integration_tests/security_tests.py +++ b/tests/integration_tests/security_tests.py @@ -1778,6 +1778,256 @@ def test_raise_for_access_sql(self, mock_can_access, mock_is_owner): database=get_example_database(), schema="bar", sql="SELECT * FROM foo" ) + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_force_dataset_match_allows_dataset( + self, mock_can_access, mock_is_owner, mock_query_datasources + ): + """ + With force_dataset_match=True (SQL Lab path), a query that references + a table backed by a Superset dataset the user has datasource_access + on still succeeds. + """ + query = Mock( + database=get_example_database(), + schema="bar", + sql="SELECT * FROM foo", + catalog=None, + ) + mock_query_datasources.return_value = [ + Mock(perm="[examples].[bar].[foo](id:1)") + ] + # Only datasource_access succeeds; database_access / schema_access + # / catalog_access return False. + mock_can_access.side_effect = lambda perm, _vm: perm == "datasource_access" + mock_is_owner.return_value = False + security_manager.raise_for_access(query=query, force_dataset_match=True) + + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_force_dataset_match_denies_schema_only( + self, mock_can_access, mock_is_owner, mock_query_datasources + ): + """ + Regression test: with force_dataset_match=True, schema_access alone + is not sufficient. The user must have datasource_access (or own) a + registered Superset dataset for the referenced table. + """ + query = Mock( + database=get_example_database(), + schema="bar", + sql="SELECT * FROM table_with_no_dataset", + catalog=None, + ) + # No SqlaTable registered for the referenced table. + mock_query_datasources.return_value = [] + # User holds schema_access only (would have been sufficient before + # the fix); everything else returns False. + mock_can_access.side_effect = lambda perm, _vm: perm == "schema_access" + mock_is_owner.return_value = False + + with self.assertRaises(SupersetSecurityException): # noqa: PT027 + security_manager.raise_for_access(query=query, force_dataset_match=True) + + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_force_dataset_match_denies_unresolved_schema( + self, mock_can_access, mock_is_owner, mock_query_datasources + ): + """ + Regression test: with force_dataset_match=True, a query that + references a table without a resolvable schema (parser couldn't + infer one and the database has no default_schema) is denied + outright. Otherwise SqlaTable.query_datasources_by_name would drop + the schema filter and match a dataset in an unrelated schema while + the engine's search_path resolved the actual query against a + different schema. + """ + database = get_example_database() + # default_schema_for_query returns None; the parser yields a Table + # with no schema either. + with patch.object( + database.db_engine_spec, + "get_default_schema_for_query", + return_value=None, + ): + query = Mock( + database=database, + schema=None, + sql="SELECT * FROM foo", + catalog=None, + ) + # If the deny didn't fire first, query_datasources_by_name + # would return something matching across schemas. + mock_query_datasources.return_value = [ + Mock(perm="[examples].[other_schema].[foo](id:1)") + ] + mock_can_access.return_value = False + mock_is_owner.return_value = False + + with self.assertRaises(SupersetSecurityException): # noqa: PT027 + security_manager.raise_for_access(query=query, force_dataset_match=True) + + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_force_dataset_match_denies_unparseable( + self, mock_can_access, mock_is_owner, mock_query_datasources + ): + """ + Regression test: with force_dataset_match=True, a query whose AST + contains an unparseable statement (sqlglot exp.Command, e.g. a + stored-procedure call whose internal SQL is dynamic) is denied, + because the per-table check below cannot see tables referenced + inside such statements. + """ + query = Mock( + database=get_example_database(), + schema="public", + # CALL on Postgres parses as exp.Command with no parseable + # literal, yielding zero extracted tables. + sql="CALL get_secret_data();", + catalog=None, + ) + mock_query_datasources.return_value = [] + mock_can_access.return_value = False + mock_is_owner.return_value = False + + with self.assertRaises(SupersetSecurityException): # noqa: PT027 + security_manager.raise_for_access(query=query, force_dataset_match=True) + + @patch("superset.security.manager.process_jinja_sql") + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_force_dataset_match_prefers_executed_sql( + self, + mock_can_access, + mock_is_owner, + mock_query_datasources, + mock_process_jinja, + ): + """ + Re-validation (results fetch, CSV export, streaming export) does not + carry template_params, so the security manager parses + ``executed_sql`` (the rendered SQL that actually ran) when it is set + to keep the table set aligned with the execute-time check. + """ + from superset.sql.parse import JinjaSQLResult, SQLScript + + query = Mock( + database=get_example_database(), + schema="bar", + sql="SELECT * FROM {{ table_name }}", + executed_sql="SELECT * FROM bar.foo", + catalog=None, + ) + mock_process_jinja.return_value = JinjaSQLResult( + script=SQLScript("SELECT * FROM bar.foo", "postgresql"), + tables=set(), + ) + mock_query_datasources.return_value = [] + mock_can_access.return_value = False + mock_is_owner.return_value = False + + security_manager.raise_for_access(query=query, force_dataset_match=True) + + sql_passed = mock_process_jinja.call_args.args[0] + assert sql_passed == "SELECT * FROM bar.foo" + + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_force_dataset_match_denies_kql( + self, mock_can_access, mock_is_owner, mock_query_datasources + ): + """ + Regression test: with force_dataset_match=True, Kusto KQL (and any + other engine sqlglot doesn't model) is denied because table + extraction is unsupported. Otherwise raise_for_access would silently + return on an empty table set. + """ + kql_db = Mock() + kql_db.database_name = "kql_db" + kql_db.db_engine_spec.engine = "kustokql" + kql_db.get_default_catalog.return_value = None + kql_db.get_default_schema_for_query.return_value = None + query = Mock( + database=kql_db, + schema=None, + sql="StormEvents | take 1", + catalog=None, + ) + mock_query_datasources.return_value = [] + mock_can_access.return_value = False + mock_is_owner.return_value = False + + with self.assertRaises(SupersetSecurityException): # noqa: PT027 + security_manager.raise_for_access(query=query, force_dataset_match=True) + + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_default_keeps_schema_access( + self, mock_can_access, mock_is_owner, mock_query_datasources + ): + """ + Without force_dataset_match (default), schema_access still grants + access to non-dataset tables. Preserves backwards compatibility for + chart-data on saved queries, dataset CRUD from SQL, explore on saved + queries, and other non-SQL-Lab callers. + """ + query = Mock( + database=get_example_database(), + schema="bar", + sql="SELECT * FROM table_with_no_dataset", + catalog=None, + ) + mock_query_datasources.return_value = [] + mock_can_access.side_effect = lambda perm, _vm: perm == "schema_access" + mock_is_owner.return_value = False + + security_manager.raise_for_access(query=query) + + @patch("superset.connectors.sqla.models.SqlaTable.query_datasources_by_name") + @patch("superset.security.SupersetSecurityManager.is_owner") + @patch("superset.security.SupersetSecurityManager.can_access") + def test_raise_for_access_force_dataset_match_table_uses_default_schema( + self, mock_can_access, mock_is_owner, mock_query_datasources + ): + """ + Regression test: under force_dataset_match=True, a caller that passes + an under-qualified Table (e.g. MetaDB's 2-part URI ``db.table``) is + resolved against the database's default schema before the strict + deny fires, so a user with datasource_access on the default-schema + dataset is not refused. + """ + database = get_example_database() + with patch.object( + database.db_engine_spec, + "get_default_schema", + return_value="public", + ): + table = Table("foo", None, None) + mock_query_datasources.return_value = [ + Mock(perm="[examples].[public].[foo](id:1)") + ] + mock_can_access.side_effect = lambda perm, _vm: perm == "datasource_access" + mock_is_owner.return_value = False + security_manager.raise_for_access( + database=database, + table=table, + force_dataset_match=True, + ) + # query_datasources_by_name should have been called with the + # resolved default schema, not None. + mock_query_datasources.assert_called_once() + _args, kwargs = mock_query_datasources.call_args + assert kwargs.get("schema") == "public" + @patch("superset.security.SupersetSecurityManager.is_owner") @patch("superset.security.SupersetSecurityManager.can_access") @patch("superset.security.SupersetSecurityManager.can_access_schema") diff --git a/tests/integration_tests/sqllab_tests.py b/tests/integration_tests/sqllab_tests.py index d43f0d8a4c50..950879027c7c 100644 --- a/tests/integration_tests/sqllab_tests.py +++ b/tests/integration_tests/sqllab_tests.py @@ -297,30 +297,20 @@ def test_sql_json_schema_access(self): f"CREATE TABLE IF NOT EXISTS {CTAS_SCHEMA_NAME}.test_table AS SELECT 1 as c1, 2 as c2" # noqa: E501 ) - data = self.run_sql( - f"SELECT * FROM {CTAS_SCHEMA_NAME}.test_table", # noqa: S608 - "3", - username="SchemaUser", # noqa: S608 - ) - assert 1 == len(data["data"]) - - data = self.run_sql( - f"SELECT * FROM {CTAS_SCHEMA_NAME}.test_table", # noqa: S608 - "4", - username="SchemaUser", - schema=CTAS_SCHEMA_NAME, - ) - assert 1 == len(data["data"]) - - # postgres needs a schema as a part of the table name. - if db_backend == "mysql": - data = self.run_sql( - "SELECT * FROM test_table", - "5", + # SQL Lab raw query access requires datasource_access on a registered + # Superset dataset. schema_access alone is no longer sufficient, so + # the SchemaUser is denied here even though they hold schema_access + # on CTAS_SCHEMA_NAME (the table is created on the fly and is not a + # registered dataset). + for client_id, schema in (("3", None), ("4", CTAS_SCHEMA_NAME)): + resp = self.run_sql( + f"SELECT * FROM {CTAS_SCHEMA_NAME}.test_table", # noqa: S608 + client_id, username="SchemaUser", - schema=CTAS_SCHEMA_NAME, + schema=schema, ) - assert 1 == len(data["data"]) + assert "data" not in resp + assert "errors" in resp db.session.query(Query).delete() with get_example_database().get_sqla_engine() as engine: diff --git a/tests/unit_tests/sql/parse_tests.py b/tests/unit_tests/sql/parse_tests.py index 0b83dd71dbd2..27c293229705 100644 --- a/tests/unit_tests/sql/parse_tests.py +++ b/tests/unit_tests/sql/parse_tests.py @@ -1166,6 +1166,30 @@ def test_has_mutation(engine: str, sql: str, expected: bool) -> None: assert SQLScript(sql, engine).has_mutation() == expected +@pytest.mark.parametrize( + "engine, sql, expected", + [ + # Plain SELECT parses to a proper AST node. + ("postgresql", "SELECT * FROM foo", False), + # CALL parses to ``exp.Command`` on Postgres. + ("postgresql", "CALL my_proc(1);", True), + # A script that mixes a parseable statement with an unparseable one + # is still flagged so strict scoping can refuse the whole script. + ("postgresql", "SELECT 1; CALL my_proc();", True), + # Non-sqlglot engines (e.g. Kusto KQL) do not produce a parseable + # AST and cannot have their tables enumerated, so they must be + # flagged as unparseable to fail closed under strict scoping. + ("kustokql", "print 1", True), + ], +) +def test_has_unparseable_statement(engine: str, sql: str, expected: bool) -> None: + """ + Test the `has_unparseable_statement` property used by strict scoping to + refuse statements that sqlglot couldn't fully model. + """ + assert SQLScript(sql, engine).has_unparseable_statement is expected + + @pytest.mark.parametrize( "sql", [ From 17d1a45bc94f628fb3de70d2f7a7b751e180d173 Mon Sep 17 00:00:00 2001 From: Oleg Ovcharuk Date: Wed, 3 Jun 2026 00:13:41 +0300 Subject: [PATCH 06/11] feat(ydb): switch to native YDB sqlglot dialect (#40170) --- UPDATING.md | 4 ++++ pyproject.toml | 2 +- superset/db_engine_specs/ydb.py | 2 +- superset/sql/parse.py | 4 +++- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/UPDATING.md b/UPDATING.md index ea73f6adf630..f5ee4f80c342 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -24,6 +24,10 @@ assists people when migrating to a new version. ## Next +### YDB now uses a native sqlglot dialect + +YDB SQL parsing now relies on the dedicated [`ydb-sqlglot-plugin`](https://pypi.org/project/ydb-sqlglot-plugin/) dialect, which registers itself with sqlglot automatically. YDB users must install this plugin (e.g., via `pip install "apache-superset[ydb]"`) to avoid a `ValueError` when Superset parses YDB queries. + ### Dataset import validates catalog against the target connection Importing a dataset now validates the `catalog` field against the target database connection. When the connection has multi-catalog disabled (`allow_multi_catalog` off) and the dataset's catalog is not the connection's default catalog, the import fails instead of silently persisting the non-default catalog. This matches the validation already enforced on the dataset update path and prevents imported datasets from querying an unintended database. diff --git a/pyproject.toml b/pyproject.toml index 38dbe8508e7f..2412e04f14c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -208,7 +208,7 @@ netezza = ["nzalchemy>=11.0.2"] starrocks = ["starrocks>=1.0.0"] doris = ["pydoris>=1.0.0, <2.0.0"] oceanbase = ["oceanbase_py>=0.0.1"] -ydb = ["ydb-sqlalchemy>=0.1.2"] +ydb = ["ydb-sqlalchemy>=0.1.2", "ydb-sqlglot-plugin>=0.2.5"] development = [ # no bounds for apache-superset-extensions-cli until a stable version "apache-superset-extensions-cli", diff --git a/superset/db_engine_specs/ydb.py b/superset/db_engine_specs/ydb.py index 5c3b051d970a..2d56e7abdf01 100755 --- a/superset/db_engine_specs/ydb.py +++ b/superset/db_engine_specs/ydb.py @@ -63,7 +63,7 @@ class YDBEngineSpec(BaseEngineSpec): DatabaseCategory.TRADITIONAL_RDBMS, DatabaseCategory.OPEN_SOURCE, ], - "pypi_packages": ["ydb-sqlalchemy"], + "pypi_packages": ["ydb-sqlalchemy", "ydb-sqlglot-plugin"], "connection_string": "ydb://{host}:{port}/{database_name}", "default_port": 2135, "engine_parameters": [ diff --git a/superset/sql/parse.py b/superset/sql/parse.py index 14705bae9775..154a8ae0b198 100644 --- a/superset/sql/parse.py +++ b/superset/sql/parse.py @@ -114,7 +114,9 @@ "teradatasql": Dialects.TERADATA, "trino": Dialects.TRINO, "vertica": Vertica, - "yql": Dialects.CLICKHOUSE, + # "ydb" is a plugin dialect (ydb-sqlglot-plugin) auto-discovered via entry_points, + # hence a string name rather than a class reference like the built-in dialects. + "yql": "ydb", } From 8c62f533d7c7b658501c5824e207445156bd3f2e Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Tue, 2 Jun 2026 14:24:00 -0700 Subject: [PATCH 07/11] fix(core): restrict allowed CSS properties in sanitized HTML (#40627) Co-authored-by: Claude Code --- .../superset-ui-core/src/utils/html.test.tsx | 29 ++++++++++++ .../superset-ui-core/src/utils/html.tsx | 46 ++++++++++++++++++- .../test/utils/tooltip.test.ts | 21 +++++---- 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/superset-frontend/packages/superset-ui-core/src/utils/html.test.tsx b/superset-frontend/packages/superset-ui-core/src/utils/html.test.tsx index ea24e0c8c50e..54823c1e5ce6 100644 --- a/superset-frontend/packages/superset-ui-core/src/utils/html.test.tsx +++ b/superset-frontend/packages/superset-ui-core/src/utils/html.test.tsx @@ -33,6 +33,35 @@ describe('sanitizeHtml', () => { const sanitizedString = sanitizeHtml(htmlString); expect(sanitizedString).not.toContain('script'); }); + + test('should preserve allowed presentational CSS properties', () => { + const htmlString = + '
x
'; + const sanitizedString = sanitizeHtml(htmlString); + expect(sanitizedString).toContain('color:red'); + expect(sanitizedString).toContain('background-color:blue'); + expect(sanitizedString).toContain('font-size:12px'); + expect(sanitizedString).toContain('text-align:center'); + }); + + test('should strip layout and positioning CSS properties', () => { + const htmlString = + '
x
'; + const sanitizedString = sanitizeHtml(htmlString); + expect(sanitizedString).toContain('color:red'); + expect(sanitizedString).not.toContain('position'); + expect(sanitizedString).not.toContain('z-index'); + expect(sanitizedString).not.toContain('width'); + expect(sanitizedString).not.toContain('height'); + }); + + test('should strip unsafe CSS property values', () => { + const htmlString = + '
x
'; + const sanitizedString = sanitizeHtml(htmlString); + expect(sanitizedString).not.toContain('javascript'); + expect(sanitizedString).not.toContain('url('); + }); }); describe('isProbablyHTML', () => { diff --git a/superset-frontend/packages/superset-ui-core/src/utils/html.tsx b/superset-frontend/packages/superset-ui-core/src/utils/html.tsx index 940fc232b6ac..8e5e8e482971 100644 --- a/superset-frontend/packages/superset-ui-core/src/utils/html.tsx +++ b/superset-frontend/packages/superset-ui-core/src/utils/html.tsx @@ -19,6 +19,50 @@ import { FilterXSS, getDefaultWhiteList } from 'xss'; import { DataRecordValue } from '../types'; +// Restrict inline `style` attributes to a small set of presentational CSS +// properties. Overlay/positioning properties (e.g. position, z-index, top, +// left, transform) and sizing properties that could cover the page (e.g. +// width, height) are intentionally excluded so that sanitized markup cannot +// escape its container to overlay or obscure the surrounding page. The +// allowlisted spacing/border properties (margin, padding, border) can still +// affect layout within the container, which is acceptable. The `xss` library +// also validates property values against this allowlist, stripping unsupported +// constructs such as url()/expression(). +const allowedCssProperties = { + color: true, + 'background-color': true, + 'text-align': true, + 'text-decoration': true, + 'font-family': true, + 'font-size': true, + 'font-style': true, + 'font-weight': true, + 'line-height': true, + 'letter-spacing': true, + 'white-space': true, + padding: true, + 'padding-top': true, + 'padding-right': true, + 'padding-bottom': true, + 'padding-left': true, + margin: true, + 'margin-top': true, + 'margin-right': true, + 'margin-bottom': true, + 'margin-left': true, + border: true, + 'border-color': true, + 'border-style': true, + 'border-width': true, + 'border-radius': true, + 'vertical-align': true, + // Needed by ECharts tooltips for row transparency and text truncation. + opacity: true, + 'max-width': true, + overflow: true, + 'text-overflow': true, +}; + const xssFilter = new FilterXSS({ whiteList: { ...getDefaultWhiteList(), @@ -45,7 +89,7 @@ const xssFilter = new FilterXSS({ tfoot: ['align', 'valign', 'style'], }, stripIgnoreTag: true, - css: false, + css: { whiteList: allowedCssProperties }, }); export function sanitizeHtml(htmlString: string) { diff --git a/superset-frontend/packages/superset-ui-core/test/utils/tooltip.test.ts b/superset-frontend/packages/superset-ui-core/test/utils/tooltip.test.ts index 1cb708bccdb2..55ea272167a0 100644 --- a/superset-frontend/packages/superset-ui-core/test/utils/tooltip.test.ts +++ b/superset-frontend/packages/superset-ui-core/test/utils/tooltip.test.ts @@ -161,10 +161,13 @@ test('should preserve table styling after sanitization (fixes ECharts tooltip fo `; + // The `xss` CSS filter normalizes declarations, dropping the space after + // each colon (e.g. `opacity: 0.8;` becomes `opacity:0.8;`) while preserving + // the property/value pairs themselves. const sanitized = sanitizeHtml(tableWithStyles); - expect(sanitized).toContain('style="opacity: 0.8;"'); - expect(sanitized).toContain('style="text-align: left; padding-left: 0px;"'); - expect(sanitized).toContain('style="text-align: right; padding-left: 16px;"'); + expect(sanitized).toContain('style="opacity:0.8;"'); + expect(sanitized).toContain('style="text-align:left; padding-left:0px;"'); + expect(sanitized).toContain('style="text-align:right; padding-left:16px;"'); const data = [ ['Metric', 'Value'], @@ -172,10 +175,10 @@ test('should preserve table styling after sanitization (fixes ECharts tooltip fo ]; const html = tooltipHtml(data, 'Test Tooltip'); - expect(html).toContain('style="opacity: 0.8;"'); - expect(html).toContain('text-align: left'); - expect(html).toContain('text-align: right'); - expect(html).toContain('padding-left: 0px'); - expect(html).toContain('padding-left: 16px'); - expect(html).toContain('max-width: 300px'); + expect(html).toContain('style="opacity:0.8;"'); + expect(html).toContain('text-align:left'); + expect(html).toContain('text-align:right'); + expect(html).toContain('padding-left:0px'); + expect(html).toContain('padding-left:16px'); + expect(html).toContain('max-width:300px'); }); From 6abee0289b89379c892f2e9fcfe04ae743caae24 Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Tue, 2 Jun 2026 15:09:14 -0700 Subject: [PATCH 08/11] fix(reports): guard SUCCESS-state report execution against duplicate sends and stuck WORKING state (#40657) Co-authored-by: Claude Code --- superset/commands/report/execute.py | 51 +++++++++++++++---- .../commands/report/execute_test.py | 36 +++++++++++-- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/superset/commands/report/execute.py b/superset/commands/report/execute.py index 4f20783c4d45..fa5fb9422bdc 100644 --- a/superset/commands/report/execute.py +++ b/superset/commands/report/execute.py @@ -1039,17 +1039,50 @@ def next(self) -> None: ) return except Exception as ex: - self.send_error( - f"Error occurred for {self._report_schedule.type}:" - f" {self._report_schedule.name}", - str(ex), - ) - self.update_report_schedule_and_log( - ReportState.ERROR, - error_message=REPORT_SCHEDULE_ERROR_NOTIFICATION_MARKER, - ) + # Ensure the schedule always transitions out of WORKING to + # ERROR, even if sending the error notification itself fails — + # otherwise the schedule is stuck in WORKING until the working + # timeout. Mirrors ReportNotTriggeredErrorState.next(). + # Only record the marker when the notification was actually + # delivered; otherwise record the send failure so the grace- + # period check doesn't incorrectly suppress future notifications. + error_message = REPORT_SCHEDULE_ERROR_NOTIFICATION_MARKER + try: + self.send_error( + f"Error occurred for {self._report_schedule.type}:" + f" {self._report_schedule.name}", + str(ex), + ) + except Exception as send_ex: # noqa: BLE001 # pylint: disable=broad-except + error_message = str(send_ex) or str(ex) + logger.warning( + "Failed to send error notification for report schedule " + "(execution %s)", + self._execution_id, + exc_info=True, + ) + finally: + try: + self.update_report_schedule_and_log( + ReportState.ERROR, + error_message=error_message, + ) + except ReportScheduleUnexpectedError: + logger.warning( + "Failed to log ERROR state for report schedule " + "(execution %s) due to database issue", + self._execution_id, + exc_info=True, + ) raise + # For REPORT types the ALERT branch above is skipped, so WORKING has not + # been set yet. Set it before the (potentially slow) send() so a + # concurrent scheduler tick is blocked by ReportWorkingState, preventing + # duplicate notifications. ALERT types already set WORKING above. + if self._report_schedule.type != ReportScheduleType.ALERT: + self.update_report_schedule_and_log(ReportState.WORKING) + try: self.send() # Include filter warnings in the log if any were collected diff --git a/tests/unit_tests/commands/report/execute_test.py b/tests/unit_tests/commands/report/execute_test.py index bf8c38f1e314..a86c5dfffc35 100644 --- a/tests/unit_tests/commands/report/execute_test.py +++ b/tests/unit_tests/commands/report/execute_test.py @@ -1541,15 +1541,43 @@ def test_success_state_report_sends_and_logs_success( schedule_type=ReportScheduleType.REPORT, ) mock_send = mocker.patch.object(state, "send") - mocker.patch.object(state, "update_report_schedule_and_log") + mock_update = mocker.patch.object(state, "update_report_schedule_and_log") state.next() mock_send.assert_called_once() - state.update_report_schedule_and_log.assert_called_once_with( # type: ignore[attr-defined] - ReportState.SUCCESS, - error_message=None, + # WORKING is set before send() (concurrency guard against duplicate sends), + # then SUCCESS after. + assert mock_update.call_args_list == [ + mocker.call(ReportState.WORKING), + mocker.call(ReportState.SUCCESS, error_message=None), + ] + + +def test_success_state_error_logged_when_send_error_raises( + mocker: MockerFixture, +) -> None: + """If send_error() itself raises, the schedule must still transition to + ERROR (not stay stuck in WORKING).""" + state = _make_state_instance( + mocker, ReportSuccessState, schedule_type=ReportScheduleType.ALERT ) + mocker.patch.object(state, "is_in_grace_period", return_value=False) + mock_update = mocker.patch.object(state, "update_report_schedule_and_log") + mocker.patch.object( + state, "send_error", side_effect=RuntimeError("notification boom") + ) + mocker.patch( + "superset.commands.report.execute.AlertCommand" + ).return_value.run.side_effect = RuntimeError("alert boom") + + # The original alert error propagates... + with pytest.raises(RuntimeError, match="alert boom"): + state.next() + + # ...but ERROR was still logged despite send_error() failing. + states = [call.args[0] for call in mock_update.call_args_list] + assert ReportState.ERROR in states def test_get_url_for_csv_uses_post_processed_type( From 9af6746dbe19af16848517f05cdf92f047b9c254 Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Tue, 2 Jun 2026 16:15:11 -0700 Subject: [PATCH 09/11] fix(models): HTML-escape data-controlled values in dashboard_link and Slice.icons (#40639) Co-authored-by: Claude Code --- superset/models/dashboard.py | 5 ++- superset/models/slice.py | 8 +++- tests/unit_tests/models/dashboard_test.py | 52 +++++++++++++++++++++++ tests/unit_tests/models/slice_test.py | 25 ++++++++++- 4 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 tests/unit_tests/models/dashboard_test.py diff --git a/superset/models/dashboard.py b/superset/models/dashboard.py index f38d801719ea..32991820776b 100644 --- a/superset/models/dashboard.py +++ b/superset/models/dashboard.py @@ -221,7 +221,10 @@ def status(self) -> utils.DashboardStatus: @renders("dashboard_title") def dashboard_link(self) -> Markup: title = escape(self.dashboard_title or "") - return Markup(f'{title}') + # self.url embeds the user-controlled slug; escape it before it is + # marked safe via Markup (mirrors SqlaTable.link). + url = escape(self.url) + return Markup(f'{title}') @property def digest(self) -> str | None: diff --git a/superset/models/slice.py b/superset/models/slice.py index 04c698ce95da..e10b373d945c 100644 --- a/superset/models/slice.py +++ b/superset/models/slice.py @@ -326,11 +326,15 @@ def slice_link(self) -> Markup: @property def icons(self) -> str: + # Escape the data-controlled datasource name and edit URL before they + # are interpolated into HTML attributes. + url = escape(self.datasource_edit_url) + datasource = escape(self.datasource) return f""" + title="{datasource}"> """ diff --git a/tests/unit_tests/models/dashboard_test.py b/tests/unit_tests/models/dashboard_test.py new file mode 100644 index 000000000000..08997366d03b --- /dev/null +++ b/tests/unit_tests/models/dashboard_test.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from superset.models.dashboard import Dashboard + + +def test_dashboard_link_escapes_slug() -> None: + """dashboard_link must HTML-escape the user-controlled slug in the href. + + The slug can carry markup via the import path (which does not run the REST + API's slug sanitization), so the rendered FAB list-view link must escape it. + """ + dash = Dashboard() + dash.id = 1 + dash.dashboard_title = "My Dashboard" + dash.slug = '">' + + link = str(dash.dashboard_link()) + + # The injected script tag / attribute breakout must be escaped away. + assert "', + x: 1, + y: 2, + size: 3, + }, + entity: 'entity', + xField: 'x', + yField: 'y', + sizeField: 'size', + xFormatter: (v: number) => String(v), + yFormatter: (v: number) => String(v), + sizeFormatter: (v: number) => String(v), + }); + expect(html).not.toMatch(/', + color: 'red', + value: 1, + }, + ], + }, + (v: number) => String(v), + [(v: number) => String(v)], + ); + expect(html).not.toMatch(/payload', + }; + const html = tip.html()(datum); + expect(html).not.toMatch(/