diff --git a/src/databricks/labs/ucx/workspace_access/groups.py b/src/databricks/labs/ucx/workspace_access/groups.py index 814043e1c6..3c81639e83 100644 --- a/src/databricks/labs/ucx/workspace_access/groups.py +++ b/src/databricks/labs/ucx/workspace_access/groups.py @@ -913,6 +913,8 @@ def _get_strategy( class AccountGroupLookup: + PAGE_SIZE = 10000 + def __init__(self, ws: WorkspaceClient): self._ws = ws @@ -964,13 +966,28 @@ def _list_account_groups(self, scim_attributes: str) -> list[iam.Group]: # TODO: we should avoid using this method, as it's not documented # get account-level groups even if they're not (yet) assigned to a workspace logger.info(f"Listing account groups with {scim_attributes}...") - account_groups = [] - raw = self._ws.api_client.do("GET", "/api/2.0/account/scim/v2/Groups", query={"attributes": scim_attributes}) - for resource in raw.get("Resources", []): # type: ignore[union-attr] - group = iam.Group.from_dict(resource) - if group.display_name in SYSTEM_GROUPS: - continue - account_groups.append(group) + account_groups: list[iam.Group] = [] + # Paginate through the undocumented workspace-level account SCIM endpoint, + # mirroring the pagination logic in the SDK's AccountGroupsAPI.list(). + seen: set[str] = set() + query: dict[str, str | int] = {"attributes": scim_attributes, "startIndex": 1, "count": self.PAGE_SIZE} + while True: + raw = self._ws.api_client.do("GET", "/api/2.0/account/scim/v2/Groups", query=query) + resources = raw.get("Resources", []) # type: ignore[union-attr] + if not resources: + break + for resource in resources: + group = iam.Group.from_dict(resource) + if group.id and group.id in seen: + continue + if group.id: + seen.add(group.id) + if group.display_name in SYSTEM_GROUPS: + continue + account_groups.append(group) + if len(resources) < int(query["count"]): + break + query["startIndex"] = int(query["startIndex"]) + len(resources) logger.info(f"Found {len(account_groups)} account groups") sorted_groups: list[iam.Group] = sorted( account_groups, key=lambda _: _.display_name if _.display_name else "" diff --git a/tests/unit/workspace_access/test_groups.py b/tests/unit/workspace_access/test_groups.py index 1adc823fc2..df213fc0c6 100644 --- a/tests/unit/workspace_access/test_groups.py +++ b/tests/unit/workspace_access/test_groups.py @@ -14,6 +14,7 @@ from databricks.sdk.service.iam import ComplexValue, Group, ResourceMeta from databricks.labs.ucx.workspace_access.groups import ( + AccountGroupLookup, ConfigureGroups, GroupManager, MigratedGroup, @@ -568,6 +569,40 @@ def reflect_account_side_effect(method, *_, **__): ) +def test_list_account_groups_paginates_and_deduplicates(): + backend = MockBackend(rows={"SELECT": [("1", "de", "de", "test-group-de", "", "", "", "")]}) + wsclient = create_autospec(WorkspaceClient) + + page1 = [ + Group(id="1", display_name="alpha").as_dict(), + Group(id="2", display_name="beta").as_dict(), + ] + page2 = [ + Group(id="2", display_name="beta").as_dict(), + Group(id="3", display_name="gamma").as_dict(), + Group(id="4", display_name="users").as_dict(), + ] + + responses = [{"Resources": page1}, {"Resources": page2}, {"Resources": []}] + + def do_side_effect(method, *_args, **_kwargs): + if method == "GET": + return responses.pop(0) + return None + + wsclient.api_client.do.side_effect = do_side_effect + + group1 = Group(id="1", display_name="test-dfd-alpha", meta=ResourceMeta(resource_type="WorkspaceGroup")) + wsclient.groups.list.return_value = [group1] + wsclient.groups.get.return_value = group1 + + with patch.object(AccountGroupLookup, "PAGE_SIZE", 2): + GroupManager(backend, wsclient, inventory_database="inv").reflect_account_groups_on_workspace() + + get_calls = [c for c in wsclient.api_client.do.call_args_list if c[0][0] == "GET"] + assert len(get_calls) == 3 + + def test_delete_original_workspace_groups_should_delete_reflected_acc_groups_in_workspace(fake_sleep: Mock) -> None: account_id = "11" ws_id = "1"