Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions sky/dashboard/src/components/infra.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ export function InfrastructureSection({
isSSH = false, // To differentiate between SSH and Kubernetes
actionButton = null, // Optional action button for the header
contextWorkspaceMap = {}, // Mapping of contexts to workspaces
contextErrors = {}, // Mapping of contexts to error messages
}) {
// Add defensive check for contexts
const safeContexts = contexts || [];
Expand Down Expand Up @@ -289,15 +290,19 @@ export function InfrastructureSection({
) : (
<span
className={
nodes.length === 0 ? 'text-gray-400' : ''
nodes.length === 0 && contextErrors[context]
? 'text-gray-400'
: ''
}
title={
nodes.length === 0
? 'Context may be unavailable or timed out'
nodes.length === 0 && contextErrors[context]
? contextErrors[context]
: ''
}
>
{nodes.length === 0 ? '0*' : nodes.length}
{nodes.length === 0 && contextErrors[context]
? '0*'
: nodes.length}
</span>
)}
</td>
Expand Down Expand Up @@ -1615,6 +1620,7 @@ export function GPUs() {
const [enabledClouds, setEnabledClouds] = useState(0);
const [contextStats, setContextStats] = useState({});
const [contextWorkspaceMap, setContextWorkspaceMap] = useState({});
const [contextErrors, setContextErrors] = useState({});

// Workspace-aware infrastructure state
const [workspaceInfrastructure, setWorkspaceInfrastructure] = useState({});
Expand Down Expand Up @@ -1668,6 +1674,7 @@ export function GPUs() {
setPerNodeGPUs([]);
setContextStats({});
setContextWorkspaceMap({});
setContextErrors({});
setAvailableWorkspaces([]);
setKubeDataLoaded(true);
setKubeLoading(false);
Expand Down Expand Up @@ -1714,6 +1721,7 @@ export function GPUs() {
perNodeGPUs: fetchedPerNodeGPUs,
contextStats: fetchedContextStats,
contextWorkspaceMap: fetchedContextWorkspaceMap,
contextErrors: fetchedContextErrors,
} = infraData;

setWorkspaceInfrastructure(fetchedWorkspaceInfrastructure || {});
Expand All @@ -1723,6 +1731,7 @@ export function GPUs() {
setPerNodeGPUs(fetchedPerNodeGPUs || []);
setContextStats(fetchedContextStats || {});
setContextWorkspaceMap(fetchedContextWorkspaceMap || {});
setContextErrors(fetchedContextErrors || {});

// Extract available workspaces from the workspace infrastructure data
const workspaceNames = Object.keys(
Expand All @@ -1740,6 +1749,7 @@ export function GPUs() {
setPerNodeGPUs([]);
setContextStats({});
setContextWorkspaceMap({});
setContextErrors({});
setAvailableWorkspaces([]);
setKubeDataLoaded(true);
setKubeLoading(false);
Expand All @@ -1753,6 +1763,7 @@ export function GPUs() {
setPerNodeGPUs([]);
setContextStats({});
setContextWorkspaceMap({});
setContextErrors({});
setAvailableWorkspaces([]);
setKubeDataLoaded(true);
setKubeLoading(false);
Expand Down Expand Up @@ -2306,6 +2317,7 @@ export function GPUs() {
isJobsDataLoading={sshAndKubeJobsDataLoading}
isSSH={true}
contextWorkspaceMap={contextWorkspaceMap}
contextErrors={contextErrors}
actionButton={
// TODO: Add back when SSH Node Pool add operation is more robust
// <button
Expand Down Expand Up @@ -2337,6 +2349,7 @@ export function GPUs() {
isJobsDataLoading={sshAndKubeJobsDataLoading}
isSSH={false}
contextWorkspaceMap={contextWorkspaceMap}
contextErrors={contextErrors}
/>
);
};
Expand Down
14 changes: 11 additions & 3 deletions sky/dashboard/src/components/workspaces.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ export async function getWorkspaceManagedJobs(workspaceName) {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
let errorMessage = fetchedData.statusText;
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
Expand All @@ -135,17 +136,24 @@ export async function getWorkspaceManagedJobs(workspaceName) {
// Handle specific error types
if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
return { jobs: [] };
} else {
errorMessage = error.message || String(data.detail.error);
}
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
console.error(
'Error parsing JSON from data.detail.error:',
jsonError
);
errorMessage = String(data.detail.error);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
console.error('Error parsing response JSON:', parseError);
errorMessage = String(parseError);
}
}
if (!fetchedData.ok) {
const msg = `API request to get managed jobs result failed with status ${fetchedData.status} for workspace ${workspaceName}`;
const msg = `API request to get managed jobs result failed with status ${fetchedData.status}, error: ${errorMessage} for workspace ${workspaceName}`;
throw new Error(msg);
}
const data = await fetchedData.json();
Expand Down
4 changes: 3 additions & 1 deletion sky/dashboard/src/data/connectors/client.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'use client';

import { getErrorMessageFromResponse } from '@/data/utils';
import { ENDPOINT } from './constants';

export const apiClient = {
Expand Down Expand Up @@ -44,7 +45,8 @@ export const apiClient = {

// Handle all error status codes (4xx, 5xx, etc.)
if (!fetchedData.ok) {
const msg = `API request to get ${path} result failed with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `API request to get ${path} result failed with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}

Expand Down
61 changes: 22 additions & 39 deletions sky/dashboard/src/data/connectors/infra.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { CLOUDS_LIST, COMMON_GPUS } from '@/data/connectors/constants';

// Importing from the same directory
import { apiClient } from '@/data/connectors/client';
import { getErrorMessageFromResponse } from '@/data/utils';

export async function getCloudInfrastructure(forceRefresh = false) {
const dashboardCache = (await import('@/lib/cache')).default;
Expand Down Expand Up @@ -54,7 +55,7 @@ export async function getCloudInfrastructure(forceRefresh = false) {
`/api/get?request_id=${checkId}`
);
if (!checkResult.ok) {
const msg = `Failed to get sky check result with status ${checkResult.status}`;
const msg = `Failed to get sky check result with status ${checkResult.status}, error: ${checkResult.statusText}`;
throw new Error(msg);
}
const checkData = await checkResult.json();
Expand Down Expand Up @@ -206,6 +207,7 @@ export async function getWorkspaceInfrastructure() {
perNodeGPUs: [],
contextStats: {},
contextWorkspaceMap: {},
contextErrors: {},
};
}

Expand Down Expand Up @@ -324,6 +326,7 @@ export async function getWorkspaceInfrastructure() {
allGPUs: [],
perContextGPUs: [],
perNodeGPUs: [],
contextErrors: {},
};
try {
gpuData = await getKubernetesGPUsFromContexts(validContexts);
Expand All @@ -339,6 +342,7 @@ export async function getWorkspaceInfrastructure() {
perNodeGPUs: gpuData.perNodeGPUs || [],
contextStats: contextStats,
contextWorkspaceMap: contextWorkspaceMap,
contextErrors: gpuData.contextErrors || {},
};

console.log('[DEBUG] Final result:', finalResult);
Expand All @@ -361,12 +365,14 @@ async function getKubernetesGPUsFromContexts(contextNames) {
allGPUs: [],
perContextGPUs: [],
perNodeGPUs: [],
contextErrors: {},
};
}

const allGPUsSummary = {};
const perContextGPUsData = {};
const perNodeGPUs_dict = {};
const contextErrors = {};

// Get all of the node info for all contexts in parallel and put them
// in a dictionary keyed by context name.
Expand All @@ -381,11 +387,16 @@ async function getKubernetesGPUsFromContexts(contextNames) {
contextToNodeInfo[contextNames[i]] = result.value;
} else {
// Log the error but continue with other contexts
const errorMessage =
result.reason?.message ||
(typeof result.reason === 'string' && result.reason) ||
'Context may be unavailable or timed out';
console.warn(
`Failed to get node info for context ${contextNames[i]}:`,
result.reason
);
contextToNodeInfo[contextNames[i]] = {};
contextErrors[contextNames[i]] = errorMessage;
}
}

Expand Down Expand Up @@ -520,6 +531,7 @@ async function getKubernetesGPUsFromContexts(contextNames) {
a.node_name.localeCompare(b.node_name) ||
a.gpu_name.localeCompare(b.gpu_name)
),
contextErrors: contextErrors,
};
} catch (error) {
console.error('[infra.jsx] Error in getKubernetesGPUsFromContexts:', error);
Expand All @@ -533,7 +545,7 @@ async function getKubernetesPerNodeGPUs(context) {
context: context,
});
if (!response.ok) {
const msg = `Failed to get kubernetes node info with status ${response.status}`;
const msg = `Failed to get kubernetes node info for context ${context} with status ${response.status}, error: ${response.statusText}`;
throw new Error(msg);
}
const id =
Expand All @@ -544,24 +556,9 @@ async function getKubernetesPerNodeGPUs(context) {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
if (data.detail && data.detail.error) {
try {
const error = JSON.parse(data.detail.error);
const msg = `Context ${context} unavailable: ${error.message}`;
throw new Error(msg);
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
}
}
if (!fetchedData.ok) {
const msg = `Failed to get kubernetes node info result with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `Failed to get kubernetes node info result for context ${context} with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}
const data = await fetchedData.json();
Expand Down Expand Up @@ -672,7 +669,7 @@ export async function getCloudGPUs() {
gpus_only: true,
});
if (!response.ok) {
const msg = `Failed to get cloud GPUs with status ${response.status}`;
const msg = `Failed to get cloud GPUs with status ${response.status}, error: ${response.statusText}`;
throw new Error(msg);
}
const id =
Expand All @@ -683,24 +680,9 @@ export async function getCloudGPUs() {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
if (data.detail && data.detail.error) {
try {
const error = JSON.parse(data.detail.error);
const msg = `Error fetching cloud GPUs: ${error.message}`;
throw new Error(msg);
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
}
}
if (!fetchedData.ok) {
const msg = `Failed to get cloud GPUs result with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `Failed to get cloud GPUs result with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}
const data = await fetchedData.json();
Expand Down Expand Up @@ -764,7 +746,7 @@ export async function getDetailedGpuInfo(filter) {
all_regions: true,
});
if (!response.ok) {
const msg = `Failed to get detailed GPU info with status ${response.status}`;
const msg = `Failed to get detailed GPU info with status ${response.status}, error: ${response.statusText}`;
throw new Error(msg);
}
const id =
Expand All @@ -776,7 +758,8 @@ export async function getDetailedGpuInfo(filter) {
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
if (!fetchedData.ok) {
const msg = `Failed to get detailed GPU info result with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `Failed to get detailed GPU info result with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}

Expand Down
29 changes: 22 additions & 7 deletions sky/dashboard/src/data/connectors/jobs.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ export async function getManagedJobs(options = {}) {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
let errorMessage = fetchedData.statusText;
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
Expand All @@ -93,18 +94,25 @@ export async function getManagedJobs(options = {}) {
// Handle specific error types
if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
return { jobs: [], total: 0, controllerStopped: true };
} else {
errorMessage = error.message || String(data.detail.error);
}
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
console.error(
'Error parsing JSON from data.detail.error:',
jsonError
);
errorMessage = String(data.detail.error);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
console.error('Error parsing response JSON:', parseError);
errorMessage = String(parseError);
}
}
// Handle all error status codes (4xx, 5xx, etc.)
if (!fetchedData.ok) {
const msg = `API request to get managed jobs result failed with status ${fetchedData.status}`;
const msg = `API request to get managed jobs result failed with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}
// print out the response for debugging
Expand Down Expand Up @@ -323,7 +331,7 @@ export async function getPoolStatus() {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);

let errorMessage = fetchedData.statusText;
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
Expand All @@ -332,18 +340,25 @@ export async function getPoolStatus() {
const error = JSON.parse(data.detail.error);
if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
return { pools: [], controllerStopped: true };
} else {
errorMessage = error.message || String(data.detail.error);
}
} catch (jsonError) {
console.error('Failed to parse error JSON:', jsonError);
console.error(
'Error parsing JSON from data.detail.error:',
jsonError
);
errorMessage = String(data.detail.error);
}
}
} catch (dataError) {
console.error('Failed to parse response JSON:', dataError);
console.error('Error parsing response JSON:', dataError);
errorMessage = String(dataError);
}
}

if (!fetchedData.ok) {
const msg = `API request to get pool status result failed with status ${fetchedData.status}`;
const msg = `API request to get pool status result failed with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}

Expand Down
Loading
Loading