@@ -374,24 +374,135 @@ <h2 class="text-3xl font-bold text-gray-900 mb-4">3. Experiments</h2>
374374 < div class ="w-24 h-1 bg-primary-600 mx-auto "> </ div >
375375 </ div >
376376
377- <!-- Benchmarks -->
377+ <!-- GAIA Benchmark Chart -->
378+ < div class ="mb-16 ">
379+ < h3 class ="text-2xl font-semibold text-gray-900 mb-8 text-center "> GAIA Benchmark Test Results</ h3 >
380+ < div class ="bg-white p-8 rounded-xl shadow-sm ">
381+ < div class ="flex items-center justify-between mb-6 ">
382+ < div class ="flex items-center space-x-4 ">
383+ < div class ="flex items-center space-x-2 ">
384+ < div class ="w-4 h-4 bg-gradient-to-r from-blue-400 to-cyan-400 rounded "> </ div >
385+ < span class ="text-sm text-gray-600 "> Skywork Deep Research Agent (Our)</ span >
386+ </ div >
387+ < div class ="flex items-center space-x-2 ">
388+ < div class ="w-4 h-4 bg-gray-400 rounded "> </ div >
389+ < span class ="text-sm text-gray-600 "> Other Models</ span >
390+ </ div >
391+ </ div >
392+ </ div >
393+
394+ < div class ="space-y-4 ">
395+ <!-- Skywork DRA -->
396+ < div class ="flex items-center ">
397+ < div class ="w-48 text-sm text-gray-600 "> Skywork DRA</ div >
398+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
399+ < div class ="bg-gradient-to-r from-blue-400 to-cyan-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 97.6% ">
400+ < span class ="text-white font-semibold text-sm "> 83.06</ span >
401+ </ div >
402+ </ div >
403+ </ div >
404+
405+ <!-- Skywork DRA (w/o MCP) -->
406+ < div class ="flex items-center ">
407+ < div class ="w-48 text-sm text-gray-600 "> Skywork DRA (w/o MCP)</ div >
408+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
409+ < div class ="bg-gradient-to-r from-blue-400 to-cyan-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 93.0% ">
410+ < span class ="text-white font-semibold text-sm "> 79.07</ span >
411+ </ div >
412+ </ div >
413+ </ div >
414+
415+ <!-- Aworld -->
416+ < div class ="flex items-center ">
417+ < div class ="w-48 text-sm text-gray-600 "> Aworld</ div >
418+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
419+ < div class ="bg-gray-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 96.1% ">
420+ < span class ="text-white font-semibold text-sm "> 81.73</ span >
421+ </ div >
422+ </ div >
423+ </ div >
424+
425+ <!-- Su Zero Ultra -->
426+ < div class ="flex items-center ">
427+ < div class ="w-48 text-sm text-gray-600 "> Su Zero Ultra</ div >
428+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
429+ < div class ="bg-gray-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 94.6% ">
430+ < span class ="text-white font-semibold text-sm "> 80.40</ span >
431+ </ div >
432+ </ div >
433+ </ div >
434+
435+ <!-- h2oGPTe Agent -->
436+ < div class ="flex items-center ">
437+ < div class ="w-48 text-sm text-gray-600 "> h2oGPTe Agent</ div >
438+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
439+ < div class ="bg-gray-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 93.8% ">
440+ < span class ="text-white font-semibold text-sm "> 79.73</ span >
441+ </ div >
442+ </ div >
443+ </ div >
444+
445+ <!-- desearch -->
446+ < div class ="flex items-center ">
447+ < div class ="w-48 text-sm text-gray-600 "> desearch</ div >
448+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
449+ < div class ="bg-gray-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 91.8% ">
450+ < span class ="text-white font-semibold text-sm "> 78.07</ span >
451+ </ div >
452+ </ div >
453+ </ div >
454+
455+ <!-- Alita -->
456+ < div class ="flex items-center ">
457+ < div class ="w-48 text-sm text-gray-600 "> Alita</ div >
458+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
459+ < div class ="bg-gray-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 88.7% ">
460+ < span class ="text-white font-semibold text-sm "> 75.42</ span >
461+ </ div >
462+ </ div >
463+ </ div >
464+
465+ <!-- Langfun Agent -->
466+ < div class ="flex items-center ">
467+ < div class ="w-48 text-sm text-gray-600 "> Langfun Agent</ div >
468+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
469+ < div class ="bg-gray-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 85.9% ">
470+ < span class ="text-white font-semibold text-sm "> 73.09</ span >
471+ </ div >
472+ </ div >
473+ </ div >
474+
475+ <!-- JoyAgent-Genie -->
476+ < div class ="flex items-center ">
477+ < div class ="w-48 text-sm text-gray-600 "> JoyAgent-Genie</ div >
478+ < div class ="flex-1 bg-gray-200 rounded-full h-8 relative ">
479+ < div class ="bg-gray-400 h-8 rounded-full flex items-center justify-end pr-3 " style ="width: 76.6% ">
480+ < span class ="text-white font-semibold text-sm "> 65.12</ span >
481+ </ div >
482+ </ div >
483+ </ div >
484+ </ div >
485+ </ div >
486+ </ div >
487+
488+ <!-- Benchmark Summary -->
378489 < div class ="grid lg:grid-cols-3 gap-8 mb-16 ">
379490 < div class ="bg-gradient-to-br from-green-50 to-emerald-50 p-6 rounded-xl ">
380491 < h3 class ="text-xl font-semibold text-gray-900 mb-3 "> SimpleQA Benchmark</ h3 >
381492 < p class ="text-gray-600 mb-4 "> Evaluation on simple question-answering tasks to assess basic reasoning capabilities.</ p >
382- < div class ="text-2xl font-bold text-green-600 "> High Performance </ div >
493+ < div class ="text-2xl font-bold text-green-600 "> 95.3 </ div >
383494 </ div >
384495
385496 < div class ="bg-gradient-to-br from-blue-50 to-cyan-50 p-6 rounded-xl ">
386- < h3 class ="text-xl font-semibold text-gray-900 mb-3 "> GAIA Benchmark</ h3 >
497+ < h3 class ="text-xl font-semibold text-gray-900 mb-3 "> GAIA Benchmark Validation </ h3 >
387498 < p class ="text-gray-600 mb-4 "> Comprehensive evaluation on real-world tasks requiring web search and reasoning.</ p >
388- < div class ="text-2xl font-bold text-blue-600 "> 79.07 Average </ div >
499+ < div class ="text-2xl font-bold text-blue-600 "> 82.42 </ div >
389500 </ div >
390501
391502 < div class ="bg-gradient-to-br from-purple-50 to-violet-50 p-6 rounded-xl ">
392503 < h3 class ="text-xl font-semibold text-gray-900 mb-3 "> HLE Benchmark</ h3 >
393504 < p class ="text-gray-600 mb-4 "> Human-level evaluation benchmark for complex reasoning and planning tasks.</ p >
394- < div class ="text-2xl font-bold text-purple-600 "> State-of-the-Art </ div >
505+ < div class ="text-2xl font-bold text-purple-600 "> 25.9 </ div >
395506 </ div >
396507 </ div >
397508
0 commit comments