|
27 | 27 | from aleph.vm.controllers.firecracker.program import FileTooLargeError |
28 | 28 | from aleph.vm.hypervisors.firecracker.microvm import MicroVMFailedInitError |
29 | 29 | from aleph.vm.models import VmExecution |
| 30 | +from aleph.vm.network.firewall import ( |
| 31 | + initialize_nftables, |
| 32 | + recreate_network_for_vms, |
| 33 | + remove_all_aleph_chains, |
| 34 | +) |
30 | 35 | from aleph.vm.orchestrator import payment, status |
31 | 36 | from aleph.vm.orchestrator.chain import STREAM_CHAINS |
32 | 37 | from aleph.vm.orchestrator.custom_logs import set_vm_for_logging |
@@ -429,6 +434,7 @@ def authenticate_api_request(request: web.Request) -> bool: |
429 | 434 |
|
430 | 435 |
|
431 | 436 | allocation_lock = None |
| 437 | +network_recreation_lock = None |
432 | 438 |
|
433 | 439 |
|
434 | 440 | async def update_allocations(request: web.Request): |
@@ -547,6 +553,130 @@ async def update_allocations(request: web.Request): |
547 | 553 | ) |
548 | 554 |
|
549 | 555 |
|
| 556 | +async def recreate_network(request: web.Request): |
| 557 | + """Recreate network settings for the CRN and all running VMs. |
| 558 | +
|
| 559 | + This endpoint performs a complete network reconfiguration by: |
| 560 | + 1. Querying the nftables ruleset to find all aleph-related chains |
| 561 | + 2. Removing ALL chains created by aleph software (both tracked and untracked) |
| 562 | + including VM-specific chains and supervisor chains |
| 563 | + 3. Re-initializing the base network setup with nftables (creating fresh |
| 564 | + supervisor chains: aleph-supervisor-nat, aleph-supervisor-filter, |
| 565 | + aleph-supervisor-prerouting) |
| 566 | + 4. Recreating VM-specific chains and rules for each currently running VM |
| 567 | + 5. Restoring port forwarding rules for all running instances |
| 568 | +
|
| 569 | + This method is designed to handle cases where: |
| 570 | + - Network rules have become duplicated or inconsistent |
| 571 | + - Chains exist on the host that are no longer tracked by the software |
| 572 | + - The firewall state needs to be reset to match the current VM pool |
| 573 | +
|
| 574 | + The operation is atomic and uses a lock to prevent concurrent modifications. |
| 575 | +
|
| 576 | + Returns: |
| 577 | + JSON response with: |
| 578 | + - success: Boolean indicating if all VMs were successfully recreated |
| 579 | + - removed_chains_count: Number of chains that were removed |
| 580 | + - removed_chains: List of chain names that were removed |
| 581 | + - recreated_count: Number of VMs that were successfully recreated |
| 582 | + - failed_count: Number of VMs that failed to recreate |
| 583 | + - recreated_vms: List of VM hashes that were recreated |
| 584 | + - failed_vms: List of VM hashes and errors for failed recreations |
| 585 | + """ |
| 586 | + if not authenticate_api_request(request): |
| 587 | + return web.HTTPUnauthorized(text="Authentication token received is invalid") |
| 588 | + |
| 589 | + global network_recreation_lock |
| 590 | + if network_recreation_lock is None: |
| 591 | + network_recreation_lock = asyncio.Lock() |
| 592 | + |
| 593 | + pool: VmPool = request.app["vm_pool"] |
| 594 | + |
| 595 | + async with network_recreation_lock: |
| 596 | + logger.info("Starting network recreation process") |
| 597 | + |
| 598 | + # Step 1: Collect all running VMs and their network configuration |
| 599 | + running_vms = [] |
| 600 | + for vm_hash, execution in pool.executions.items(): |
| 601 | + if execution.is_running and execution.vm and execution.vm.tap_interface: |
| 602 | + running_vms.append( |
| 603 | + { |
| 604 | + "vm_hash": vm_hash, |
| 605 | + "vm_id": execution.vm.vm_id, |
| 606 | + "tap_interface": execution.vm.tap_interface, |
| 607 | + "execution": execution, |
| 608 | + } |
| 609 | + ) |
| 610 | + logger.debug(f"Found running VM {vm_hash} with vm_id={execution.vm.vm_id}") |
| 611 | + |
| 612 | + logger.info(f"Found {len(running_vms)} running VMs to recreate network rules for") |
| 613 | + |
| 614 | + # Step 2: Remove all aleph-related chains (VM-specific and supervisor chains) |
| 615 | + try: |
| 616 | + removed_chains, failed_removals = remove_all_aleph_chains() |
| 617 | + if failed_removals: |
| 618 | + logger.warning(f"Failed to remove {len(failed_removals)} chains") |
| 619 | + for chain_name, error in failed_removals: |
| 620 | + logger.warning(f" - {chain_name}: {error}") |
| 621 | + except Exception as e: |
| 622 | + logger.error(f"Error removing aleph chains: {e}") |
| 623 | + return web.json_response( |
| 624 | + {"success": False, "error": f"Failed to remove existing chains: {str(e)}"}, |
| 625 | + status=500, |
| 626 | + ) |
| 627 | + |
| 628 | + # Step 3: Re-initialize the base network setup |
| 629 | + logger.info("Re-initializing nftables") |
| 630 | + try: |
| 631 | + initialize_nftables() |
| 632 | + except Exception as e: |
| 633 | + logger.error(f"Error initializing nftables: {e}") |
| 634 | + return web.json_response( |
| 635 | + {"success": False, "error": f"Failed to initialize network: {str(e)}"}, |
| 636 | + status=500, |
| 637 | + ) |
| 638 | + |
| 639 | + # Step 4: Recreate VM-specific chains and rules |
| 640 | + try: |
| 641 | + recreated_vms, failed_vms = recreate_network_for_vms(running_vms) |
| 642 | + except Exception as e: |
| 643 | + logger.error(f"Error recreating VM networks: {e}") |
| 644 | + return web.json_response( |
| 645 | + {"success": False, "error": f"Failed to recreate VM networks: {str(e)}"}, |
| 646 | + status=500, |
| 647 | + ) |
| 648 | + |
| 649 | + # Step 5: Recreate port forwarding rules for instances |
| 650 | + logger.info("Recreating port forwarding rules for instances") |
| 651 | + for vm_info in running_vms: |
| 652 | + execution = vm_info["execution"] |
| 653 | + if execution.is_instance and str(vm_info["vm_hash"]) in recreated_vms: |
| 654 | + try: |
| 655 | + await execution.fetch_port_redirect_config_and_setup() |
| 656 | + logger.debug(f"Recreated port redirects for instance {vm_info['vm_hash']}") |
| 657 | + except Exception as e: |
| 658 | + logger.error(f"Error recreating port redirects for VM {vm_info['vm_hash']}: {e}") |
| 659 | + # Don't add to failed_vms as the VM network itself was created successfully |
| 660 | + |
| 661 | + logger.info( |
| 662 | + f"Network recreation complete. Removed chains: {len(removed_chains)}, " |
| 663 | + f"Recreated VMs: {len(recreated_vms)}, Failed: {len(failed_vms)}" |
| 664 | + ) |
| 665 | + |
| 666 | + return web.json_response( |
| 667 | + { |
| 668 | + "success": len(failed_vms) == 0, |
| 669 | + "removed_chains_count": len(removed_chains), |
| 670 | + "removed_chains": removed_chains, |
| 671 | + "recreated_count": len(recreated_vms), |
| 672 | + "failed_count": len(failed_vms), |
| 673 | + "recreated_vms": recreated_vms, |
| 674 | + "failed_vms": failed_vms, |
| 675 | + }, |
| 676 | + status=200 if len(failed_vms) == 0 else 207, |
| 677 | + ) |
| 678 | + |
| 679 | + |
550 | 680 | @cors_allow_all |
551 | 681 | async def notify_allocation(request: web.Request): |
552 | 682 | """Notify instance allocation, only used for Pay as you Go feature""" |
|
0 commit comments