diff --git a/doc/src/vpr/command_line_usage.rst b/doc/src/vpr/command_line_usage.rst
index b3d482e048..fe1b46f83b 100644
--- a/doc/src/vpr/command_line_usage.rst
+++ b/doc/src/vpr/command_line_usage.rst
@@ -907,6 +907,16 @@ If any of init_t, exit_t or alpha_t is specified, the user schedule, with a fixe
 
     **Default:** ``auto_bb``
 
+.. option:: --place_frequency {once | always}
+
+    Specifies how often placement is performed during the minimum channel width search.
+
+    ``once``: Placement is run only once at the beginning of the channel width search. This reduces runtime but may not benefit from congestion-aware optimizations.
+
+    ``always``: Placement is rerun for each channel width trial. This might improve routability at the cost of increased runtime.
+
+    **Default:** ``once``
+
 .. option:: --place_chan_width <int>
 
     Tells VPR how many tracks a channel of relative width 1 is expected to need to complete routing of this circuit.
@@ -1869,6 +1879,7 @@ The following options are only valid when the router is in timing-driven mode (t
     **Default:** ``0.5``
 
 .. option:: --router_initial_acc_cost_chan_congestion_weight <float>
+
     Weight applied to the excess channel utilization (above threshold) when computing the initial accumulated cost (acc_cost)of routing resources.
 
     Higher values make the router more sensitive to early congestion.
@@ -1907,7 +1918,7 @@ The following options are only valid when the router is in timing-driven mode (t
 
 .. option:: --router_first_iter_timing_report <file>
 
-    Name of the timing report file to generate after the first routing iteration completes (not generated if unspecfied).
+    Name of the timing report file to generate after the first routing iteration completes (not generated if unspecified).
 
 .. option:: --router_debug_net <int>
 
diff --git a/libs/libvtrutil/src/vtr_prefix_sum.h b/libs/libvtrutil/src/vtr_prefix_sum.h
index a9472fcd07..489dd487fe 100644
--- a/libs/libvtrutil/src/vtr_prefix_sum.h
+++ b/libs/libvtrutil/src/vtr_prefix_sum.h
@@ -92,7 +92,7 @@ class PrefixSum1D {
     /**
      * @brief Construct the 1D prefix sum from a vector.
      */
-    PrefixSum1D(std::vector<T> vals, T zero = T())
+    PrefixSum1D(const std::vector<T>& vals, T zero = T())
         : PrefixSum1D(
               vals.size(),
               [&](size_t x) noexcept {
diff --git a/vpr/src/analytical_place/ap_flow_enums.h b/vpr/src/analytical_place/ap_flow_enums.h
index da47927d5d..707a842ae8 100644
--- a/vpr/src/analytical_place/ap_flow_enums.h
+++ b/vpr/src/analytical_place/ap_flow_enums.h
@@ -27,7 +27,7 @@ enum class e_ap_analytical_solver {
  */
 enum class e_ap_partial_legalizer {
     BiPartitioning, ///< Partial Legalizer which forms minimum windows around dense regions and uses bipartitioning to spread blocks over windows.
-    FlowBased       ///> Partial Legalizer which flows blocks from overfilled bins to underfilled bins.
+    FlowBased       ///< Partial Legalizer which flows blocks from overfilled bins to underfilled bins.
 };
 
 /**
diff --git a/vpr/src/base/ShowSetup.cpp b/vpr/src/base/ShowSetup.cpp
index 2e8d36e5f2..858bd3198c 100644
--- a/vpr/src/base/ShowSetup.cpp
+++ b/vpr/src/base/ShowSetup.cpp
@@ -495,20 +495,16 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
 static void ShowPlacerOpts(const t_placer_opts& PlacerOpts) {
     VTR_LOG("PlacerOpts.place_freq: ");
     switch (PlacerOpts.place_freq) {
-        case PLACE_ONCE:
-            VTR_LOG("PLACE_ONCE\n");
+        case e_place_freq::ONCE:
+            VTR_LOG("ONCE\n");
             break;
-        case PLACE_ALWAYS:
-            VTR_LOG("PLACE_ALWAYS\n");
-            break;
-        case PLACE_NEVER:
-            VTR_LOG("PLACE_NEVER\n");
+        case e_place_freq::ALWAYS:
+            VTR_LOG("ALWAYS\n");
             break;
         default:
             VTR_LOG_ERROR("Unknown Place Freq\n");
     }
-    if ((PLACE_ONCE == PlacerOpts.place_freq)
-        || (PLACE_ALWAYS == PlacerOpts.place_freq)) {
+    if (PlacerOpts.place_freq == e_place_freq::ONCE || PlacerOpts.place_freq == e_place_freq::ALWAYS) {
         VTR_LOG("PlacerOpts.place_algorithm: ");
         switch (PlacerOpts.place_algorithm.get()) {
             case e_place_algorithm::BOUNDING_BOX_PLACE:
diff --git a/vpr/src/base/place_and_route.cpp b/vpr/src/base/place_and_route.cpp
index 138d411539..6381eb39dc 100644
--- a/vpr/src/base/place_and_route.cpp
+++ b/vpr/src/base/place_and_route.cpp
@@ -167,7 +167,7 @@ int binary_search_place_and_route(const Netlist<>& placement_net_list,
             break;
         }
 
-        if (placer_opts.place_freq == PLACE_ALWAYS) {
+        if (placer_opts.place_freq == e_place_freq::ALWAYS) {
             placer_opts.place_chan_width = current;
             try_place(placement_net_list,
                       placer_opts,
@@ -312,7 +312,7 @@ int binary_search_place_and_route(const Netlist<>& placement_net_list,
             fflush(stdout);
             if (current < 1)
                 break;
-            if (placer_opts.place_freq == PLACE_ALWAYS) {
+            if (placer_opts.place_freq == e_place_freq::ALWAYS) {
                 placer_opts.place_chan_width = current;
                 try_place(placement_net_list, placer_opts, router_opts, analysis_opts, noc_opts,
                           arch->Chans, det_routing_arch, segment_inf,
@@ -341,7 +341,7 @@ int binary_search_place_and_route(const Netlist<>& placement_net_list,
                              route_ctx.clb_opins_used_locally,
                              saved_clb_opins_used_locally);
 
-                if (placer_opts.place_freq == PLACE_ALWAYS) {
+                if (placer_opts.place_freq == e_place_freq::ALWAYS) {
                     auto& cluster_ctx = g_vpr_ctx.clustering();
                     // Cluster-based net_list is used for placement
                     std::string placement_id = print_place(filename_opts.NetFile.c_str(), cluster_ctx.clb_nlist.netlist_id().c_str(),
@@ -417,7 +417,7 @@ t_chan_width setup_chan_width(const t_router_opts& router_opts,
     if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) {
         auto& device_ctx = g_vpr_ctx.device();
 
-        auto type = find_most_common_tile_type(device_ctx.grid);
+        t_physical_tile_type_ptr type = find_most_common_tile_type(device_ctx.grid);
 
         width_fac = 4 * type->num_pins;
         // this is 2x the value that binary search starts
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index 57c449b6d2..e13c7768b1 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -93,6 +93,7 @@ struct ParseArchFormat {
         return {"vtr", "fpga-interchange"};
     }
 };
+
 struct ParseCircuitFormat {
     ConvertedValue<e_circuit_format> from_str(const std::string& str) {
         ConvertedValue<e_circuit_format> conv_value;
@@ -618,6 +619,37 @@ struct ParsePlaceBoundingBox {
     }
 };
 
+struct ParsePlacementFreq {
+    ConvertedValue<e_place_freq> from_str(const std::string& str) {
+        ConvertedValue<e_place_freq> conv_value;
+        if (str == "once") {
+            conv_value.set_value(e_place_freq::ONCE);
+        } else if (str == "always") {
+            conv_value.set_value(e_place_freq::ALWAYS);
+        } else {
+            std::stringstream msg;
+            msg << "Invalid conversion from '" << str << "' to e_place_freq (expected one of: " << argparse::join(default_choices(), ", ") << ")";
+            conv_value.set_error(msg.str());
+        }
+        return conv_value;
+    }
+
+    ConvertedValue<std::string> to_str(e_place_freq val) {
+        ConvertedValue<std::string> conv_value;
+        if (val == e_place_freq::ONCE) {
+            conv_value.set_value("once");
+        } else {
+            VTR_ASSERT(val == e_place_freq::ALWAYS);
+            conv_value.set_value("always");
+        }
+        return conv_value;
+    }
+
+    std::vector<std::string> default_choices() {
+        return {"once", "always"};
+    }
+};
+
 struct ParsePlaceAgentAlgorithm {
     ConvertedValue<e_agent_algorithm> from_str(const std::string& str) {
         ConvertedValue<e_agent_algorithm> conv_value;
@@ -2177,7 +2209,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
 
     auto& place_grp = parser.add_argument_group("placement options");
 
-    place_grp.add_argument(args.Seed, "--seed")
+    place_grp.add_argument(args.seed, "--seed")
         .help("Placement random number generator seed")
         .default_value("1")
         .show_in(argparse::ShowIn::HELP_ONLY);
@@ -2195,7 +2227,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
         .default_value("astar")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_grp.add_argument(args.PlaceInnerNum, "--inner_num")
+    place_grp.add_argument(args.place_inner_num, "--inner_num")
         .help("Controls number of moves per temperature: inner_num * num_blocks ^ (4/3)")
         .default_value("0.5")
         .show_in(argparse::ShowIn::HELP_ONLY);
@@ -2226,17 +2258,17 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
         .default_value("1.0")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_grp.add_argument(args.PlaceInitT, "--init_t")
+    place_grp.add_argument(args.place_init_t, "--init_t")
         .help("Initial temperature for manual annealing schedule")
         .default_value("100.0")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_grp.add_argument(args.PlaceExitT, "--exit_t")
+    place_grp.add_argument(args.place_exit_t, "--exit_t")
         .help("Temperature at which annealing which terminate for manual annealing schedule")
         .default_value("0.01")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_grp.add_argument(args.PlaceAlphaT, "--alpha_t")
+    place_grp.add_argument(args.place_alpha_t, "--alpha_t")
         .help(
             "Temperature scaling factor for manual annealing schedule."
             " Old temperature is multiplied by alpha_t")
@@ -2259,7 +2291,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
         .default_value("")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_grp.add_argument<e_place_algorithm, ParsePlaceAlgorithm>(args.PlaceAlgorithm, "--place_algorithm")
+    place_grp.add_argument<e_place_algorithm, ParsePlaceAlgorithm>(args.place_algorithm, "--place_algorithm")
         .help(
             "Controls which placement algorithm is used. Valid options:\n"
             " * bounding_box: Focuses purely on minimizing the bounding box wirelength of the circuit. Turns off timing analysis if specified.\n"
@@ -2269,7 +2301,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
         .choices({"bounding_box", "criticality_timing", "slack_timing"})
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_grp.add_argument<e_place_algorithm, ParsePlaceAlgorithm>(args.PlaceQuenchAlgorithm, "--place_quench_algorithm")
+    place_grp.add_argument<e_place_algorithm, ParsePlaceAlgorithm>(args.place_quench_algorithm, "--place_quench_algorithm")
         .help(
             "Controls which placement algorithm is used during placement quench.\n"
             "If specified, it overrides the option --place_algorithm during placement quench.\n"
@@ -2281,7 +2313,7 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
         .choices({"bounding_box", "criticality_timing", "slack_timing"})
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_grp.add_argument(args.PlaceChanWidth, "--place_chan_width")
+    place_grp.add_argument(args.place_chan_width, "--place_chan_width")
         .help(
             "Sets the assumed channel width during placement. "
             "If --place_chan_width is unspecified, but --route_chan_width is specified the "
@@ -2334,15 +2366,21 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
             "Specifies the type of bounding box to be used in 3D architectures.\n"
             "\n"
             "MODE options:\n"
-            "  auto_bb     : Automatically determine the appropriate bounding box based on the connections between layers.\n"
-            "  cube_bb            : Use 3D bounding boxes.\n"
-            "  per_layer_bb     : Use per-layer bounding boxes.\n"
+            "  auto_bb      : Automatically determine the appropriate bounding box based on the connections between layers.\n"
+            "  cube_bb      : Use 3D bounding boxes.\n"
+            "  per_layer_bb : Use per-layer bounding boxes.\n"
             "\n"
             "Choose one of the available modes to define the behavior of bounding boxes in your 3D architecture. The default mode is 'automatic'.")
         .default_value("auto_bb")
         .choices({"auto_bb", "cube_bb", "per_layer_bb"})
         .show_in(argparse::ShowIn::HELP_ONLY);
 
+    place_grp.add_argument<e_place_freq, ParsePlacementFreq>(args.place_placement_freq, "--place_frequency")
+        .help("Run placement every time or only once during channel width search.")
+        .default_value("once")
+        .choices({"once", "always"})
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
     place_grp.add_argument<bool, ParseOnOff>(args.RL_agent_placement, "--RL_agent_placement")
         .help(
             "Uses a Reinforcement Learning (RL) agent in choosing the appropriate move type in placement."
@@ -2483,14 +2521,30 @@ argparse::ArgumentParser create_arg_parser(const std::string& prog_name, t_optio
 
     auto& place_timing_grp = parser.add_argument_group("timing-driven placement options");
 
-    place_timing_grp.add_argument(args.PlaceTimingTradeoff, "--timing_tradeoff")
+    place_timing_grp.add_argument(args.place_timing_tradeoff, "--timing_tradeoff")
         .help(
             "Trade-off control between delay and wirelength during placement."
             " 0.0 focuses completely on wirelength, 1.0 completely on timing")
         .default_value("0.5")
         .show_in(argparse::ShowIn::HELP_ONLY);
 
-    place_timing_grp.add_argument(args.RecomputeCritIter, "--recompute_crit_iter")
+    place_timing_grp.add_argument(args.place_congestion_factor, "--congestion_factor")
+        .help("Weighting factor for congestion cost during placement. "
+              "Higher values prioritize congestion avoidance over bounding box and timing costs. "
+              "When set to zero, congestion modeling and optimization is disabled in the placement stage.")
+        .default_value("0.0")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    place_timing_grp.add_argument(args.place_congestion_rlim_trigger_ratio, "--congestion_rlim_trigger_ratio")
+        .help("Enables congestion modeling when the ratio of the current range limit to the initial range limit falls below this threshold, "
+              "provided the congestion weighting factor is non-zero.")
+        .default_value("1.0")
+        .show_in(argparse::ShowIn::HELP_ONLY);
+
+    place_timing_grp.add_argument(args.place_congestion_chan_util_threshold, "--congestion_chan_util_threshold")
+        .help("Penalizes nets in placement whose average routing channel utilization within their bounding boxes exceeds this threshold.");
+
+    place_timing_grp.add_argument(args.recompute_crit_iter, "--recompute_crit_iter")
         .help("Controls how many temperature updates occur between timing analysis during placement")
         .default_value("1")
         .show_in(argparse::ShowIn::HELP_ONLY);
@@ -3449,11 +3503,11 @@ void set_conditional_defaults(t_options& args) {
      */
 
     //Which placement algorithm to use?
-    if (args.PlaceAlgorithm.provenance() != Provenance::SPECIFIED) {
+    if (args.place_algorithm.provenance() != Provenance::SPECIFIED) {
         if (args.timing_analysis) {
-            args.PlaceAlgorithm.set(e_place_algorithm::CRITICALITY_TIMING_PLACE, Provenance::INFERRED);
+            args.place_algorithm.set(e_place_algorithm::CRITICALITY_TIMING_PLACE, Provenance::INFERRED);
         } else {
-            args.PlaceAlgorithm.set(e_place_algorithm::BOUNDING_BOX_PLACE, Provenance::INFERRED);
+            args.place_algorithm.set(e_place_algorithm::BOUNDING_BOX_PLACE, Provenance::INFERRED);
         }
     }
 
@@ -3467,7 +3521,7 @@ void set_conditional_defaults(t_options& args) {
     // Check for correct options combinations
     // If you are running WLdriven placement, the RL reward function should be
     // either basic or nonPenalizing basic
-    if (args.RL_agent_placement && (args.PlaceAlgorithm == e_place_algorithm::BOUNDING_BOX_PLACE || !args.timing_analysis)) {
+    if (args.RL_agent_placement && (args.place_algorithm == e_place_algorithm::BOUNDING_BOX_PLACE || !args.timing_analysis)) {
         if (args.place_reward_fun.value() != "basic" && args.place_reward_fun.value() != "nonPenalizing_basic") {
             VTR_LOG_WARN(
                 "To use RLPlace for WLdriven placements, the reward function should be basic or nonPenalizing_basic.\n"
@@ -3478,18 +3532,18 @@ void set_conditional_defaults(t_options& args) {
     }
 
     //Which placement algorithm to use during placement quench?
-    if (args.PlaceQuenchAlgorithm.provenance() != Provenance::SPECIFIED) {
-        args.PlaceQuenchAlgorithm.set(args.PlaceAlgorithm, Provenance::INFERRED);
+    if (args.place_quench_algorithm.provenance() != Provenance::SPECIFIED) {
+        args.place_quench_algorithm.set(args.place_algorithm, Provenance::INFERRED);
     }
 
     //Place chan width follows Route chan width if unspecified
-    if (args.PlaceChanWidth.provenance() != Provenance::SPECIFIED && args.RouteChanWidth.provenance() == Provenance::SPECIFIED) {
-        args.PlaceChanWidth.set(args.RouteChanWidth.value(), Provenance::INFERRED);
+    if (args.place_chan_width.provenance() != Provenance::SPECIFIED && args.RouteChanWidth.provenance() == Provenance::SPECIFIED) {
+        args.place_chan_width.set(args.RouteChanWidth.value(), Provenance::INFERRED);
     }
 
     //Do we calculate timing info during placement?
-    if (args.ShowPlaceTiming.provenance() != Provenance::SPECIFIED) {
-        args.ShowPlaceTiming.set(args.timing_analysis, Provenance::INFERRED);
+    if (args.show_place_timing.provenance() != Provenance::SPECIFIED) {
+        args.show_place_timing.set(args.timing_analysis, Provenance::INFERRED);
     }
 
     //Slave quench recompute divider of inner loop recompute divider unless specified
@@ -3498,9 +3552,9 @@ void set_conditional_defaults(t_options& args) {
     }
 
     //Which schedule?
-    if (args.PlaceInitT.provenance() == Provenance::SPECIFIED // Any of these flags select a manual schedule
-        || args.PlaceExitT.provenance() == Provenance::SPECIFIED
-        || args.PlaceAlphaT.provenance() == Provenance::SPECIFIED) {
+    if (args.place_init_t.provenance() == Provenance::SPECIFIED // Any of these flags select a manual schedule
+        || args.place_exit_t.provenance() == Provenance::SPECIFIED
+        || args.place_alpha_t.provenance() == Provenance::SPECIFIED) {
         args.anneal_sched_type.set(e_sched_type::USER_SCHED, Provenance::INFERRED);
     } else {
         args.anneal_sched_type.set(e_sched_type::AUTO_SCHED, Provenance::INFERRED); // Otherwise use the automatic schedule
diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
index f846867af7..2c309253b4 100644
--- a/vpr/src/base/read_options.h
+++ b/vpr/src/base/read_options.h
@@ -9,7 +9,7 @@
 #include "argparse.hpp"
 
 struct t_options {
-    /* File names */
+    // File names
     argparse::ArgValue<std::string> ArchFile;
     argparse::ArgValue<std::string> CircuitName;
     argparse::ArgValue<std::string> NetFile;
@@ -49,7 +49,7 @@ struct t_options {
 
     argparse::ArgValue<std::string> write_block_usage;
 
-    /* Stage Options */
+    // Stage Options
     argparse::ArgValue<bool> do_packing;
     argparse::ArgValue<bool> do_legalize;
     argparse::ArgValue<bool> do_placement;
@@ -58,13 +58,13 @@ struct t_options {
     argparse::ArgValue<bool> do_analysis;
     argparse::ArgValue<bool> do_power;
 
-    /* Graphics Options */
+    // Graphics Options
     argparse::ArgValue<bool> show_graphics; ///<Enable argparse::ArgValue<int>eractive graphics?
     argparse::ArgValue<int> GraphPause;
     argparse::ArgValue<bool> save_graphics;
     argparse::ArgValue<std::string> graphics_commands;
 
-    /* General options */
+    // General options
     argparse::ArgValue<bool> show_help;
     argparse::ArgValue<bool> show_version;
     argparse::ArgValue<bool> show_arch_resources;
@@ -86,11 +86,11 @@ struct t_options {
     argparse::ArgValue<bool> allow_dangling_combinational_nodes;
     argparse::ArgValue<bool> terminate_if_timing_fails;
 
-    /* Server options */
+    // Server options
     argparse::ArgValue<bool> is_server_mode_enabled;
     argparse::ArgValue<int> server_port_num;
 
-    /* Atom netlist options */
+    // Atom netlist options
     argparse::ArgValue<bool> absorb_buffer_luts;
     argparse::ArgValue<e_const_gen_inference> const_gen_inference;
     argparse::ArgValue<bool> sweep_dangling_primary_ios;
@@ -99,7 +99,7 @@ struct t_options {
     argparse::ArgValue<bool> sweep_constant_primary_outputs;
     argparse::ArgValue<int> netlist_verbosity;
 
-    /* Analytical Placement options */
+    // Analytical Placement options
     argparse::ArgValue<e_ap_analytical_solver> ap_analytical_solver;
     argparse::ArgValue<e_ap_partial_legalizer> ap_partial_legalizer;
     argparse::ArgValue<e_ap_full_legalizer> ap_full_legalizer;
@@ -111,7 +111,7 @@ struct t_options {
     argparse::ArgValue<int> ap_high_fanout_threshold;
     argparse::ArgValue<bool> ap_generate_mass_report;
 
-    /* Clustering options */
+    // Clustering options
     argparse::ArgValue<bool> connection_driven_clustering;
     argparse::ArgValue<e_unrelated_clustering> allow_unrelated_clustering;
     argparse::ArgValue<float> timing_gain_weight;
@@ -126,19 +126,20 @@ struct t_options {
     argparse::ArgValue<int> pack_feasible_block_array_size;
     argparse::ArgValue<std::vector<std::string>> pack_high_fanout_threshold;
     argparse::ArgValue<int> pack_verbosity;
-    /* Placement options */
-    argparse::ArgValue<int> Seed;
-    argparse::ArgValue<bool> ShowPlaceTiming;
-    argparse::ArgValue<float> PlaceInnerNum;
+
+    // Placement options
+    argparse::ArgValue<int> seed;
+    argparse::ArgValue<bool> show_place_timing;
+    argparse::ArgValue<float> place_inner_num;
     argparse::ArgValue<float> place_auto_init_t_scale;
-    argparse::ArgValue<float> PlaceInitT;
-    argparse::ArgValue<float> PlaceExitT;
-    argparse::ArgValue<float> PlaceAlphaT;
+    argparse::ArgValue<float> place_init_t;
+    argparse::ArgValue<float> place_exit_t;
+    argparse::ArgValue<float> place_alpha_t;
     argparse::ArgValue<e_sched_type> anneal_sched_type;
-    argparse::ArgValue<e_place_algorithm> PlaceAlgorithm;
-    argparse::ArgValue<e_place_algorithm> PlaceQuenchAlgorithm;
+    argparse::ArgValue<e_place_algorithm> place_algorithm;
+    argparse::ArgValue<e_place_algorithm> place_quench_algorithm;
     argparse::ArgValue<e_pad_loc_type> pad_loc_type;
-    argparse::ArgValue<int> PlaceChanWidth;
+    argparse::ArgValue<int> place_chan_width;
     argparse::ArgValue<float> place_rlim_escape_fraction;
     argparse::ArgValue<std::string> place_move_stats_file;
     argparse::ArgValue<int> placement_saves_per_temperature;
@@ -147,6 +148,7 @@ struct t_options {
     argparse::ArgValue<std::vector<float>> place_static_move_prob;
     argparse::ArgValue<int> place_high_fanout_net;
     argparse::ArgValue<e_place_bounding_box_mode> place_bounding_box_mode;
+    argparse::ArgValue<e_place_freq> place_placement_freq;
 
     argparse::ArgValue<bool> RL_agent_placement;
     argparse::ArgValue<bool> place_agent_multistate;
@@ -167,7 +169,7 @@ struct t_options {
     argparse::ArgValue<int> placer_debug_block;
     argparse::ArgValue<int> placer_debug_net;
 
-    /*NoC Options*/
+    // NoC Options
     argparse::ArgValue<bool> noc;
     argparse::ArgValue<std::string> noc_flows_file;
     argparse::ArgValue<std::string> noc_routing_algorithm;
@@ -185,9 +187,12 @@ struct t_options {
     argparse::ArgValue<bool> noc_sat_routing_log_search_progress;
     argparse::ArgValue<std::string> noc_placement_file_name;
 
-    /* Timing-driven placement options only */
-    argparse::ArgValue<float> PlaceTimingTradeoff;
-    argparse::ArgValue<int> RecomputeCritIter;
+    // Timing-driven placement options only
+    argparse::ArgValue<float> place_congestion_factor;
+    argparse::ArgValue<float> place_congestion_rlim_trigger_ratio;
+    argparse::ArgValue<float> place_congestion_chan_util_threshold;
+    argparse::ArgValue<float> place_timing_tradeoff;
+    argparse::ArgValue<int> recompute_crit_iter;
     argparse::ArgValue<int> inner_loop_recompute_divider;
     argparse::ArgValue<int> quench_recompute_divider;
     argparse::ArgValue<float> place_exp_first;
@@ -202,7 +207,7 @@ struct t_options {
     argparse::ArgValue<e_reducer> place_delay_model_reducer;
     argparse::ArgValue<std::string> allowed_tiles_for_delay_model;
 
-    /* Router Options */
+    // Router Options
     argparse::ArgValue<bool> check_rr_graph;
     argparse::ArgValue<int> max_router_iterations;
     argparse::ArgValue<float> first_iter_pres_fac;
@@ -232,7 +237,7 @@ struct t_options {
     argparse::ArgValue<int> route_verbosity;
     argparse::ArgValue<int> custom_3d_sb_fanin_fanout;
 
-    /* Timing-driven router options only */
+    // Timing-driven router options only
     argparse::ArgValue<float> astar_fac;
     argparse::ArgValue<float> astar_offset;
     argparse::ArgValue<float> router_profiler_astar_fac;
@@ -267,7 +272,7 @@ struct t_options {
     argparse::ArgValue<e_router_initial_timing> router_initial_timing;
     argparse::ArgValue<e_heap_type> router_heap;
 
-    /* Analysis options */
+    // Analysis options
     argparse::ArgValue<bool> full_stats;
     argparse::ArgValue<bool> Generate_Post_Synthesis_Netlist;
     argparse::ArgValue<bool> Generate_Post_Implementation_Merged_Netlist;
diff --git a/vpr/src/base/setup_vpr.cpp b/vpr/src/base/setup_vpr.cpp
index 4f2cc967bc..cb0bd5b2b7 100644
--- a/vpr/src/base/setup_vpr.cpp
+++ b/vpr/src/base/setup_vpr.cpp
@@ -27,23 +27,61 @@
 #include "setup_vib_utils.h"
 
 static void setup_netlist_opts(const t_options& Options, t_netlist_opts& NetlistOpts);
+
+/**
+ * @brief Sets up the t_ap_opts structure based on users inputs and
+ *        on the architecture specified.
+ *
+ * Error checking, such as checking for conflicting params is assumed
+ * to be done beforehand
+ */
 static void setup_ap_opts(const t_options& options,
                           t_ap_opts& apOpts);
+
+/**
+ * @brief Sets up the t_packer_opts structure based on users inputs and
+ *        on the architecture specified.
+ *
+ * Error checking, such as checking for conflicting params is assumed
+ * to be done beforehand
+ */
 static void setup_packer_opts(const t_options& Options,
                               t_packer_opts* PackerOpts);
+
+/**
+ * @brief Sets up the s_placer_opts structure based on users input.
+ *
+ * Error checking, such as checking for conflicting params
+ * is assumed to be done beforehand
+ */
 static void setup_placer_opts(const t_options& Options,
                               t_placer_opts* PlacerOpts);
 static void setup_anneal_sched(const t_options& Options,
                                t_annealing_sched* AnnealSched);
 static void setup_router_opts(const t_options& Options, t_router_opts* RouterOpts);
+
+/**
+ * Go through all the NoC options supplied by the user and store them internally.
+ */
 static void setup_noc_opts(const t_options& Options,
                            t_noc_opts* NocOpts);
+
 static void setup_server_opts(const t_options& Options,
                               t_server_opts* ServerOpts);
 
+/**
+ * @brief Sets up routing structures.
+ *
+ * Since checks are already done, this just copies values across
+ */
 static void setup_routing_arch(const t_arch& Arch, t_det_routing_arch& RoutingArch);
 
 static void setup_timing(const t_options& Options, const bool TimingEnabled, t_timing_inf* Timing);
+
+/**
+ * @brief This loads up VPR's arch_switch_inf data by combining the switches
+ *        from the arch file with the special switches that VPR needs.
+ */
 static void setup_switches(const t_arch& Arch,
                            t_det_routing_arch& RoutingArch,
                            const std::vector<t_arch_switch_inf>& arch_switches);
@@ -72,22 +110,12 @@ static void add_intra_tile_switches();
 
 /**
  * Identify the pins that can directly reach class_inf
- * @param physical_tile
- * @param logical_block
- * @param class_inf
- * @param physical_class_num
  */
 static void do_reachability_analysis(t_physical_tile_type* physical_tile,
                                      t_logical_block_type* logical_block,
                                      t_class* class_inf,
                                      int physical_class_num);
 
-/**
- * @brief Sets VPR parameters and defaults.
- *
- * Does not do any error checking as this should have been done by
- * the various input checkers
- */
 void SetupVPR(const t_options* options,
               const bool timingenabled,
               const bool readArchFile,
@@ -306,7 +334,7 @@ void SetupVPR(const t_options* options,
 
     ShowSetup(*vpr_setup);
 
-    /* init global variables */
+    // init global variables
     vtr::out_file_prefix = options->out_file_prefix;
 
     {
@@ -348,7 +376,7 @@ void SetupVPR(const t_options* options,
 }
 
 static void setup_timing(const t_options& Options, const bool TimingEnabled, t_timing_inf* Timing) {
-    /* Don't do anything if they don't want timing */
+    // Don't do anything if they don't want timing
     if (!TimingEnabled) {
         Timing->timing_analysis_enabled = false;
         return;
@@ -358,10 +386,6 @@ static void setup_timing(const t_options& Options, const bool TimingEnabled, t_t
     Timing->SDCFile = Options.SDCFile;
 }
 
-/**
- * @brief This loads up VPR's arch_switch_inf data by combining the switches
- *        from the arch file with the special switches that VPR needs.
- */
 static void setup_switches(const t_arch& Arch,
                            t_det_routing_arch& RoutingArch,
                            const std::vector<t_arch_switch_inf>& arch_switches) {
@@ -372,10 +396,10 @@ static void setup_switches(const t_arch& Arch,
 
     find_ipin_cblock_switch_index(Arch, RoutingArch.wire_to_arch_ipin_switch, RoutingArch.wire_to_arch_ipin_switch_between_dice);
 
-    /* Depends on device_ctx.num_arch_switches */
+    // Depends on device_ctx.num_arch_switches
     RoutingArch.delayless_switch = num_arch_switches++;
 
-    /* Alloc the list now that we know the final num_arch_switches value */
+    // Alloc the list now that we know the final num_arch_switches value
     device_ctx.arch_switch_inf.resize(num_arch_switches);
     for (int iswitch = 0; iswitch < switches_to_copy; iswitch++) {
         device_ctx.arch_switch_inf[iswitch] = arch_switches[iswitch];
@@ -384,7 +408,7 @@ static void setup_switches(const t_arch& Arch,
         device_ctx.all_sw_inf[iswitch] = arch_switches[iswitch];
     }
 
-    /* Delayless switch for connecting sinks and sources with their pins. */
+    // Delayless switch for connecting sinks and sources with their pins.
     device_ctx.arch_switch_inf[RoutingArch.delayless_switch].set_type(SwitchType::MUX);
     device_ctx.arch_switch_inf[RoutingArch.delayless_switch].name = std::string(VPR_DELAYLESS_SWITCH_NAME);
     device_ctx.arch_switch_inf[RoutingArch.delayless_switch].R = 0.;
@@ -404,21 +428,15 @@ static void setup_switches(const t_arch& Arch,
 
     device_ctx.delayless_switch_idx = RoutingArch.delayless_switch;
 
-    //Warn about non-zero Cout values for the ipin switch, since these values have no effect.
-    //VPR do not model the R/C's of block internal routing connection.
-    //
-    //Note that we don't warn about the R value as it may be used to size the buffer (if buf_size_type is AUTO)
+    // Warn about non-zero Cout values for the ipin switch, since these values have no effect.
+    // VPR do not model the R/C's of block internal routing connection
+    // Note that we don't warn about the R value as it may be used to size the buffer (if buf_size_type is AUTO)
     if (device_ctx.arch_switch_inf[RoutingArch.wire_to_arch_ipin_switch].Cout != 0.) {
         VTR_LOG_WARN("Non-zero switch output capacitance (%g) has no effect when switch '%s' is used for connection block inputs\n",
                      device_ctx.arch_switch_inf[RoutingArch.wire_to_arch_ipin_switch].Cout, Arch.ipin_cblock_switch_name[0].c_str());
     }
 }
 
-/**
- * @brief Sets up routing structures.
- *
- * Since checks are already done, this just copies values across
- */
 static void setup_routing_arch(const t_arch& Arch,
                                t_det_routing_arch& RoutingArch) {
     RoutingArch.switch_block_type = Arch.sb_type;
@@ -432,10 +450,10 @@ static void setup_routing_arch(const t_arch& Arch,
         RoutingArch.directionality = Arch.Segments[0].directionality;
     }
 
-    /* copy over the switch block information */
+    // copy over the switch block information
     RoutingArch.switchblocks = Arch.switchblocks;
 
-    /* Copy the tileable routing setting */
+    // Copy the tileable routing setting
     RoutingArch.tileable = Arch.tileable;
     RoutingArch.perimeter_cb = Arch.perimeter_cb;
     RoutingArch.shrink_boundary = Arch.shrink_boundary;
@@ -542,17 +560,17 @@ static void setup_router_opts(const t_options& Options, t_router_opts* RouterOpt
 
 static void setup_anneal_sched(const t_options& Options,
                                t_annealing_sched* AnnealSched) {
-    AnnealSched->alpha_t = Options.PlaceAlphaT;
+    AnnealSched->alpha_t = Options.place_alpha_t;
     if (AnnealSched->alpha_t >= 1 || AnnealSched->alpha_t <= 0) {
         VPR_FATAL_ERROR(VPR_ERROR_OTHER, "alpha_t must be between 0 and 1 exclusive.\n");
     }
 
-    AnnealSched->exit_t = Options.PlaceExitT;
+    AnnealSched->exit_t = Options.place_exit_t;
     if (AnnealSched->exit_t <= 0) {
         VPR_FATAL_ERROR(VPR_ERROR_OTHER, "exit_t must be greater than 0.\n");
     }
 
-    AnnealSched->init_t = Options.PlaceInitT;
+    AnnealSched->init_t = Options.place_init_t;
     if (AnnealSched->init_t <= 0) {
         VPR_FATAL_ERROR(VPR_ERROR_OTHER, "init_t must be greater than 0.\n");
     }
@@ -561,7 +579,7 @@ static void setup_anneal_sched(const t_options& Options,
         VPR_FATAL_ERROR(VPR_ERROR_OTHER, "init_t must be greater or equal to than exit_t.\n");
     }
 
-    AnnealSched->inner_num = Options.PlaceInnerNum;
+    AnnealSched->inner_num = Options.place_inner_num;
     if (AnnealSched->inner_num <= 0) {
         VPR_FATAL_ERROR(VPR_ERROR_OTHER, "inner_num must be greater than 0.\n");
     }
@@ -569,15 +587,8 @@ static void setup_anneal_sched(const t_options& Options,
     AnnealSched->type = Options.anneal_sched_type;
 }
 
-/**
- * @brief Sets up the t_ap_opts structure based on users inputs and
- *        on the architecture specified.
- *
- * Error checking, such as checking for conflicting params is assumed
- * to be done beforehand
- */
-void setup_ap_opts(const t_options& options,
-                   t_ap_opts& apOpts) {
+static void setup_ap_opts(const t_options& options,
+                          t_ap_opts& apOpts) {
     apOpts.analytical_solver_type = options.ap_analytical_solver.value();
     apOpts.partial_legalizer_type = options.ap_partial_legalizer.value();
     apOpts.full_legalizer_type = options.ap_full_legalizer.value();
@@ -591,13 +602,6 @@ void setup_ap_opts(const t_options& options,
     apOpts.generate_mass_report = options.ap_generate_mass_report.value();
 }
 
-/**
- * @brief Sets up the t_packer_opts structure based on users inputs and
- *        on the architecture specified.
- *
- * Error checking, such as checking for conflicting params is assumed
- * to be done beforehand
- */
 void setup_packer_opts(const t_options& Options,
                        t_packer_opts* PackerOpts) {
     PackerOpts->output_file = Options.NetFile;
@@ -639,12 +643,6 @@ static void setup_netlist_opts(const t_options& Options, t_netlist_opts& Netlist
     NetlistOpts.netlist_verbosity = Options.netlist_verbosity;
 }
 
-/**
- * @brief Sets up the s_placer_opts structure based on users input.
- *
- * Error checking, such as checking for conflicting params
- * is assumed to be done beforehand
- */
 static void setup_placer_opts(const t_options& Options, t_placer_opts* PlacerOpts) {
     if (Options.do_placement) {
         PlacerOpts->doPlacement = e_stage_action::DO;
@@ -657,8 +655,8 @@ static void setup_placer_opts(const t_options& Options, t_placer_opts* PlacerOpt
 
     PlacerOpts->td_place_exp_last = Options.place_exp_last;
 
-    PlacerOpts->place_algorithm = Options.PlaceAlgorithm;
-    PlacerOpts->place_quench_algorithm = Options.PlaceQuenchAlgorithm;
+    PlacerOpts->place_algorithm = Options.place_algorithm;
+    PlacerOpts->place_quench_algorithm = Options.place_quench_algorithm;
 
     PlacerOpts->constraints_file = Options.constraints_file;
 
@@ -668,13 +666,16 @@ static void setup_placer_opts(const t_options& Options, t_placer_opts* PlacerOpt
 
     PlacerOpts->pad_loc_type = Options.pad_loc_type;
 
-    PlacerOpts->place_chan_width = Options.PlaceChanWidth;
+    PlacerOpts->place_chan_width = Options.place_chan_width;
 
-    PlacerOpts->recompute_crit_iter = Options.RecomputeCritIter;
+    PlacerOpts->recompute_crit_iter = Options.recompute_crit_iter;
 
-    PlacerOpts->timing_tradeoff = Options.PlaceTimingTradeoff;
+    PlacerOpts->timing_tradeoff = Options.place_timing_tradeoff;
+    PlacerOpts->congestion_factor = Options.place_congestion_factor;
+    PlacerOpts->congestion_rlim_trigger_ratio = Options.place_congestion_rlim_trigger_ratio;
+    PlacerOpts->congestion_chan_util_threshold = Options.place_congestion_chan_util_threshold;
 
-    /* Depends on PlacerOpts->place_algorithm */
+    // Depends on PlacerOpts->place_algorithm
     PlacerOpts->delay_offset = Options.place_delay_offset;
     PlacerOpts->delay_ramp_delta_threshold = Options.place_delay_ramp_delta_threshold;
     PlacerOpts->delay_ramp_slope = Options.place_delay_ramp_slope;
@@ -683,7 +684,7 @@ static void setup_placer_opts(const t_options& Options, t_placer_opts* PlacerOpt
     PlacerOpts->delay_model_type = Options.place_delay_model;
     PlacerOpts->delay_model_reducer = Options.place_delay_model_reducer;
 
-    PlacerOpts->place_freq = PLACE_ONCE; /* DEFAULT */
+    PlacerOpts->place_freq = Options.place_placement_freq;
 
     PlacerOpts->post_place_timing_report_file = Options.post_place_timing_report_file;
 
@@ -721,7 +722,7 @@ static void setup_placer_opts(const t_options& Options, t_placer_opts* PlacerOpt
     PlacerOpts->floorplan_num_vertical_partitions = Options.floorplan_num_vertical_partitions;
     PlacerOpts->place_quench_only = Options.place_quench_only;
 
-    PlacerOpts->seed = Options.Seed;
+    PlacerOpts->seed = Options.seed;
 
     PlacerOpts->placer_debug_block = Options.placer_debug_block;
     PlacerOpts->placer_debug_net = Options.placer_debug_net;
@@ -775,9 +776,6 @@ static void setup_power_opts(const t_options& Options, t_power_opts* power_opts,
     }
 }
 
-/*
- * Go through all the NoC options supplied by the user and store them internally.
- */
 static void setup_noc_opts(const t_options& Options, t_noc_opts* NocOpts) {
     // assign the noc specific options from the command line
     NocOpts->noc = Options.noc;
diff --git a/vpr/src/base/setup_vpr.h b/vpr/src/base/setup_vpr.h
index f72bb231bd..7364b8bb05 100644
--- a/vpr/src/base/setup_vpr.h
+++ b/vpr/src/base/setup_vpr.h
@@ -5,6 +5,12 @@
 #include "physical_types.h"
 #include "vpr_types.h"
 
+/**
+ * @brief Sets VPR parameters and defaults.
+ *
+ * Does not do any error checking as this should have been done by
+ * the various input checkers
+ */
 void SetupVPR(const t_options* Options,
               const bool TimingEnabled,
               const bool readArchFile,
diff --git a/vpr/src/base/stats.cpp b/vpr/src/base/stats.cpp
index 6718d67678..e1757f791f 100644
--- a/vpr/src/base/stats.cpp
+++ b/vpr/src/base/stats.cpp
@@ -33,23 +33,6 @@ static void load_channel_occupancies(const Netlist<>& net_list,
                                      vtr::Matrix<int>& chanx_occ,
                                      vtr::Matrix<int>& chany_occ);
 
-/**
- * @brief Writes channel occupancy data to a file.
- *
- * Each row contains:
- *   - (x, y) coordinate
- *   - Occupancy count
- *   - Occupancy percentage (occupancy / capacity)
- *   - Channel capacity
- *
- * @param filename      Output file path.
- * @param occupancy     Matrix of occupancy counts.
- * @param capacity_list List of channel capacities (per y for chanx, per x for chany).
- */
-static void write_channel_occupancy_table(const std::string_view filename,
-                                          const vtr::Matrix<int>& occupancy,
-                                          const std::vector<int>& capacity_list);
-
 /**
  * @brief Figures out maximum, minimum and average number of bends
  *        and net length in the routing.
@@ -175,7 +158,7 @@ void length_and_bends_stats(const Netlist<>& net_list, bool is_flat) {
     int num_clb_opins_reserved = 0;
     int num_absorbed_nets = 0;
 
-    for (auto net_id : net_list.nets()) {
+    for (ParentNetId net_id : net_list.nets()) {
         if (!net_list.net_is_ignored(net_id) && net_list.net_sinks(net_id).size() != 0) { /* Globals don't count. */
             int bends, length, segments;
             bool is_absorbed;
@@ -283,9 +266,9 @@ static void get_channel_occupancy_stats(const Netlist<>& net_list, bool /***/) {
     VTR_LOG("\n");
 }
 
-static void write_channel_occupancy_table(const std::string_view filename,
-                                          const vtr::Matrix<int>& occupancy,
-                                          const std::vector<int>& capacity_list) {
+void write_channel_occupancy_table(const std::string_view filename,
+                                   const vtr::Matrix<int>& occupancy,
+                                   const std::vector<int>& capacity_list) {
     constexpr int w_coord = 6;
     constexpr int w_value = 12;
     constexpr int w_percent = 12;
diff --git a/vpr/src/base/stats.h b/vpr/src/base/stats.h
index 08b9e4d1d1..1d4f118721 100644
--- a/vpr/src/base/stats.h
+++ b/vpr/src/base/stats.h
@@ -2,6 +2,8 @@
 
 #include <map>
 #include <vector>
+#include <string_view>
+
 #include "netlist.h"
 #include "rr_graph_type.h"
 
@@ -26,7 +28,7 @@ void routing_stats(const Netlist<>& net_list,
 /**
  * @brief Calculates the routing channel width at each grid location.
  *
- * Iterates through all RR nodes and counts how many wires pass through each (x, y) location
+ * Iterates through all RR nodes and counts how many wires pass through each (layer, x, y) location
  * for both horizontal (CHANX) and vertical (CHANY) channels.
  *
  * @return A pair of 3D matrices:
@@ -61,3 +63,22 @@ void print_resource_usage();
  * @param target_device_utilization The target device utilization set by the user
  */
 void print_device_utilization(const float target_device_utilization);
+
+/**
+ * @brief Writes channel occupancy data to a file.
+ *
+ * Each row contains:
+ *   - (x, y) coordinate
+ *   - Occupancy count
+ *   - Occupancy percentage (occupancy / capacity)
+ *   - Channel capacity
+ *
+ *   TODO: extend to 3D
+ *
+ * @param filename      Output file path.
+ * @param occupancy     Matrix of occupancy counts.
+ * @param capacity_list List of channel capacities (per y for chanx, per x for chany).
+ */
+void write_channel_occupancy_table(const std::string_view filename,
+                                   const vtr::Matrix<int>& occupancy,
+                                   const std::vector<int>& capacity_list);
diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp
index dcd0d2394c..17c6df6832 100644
--- a/vpr/src/base/vpr_api.cpp
+++ b/vpr/src/base/vpr_api.cpp
@@ -726,7 +726,7 @@ void vpr_load_packing(const t_vpr_setup& vpr_setup, const t_arch& arch) {
     auto& cluster_ctx = g_vpr_ctx.mutable_clustering();
     const AtomContext& atom_ctx = g_vpr_ctx.atom();
 
-    /* Ensure we have a clean start with void net remapping information */
+    // Ensure we have a clean start with void net remapping information
     cluster_ctx.post_routing_clb_pin_nets.clear();
     cluster_ctx.pre_routing_net_pin_mapping.clear();
 
@@ -735,7 +735,7 @@ void vpr_load_packing(const t_vpr_setup& vpr_setup, const t_arch& arch) {
                                          vpr_setup.FileNameOpts.verify_file_digests,
                                          vpr_setup.PackerOpts.pack_verbosity);
 
-    /* Load the mapping between clusters and their atoms */
+    // Load the mapping between clusters and their atoms
     init_clb_atoms_lookup(cluster_ctx.atoms_lookup, atom_ctx, cluster_ctx.clb_nlist);
 
     process_constant_nets(g_vpr_ctx.mutable_atom().mutable_netlist(),
@@ -749,14 +749,14 @@ void vpr_load_packing(const t_vpr_setup& vpr_setup, const t_arch& arch) {
         report_packing_pin_usage(ofs, g_vpr_ctx);
     }
 
-    // Ater the clustered netlist has been loaded, update the floorplanning
+    // After the clustered netlist has been loaded, update the floorplanning
     // constraints with the new information.
     g_vpr_ctx.mutable_floorplanning().update_floorplanning_context_post_pack();
 
-    /* Sanity check the resulting netlist */
+    // Sanity check the resulting netlist
     check_netlist(vpr_setup.PackerOpts.pack_verbosity);
 
-    // Independently verify the clusterings to ensure the clustering can be
+    // Independently verify the clustering to ensure the clustering can be
     // used for the rest of the VPR flow.
     unsigned num_errors = verify_clustering(g_vpr_ctx);
     if (num_errors == 0) {
@@ -768,7 +768,7 @@ void vpr_load_packing(const t_vpr_setup& vpr_setup, const t_arch& arch) {
                   num_errors);
     }
 
-    /* Output the netlist stats to console and optionally to file. */
+    // Output the netlist stats to console and optionally to file.
     writeClusteredNetlistStats(vpr_setup.FileNameOpts.write_block_usage);
 
     // print the total number of used physical blocks for each
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index 41da9a6d08..7547348cad 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -375,23 +375,23 @@ constexpr int NUM_PL_MOVE_TYPES = 7;
 constexpr int NUM_PL_NONTIMING_MOVE_TYPES = 3;
 
 /* Timing data structures end */
+
+// Annealing schedule
 enum class e_sched_type {
     AUTO_SCHED,
     USER_SCHED
 };
-/* Annealing schedule */
 
+// What's on screen?
 enum pic_type {
     NO_PICTURE,
     PLACEMENT,
     ROUTING
 };
-/* What's on screen? */
 
-enum pfreq {
-    PLACE_NEVER,
-    PLACE_ONCE,
-    PLACE_ALWAYS
+enum class e_place_freq {
+    ONCE,
+    ALWAYS
 };
 
 ///@brief  Power data for t_netlist structure
@@ -968,85 +968,82 @@ enum class e_move_type;
 
 /**
  * @brief Various options for the placer.
- *
- *   @param place_algorithm
- *              Controls which placement algorithm is used.
- *   @param place_quench_algorithm
- *              Controls which placement algorithm is used
- *              during placement quench.
- *   @param timing_tradeoff
- *              When in CRITICALITY_TIMING_PLACE mode, what is the
- *              tradeoff between timing and wiring costs.
- *   @param place_chan_width
- *              The channel width assumed if only one placement is performed.
- *   @param pad_loc_type
- *              Are pins FREE or fixed randomly.
- *   @param constraints_file
- *              File that specifies locations of locked down (constrained)
- *              blocks for placement. Empty string means no constraints file.
- *   @param write_initial_place_file
- *              Write the initial placement into this file. Empty string means
- *              the initial placement is not written.
- *   @param pad_loc_file
- *              File to read pad locations from if pad_loc_type is USER.
- *   @param place_freq
- *              Should the placement be skipped, done once, or done
- *              for each channel width in the binary search. (Default: ONCE)
- *   @param recompute_crit_iter
- *              How many temperature stages pass before we recompute
- *              criticalities based on the current placement and its
- *              estimated point-to-point delays.
- *   @param inner_loop_crit_divider
- *              (move_lim/inner_loop_crit_divider) determines how
- *              many inner_loop iterations pass before a recompute
- *              of criticalities is done.
- *   @param td_place_exp_first
- *              Exponent that is used in the CRITICALITY_TIMING_PLACE
- *              mode to specify the initial value of `crit_exponent`.
- *              After we map the slacks to criticalities, this value
- *              is used to `sharpen` the criticalities, making connections
- *              with worse slacks more critical.
- *   @param td_place_exp_last
- *              Value that the crit_exponent will be at the end.
- *   @param doPlacement
- *              True if placement is supposed to be done in the CAD flow.
- *              False if otherwise.
- *   @param place_constraint_expand
- *              Integer value that specifies how far to expand the floorplan
- *              region when printing out floorplan constraints based on
- *              current placement.
- *   @param place_constraint_subtile
- *              True if subtiles should be specified when printing floorplan
- *              constraints. False if not.
- *   @param place_auto_init_t_scale
- *              When the annealer is using the automatic schedule, this option
- *              scales the initial temperature selected.
  */
 struct t_placer_opts {
+    /// Controls which placement algorithm is used.
     t_place_algorithm place_algorithm;
+
+    /// Controls which placement algorithm is used during placement quench.
     t_place_algorithm place_quench_algorithm;
-    t_annealing_sched anneal_sched; ///<Placement option annealing schedule
+
+    /// Placement option annealing schedule
+    t_annealing_sched anneal_sched;
+
+    /// When in CRITICALITY_TIMING_PLACE mode, what is the tradeoff between timing and wiring costs.
     float timing_tradeoff;
+
+    /// Weight for how much congestion affects placement cost.
+    /// Higher means congestion is more important.
+    float congestion_factor;
+    /// Start using congestion cost when (current rlim / initial rlim) drops below this value.
+    float congestion_rlim_trigger_ratio;
+    /// Nets with average channel usage (withing their bounding box) above this threshold
+    /// are predicted to face some congestion in the routing stage.
+    float congestion_chan_util_threshold;
+
+    /// The channel width assumed if only one placement is performed.
     int place_chan_width;
+
+    /// Are pins FREE or fixed randomly.
     enum e_pad_loc_type pad_loc_type;
+
+    /// File that specifies locations of locked down (constrained) blocks for placement. Empty string means no constraints file.
     std::string constraints_file;
+
+    /// Write the initial placement into this file. Empty string means the initial placement is not written.
     std::string write_initial_place_file;
+
     std::string read_initial_place_file;
-    enum pfreq place_freq;
+
+    /// Should the placement be skipped, done once, or done for each channel width in the binary search. (Default: ONCE)
+    e_place_freq place_freq;
+
+    /// How many temperature stages pass before we recompute criticalities
+    /// based on the current placement and its estimated point-to-point delays.
     int recompute_crit_iter;
+
+    /// (move_lim/inner_loop_crit_divider) determines how many inner_loop iterations pass before a recompute of criticalities is done.
     int inner_loop_recompute_divider;
+
     int quench_recompute_divider;
+
+    /**
+     * Exponent that is used in the CRITICALITY_TIMING_PLACE mode to specify the initial value of `crit_exponent`.
+     * After we map the slacks to criticalities, this value is used to `sharpen` the criticalities, making
+     * connections with worse slacks more critical.
+     */
     float td_place_exp_first;
+
     int seed;
+
+    /// Value that the crit_exponent will be at the end.
     float td_place_exp_last;
+
+    /// True if placement is supposed to be done in the CAD flow. False if otherwise.
     e_stage_action doPlacement;
+
     float rlim_escape_fraction;
+
     std::string move_stats_file;
+
     int placement_saves_per_temperature;
+
     e_place_effort_scaling effort_scaling;
+
     e_timing_update_type timing_update_type;
 
     PlaceDelayModelType delay_model_type;
+
     e_reducer delay_model_reducer;
 
     float delay_offset;
@@ -1061,23 +1058,39 @@ struct t_placer_opts {
 
     std::string write_placement_delay_lookup;
     std::string read_placement_delay_lookup;
+
     vtr::vector<e_move_type, float> place_static_move_prob;
+
     bool RL_agent_placement;
     bool place_agent_multistate;
     bool place_checkpointing;
+
     int place_high_fanout_net;
+
     e_place_bounding_box_mode place_bounding_box_mode;
+
     e_agent_algorithm place_agent_algorithm;
+
     float place_agent_epsilon;
     float place_agent_gamma;
     float place_dm_rlim;
+
     e_agent_space place_agent_space;
+
     std::string place_reward_fun;
+
     float place_crit_limit;
+
+    /// Integer value that specifies how far to expand the floorplan region when
+    /// printing out floorplan constraints based on current placement.
     int place_constraint_expand;
+
+    /// True if subtiles should be specified when printing floorplan constraints. False if not.
     bool place_constraint_subtile;
+
     int floorplan_num_horizontal_partitions;
     int floorplan_num_vertical_partitions;
+
     bool place_quench_only;
 
     int placer_debug_block;
@@ -1093,6 +1106,7 @@ struct t_placer_opts {
 
     e_place_delta_delay_algorithm place_delta_delay_matrix_calculation_method;
 
+    /// When the annealer is using the automatic schedule, this option scales the initial temperature selected.
     float place_auto_init_t_scale;
 };
 
@@ -1102,63 +1116,42 @@ struct t_placer_opts {
 
 /**
  * @brief Various options for the Analytical Placer.
- *
- *   @param doAnalyticalPlacement
- *              True if analytical placement is supposed to be done in the CAD
- *              flow. False if otherwise.
- *   @param analytical_solver_type
- *              The type of analytical solver the Global Placer in the AP flow
- *              will use.
- *   @param partial_legalizer_type
- *              The type of partial legalizer the Global Placer in the AP flow
- *              will use.
- *   @param full_legalizer_type
- *              The type of full legalizer the AP flow will use.
- *   @param detailed_placer_type
- *              The type of detailed placter the AP flow will use.
- *   @param ap_timing_tradeoff
- *              A trade-off parameter used to decide how focused the AP flow
- *              should be on optimizing timing over wirelength.
- *   @param ap_high_fanout_threshold;
- *              The threshold to ignore nets with higher fanout than that
- *              value while constructing the solver.
- *   @param ap_partial_legalizer_target_density
- *              Vector of strings passed by the user to configure the target
- *              density of different physical tiles on the device.
- *   @param appack_max_dist_th
- *              Array of string passed by the user to configure the max candidate
- *              distance thresholds.
- *   @param num_threads
- *              The number of threads the AP flow can use.
- *   @param log_verbosity
- *              The verbosity level of log messages in the AP flow, with higher
- *              values leading to more verbose messages.
- *   @param generate_mass_report
- *              Whether to generate a mass report during global placement or not.
  */
 struct t_ap_opts {
+    /// True if analytical placement is supposed to be done in the CAD flow. False if otherwise.
     e_stage_action doAP;
 
+    /// The type of analytical solver the Global Placer in the AP flow will use.
     e_ap_analytical_solver analytical_solver_type;
 
+    /// The type of partial legalizer the Global Placer in the AP flow will use.
     e_ap_partial_legalizer partial_legalizer_type;
 
+    /// The type of full legalizer the AP flow will use.
     e_ap_full_legalizer full_legalizer_type;
 
+    /// The type of detailed placer the AP flow will use.
     e_ap_detailed_placer detailed_placer_type;
 
+    /// A trade-off parameter used to decide how focused the AP flow should be on optimizing timing over wirelength.
     float ap_timing_tradeoff;
 
+    /// The threshold to ignore nets with higher fanout than that value while constructing the solver.
     int ap_high_fanout_threshold;
 
+    /// Vector of strings passed by the user to configure the target density of different physical tiles on the device.
     std::vector<std::string> ap_partial_legalizer_target_density;
 
+    /// Array of string passed by the user to configure the max candidate distance thresholds.
     std::vector<std::string> appack_max_dist_th;
 
+    /// The number of threads the AP flow can use.
     unsigned num_threads;
 
+    /// The verbosity level of log messages in the AP flow, with higher values leading to more verbose messages.
     int log_verbosity;
 
+    /// Whether to generate a mass report during global placement or not.
     bool generate_mass_report;
 };
 
@@ -1166,56 +1159,6 @@ struct t_ap_opts {
  * Router data types
  *******************************************************************/
 
-/* All the parameters controlling the router's operation are in this        *
- * structure.                                                               *
- * first_iter_pres_fac:  Present sharing penalty factor used for the        *
- *                 very first (congestion mapping) Pathfinder iteration.    *
- * initial_pres_fac:  Initial present sharing penalty factor for            *
- *                    Pathfinder; used to set pres_fac on 2nd iteration.    *
- * pres_fac_mult:  Amount by which pres_fac is multiplied each              *
- *                 routing iteration.                                       *
- * acc_fac:  Historical congestion cost multiplier.  Used unchanged         *
- *           for all iterations.                                            *
- * bend_cost:  Cost of a bend (usually non-zero only for global routing).   *
- * max_router_iterations:  Maximum number of iterations before giving       *
- *                up.                                                       *
- * min_incremental_reroute_fanout: Minimum fanout a net needs to have       *
- *              for incremental reroute to be applied to it through route   *
- *              tree pruning. Larger circuits should get larger thresholds  *
- * bb_factor:  Linear distance a route can go outside the net bounding      *
- *             box.                                                         *
- * route_type:  GLOBAL or DETAILED.                                         *
- * fixed_channel_width:  Only attempt to route the design once, with the    *
- *                       channel width given.  If this variable is          *
- *                       == NO_FIXED_CHANNEL_WIDTH, do a binary search      *
- *                       on channel width.                                  *
- * router_algorithm:  TIMING_DRIVEN or PARALLEL.  Selects the desired       *
- * routing algorithm.                                                       *
- * base_cost_type: Specifies how to compute the base cost of each type of   *
- *                 rr_node.  DELAY_NORMALIZED -> base_cost = "demand"       *
- *                 x average delay to route past 1 CLB.  DEMAND_ONLY ->     *
- *                 expected demand of this node (old breadth-first costs).  *
- *                                                                          *
- * The following parameters are used only by the timing-driven router.      *
- *                                                                          *
- * astar_fac:  Factor (alpha) used to weight expected future costs to       *
- *             target in the timing_driven router.  astar_fac = 0 leads to  *
- *             an essentially breadth-first search, astar_fac = 1 is near   *
- *             the usual astar algorithm and astar_fac > 1 are more         *
- *             aggressive.                                                  *
- * astar_offset: Offset that is subtracted from the lookahead (expected     *
- *               future costs) in the timing-driven router.                 *
- * max_criticality: The maximum criticality factor (from 0 to 1) any sink   *
- *                  will ever have (i.e. clip criticality to this number).  *
- * criticality_exp: Set criticality to (path_length(sink) / longest_path) ^ *
- *                  criticality_exp (then clip to max_criticality).         *
- * doRouting: true if routing is supposed to be done, false otherwise       *
- * routing_failure_predictor: sets the configuration to be used by the      *
- * routing failure predictor, how aggressive the threshold used to judge    *
- * and abort routings deemed unroutable                                     *
- * write_rr_graph_name: stores the file name of the output rr graph         *
- * read_rr_graph_name:  stores the file name of the rr graph to be read by vpr */
-
 enum e_router_algorithm {
     NESTED,
     PARALLEL,
@@ -1275,25 +1218,54 @@ enum class e_incr_reroute_delay_ripup {
 
 constexpr int NO_FIXED_CHANNEL_WIDTH = -1;
 
+/**
+ * @brief Parameters controlling the router's operation.
+ */
 struct t_router_opts {
     bool read_rr_edge_metadata = false;
     bool do_check_rr_graph = true;
+
+    /// Present sharing penalty factor used for the very first (congestion mapping) Pathfinder iteration.
     float first_iter_pres_fac;
+    /// Initial present sharing penalty factor for Pathfinder; used to set pres_fac on 2nd iteration.
     float initial_pres_fac;
+    /// Amount by which pres_fac is multiplied each routing iteration.
     float pres_fac_mult;
     float max_pres_fac;
+
+    /// Historical congestion cost multiplier. Used unchanged for all iterations.
     float acc_fac;
+    /// Cost of a bend (usually non-zero only for global routing).
     float bend_cost;
+    /// Maximum number of iterations before giving up.
     int max_router_iterations;
+    /// Minimum fanout a net needs to have for incremental reroute to be applied to it through route tree pruning.
+    /// Larger circuits should get larger thresholds
     int min_incremental_reroute_fanout;
     e_incr_reroute_delay_ripup incr_reroute_delay_ripup;
+    /// Linear distance a route can go outside the net bounding box.
     int bb_factor;
+    /// GLOBAL or DETAILED.
     enum e_route_type route_type;
+    /// Only attempt to route the design once, with the channel width given.
+    /// If this variable is == NO_FIXED_CHANNEL_WIDTH, do a binary search on channel width.
     int fixed_channel_width;
-    int min_channel_width_hint; ///<Hint to binary search of what the minimum channel width is
+    /// Hint to binary search of what the minimum channel width is
+    int min_channel_width_hint;
+    /// TIMING_DRIVEN or PARALLEL.  Selects the desired routing algorithm.
     enum e_router_algorithm router_algorithm;
+
+    /// Specifies how to compute the base cost of each type of rr_node.
+    /// DELAY_NORMALIZED -> base_cost = "demand" x average delay to route past 1 CLB.
+    /// DEMAND_ONLY -> expected demand of this node (old breadth-first costs).
     enum e_base_cost_type base_cost_type;
+
+    /// Factor (alpha) used to weight expected future costs to target in the timing_driven router.
+    /// astar_fac = 0 leads to an essentially breadth-first search,
+    /// astar_fac = 1 is near the usual astar algorithm and astar_fac > 1 are more aggressive.
     float astar_fac;
+
+    /// Offset that is subtracted from the lookahead (expected future costs) in the timing-driven router.
     float astar_offset;
     float router_profiler_astar_fac;
     bool enable_parallel_connection_router;
@@ -1302,7 +1274,9 @@ struct t_router_opts {
     int multi_queue_num_threads;
     int multi_queue_num_queues;
     bool multi_queue_direct_draining;
+    /// The maximum criticality factor (from 0 to 1) any sink will ever have (i.e. clip criticality to this number).
     float max_criticality;
+    /// Set criticality to (path_length(sink) / longest_path) ^ criticality_exp (then clip to max_criticality).
     float criticality_exp;
     float init_wirelength_abort_threshold;
     bool verify_binary_search;
@@ -1310,7 +1284,10 @@ struct t_router_opts {
     bool congestion_analysis;
     bool fanout_analysis;
     bool switch_usage_analysis;
+    /// true if routing is supposed to be done, false otherwise
     e_stage_action doRouting;
+    /// the configuration to be used by the routing failure predictor,
+    /// how aggressive the threshold used to judge and abort routings deemed unroutable
     enum e_routing_failure_predictor routing_failure_predictor;
     enum e_routing_budgets_algorithm routing_budgets_algorithm;
     bool save_routing_per_iteration;
diff --git a/vpr/src/pack/verify_clustering.cpp b/vpr/src/pack/verify_clustering.cpp
index ec08e10a40..93f925ef68 100644
--- a/vpr/src/pack/verify_clustering.cpp
+++ b/vpr/src/pack/verify_clustering.cpp
@@ -406,7 +406,7 @@ unsigned verify_clustering(const ClusteredNetlist& clb_nlist,
         // Return here since this error can cause serious issues below.
         return num_errors;
     }
-    // Check conssitency between which clusters the atom's think thet are in and
+    // Check consistency between which clusters the atom's think thet are in and
     // which atoms the clusters think they have.
     num_errors += check_clustering_atom_consistency(clb_nlist,
                                                     atom_nlist,
diff --git a/vpr/src/place/annealer.cpp b/vpr/src/place/annealer.cpp
index 3422850312..ece401daa8 100644
--- a/vpr/src/place/annealer.cpp
+++ b/vpr/src/place/annealer.cpp
@@ -118,6 +118,7 @@ t_annealing_state::t_annealing_state(float first_t,
 }
 
 bool t_annealing_state::outer_loop_update(float success_rate,
+                                          bool congestion_modeling_enabled,
                                           const t_placer_costs& costs,
                                           const t_placer_opts& placer_opts) {
 #ifndef NO_GRAPHICS
@@ -139,8 +140,11 @@ bool t_annealing_state::outer_loop_update(float success_rate,
     }
 
     // Automatically determine exit temperature.
-    auto& cluster_ctx = g_vpr_ctx.clustering();
+    const ClusteringContext& cluster_ctx = g_vpr_ctx.clustering();
     float t_exit = 0.005 * costs.cost / cluster_ctx.clb_nlist.nets().size();
+    if (congestion_modeling_enabled) {
+        t_exit *= (1. + placer_opts.congestion_factor);
+    }
 
     VTR_ASSERT_SAFE(placer_opts.anneal_sched.type == e_sched_type::AUTO_SCHED);
     // Automatically adjust alpha according to success rate.
@@ -228,7 +232,8 @@ PlacementAnnealer::PlacementAnnealer(const t_placer_opts& placer_opts,
     , move_stats_file_(nullptr, vtr::fclose)
     , outer_crit_iter_count_(1)
     , blocks_affected_(placer_state.block_locs().size())
-    , quench_started_(false) {
+    , quench_started_(false)
+    , congestion_modeling_started_(false) {
     const auto& device_ctx = g_vpr_ctx.device();
 
     float first_crit_exponent;
@@ -373,14 +378,13 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator,
 
     MoveOutcomeStats move_outcome_stats;
 
-    /* I'm using negative values of proposed_net_cost as a flag,
-     * so DO NOT use cost functions that can go negative. */
-    double delta_c = 0;        //Change in cost due to this swap.
-    double bb_delta_c = 0;     //Change in the bounding box (wiring) cost.
-    double timing_delta_c = 0; //Change in the timing cost (delay * criticality).
+    double delta_c = 0.;            // Change in cost due to this swap.
+    double bb_delta_c = 0.;         // Change in the bounding box (wiring) cost.
+    double timing_delta_c = 0.;     // Change in the timing cost (delay * criticality).
+    double congestion_delta_c = 0.; // Change in the congestion cost
 
-    /* Allow some fraction of moves to not be restricted by rlim,
-     * in the hopes of better escaping local minima. */
+    // Allow some fraction of moves to not be restricted by rlim,
+    // in the hopes of better escaping local minima.
     float rlim;
     if (placer_opts_.rlim_escape_fraction > 0. && rng_.frand() < placer_opts_.rlim_escape_fraction) {
         rlim = std::numeric_limits<float>::infinity();
@@ -396,19 +400,19 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator,
         router_block_move = check_for_router_swap(noc_opts_.noc_swap_percentage, rng_);
     }
 
-    //When manual move toggle button is active, the manual move window asks the user for input.
+    // When manual move toggle button is active, the manual move window asks the user for input.
     if (manual_move_enabled) {
 #ifndef NO_GRAPHICS
         create_move_outcome = manual_move_display_and_propose(manual_move_generator_, blocks_affected_,
                                                               proposed_action.move_type, rlim,
                                                               placer_opts_, criticalities_);
-#endif //NO_GRAPHICS
+#endif // NO_GRAPHICS
     } else if (router_block_move) {
         // generate a move where two random router blocks are swapped
         create_move_outcome = propose_router_swap(blocks_affected_, rlim, blk_loc_registry, place_macros_, rng_);
         proposed_action.move_type = e_move_type::UNIFORM;
     } else {
-        //Generate a new move (perturbation) used to explore the space of possible placements
+        // Generate a new move (perturbation) used to explore the space of possible placements
         create_move_outcome = move_generator.propose_move(blocks_affected_, proposed_action, rlim, placer_opts_, criticalities_);
     }
 
@@ -455,7 +459,7 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator,
          * delays and timing costs and store them in proposed_* data structures.
          */
         net_cost_handler_.find_affected_nets_and_update_costs(delay_model_, criticalities_, blocks_affected_,
-                                                              bb_delta_c, timing_delta_c);
+                                                              bb_delta_c, timing_delta_c, congestion_delta_c);
 
         if (place_algorithm == e_place_algorithm::CRITICALITY_TIMING_PLACE) {
             /* Take delta_c as a combination of timing and wiring cost. In
@@ -472,7 +476,8 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator,
                            timing_delta_c,
                            costs_.timing_cost_norm);
             delta_c = (1 - placer_opts_.timing_tradeoff) * bb_delta_c * costs_.bb_cost_norm
-                      + placer_opts_.timing_tradeoff * timing_delta_c * costs_.timing_cost_norm;
+                      + placer_opts_.timing_tradeoff * timing_delta_c * costs_.timing_cost_norm
+                      + placer_opts_.congestion_factor * congestion_delta_c * costs_.congestion_cost_norm;
         } else if (place_algorithm == e_place_algorithm::SLACK_TIMING_PLACE) {
             /* For setup slack analysis, we first do a timing analysis to get the newest
              * slack values resulted from the proposed block moves. If the move turns out
@@ -539,6 +544,7 @@ e_move_result PlacementAnnealer::try_swap_(MoveGenerator& move_generator,
         if (move_outcome == e_move_result::ACCEPTED) {
             costs_.cost += delta_c;
             costs_.bb_cost += bb_delta_c;
+            costs_.congestion_cost += congestion_delta_c;
 
             if (place_algorithm == e_place_algorithm::CRITICALITY_TIMING_PLACE) {
                 costs_.timing_cost += timing_delta_c;
@@ -673,6 +679,26 @@ void PlacementAnnealer::outer_loop_update_timing_info() {
         outer_crit_iter_count_++;
     }
 
+    // Congestion modeling is enabled when the ratio of the current range limit to the initial range limit
+    // drops below a user-specified threshold, and the congestion cost weighting factor is non-zero.
+    // Once enabled, congestion modeling continues even if the range limit increases and the ratio
+    // rises above the threshold.
+    //
+    // This logic is motivated by the observation that enabling congestion modeling too early in the
+    // anneal increases computational overhead and introduces noise into the placement cost function,
+    // as early placements are typically highly congested and unstable. So, we delay congestion modeling
+    // until the placement is more settled and wirelength has been reasonably optimized.
+    if ((annealing_state_.rlim / MoveGenerator::first_rlim < placer_opts_.congestion_rlim_trigger_ratio
+         && placer_opts_.congestion_factor != 0.)
+        || congestion_modeling_started_) {
+        costs_.congestion_cost = net_cost_handler_.estimate_routing_chan_util();
+
+        if (!congestion_modeling_started_) {
+            VTR_LOG("Congestion modeling started.\n");
+            congestion_modeling_started_ = true;
+        }
+    }
+
     // Update the cost normalization factors
     costs_.update_norm_factors();
 
@@ -783,7 +809,7 @@ const t_annealing_state& PlacementAnnealer::get_annealing_state() const {
 }
 
 bool PlacementAnnealer::outer_loop_update_state() {
-    return annealing_state_.outer_loop_update(placer_stats_.success_rate, costs_, placer_opts_);
+    return annealing_state_.outer_loop_update(placer_stats_.success_rate, congestion_modeling_started_, costs_, placer_opts_);
 }
 
 void PlacementAnnealer::start_quench() {
diff --git a/vpr/src/place/annealer.h b/vpr/src/place/annealer.h
index 0d098db728..e406581956 100644
--- a/vpr/src/place/annealer.h
+++ b/vpr/src/place/annealer.h
@@ -105,6 +105,7 @@ class t_annealing_state {
      * @return True->continues the annealing. False->exits the annealing.
      */
     bool outer_loop_update(float success_rate,
+                           bool congestion_modeling_enabled,
                            const t_placer_costs& costs,
                            const t_placer_opts& placer_opts);
 
@@ -328,6 +329,8 @@ class PlacementAnnealer {
     int tot_iter_;
     /// Indicates whether the annealer has entered into the quench stage
     bool quench_started_;
+    /// Indicates whether routing congestion modeling has been started
+    bool congestion_modeling_started_;
 
     void LOG_MOVE_STATS_HEADER();
     void LOG_MOVE_STATS_PROPOSED();
diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp
index 9f6a4316eb..0b8336e08f 100644
--- a/vpr/src/place/initial_placement.cpp
+++ b/vpr/src/place/initial_placement.cpp
@@ -372,11 +372,11 @@ static bool is_loc_legal(const t_pl_loc& loc,
     return legal;
 }
 
-bool find_subtile_in_location(t_pl_loc& centroid,
-                              t_logical_block_type_ptr block_type,
-                              const BlkLocRegistry& blk_loc_registry,
-                              const PartitionRegion& pr,
-                              vtr::RngContainer& rng) {
+static bool find_subtile_in_location(t_pl_loc& centroid,
+                                     t_logical_block_type_ptr block_type,
+                                     const BlkLocRegistry& blk_loc_registry,
+                                     const PartitionRegion& pr,
+                                     vtr::RngContainer& rng) {
     //check if the location is on chip and legal, if yes try to update subtile
     if (is_loc_on_chip({centroid.x, centroid.y, centroid.layer}) && is_loc_legal(centroid, pr, block_type)) {
         //find the compatible subtiles
diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp
index 6282a3e872..77247f1510 100644
--- a/vpr/src/place/net_cost_handler.cpp
+++ b/vpr/src/place/net_cost_handler.cpp
@@ -2,11 +2,11 @@
  * @file net_cost_handler.cpp
  * @brief This file contains the implementation of functions used to update placement cost when a new move is proposed/committed.
  *
- * VPR placement cost consists of three terms which represent wirelength, timing, and NoC cost.
+ * VPR placement cost consists of multiple terms which represent wirelength, timing, congestion, and NoC cost.
  *
- * To get an estimation of the wirelength of each net, the Half Perimeter Wire Length (HPWL) approach is used. In this approach,
+ * To get an estimation of the wirelength of each net, the Half Perimeter Wire Length (HPWL) metric is used. In this approach,
  * half of the perimeter of the bounding box which contains all terminals of the net is multiplied by a correction factor,
- * and the resulting number is considered as an estimation of the bounding box.
+ * and the resulting number is considered as an estimation of the wirelength needed to route this net.
  *
  * Currently, we have two types of bounding boxes: 3D bounding box (or Cube BB) and per-layer bounding box.
  * If the FPGA grid is a 2D structure, a Cube bounding box is used, which will always have the z direction equal to 1. For 3D architectures,
@@ -20,6 +20,16 @@
  * To get a delay estimation of a connection (from a source to a sink), first, dx and dy between these two points should be calculated,
  * and these two numbers are the indices to access this 2D array. By default, the placement delay model is created by iterating over the router lookahead
  * to get the minimum cost for each dx and dy.
+ *
+ * For congestion modeling, we periodically estimate routing channel usage by distributing the estimated
+ * wirelength (WL) of each net across all routing channels within its bounding box. The wirelength is divided
+ * between CHANX and CHANY in proportion to the bounding box's width and height, respectively. However, all
+ * routing channels of the same type (CHANX or CHANY) within the box receive an equal share of that net's WL.
+ *
+ * We compute a congestion cost for each net by averaging the estimated utilization over all CHANX and CHANY
+ * channels in its bounding box. These average utilizations are then compared to a user-specified threshold.
+ * If a net’s average utilization exceeds the threshold, the excess is penalized by adding a cost proportional
+ * to the amount of the exceedance.
  */
 #include "net_cost_handler.h"
 
@@ -88,10 +98,13 @@ static void add_block_to_bb(const t_physical_tile_loc& new_pin_loc,
 NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
                                PlacerState& placer_state,
                                bool cube_bb)
-    : cube_bb_(cube_bb)
+    : congestion_modeling_started_(false)
+    , cube_bb_(cube_bb)
     , placer_state_(placer_state)
     , placer_opts_(placer_opts) {
-    const int num_layers = g_vpr_ctx.device().grid.get_num_layers();
+    const auto& device_ctx = g_vpr_ctx.device();
+
+    const size_t num_layers = device_ctx.grid.get_num_layers();
     const size_t num_nets = g_vpr_ctx.clustering().clb_nlist.nets().size();
 
     is_multi_layer_ = num_layers > 1;
@@ -100,9 +113,11 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
     if (cube_bb_) {
         ts_bb_edge_new_.resize(num_nets, t_bb());
         ts_bb_coord_new_.resize(num_nets, t_bb());
+
         bb_coords_.resize(num_nets, t_bb());
+
         bb_num_on_edges_.resize(num_nets, t_bb());
-        comp_bb_cost_functor_ = std::bind(&NetCostHandler::comp_cube_bb_cost_, this, std::placeholders::_1);
+        comp_bb_cong_cost_functor_ = std::bind(&NetCostHandler::comp_cube_bb_cong_cost_, this, std::placeholders::_1);
         update_bb_functor_ = std::bind(&NetCostHandler::update_bb_, this, std::placeholders::_1, std::placeholders::_2,
                                        std::placeholders::_3, std::placeholders::_4);
         get_net_bb_cost_functor_ = std::bind(&NetCostHandler::get_net_cube_bb_cost_, this, std::placeholders::_1, /*use_ts=*/true);
@@ -112,14 +127,14 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
         layer_ts_bb_coord_new_.resize(num_nets, std::vector<t_2D_bb>(num_layers, t_2D_bb()));
         layer_bb_num_on_edges_.resize(num_nets, std::vector<t_2D_bb>(num_layers, t_2D_bb()));
         layer_bb_coords_.resize(num_nets, std::vector<t_2D_bb>(num_layers, t_2D_bb()));
-        comp_bb_cost_functor_ = std::bind(&NetCostHandler::comp_per_layer_bb_cost_, this, std::placeholders::_1);
+        comp_bb_cong_cost_functor_ = std::bind(&NetCostHandler::comp_per_layer_bb_cost_, this, std::placeholders::_1);
         update_bb_functor_ = std::bind(&NetCostHandler::update_layer_bb_, this, std::placeholders::_1, std::placeholders::_2,
                                        std::placeholders::_3, std::placeholders::_4);
         get_net_bb_cost_functor_ = std::bind(&NetCostHandler::get_net_per_layer_bb_cost_, this, std::placeholders::_1, /*use_ts=*/true);
         get_non_updatable_bb_functor_ = std::bind(&NetCostHandler::get_non_updatable_per_layer_bb_, this, std::placeholders::_1, /*use_ts=*/true);
     }
 
-    /* This initializes the whole matrix to OPEN which is an invalid value*/
+    // This initializes the whole matrix to OPEN which is an invalid value
     ts_layer_sink_pin_count_.resize({num_nets, size_t(num_layers)}, OPEN);
     num_sink_pin_layer_.resize({num_nets, size_t(num_layers)}, OPEN);
 
@@ -129,12 +144,21 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
     net_cost_.resize(num_nets, -1.);
     proposed_net_cost_.resize(num_nets, -1.);
 
-    /* Used to store costs for moves not yet made and to indicate when a net's
-     * cost has been recomputed. proposed_net_cost[inet] < 0 means net's cost hasn't
-     * been recomputed. */
+    // Used to store costs for moves not yet made and to indicate when a net's
+    // cost has been recomputed. proposed_net_cost[inet] < 0 means net's cost hasn't
+    // been recomputed.
     bb_update_status_.resize(num_nets, NetUpdateState::NOT_UPDATED_YET);
 
     alloc_and_load_chan_w_factors_for_place_cost_();
+
+    // Congestion-related data members are not allocated until congestion modeling is enabled
+    // by calling estimate_routing_chan_util().
+    VTR_ASSERT(!congestion_modeling_started_);
+    VTR_ASSERT(chan_util_.x.empty() && chan_util_.y.empty());
+    VTR_ASSERT(acc_chan_util_.x.empty() && acc_chan_util_.y.empty());
+    VTR_ASSERT(ts_avg_chan_util_new_.empty());
+    VTR_ASSERT(avg_chan_util_.empty());
+    VTR_ASSERT(net_cong_cost_.empty() && proposed_net_cong_cost_.empty());
 }
 
 void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() {
@@ -152,24 +176,26 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() {
      * This returns the total number of tracks between channels 'low' and 'high',
      * including tracks in these channels.
      */
-    acc_chanx_width_ = vtr::PrefixSum1D<int>(grid_height, [&](size_t y) noexcept {
+    acc_chan_width_.x = vtr::PrefixSum1D<int>(grid_height, [&](size_t y) noexcept {
         int chan_x_width = device_ctx.chan_width.x_list[y];
 
-        /* If the number of tracks in a channel is zero, two consecutive elements take the same
-         * value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this
-         * potential issue, we assume that the channel width is at least 1.
-         */
-        if (chan_x_width == 0)
+        // If the number of tracks in a channel is zero, two consecutive elements take the same
+        // value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this
+        // potential issue, we assume that the channel width is at least 1.
+        if (chan_x_width == 0) {
             return 1;
+        }
 
         return chan_x_width;
     });
-    acc_chany_width_ = vtr::PrefixSum1D<int>(grid_width, [&](size_t x) noexcept {
+
+    acc_chan_width_.y = vtr::PrefixSum1D<int>(grid_width, [&](size_t x) noexcept {
         int chan_y_width = device_ctx.chan_width.y_list[x];
 
         // to avoid a division by zero
-        if (chan_y_width == 0)
+        if (chan_y_width == 0) {
             return 1;
+        }
 
         return chan_y_width;
     });
@@ -237,20 +263,20 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
                                                          });
 }
 
-std::pair<double, double> NetCostHandler::comp_bb_cost(e_cost_methods method) {
-    return comp_bb_cost_functor_(method);
+std::tuple<double, double, double> NetCostHandler::comp_bb_cong_cost(e_cost_methods method) {
+    return comp_bb_cong_cost_functor_(method);
 }
 
-std::pair<double, double> NetCostHandler::comp_cube_bb_cost_(e_cost_methods method) {
+std::tuple<double, double, double> NetCostHandler::comp_cube_bb_cong_cost_(e_cost_methods method) {
     const auto& cluster_ctx = g_vpr_ctx.clustering();
 
-    double cost = 0;
-    double expected_wirelength = 0.0;
+    double bb_cost = 0.;
+    double expected_wirelength = 0.;
 
-    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) { /* for each net ... */
-        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {   /* Do only if not ignored. */
-            /* Small nets don't use incremental updating on their bounding boxes, *
-             * so they can use a fast bounding box calculator.                    */
+    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
+        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+            // Small nets don't use incremental updating on their bounding boxes,
+            // so they can use a fast bounding box calculator.
             if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET && method == e_cost_methods::NORMAL) {
                 get_bb_from_scratch_(net_id, /*use_ts=*/false);
             } else {
@@ -258,26 +284,40 @@ std::pair<double, double> NetCostHandler::comp_cube_bb_cost_(e_cost_methods meth
             }
 
             net_cost_[net_id] = get_net_cube_bb_cost_(net_id, /*use_ts=*/false);
-            cost += net_cost_[net_id];
+            bb_cost += net_cost_[net_id];
             if (method == e_cost_methods::CHECK) {
                 expected_wirelength += get_net_wirelength_estimate_(net_id);
             }
         }
     }
 
-    return {cost, expected_wirelength};
+    double cong_cost = 0.;
+    // Compute congestion cost using recomputed bounding boxes and channel utilization map
+    if (congestion_modeling_started_) {
+        for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
+            if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+                net_cong_cost_[net_id] = get_net_cube_cong_cost_(net_id, /*use_ts=*/false);
+                cong_cost += net_cong_cost_[net_id];
+            }
+        }
+    }
+
+    return {bb_cost, expected_wirelength, cong_cost};
 }
 
-std::pair<double, double> NetCostHandler::comp_per_layer_bb_cost_(e_cost_methods method) {
+std::tuple<double, double, double> NetCostHandler::comp_per_layer_bb_cost_(e_cost_methods method) {
     const auto& cluster_ctx = g_vpr_ctx.clustering();
 
-    double cost = 0;
-    double expected_wirelength = 0.0;
+    double cost = 0.;
+    double expected_wirelength = 0.;
+    // TODO: compute congestion cost
+    // Congestion modeling is not supported for per-layer mode, so 0 is returned.
+    constexpr double cong_cost = 0.;
 
-    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) { /* for each net ... */
-        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {   /* Do only if not ignored. */
-            /* Small nets don't use incremental updating on their bounding boxes, *
-             * so they can use a fast bounding box calculator.                    */
+    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
+        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+            // Small nets don't use incremental updating on their bounding boxes,
+            //so they can use a fast bounding box calculator.
             if (cluster_ctx.clb_nlist.net_sinks(net_id).size() >= SMALL_NET && method == e_cost_methods::NORMAL) {
                 get_layer_bb_from_scratch_(net_id,
                                            layer_bb_num_on_edges_[net_id],
@@ -295,7 +335,7 @@ std::pair<double, double> NetCostHandler::comp_per_layer_bb_cost_(e_cost_methods
         }
     }
 
-    return {cost, expected_wirelength};
+    return {cost, expected_wirelength, cong_cost};
 }
 
 void NetCostHandler::update_net_bb_(const ClusterNetId net,
@@ -420,7 +460,6 @@ void NetCostHandler::update_td_delta_costs_(const PlaceDelayModel* delay_model,
     }
 }
 
-///@brief Record effected nets.
 void NetCostHandler::record_affected_net_(const ClusterNetId net) {
     /* Record effected nets. */
     if (proposed_net_cost_[net] < 0.) {
@@ -521,6 +560,14 @@ void NetCostHandler::get_non_updatable_cube_bb_(ClusterNetId net_id, bool use_ts
 
         num_sink_pin_layer[pin_loc.layer_num]++;
     }
+
+    // Update average CHANX and CHANY usage for this net within its bounding box if congestion modeling is enabled
+    if (congestion_modeling_started_) {
+        auto& [x_chan_util, y_chan_util] = use_ts ? ts_avg_chan_util_new_[net_id] : avg_chan_util_[net_id];
+        const int total_channels = (bb_coord_new.xmax - bb_coord_new.xmin + 1) * (bb_coord_new.ymax - bb_coord_new.ymin + 1);
+        x_chan_util = acc_chan_util_.x.get_sum(bb_coord_new.xmin, bb_coord_new.ymin, bb_coord_new.xmax, bb_coord_new.ymax) / total_channels;
+        y_chan_util = acc_chan_util_.y.get_sum(bb_coord_new.xmin, bb_coord_new.ymin, bb_coord_new.xmax, bb_coord_new.ymax) / total_channels;
+    }
 }
 
 void NetCostHandler::get_non_updatable_per_layer_bb_(ClusterNetId net_id, bool use_ts) {
@@ -567,8 +614,6 @@ void NetCostHandler::update_bb_(ClusterNetId net_id,
                                 t_physical_tile_loc pin_new_loc,
                                 bool src_pin) {
     //TODO: account for multiple physical pin instances per logical pin
-    const t_bb *curr_bb_edge, *curr_bb_coord;
-
     const auto& device_ctx = g_vpr_ctx.device();
 
     const int num_layers = device_ctx.grid.get_num_layers();
@@ -588,6 +633,7 @@ void NetCostHandler::update_bb_(ClusterNetId net_id,
 
     vtr::NdMatrixProxy<int, 1> curr_num_sink_pin_layer = (bb_update_status_[net_id] == NetUpdateState::NOT_UPDATED_YET) ? num_sink_pin_layer_[size_t(net_id)] : num_sink_pin_layer_new;
 
+    const t_bb *curr_bb_edge, *curr_bb_coord;
     if (bb_update_status_[net_id] == NetUpdateState::NOT_UPDATED_YET) {
         /* The net had NOT been updated before, could use the old values */
         curr_bb_edge = &bb_num_on_edges_[net_id];
@@ -828,6 +874,14 @@ void NetCostHandler::update_bb_(ClusterNetId net_id,
     if (bb_update_status_[net_id] == NetUpdateState::NOT_UPDATED_YET) {
         bb_update_status_[net_id] = NetUpdateState::UPDATED_ONCE;
     }
+
+    // Update average CHANX and CHANY usage for this net within its bounding box if congestion modeling is enabled
+    if (congestion_modeling_started_) {
+        auto& [x_chan_util, y_chan_util] = ts_avg_chan_util_new_[net_id];
+        const int total_channels = (bb_coord_new.xmax - bb_coord_new.xmin + 1) * (bb_coord_new.ymax - bb_coord_new.ymin + 1);
+        x_chan_util = acc_chan_util_.x.get_sum(bb_coord_new.xmin, bb_coord_new.ymin, bb_coord_new.xmax, bb_coord_new.ymax) / total_channels;
+        y_chan_util = acc_chan_util_.y.get_sum(bb_coord_new.xmin, bb_coord_new.ymin, bb_coord_new.xmax, bb_coord_new.ymax) / total_channels;
+    }
 }
 
 void NetCostHandler::update_layer_bb_(ClusterNetId net_id,
@@ -1262,6 +1316,14 @@ void NetCostHandler::get_bb_from_scratch_(ClusterNetId net_id, bool use_ts) {
     num_on_edges.ymax = ymax_edge;
     num_on_edges.layer_min = layer_min_edge;
     num_on_edges.layer_max = layer_max_edge;
+
+    // Update average CHANX and CHANY usage for this net within its bounding box if congestion modeling is enabled
+    if (congestion_modeling_started_) {
+        auto& [x_chan_util, y_chan_util] = use_ts ? ts_avg_chan_util_new_[net_id] : avg_chan_util_[net_id];
+        const int total_channels = (coords.xmax - coords.xmin + 1) * (coords.ymax - coords.ymin + 1);
+        x_chan_util = acc_chan_util_.x.get_sum(coords.xmin, coords.ymin, coords.xmax, coords.ymax) / total_channels;
+        y_chan_util = acc_chan_util_.y.get_sum(coords.xmin, coords.ymin, coords.xmax, coords.ymax) / total_channels;
+    }
 }
 
 void NetCostHandler::get_layer_bb_from_scratch_(ClusterNetId net_id,
@@ -1324,7 +1386,7 @@ void NetCostHandler::get_layer_bb_from_scratch_(ClusterNetId net_id,
 
 double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
     // Finds the cost due to one net by looking at its coordinate bounding box.
-    auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
 
     const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : bb_coords_[net_id];
 
@@ -1356,6 +1418,18 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
     return ncost;
 }
 
+double NetCostHandler::get_net_cube_cong_cost_(ClusterNetId net_id, bool use_ts) {
+    VTR_ASSERT_SAFE(congestion_modeling_started_);
+    const auto [x_chan_util, y_chan_util] = use_ts ? ts_avg_chan_util_new_[net_id] : avg_chan_util_[net_id];
+
+    const float threshold = placer_opts_.congestion_chan_util_threshold;
+
+    float x_chan_cong = (x_chan_util < threshold) ? 0.0f : x_chan_util - threshold;
+    float y_chan_cong = (y_chan_util < threshold) ? 0.0f : y_chan_util - threshold;
+
+    return x_chan_cong + y_chan_cong;
+}
+
 double NetCostHandler::get_net_per_layer_bb_cost_(ClusterNetId net_id, bool use_ts) {
     // Per-layer bounding box of the net
     const std::vector<t_2D_bb>& bb = use_ts ? layer_ts_bb_coord_new_[net_id] : layer_bb_coords_[net_id];
@@ -1466,25 +1540,29 @@ float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
     return z_cost_factor;
 }
 
-double NetCostHandler::recompute_bb_cost_() {
-    double cost = 0;
+std::pair<double, double> NetCostHandler::recompute_bb_cong_cost_() {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
 
-    auto& cluster_ctx = g_vpr_ctx.clustering();
+    double bb_cost = 0.;
+    double cong_cost = 0.;
 
-    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) { /* for each net ... */
-        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {   /* Do only if not ignored. */
-            /* Bounding boxes don't have to be recomputed; they're correct. */
-            cost += net_cost_[net_id];
+    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
+        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+            // Bounding boxes don't have to be recomputed; they're correct.
+            bb_cost += net_cost_[net_id];
+
+            if (congestion_modeling_started_) {
+                cong_cost += net_cong_cost_[net_id];
+            }
         }
     }
 
-    return cost;
+    return {bb_cost, cong_cost};
 }
 
 double wirelength_crossing_count(size_t fanout) {
-    /* Get the expected "crossing count" of a net, based on its number *
-     * of pins.  Extrapolate for very large nets.                      */
-
+    // Get the expected "crossing count" of a net, based on its number of pins.
+    // Extrapolate for very large nets.
     if (fanout > MAX_FANOUT_CROSSING_COUNT) {
         return 2.7933 + 0.02616 * (fanout - MAX_FANOUT_CROSSING_COUNT);
     } else {
@@ -1492,13 +1570,17 @@ double wirelength_crossing_count(size_t fanout) {
     }
 }
 
-void NetCostHandler::set_bb_delta_cost_(double& bb_delta_c) {
+void NetCostHandler::set_bb_delta_cost_(double& bb_delta_c, double& congestion_delta_c) {
     for (const ClusterNetId ts_net : ts_nets_to_update_) {
         ClusterNetId net_id = ts_net;
 
         proposed_net_cost_[net_id] = get_net_bb_cost_functor_(net_id);
-
         bb_delta_c += proposed_net_cost_[net_id] - net_cost_[net_id];
+
+        if (congestion_modeling_started_) {
+            proposed_net_cong_cost_[net_id] = get_net_cube_cong_cost_(net_id, /*use_ts=*/true);
+            congestion_delta_c += proposed_net_cong_cost_[net_id] - net_cong_cost_[net_id];
+        }
     }
 }
 
@@ -1506,19 +1588,21 @@ void NetCostHandler::find_affected_nets_and_update_costs(const PlaceDelayModel*
                                                          const PlacerCriticalities* criticalities,
                                                          t_pl_blocks_to_be_moved& blocks_affected,
                                                          double& bb_delta_c,
-                                                         double& timing_delta_c) {
-    VTR_ASSERT_SAFE(bb_delta_c == 0.);
-    VTR_ASSERT_SAFE(timing_delta_c == 0.);
-    auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
+                                                         double& timing_delta_c,
+                                                         double& congestion_delta_c) {
+    VTR_ASSERT_DEBUG(bb_delta_c == 0.);
+    VTR_ASSERT_DEBUG(timing_delta_c == 0.);
+    VTR_ASSERT_DEBUG(congestion_delta_c == 0.);
+    const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
 
     ts_nets_to_update_.resize(0);
 
-    /* Go through all the blocks moved. */
+    // Go through all the blocks moved.
     for (const t_pl_moved_block& moving_block : blocks_affected.moved_blocks) {
         auto& affected_pins = blocks_affected.affected_pins;
         ClusterBlockId blk_id = moving_block.block_num;
 
-        /* Go through all the pins in the moved block. */
+        // Go through all the pins in the moved block.
         for (ClusterPinId blk_pin : clb_nlist.block_pins(blk_id)) {
             bool is_src_moving = false;
             if (clb_nlist.pin_type(blk_pin) == PinType::SINK) {
@@ -1535,14 +1619,14 @@ void NetCostHandler::find_affected_nets_and_update_costs(const PlaceDelayModel*
         }
     }
 
-    /* Now update the bounding box costs (since the net bounding     *
-     * boxes are up-to-date). The cost is only updated once per net. */
-    set_bb_delta_cost_(bb_delta_c);
+    // Now update the bounding box costs (since the net bounding
+    // boxes are up-to-date). The cost is only updated once per net.
+    set_bb_delta_cost_(bb_delta_c, congestion_delta_c);
 }
 
 void NetCostHandler::update_move_nets() {
-    /* update net cost functions and reset flags. */
-    auto& cluster_ctx = g_vpr_ctx.clustering();
+    // update net cost functions and reset flags.
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
 
     for (const ClusterNetId ts_net : ts_nets_to_update_) {
         ClusterNetId net_id = ts_net;
@@ -1558,18 +1642,27 @@ void NetCostHandler::update_move_nets() {
         }
 
         net_cost_[net_id] = proposed_net_cost_[net_id];
-
-        /* negative proposed_net_cost value is acting as a flag to mean not computed yet. */
+        // negative proposed_net_cost value is acting as a flag to mean not computed yet.
         proposed_net_cost_[net_id] = -1;
+
+        if (congestion_modeling_started_) {
+            net_cong_cost_[net_id] = proposed_net_cong_cost_[net_id];
+            proposed_net_cong_cost_[net_id] = -1;
+        }
+
         bb_update_status_[net_id] = NetUpdateState::NOT_UPDATED_YET;
     }
 }
 
 void NetCostHandler::reset_move_nets() {
-    /* Reset the net cost function flags first. */
-    for (const ClusterNetId ts_net : ts_nets_to_update_) {
-        ClusterNetId net_id = ts_net;
+    // Reset the net cost function flags first.
+    for (const ClusterNetId net_id : ts_nets_to_update_) {
         proposed_net_cost_[net_id] = -1;
+
+        if (congestion_modeling_started_) {
+            proposed_net_cong_cost_[net_id] = -1;
+        }
+
         bb_update_status_[net_id] = NetUpdateState::NOT_UPDATED_YET;
     }
 }
@@ -1588,10 +1681,17 @@ void NetCostHandler::recompute_costs_from_scratch(const PlaceDelayModel* delay_m
         }
     };
 
-    double new_bb_cost = recompute_bb_cost_();
+    auto [new_bb_cost, new_cong_cost] = recompute_bb_cong_cost_();
     check_and_print_cost(new_bb_cost, costs.bb_cost, "bb_cost");
     costs.bb_cost = new_bb_cost;
 
+    if (congestion_modeling_started_) {
+        check_and_print_cost(new_cong_cost, costs.congestion_cost, "cong_cost");
+        costs.congestion_cost = new_cong_cost;
+    } else {
+        costs.congestion_cost = 0.;
+    }
+
     if (placer_opts_.place_algorithm.is_timing_driven()) {
         double new_timing_cost = 0.;
         comp_td_costs(delay_model, *criticalities, placer_state_, &new_timing_cost);
@@ -1607,8 +1707,8 @@ double NetCostHandler::get_total_wirelength_estimate() const {
     const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
 
     double estimated_wirelength = 0.0;
-    for (ClusterNetId net_id : clb_nlist.nets()) { /* for each net ... */
-        if (!clb_nlist.net_is_ignored(net_id)) {   /* Do only if not ignored. */
+    for (ClusterNetId net_id : clb_nlist.nets()) {
+        if (!clb_nlist.net_is_ignored(net_id)) {
             if (cube_bb_) {
                 estimated_wirelength += get_net_wirelength_estimate_(net_id);
             } else {
@@ -1620,9 +1720,182 @@ double NetCostHandler::get_total_wirelength_estimate() const {
     return estimated_wirelength;
 }
 
+double NetCostHandler::estimate_routing_chan_util(bool compute_congestion_cost /*=true*/) {
+    const auto& cluster_ctx = g_vpr_ctx.clustering();
+    const DeviceContext& device_ctx = g_vpr_ctx.device();
+
+    const size_t grid_width = device_ctx.grid.width();
+    const size_t grid_height = device_ctx.grid.height();
+    const size_t num_layers = device_ctx.grid.get_num_layers();
+    const size_t num_nets = g_vpr_ctx.clustering().clb_nlist.nets().size();
+
+    // Congestion-related data members are allocated the first time this method is called
+    // to enable congestion modeling. This lazy allocation helps save memory when congestion
+    // modeling is not used.
+    if (!congestion_modeling_started_) {
+        congestion_modeling_started_ = true;
+
+        chan_util_.x = vtr::NdMatrix<double, 3>({{num_layers, grid_width, grid_height}}, 0);
+        chan_util_.y = vtr::NdMatrix<double, 3>({{num_layers, grid_width, grid_height}}, 0);
+
+        acc_chan_util_.x = vtr::PrefixSum2D<double>(grid_width,
+                                                    grid_height,
+                                                    [&](size_t x, size_t y) {
+                                                        return chan_util_.x[0][x][y];
+                                                    });
+
+        acc_chan_util_.y = vtr::PrefixSum2D<double>(grid_width,
+                                                    grid_height,
+                                                    [&](size_t x, size_t y) {
+                                                        return chan_util_.y[0][x][y];
+                                                    });
+
+        ts_avg_chan_util_new_.resize(num_nets, {0., 0.});
+        avg_chan_util_.resize(num_nets, {0., 0.});
+        net_cong_cost_.resize(num_nets, -1.);
+        proposed_net_cong_cost_.resize(num_nets, -1.);
+    }
+
+    chan_util_.x.fill(0.);
+    chan_util_.y.fill(0.);
+
+    // For each net, this function estimates routing channel utilization by distributing
+    // the net's expected wirelength across its bounding box. The expected wirelength
+    // for each dimension (x, y) is computed proportionally based on the bounding box size
+    // in each direction. The wirelength in each dimension is then **evenly spread** across
+    // all grid locations within the bounding box, and the demand is accumulated in
+    // the channel utilization matrices.
+
+    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
+        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+
+            if (cube_bb_) {
+                const t_bb& bb = bb_coords_[net_id];
+                double expected_wirelength = get_net_wirelength_estimate_(net_id);
+
+                int distance_x = bb.xmax - bb.xmin + 1;
+                int distance_y = bb.ymax - bb.ymin + 1;
+                int distance_z = bb.layer_max - bb.layer_min + 1;
+
+                double expected_x_wl = (double)distance_x / (distance_x + distance_y) * expected_wirelength;
+                double expected_y_wl = expected_wirelength - expected_x_wl;
+
+                int total_channel_segments = distance_x * distance_y * distance_z;
+                double expected_per_x_segment_wl = expected_x_wl / total_channel_segments;
+                double expected_per_y_segment_wl = expected_y_wl / total_channel_segments;
+
+                for (int layer = bb.layer_min; layer <= bb.layer_max; layer++) {
+                    for (int x = bb.xmin; x <= bb.xmax; x++) {
+                        for (int y = bb.ymin; y <= bb.ymax; y++) {
+                            chan_util_.x[layer][x][y] += expected_per_x_segment_wl;
+                            chan_util_.y[layer][x][y] += expected_per_y_segment_wl;
+                        }
+                    }
+                }
+            } else {
+                const std::vector<t_2D_bb>& bb = layer_bb_coords_[net_id];
+                const vtr::NdMatrixProxy<int, 1> net_layer_pin_sink_count = num_sink_pin_layer_[size_t(net_id)];
+
+                for (size_t layer = 0; layer < bb.size(); layer++) {
+                    if (net_layer_pin_sink_count[layer] == 0) {
+                        continue;
+                    }
+
+                    double crossing = wirelength_crossing_count(net_layer_pin_sink_count[layer] + 1);
+                    double expected_wirelength = ((bb[layer].xmax - bb[layer].xmin + 1) + (bb[layer].ymax - bb[layer].ymin + 1)) * crossing;
+
+                    int distance_x = bb[layer].xmax - bb[layer].xmin + 1;
+                    int distance_y = bb[layer].ymax - bb[layer].ymin + 1;
+
+                    double expected_x_wl = (double)distance_x / (distance_x + distance_y) * expected_wirelength;
+                    double expected_y_wl = expected_wirelength - expected_x_wl;
+
+                    int total_channel_segments = distance_x * distance_y;
+                    double expected_per_x_segment_wl = expected_x_wl / total_channel_segments;
+                    double expected_per_y_segment_wl = expected_y_wl / total_channel_segments;
+
+                    for (int x = bb[layer].xmin; x <= bb[layer].xmax; x++) {
+                        for (int y = bb[layer].ymin; y <= bb[layer].ymax; y++) {
+                            chan_util_.x[layer][x][y] += expected_per_x_segment_wl;
+                            chan_util_.y[layer][x][y] += expected_per_y_segment_wl;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Channel width is computed only once and reused in later calls.
+    if (chan_width_.x.empty()) {
+        VTR_ASSERT(chan_width_.y.empty());
+        std::tie(chan_width_.x, chan_width_.y) = calculate_channel_width();
+    }
+
+    VTR_ASSERT(chan_util_.x.size() == chan_util_.y.size());
+    VTR_ASSERT(chan_util_.x.size() == chan_width_.x.size());
+    VTR_ASSERT(chan_util_.y.size() == chan_width_.y.size());
+
+    // Normalize channel utilizations by dividing by the corresponding channel widths.
+    // If a channel does not exist (i.e., its width is zero), we set its utilization to 1
+    // to avoid division by zero.
+    for (size_t layer = 0; layer < num_layers; ++layer) {
+        for (size_t x = 0; x < grid_width; ++x) {
+            for (size_t y = 0; y < grid_height; ++y) {
+                if (chan_width_.x[layer][x][y] > 0) {
+                    chan_util_.x[layer][x][y] /= chan_width_.x[layer][x][y];
+                } else {
+                    VTR_ASSERT_SAFE(chan_width_.x[layer][x][y] == 0);
+                    chan_util_.x[layer][x][y] = 1.;
+                }
+
+                if (chan_width_.y[layer][x][y] > 0) {
+                    chan_util_.y[layer][x][y] /= chan_width_.y[layer][x][y];
+                } else {
+                    VTR_ASSERT_SAFE(chan_width_.y[layer][x][y] == 0);
+                    chan_util_.y[layer][x][y] = 1.;
+                }
+            }
+        }
+    }
+
+    // For now, congestion modeling in the placement stage is limited to a single die
+    // TODO: extend it to multiple dice
+    acc_chan_util_.x = vtr::PrefixSum2D<double>(grid_width,
+                                                grid_height,
+                                                [&](size_t x, size_t y) {
+                                                    return chan_util_.x[0][x][y];
+                                                });
+
+    acc_chan_util_.y = vtr::PrefixSum2D<double>(grid_width,
+                                                grid_height,
+                                                [&](size_t x, size_t y) {
+                                                    return chan_util_.y[0][x][y];
+                                                });
+
+    double cong_cost = 0.;
+    // Compute congestion cost using computed bounding boxes and channel utilization map
+    if (compute_congestion_cost) {
+        for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
+            if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
+                net_cong_cost_[net_id] = get_net_cube_cong_cost_(net_id, /*use_ts=*/false);
+                cong_cost += net_cong_cost_[net_id];
+            }
+        }
+    }
+
+    return cong_cost;
+}
+
+const ChannelData<vtr::NdMatrix<double, 3>>& NetCostHandler::get_chan_util() const {
+    return chan_util_;
+}
+
 void NetCostHandler::set_ts_bb_coord_(const ClusterNetId net_id) {
     if (cube_bb_) {
         bb_coords_[net_id] = ts_bb_coord_new_[net_id];
+        if (congestion_modeling_started_) {
+            avg_chan_util_[net_id] = ts_avg_chan_util_new_[net_id];
+        }
     } else {
         layer_bb_coords_[net_id] = layer_ts_bb_coord_new_[net_id];
     }
@@ -1746,115 +2019,3 @@ std::pair<t_bb, t_bb> NetCostHandler::union_2d_bb_incr(ClusterNetId net_id) cons
 
     return std::make_pair(merged_num_edge, merged_bb);
 }
-
-std::pair<vtr::NdMatrix<double, 3>, vtr::NdMatrix<double, 3>> NetCostHandler::estimate_routing_chan_util() const {
-    const auto& cluster_ctx = g_vpr_ctx.clustering();
-    const auto& device_ctx = g_vpr_ctx.device();
-
-    auto chanx_util = vtr::NdMatrix<double, 3>({{(size_t)device_ctx.grid.get_num_layers(),
-                                                 device_ctx.grid.width(),
-                                                 device_ctx.grid.height()}},
-                                               0);
-
-    auto chany_util = vtr::NdMatrix<double, 3>({{(size_t)device_ctx.grid.get_num_layers(),
-                                                 device_ctx.grid.width(),
-                                                 device_ctx.grid.height()}},
-                                               0);
-
-    // For each net, this function estimates routing channel utilization by distributing
-    // the net's expected wirelength across its bounding box. The expected wirelength
-    // for each dimension (x, y) is computed proportionally based on the bounding box size
-    // in each direction. The wirelength in each dimension is then **evenly spread** across
-    // all grid locations within the bounding box, and the demand is accumulated in
-    // the channel utilization matrices.
-
-    for (ClusterNetId net_id : cluster_ctx.clb_nlist.nets()) {
-        if (!cluster_ctx.clb_nlist.net_is_ignored(net_id)) {
-
-            if (cube_bb_) {
-                const t_bb& bb = bb_coords_[net_id];
-                double expected_wirelength = get_net_wirelength_estimate_(net_id);
-
-                int distance_x = bb.xmax - bb.xmin + 1;
-                int distance_y = bb.ymax - bb.ymin + 1;
-                int distance_z = bb.layer_max - bb.layer_min + 1;
-
-                double expected_x_wl = (double)distance_x / (distance_x + distance_y) * expected_wirelength;
-                double expected_y_wl = expected_wirelength - expected_x_wl;
-
-                int total_channel_segments = distance_x * distance_y * distance_z;
-                double expected_per_x_segment_wl = expected_x_wl / total_channel_segments;
-                double expected_per_y_segment_wl = expected_y_wl / total_channel_segments;
-
-                for (int layer = bb.layer_min; layer <= bb.layer_max; layer++) {
-                    for (int x = bb.xmin; x <= bb.xmax; x++) {
-                        for (int y = bb.ymin; y <= bb.ymax; y++) {
-                            chanx_util[layer][x][y] += expected_per_x_segment_wl;
-                            chany_util[layer][x][y] += expected_per_y_segment_wl;
-                        }
-                    }
-                }
-            } else {
-                const std::vector<t_2D_bb>& bb = layer_bb_coords_[net_id];
-                const vtr::NdMatrixProxy<int, 1> net_layer_pin_sink_count = num_sink_pin_layer_[size_t(net_id)];
-
-                for (size_t layer = 0; layer < bb.size(); layer++) {
-                    if (net_layer_pin_sink_count[layer] == 0) {
-                        continue;
-                    }
-
-                    double crossing = wirelength_crossing_count(net_layer_pin_sink_count[layer] + 1);
-                    double expected_wirelength = ((bb[layer].xmax - bb[layer].xmin + 1) + (bb[layer].ymax - bb[layer].ymin + 1)) * crossing;
-
-                    int distance_x = bb[layer].xmax - bb[layer].xmin + 1;
-                    int distance_y = bb[layer].ymax - bb[layer].ymin + 1;
-
-                    double expected_x_wl = (double)distance_x / (distance_x + distance_y) * expected_wirelength;
-                    double expected_y_wl = expected_wirelength - expected_x_wl;
-
-                    int total_channel_segments = distance_x * distance_y;
-                    double expected_per_x_segment_wl = expected_x_wl / total_channel_segments;
-                    double expected_per_y_segment_wl = expected_y_wl / total_channel_segments;
-
-                    for (int x = bb[layer].xmin; x <= bb[layer].xmax; x++) {
-                        for (int y = bb[layer].ymin; y <= bb[layer].ymax; y++) {
-                            chanx_util[layer][x][y] += expected_per_x_segment_wl;
-                            chany_util[layer][x][y] += expected_per_y_segment_wl;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    const auto [chanx_width, chany_width] = calculate_channel_width();
-
-    VTR_ASSERT(chanx_util.size() == chany_util.size());
-    VTR_ASSERT(chanx_util.ndims() == chany_util.ndims());
-    VTR_ASSERT(chanx_util.size() == chanx_width.size());
-    VTR_ASSERT(chanx_util.ndims() == chanx_width.ndims());
-    VTR_ASSERT(chany_util.size() == chany_width.size());
-    VTR_ASSERT(chany_util.ndims() == chany_width.ndims());
-
-    for (size_t layer = 0; layer < chanx_util.dim_size(0); ++layer) {
-        for (size_t x = 0; x < chanx_util.dim_size(1); ++x) {
-            for (size_t y = 0; y < chanx_util.dim_size(2); ++y) {
-                if (chanx_width[layer][x][y] > 0) {
-                    chanx_util[layer][x][y] /= chanx_width[layer][x][y];
-                } else {
-                    VTR_ASSERT_SAFE(chanx_width[layer][x][y] == 0);
-                    chanx_util[layer][x][y] = 1.;
-                }
-
-                if (chany_width[layer][x][y] > 0) {
-                    chany_util[layer][x][y] /= chany_width[layer][x][y];
-                } else {
-                    VTR_ASSERT_SAFE(chany_width[layer][x][y] == 0);
-                    chany_util[layer][x][y] = 1.;
-                }
-            }
-        }
-    }
-
-    return {chanx_util, chany_util};
-}
diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h
index b36fe9fc6e..8f9d4511e1 100644
--- a/vpr/src/place/net_cost_handler.h
+++ b/vpr/src/place/net_cost_handler.h
@@ -35,6 +35,13 @@ enum class e_cost_methods {
     CHECK
 };
 
+template<typename T>
+struct ChannelData {
+    T x;
+    T y;
+    // TODO: add Z dimension
+};
+
 class NetCostHandler {
   public:
     NetCostHandler() = delete;
@@ -55,20 +62,20 @@ class NetCostHandler {
     NetCostHandler(const t_placer_opts& placer_opts, PlacerState& placer_state, bool cube_bb);
 
     /**
-     * @brief Finds the bb cost from scratch.
-     * Done only when the placement has been radically changed
-     * (i.e. after initial placement). Otherwise find the cost
+     * @brief Finds the bb cost and congestion cost from scratch.
+     * @details Done only when the placement has been radically changed
+     * (i.e. after initial placement). Otherwise, find the cost
      * change incrementally. If method check is NORMAL, we find
      * bounding boxes that are updatable for the larger nets.
      * If method is CHECK, all bounding boxes are found via the
      * non_updateable_bb routine, to provide a cost which can be
      * used to check the correctness of the other routine.
      * @param method The method used to calculate placement cost.
-     * @return (bounding box cost of the placement, estimated wirelength)
+     * @return (bounding box cost of the placement, estimated wirelength, congestion cost)
      *
      * @note The returned estimated wirelength is valid only when method == CHECK
      */
-    std::pair<double, double> comp_bb_cost(e_cost_methods method);
+    std::tuple<double, double, double> comp_bb_cong_cost(e_cost_methods method);
 
     /**
      * @brief Find all the nets and pins affected by this swap and update costs.
@@ -97,7 +104,8 @@ class NetCostHandler {
                                              const PlacerCriticalities* criticalities,
                                              t_pl_blocks_to_be_moved& blocks_affected,
                                              double& bb_delta_c,
-                                             double& timing_delta_c);
+                                             double& timing_delta_c,
+                                             double& congestion_delta_c);
 
     /**
      * @brief Reset the net cost function flags (proposed_net_cost and bb_updated_before)
@@ -107,7 +115,6 @@ class NetCostHandler {
     /**
      * @brief Update net cost data structures (in placer context and net_cost in .cpp file)
      * and reset flags (proposed_net_cost and bb_updated_before).
-     * @param num_nets_affected The number of nets affected by the move.
      * It is used to determine the index up to which elements in ts_nets_to_update are valid.
      */
     void update_move_nets();
@@ -117,7 +124,6 @@ class NetCostHandler {
      * and update "costs" accordingly. It is important to note that in this function bounding box
      * and connection delays are not calculated from scratch. However, it iterates over all nets
      * and connections and updates their costs by a complete summation, rather than incrementally.
-     * @param noc_opts Contains NoC cost weighting factors.
      * @param delay_model Placement delay model. Used to compute timing cost.
      * @param criticalities Contains the clustered netlist connection criticalities.
      * Used to computed timing cost .
@@ -133,33 +139,44 @@ class NetCostHandler {
     double get_total_wirelength_estimate() const;
 
     /**
-     * @brief Estimates routing channel utilization.
+     * @brief Estimates routing channel utilization and computes the congestion cost
+     * for each net.
+     * @param compute_congestion_cost Indicates whether computing congestion cost is needed.
      *
      * For each net, distributes estimated wirelength across its bounding box
      * and accumulates demand for different routing channels. Normalizes by channel widths
      * (e.g. a value of 0.5 means 50% of the wiring in a channel is expected to be used).
      *
-     * @return Pair of matrices with relative CHANX and CHANY utilization.
-     *         The dimension order for each matrix is [layer][x][y].
+     * @note This method assumes that net bounding boxes are already computed.
+     *
+     * @return Total congestion cost if requested.
      */
-    std::pair<vtr::NdMatrix<double, 3>, vtr::NdMatrix<double, 3>> estimate_routing_chan_util() const;
+    double estimate_routing_chan_util(bool compute_congestion_cost = true);
+
+    /**
+     * @brief Returns the estimated routing channel usage for each location in the grid.
+     *        The channel usage estimates are computed in estimate_routing_chan_util().
+     */
+    const ChannelData<vtr::NdMatrix<double, 3>>& get_chan_util() const;
 
   private:
-    ///@brief Specifies whether the bounding box is computed using cube method or per-layer method.
+    /// Indicates whether congestion cost modeling is enabled.
+    bool congestion_modeling_started_;
+    /// Specifies whether the bounding box is computed using cube method or per-layer method.
     bool cube_bb_;
-    ///@brief Determines whether the FPGA has multiple dies (layers)
+    /// Determines whether the FPGA has multiple dies (layers)
     bool is_multi_layer_;
-    ///@brief A reference to the placer's state to be updated by this object.
+    /// A reference to the placer's state to be updated by this object.
     PlacerState& placer_state_;
-    ///@brief Contains some parameter that determine how the placement cost is computed.
+    /// Contains some parameter that determine how the placement cost is computed.
     const t_placer_opts& placer_opts_;
-    ///@brief Points to the proper method for computing the bounding box cost from scratch.
-    std::function<std::pair<double, double>(e_cost_methods method)> comp_bb_cost_functor_;
-    ///@brief Points to the proper method for updating the bounding box of a net.
+    /// Points to the proper method for computing the bounding box cost, estimated wirelength and congestion cost from scratch.
+    std::function<std::tuple<double, double, double>(e_cost_methods method)> comp_bb_cong_cost_functor_;
+    /// Points to the proper method for updating the bounding box of a net.
     std::function<void(ClusterNetId net_id, t_physical_tile_loc pin_old_loc, t_physical_tile_loc pin_new_loc, bool is_driver)> update_bb_functor_;
-    ///@brief Points to the proper method for getting the bounding box cost of a net
+    /// Points to the proper method for getting the bounding box cost of a net
     std::function<double(ClusterNetId)> get_net_bb_cost_functor_;
-    ///@brief Points to the proper method for getting the non-updatable bounding box of a net
+    /// Points to the proper method for getting the non-updatable bounding box of a net
     std::function<void(const ClusterNetId net)> get_non_updatable_bb_functor_;
 
     /**
@@ -195,19 +212,28 @@ class NetCostHandler {
     /* [0...num_affected_nets] -> net_id of the affected nets */
     std::vector<ClusterNetId> ts_nets_to_update_;
 
-    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the number of blocks on each of a net's bounding box (to allow efficient updates)
+    vtr::vector<ClusterNetId, std::pair<float, float>> ts_avg_chan_util_new_;
+
+    /// Store the number of blocks on each of a net's bounding box (to allow efficient updates)
+    /// [0..cluster_ctx.clb_nlist.nets().size()-1]
     vtr::vector<ClusterNetId, t_bb> bb_num_on_edges_;
 
-    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates of a net's bounding box
+    /// Store the bounding box coordinates of a net's bounding box
+    /// [0..cluster_ctx.clb_nlist.nets().size()-1]
     vtr::vector<ClusterNetId, t_bb> bb_coords_;
 
-    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the number of blocks on each of a net's bounding box (to allow efficient updates)
+    vtr::vector<ClusterNetId, std::pair<float, float>> avg_chan_util_;
+
+    /// Store the number of blocks on each of a net's bounding box (to allow efficient updates)
+    /// [0..cluster_ctx.clb_nlist.nets().size()-1]
     vtr::vector<ClusterNetId, std::vector<t_2D_bb>> layer_bb_num_on_edges_;
 
-    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the bounding box coordinates of a net's bounding box
+    /// Store the bounding box coordinates of a net's bounding box
+    /// [0..cluster_ctx.clb_nlist.nets().size()-1]
     vtr::vector<ClusterNetId, std::vector<t_2D_bb>> layer_bb_coords_;
 
-    // [0..cluster_ctx.clb_nlist.nets().size()-1]. Store the number of blocks on each layer ()
+    /// Store the number of blocks on each layer ()
+    /// [0..cluster_ctx.clb_nlist.nets().size()-1]
     vtr::Matrix<int> num_sink_pin_layer_;
 
     /**
@@ -230,6 +256,19 @@ class NetCostHandler {
      */
     vtr::vector<ClusterNetId, double> net_cost_;
     vtr::vector<ClusterNetId, double> proposed_net_cost_;
+
+    /**
+     * @brief The congestion cost for each net is based on the extent to which its
+     * average routing channel utilization exceeds a predefined threshold.
+     * This is computed by measuring the average utilization within the net's
+     * bounding box and subtracting the congestion threshold.
+     * Only the excess portion contributes to the net's congestion cost.
+     * The valid range is [0...cluster_ctx.clb_nlist.nets().size()-1] when
+     * congestion modeling is enabled. Otherwise, this vector would be empty.
+     */
+    vtr::vector<ClusterNetId, double> net_cong_cost_;
+    vtr::vector<ClusterNetId, double> proposed_net_cong_cost_;
+
     vtr::vector<ClusterNetId, NetUpdateState> bb_update_status_;
 
     /**
@@ -241,14 +280,30 @@ class NetCostHandler {
      * number of tracks in that direction; for other cost functions they
      * will never be used.
      */
-    vtr::PrefixSum1D<int> acc_chanx_width_; // [0..device_ctx.grid.width()-1]
-    vtr::PrefixSum1D<int> acc_chany_width_; // [0..device_ctx.grid.height()-1]
+    ChannelData<vtr::PrefixSum1D<int>> acc_chan_width_;
+
+    /**
+     * @brief Estimated routing usage per channel segment,
+     *        indexed by [layer][x][y]. Values represent normalized wire demand
+     *        contribution from all nets distributed over their bounding boxes.
+     */
+    ChannelData<vtr::NdMatrix<double, 3>> chan_util_;
+
+    /**
+     * @brief Accumulated (prefix sum) channel utilization in each direction (x/y),
+     *        on the base layer. Enables fast computation of average utilization
+     *        over a net’s bounding box during congestion cost estimation.
+     */
+    ChannelData<vtr::PrefixSum2D<double>> acc_chan_util_;
+
+    /// Available channel width per grid location, indexed by [layer][x][y].
+    ChannelData<vtr::NdMatrix<int, 3>> chan_width_;
 
     /**
      * @brief The matrix below is used to calculate a chanz_place_cost_fac based on the average channel width in 
      * the cross-die-layer direction over a 2D (x,y) region. We don't assume the inter-die connectivity is the same at all (x,y) locations, so we
      * can't compute the full chanz_place_cost_fac for all possible (xlow,ylow)(xhigh,yhigh) without a 4D array, which would
-     * be too big: O(n^2) in circuit size. Instead we compute a prefix sum that stores the number of inter-die connections per layer from
+     * be too big: O(n^2) in circuit size. Instead, we compute a prefix sum that stores the number of inter-die connections per layer from
      * (x=0,y=0) to (x,y). Given this, we can compute the average number of inter-die connections over a (xlow,ylow) to (xhigh,yhigh) 
      * region in O(1) (by adding and subtracting 4 entries)
      */
@@ -287,9 +342,10 @@ class NetCostHandler {
     /**
      * @brief Calculates and returns the total bb (wirelength) cost change that would result from moving the blocks
      * indicated in the blocks_affected data structure.
-     * @param bb_delta_c Cost difference after and before moving the block
+     * @param bb_delta_c Bounding box cost difference after and before moving the block.
+     * @param congestion_delta_c Congestion cost difference after and before moving the block.
      */
-    void set_bb_delta_cost_(double& bb_delta_c);
+    void set_bb_delta_cost_(double& bb_delta_c, double& congestion_delta_c);
 
     /**
      * @brief Allocates and loads the chanx_place_cost_fac and chany_place_cost_fac arrays with the inverse of
@@ -304,7 +360,7 @@ class NetCostHandler {
 
     /**
      * @brief Allocates and loads acc_tile_num_inter_die_conn_ which contains the accumulative number of inter-die
-     * conntections.
+     * connections.
      *
      * @details This is only useful for multi-die FPGAs.
      */
@@ -377,10 +433,10 @@ class NetCostHandler {
      * @details This routine finds the bounding box of each net from scratch when the bounding box is of type per-layer (i.e. from
      * only the block location information). It updates the coordinate, number of pins on each edge information, and the
      * number of sinks on each layer. It should only be called when the bounding box information is not valid.
-     * @param net_id ID of the net which the moving pin belongs to
-     * @param coords Bounding box coordinates of the net. It is calculated in this function
-     * @param num_on_edges Net's number of blocks on the edges of the bounding box. It is calculated in this function.
-     * @param num_sink_pin_layer Net's number of sinks on each layer, calculated in this function.
+     * @param net_id                ID of the net which the moving pin belongs to
+     * @param coords                Bounding box coordinates of the net. It is calculated in this function
+     * @param num_on_edges          Net's number of blocks on the edges of the bounding box. It is calculated in this function.
+     * @param layer_pin_sink_count  Net's number of sinks on each layer, calculated in this function.
      */
     void get_layer_bb_from_scratch_(ClusterNetId net_id,
                                     std::vector<t_2D_bb>& num_on_edges,
@@ -415,11 +471,9 @@ class NetCostHandler {
      * Currently assumes channels on both sides of the CLBs forming the   edges of the bounding box can be used.
      * Essentially, I am assuming the pins always lie on the outside of the bounding box. The x and y coordinates
      * are the pin's x and y coordinates. IO blocks are considered to be one cell in for simplicity.
-     * @param bb_edge_new Number of blocks on the edges of the bounding box
-     * @param bb_coord_new Coordinates of the bounding box
-     * @param num_sink_pin_layer_new Number of sinks of the given net on each layer
-     * @param pin_old_loc The old location of the moving pin
-     * @param pin_new_loc The new location of the moving pin
+     * @param net_id        Net whose bounding box is to be updated.
+     * @param pin_old_loc   The old location of the moving pin
+     * @param pin_new_loc   The new location of the moving pin
      * @param is_output_pin Is the moving pin of the type output
      */
     void update_layer_bb_(ClusterNetId net_id,
@@ -495,22 +549,23 @@ class NetCostHandler {
     /**
      * @brief Computes the bounding box from scratch using 2D bounding boxes (per-layer mode)
      * @param method The method used to calculate placement cost. Specifies whether the cost is
-     * computed from scratch or incrementally.
-     * @return (bounding box cost of the placement, estimated wirelength)
-     *
+     *        computed from scratch or incrementally.
+     * @return (bounding box cost of the placement, estimated wirelength, congestion cost)
+     * @note Congestion modeling is not supported for per-layer mode, so 0 is returned.
      * @note The returned estimated wirelength is valid only when method == CHECK
      */
-    std::pair<double, double> comp_per_layer_bb_cost_(e_cost_methods method);
+    std::tuple<double, double, double> comp_per_layer_bb_cost_(e_cost_methods method);
 
     /**
      * @brief Computes the bounding box from scratch using 3D bounding boxes (cube mode)
+     *        and calculates BB cost, estimated wirelength, and congestion cost (if enabled).
      * @param method The method used to calculate placement cost. Specifies whether the cost is
-     * computed from scratch or incrementally.
-     * @return (bounding box cost of the placement, estimated wirelength)
+     *               computed from scratch or incrementally.
+     * @return (bounding box cost of the placement, estimated wirelength, congestion cost)
      *
      * @note The returned estimated wirelength is valid only when method == CHECK
      */
-    std::pair<double, double> comp_cube_bb_cost_(e_cost_methods method);
+    std::tuple<double, double, double> comp_cube_bb_cong_cost_(e_cost_methods method);
 
     /**
      * @brief if "net" is not already stored as an affected net, add it in ts_nets_to_update.
@@ -520,20 +575,31 @@ class NetCostHandler {
 
     /**
      * @brief To mitigate round-off errors, every once in a while, the costs of nets are summed up from scratch.
-     * This functions is called to do that for bb cost. It doesn't calculate the BBs from scratch, it would only add the costs again.
-     * @return Total bb (wirelength) cost for the placement
+     *        This function is called to do that for bb and congestion cost.
+     *        It doesn't calculate the BBs or channel usage estimate from scratch,
+     *        it would only add the costs again.
+     * @return (total bb cost, total congestion cost)
      */
-    double recompute_bb_cost_();
+    std::pair<double, double> recompute_bb_cong_cost_();
 
     /**
      * @brief Given the 3D BB, calculate the wire-length cost of the net
-     * @param net_id ID of the net which cost is requested.
+     * @param net_id ID of the net whose cost is requested.
      * @param use_ts Specifies if the bounding box is retrieved from ts data structures
-     * or move context.
+     *               or permanent data structures.
      * @return Wirelength cost of the net
      */
     double get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts);
 
+    /**
+     * @brief Calculate the congestion cost of net using its 3D bounding box.
+     * @param net_id ID of the net whose cost is requested.
+     * @param use_ts Specifies if the bounding box is retrieved from ts data structures
+     *               or move context.
+     * @return Congestion cost of the net
+     */
+    double get_net_cube_cong_cost_(ClusterNetId net_id, bool use_ts);
+
     /**
      * @brief Given the per-layer BB, calculate the wire-length cost of the net on each layer
      * and return the sum of the costs
@@ -556,10 +622,10 @@ class NetCostHandler {
      */
     template<typename BBT>
     std::pair<double, double> get_chanxy_cost_fac_(const BBT& bb) {
-        const int total_chanx_width = acc_chanx_width_.get_sum(bb.ymin, bb.ymax);
+        const int total_chanx_width = acc_chan_width_.x.get_sum(bb.ymin, bb.ymax);
         const double inverse_average_chanx_width = (bb.ymax - bb.ymin + 1.0) / total_chanx_width;
 
-        const int total_chany_width = acc_chany_width_.get_sum(bb.xmin, bb.xmax);
+        const int total_chany_width = acc_chan_width_.y.get_sum(bb.xmin, bb.xmax);
         const double inverse_average_chany_width = (bb.xmax - bb.xmin + 1.0) / total_chany_width;
 
         return {inverse_average_chanx_width, inverse_average_chany_width};
@@ -580,16 +646,14 @@ class NetCostHandler {
     /**
      * @brief Given the 3D BB, calculate the wire-length estimate of the net
      * @param net_id ID of the net which wirelength estimate is requested
-     * @param bb Bounding box of the net
      * @return Wirelength estimate of the net
      */
     double get_net_wirelength_estimate_(ClusterNetId net_id) const;
 
     /**
      * @brief Given the per-layer BB, calculate the wire-length estimate of the net on each layer
-     * and return the sum of the lengths
-     * @param bb Per-layer BB of the net
-     * @param net_layer_pin_sink_count Number of sink pins on each layer for the net
+     *        and return the sum of the lengths
+     * @param net_id Net whose weirelength is to be estimated.
      * @return Wirelength estimate of the net
      */
     double get_net_wirelength_from_layer_bb_(ClusterNetId net_id) const;
diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp
index c5d46b5af3..9e57362338 100644
--- a/vpr/src/place/place.cpp
+++ b/vpr/src/place/place.cpp
@@ -38,10 +38,9 @@ void try_place(const Netlist<>& net_list,
                const FlatPlacementInfo& flat_placement_info,
                bool is_flat) {
 
-    /* Currently, the functions that require is_flat as their parameter and are called during placement should
-     * receive is_flat as false. For example, if the RR graph of router lookahead is built here, it should be as
-     * if is_flat is false, even if is_flat is set to true from the command line.
-     */
+    // Currently, the functions that require is_flat as their parameter and are called during placement should
+    // receive is_flat as false. For example, if the RR graph of router lookahead is built here, it should be as
+    // if is_flat is false, even if is_flat is set to true from the command line
     VTR_ASSERT(!is_flat);
     const auto& device_ctx = g_vpr_ctx.device();
     const auto& cluster_ctx = g_vpr_ctx.clustering();
@@ -52,6 +51,10 @@ void try_place(const Netlist<>& net_list,
     // Initialize the variables in the placement context.
     mutable_placement.init_placement_context(placer_opts, directs);
 
+    if (mutable_floorplanning.cluster_constraints.empty()) {
+        mutable_floorplanning.update_floorplanning_context_post_pack();
+    }
+
     // Update the floorplanning constraints with the macro information from the
     // placement context.
     mutable_floorplanning.update_floorplanning_context_pre_place(*mutable_placement.place_macros);
@@ -60,23 +63,22 @@ void try_place(const Netlist<>& net_list,
     VTR_LOG("Bounding box mode is %s\n", (mutable_placement.cube_bb ? "Cube" : "Per-layer"));
     VTR_LOG("\n");
 
-    /* To make sure the importance of NoC-related cost terms compared to
-     * BB and timing cost is determine only through NoC placement weighting factor,
-     * we normalize NoC-related cost weighting factors so that they add up to 1.
-     * With this normalization, NoC-related cost weighting factors only determine
-     * the relative importance of NoC cost terms with respect to each other, while
-     * the importance of total NoC cost to conventional placement cost is determined
-     * by NoC placement weighting factor.
-     * FIXME: This should not be modifying the NoC Opts here, this normalization
-     *        should occur when these Opts are loaded in.
-     */
+    // To make sure the importance of NoC-related cost terms compared to
+    // BB and timing cost is determine only through NoC placement weighting factor,
+    // we normalize NoC-related cost weighting factors so that they add up to 1.
+    // With this normalization, NoC-related cost weighting factors only determine
+    // the relative importance of NoC cost terms with respect to each other, while
+    // the importance of total NoC cost to conventional placement cost is determined
+    // by NoC placement weighting factor.
+    // FIXME: This should not be modifying the NoC Opts here, this normalization
+    //        should occur when these Opts are loaded in.
     if (noc_opts.noc) {
         normalize_noc_cost_weighting_factor(const_cast<t_noc_opts&>(noc_opts));
     }
 
-    /* Placement delay model is independent of the placement and can be shared across
-     * multiple placers if we are performing parallel annealing.
-     * So, it is created and initialized once. */
+    // Placement delay model is independent of the placement and can be shared across
+    // multiple placers if we are performing parallel annealing.
+    // So, it is created and initialized once. */
     std::shared_ptr<PlaceDelayModel> place_delay_model;
 
     if (placer_opts.place_algorithm.is_timing_driven()) {
@@ -95,15 +97,13 @@ void try_place(const Netlist<>& net_list,
         }
     }
 
-    /* Make the global instance of BlkLocRegistry inaccessible through the getter methods of the
-     * placement context. This is done to make sure that the placement stage only accesses its
-     * own local instances of BlkLocRegistry.
-     */
+    // Make the global instance of BlkLocRegistry inaccessible through the getter methods of the
+    // placement context. This is done to make sure that the placement stage only accesses its
+    // own local instances of BlkLocRegistry.
     mutable_placement.lock_loc_vars();
 
-    /* Start measuring placement time. The measured execution time will be printed
-     * when this object goes out of scope at the end of this function.
-     */
+    // Start measuring placement time. The measured execution time will be printed
+    // when this object goes out of scope at the end of this function.
     vtr::ScopedStartFinishTimer placement_timer("Placement");
 
     // Enables fast look-up pb graph pins from block pin indices
@@ -117,10 +117,9 @@ void try_place(const Netlist<>& net_list,
 
     placer.place();
 
-    /* The placer object has its own copy of block locations and doesn't update
-     * the global context directly. We need to copy its internal data structures
-     * to the global placement context before it goes out of scope.
-     */
+    // The placer object has its own copy of block locations and doesn't update
+    // the global context directly. We need to copy its internal data structures
+    // to the global placement context before it goes out of scope.
     placer.update_global_state();
 
     // Clean the variables in the placement context. This will deallocate memory
@@ -154,7 +153,7 @@ static void update_screen_debug();
 
 //Performs a major (i.e. interactive) placement screen update.
 //This function with no arguments is useful for calling from a debugger to
-//look at the intermediate implemetnation state.
+//look at the intermediate implementation state.
 static void update_screen_debug() {
     update_screen(ScreenUpdatePriority::MAJOR, "DEBUG", PLACEMENT, nullptr);
 }
diff --git a/vpr/src/place/place_constraints.cpp b/vpr/src/place/place_constraints.cpp
index ef867ce5b1..fdad4813cb 100644
--- a/vpr/src/place/place_constraints.cpp
+++ b/vpr/src/place/place_constraints.cpp
@@ -209,7 +209,7 @@ void load_cluster_constraints() {
 
     floorplanning_ctx.cluster_constraints.resize(cluster_ctx.clb_nlist.blocks().size());
 
-    for (auto cluster_id : cluster_ctx.clb_nlist.blocks()) {
+    for (ClusterBlockId cluster_id : cluster_ctx.clb_nlist.blocks()) {
         const std::unordered_set<AtomBlockId>& atoms = cluster_ctx.atoms_lookup[cluster_id];
         PartitionRegion empty_pr;
         floorplanning_ctx.cluster_constraints[cluster_id] = empty_pr;
diff --git a/vpr/src/place/place_util.cpp b/vpr/src/place/place_util.cpp
index 7fdf3383a0..e206037bb3 100644
--- a/vpr/src/place/place_util.cpp
+++ b/vpr/src/place/place_util.cpp
@@ -11,13 +11,22 @@
 #include "noc_place_utils.h"
 
 void t_placer_costs::update_norm_factors() {
+    const ClusteredNetlist& clustered_nlist = g_vpr_ctx.clustering().clb_nlist;
+
+    bb_cost_norm = 1 / bb_cost;
+
+    if (congestion_cost > 0.) {
+        congestion_cost_norm = 1 / congestion_cost;
+    } else {
+        congestion_cost_norm = 1. / (double)clustered_nlist.nets().size();
+    }
+
     if (place_algorithm.is_timing_driven()) {
-        bb_cost_norm = 1 / bb_cost;
-        //Prevent the norm factor from going to infinity
+        // Prevent the norm factor from going to infinity
         timing_cost_norm = std::min(1 / timing_cost, MAX_INV_TIMING_COST);
     } else {
-        VTR_ASSERT_SAFE(place_algorithm == e_place_algorithm::BOUNDING_BOX_PLACE);
-        bb_cost_norm = 1 / bb_cost; //Updating the normalization factor in bounding box mode since the cost in this mode is determined after normalizing the wirelength cost
+        // Timing normalization factor is not used
+        timing_cost_norm = std::numeric_limits<double>::quiet_NaN();
     }
 
     if (noc_enabled) {
@@ -36,6 +45,8 @@ double t_placer_costs::get_total_cost(const t_placer_opts& placer_opts, const t_
         total_cost = (1 - placer_opts.timing_tradeoff) * (bb_cost * bb_cost_norm) + (placer_opts.timing_tradeoff) * (timing_cost * timing_cost_norm);
     }
 
+    total_cost += placer_opts.congestion_factor * congestion_cost * congestion_cost_norm;
+
     if (noc_opts.noc) {
         // in noc mode we include noc aggregate bandwidth, noc latency, and noc congestion
         total_cost += calculate_noc_cost(noc_cost_terms, noc_cost_norm_factors, noc_opts);
@@ -65,7 +76,7 @@ int get_place_inner_loop_num_move(const t_placer_opts& placer_opts, const t_anne
         move_lim = int(annealing_sched.inner_num * pow(device_size, 2. / 3.) * pow(num_blocks, 2. / 3.));
     }
 
-    /* Avoid having a non-positive move_lim */
+    // Avoid having a non-positive move_lim
     move_lim = std::max(move_lim, 1);
 
     return move_lim;
@@ -76,6 +87,7 @@ void t_placer_statistics::reset() {
     av_cost = 0.;
     av_bb_cost = 0.;
     av_timing_cost = 0.;
+    av_cong_cost = 0.;
     sum_of_squares = 0.;
     success_sum = 0;
     success_rate = 0.;
@@ -88,6 +100,7 @@ void t_placer_statistics::single_swap_update(const t_placer_costs& costs) {
     av_cost += costs.cost;
     av_bb_cost += costs.bb_cost;
     av_timing_cost += costs.timing_cost;
+    av_cong_cost += costs.congestion_cost;
     sum_of_squares += (costs.cost) * (costs.cost);
 }
 
@@ -97,10 +110,12 @@ void t_placer_statistics::calc_iteration_stats(const t_placer_costs& costs, int
         av_cost = costs.cost;
         av_bb_cost = costs.bb_cost;
         av_timing_cost = costs.timing_cost;
+        av_cong_cost = costs.congestion_cost;
     } else {
         av_cost /= success_sum;
         av_bb_cost /= success_sum;
         av_timing_cost /= success_sum;
+        av_cong_cost /= success_sum;
     }
     success_rate = success_sum / float(move_lim);
     std_dev = get_std_dev(success_sum, sum_of_squares, av_cost);
diff --git a/vpr/src/place/place_util.h b/vpr/src/place/place_util.h
index f9925cbcc7..0caa10b8d5 100644
--- a/vpr/src/place/place_util.h
+++ b/vpr/src/place/place_util.h
@@ -63,37 +63,33 @@ struct NocCostTerms {
  * values of the previous iteration. However, the divisions are expensive,
  * so we store their multiplicative inverses when they are updated in
  * the outer loop routines to speed up the normalization process.
- *
- *   @param cost The weighted average of the wiring cost and the timing cost.
- *   @param bb_cost The bounding box cost, aka the wiring cost.
- *   @param timing_cost The timing cost, which is connection delay * criticality.
- *
- *   @param bb_cost_norm The normalization factor for the wiring cost.
- *   @param timing_cost_norm The normalization factor for the timing cost, which
- *              is upper-bounded by the value of MAX_INV_TIMING_COST.
- *
- *   @param noc_cost_terms NoC-related cost terms
- *   @param noc_cost_norm_factors Normalization factors for NoC-related cost terms.
- *
- *   @param MAX_INV_TIMING_COST Stops inverse timing cost from going to infinity
- *              with very lax timing constraints, which avoids multiplying by a
- *              gigantic timing_cost_norm when auto-normalizing. The exact value
- *              of this cost has relatively little impact, but should be large
- *              enough to not affect the timing costs computation for normal
- *              constraints.
- *
- *   @param place_algorithm Determines how the member values are updated upon
- *              each temperature change during the placer annealing process.
  */
 class t_placer_costs {
   public: //members
+    /// The weighted average of the wiring cost, the timing cost, and the congestion cost (if enabled)
     double cost = 0.;
+
+    /// The bounding box cost, aka the wiring cost.
     double bb_cost = 0.;
+
+    /// The timing cost, which is connection delay * criticality.
     double timing_cost = 0.;
+
+    /// The congestion cost, which estimates how much routing channels are over-utilized.
+    double congestion_cost = 0.;
+
+    /// The normalization factor for the wiring cost.
     double bb_cost_norm = 0.;
+
+    /// The normalization factor for the timing cost, which is upper-bounded by the value of MAX_INV_TIMING_COST.
     double timing_cost_norm = 0.;
 
+    /// The normalization factor for the congestion cost.
+    double congestion_cost_norm = 0.;
+
+    /// NoC-related cost terms.
     NocCostTerms noc_cost_terms;
+    /// Normalization factors for NoC-related cost terms.
     NocCostTerms noc_cost_norm_factors;
 
   public: //Constructor
@@ -131,7 +127,18 @@ class t_placer_costs {
     t_placer_costs& operator+=(const NocCostTerms& noc_delta_cost);
 
   private:
+    /**
+     * @brief Stops inverse timing cost from going to infinity
+     *         with very lax timing constraints, which avoids multiplying by a
+     *         gigantic timing_cost_norm when auto-normalizing. The exact value
+     *         of this cost has relatively little impact, but should be large
+     *         enough to not affect the timing costs computation for normal
+     *         constraints.
+     */
     static constexpr double MAX_INV_TIMING_COST = 1.e12;
+
+    /// Determines how the member values are updated upon
+    /// each temperature change during the placer annealing process.
     t_place_algorithm place_algorithm;
     bool noc_enabled;
 };
@@ -148,38 +155,30 @@ class t_placer_costs {
  * In terms of calculating statistics for total cost, we mean that we
  * operate upon the set of placer cost values gathered after every
  * accepted block move.
- *
- *   @param av_cost
- *              Average total cost. Cost formulation depends on
- *              the place algorithm currently being used.
- *   @param av_bb_cost
- *              Average bounding box (wiring) cost.
- *   @param av_timing_cost
- *              Average timing cost (delay * criticality).
- *   @param sum_of_squares
- *              Sum of squares of the total cost.
- *   @param success_num
- *              Number of accepted block swaps for the current iteration.
- *   @param success_rate
- *              num_accepted / total_trials for the current iteration.
- *   @param std_dev
- *              Standard deviation of the total cost.
- *
  */
 class t_placer_statistics {
   public:
+    /// Average total cost. Cost formulation depends on the place algorithm currently being used.
     double av_cost;
+    /// Average bounding box (wiring) cost.
     double av_bb_cost;
+    /// Average timing cost (delay * criticality).
     double av_timing_cost;
+    /// Average congestion cost.
+    double av_cong_cost;
+    /// Sum of squares of the total cost.
     double sum_of_squares;
+    /// Number of accepted block swaps for the current iteration.
     int success_sum;
+    /// num_accepted / total_trials for the current iteration.
     float success_rate;
+    /// Standard deviation of the total cost.
     double std_dev;
 
-  public: //Constructor
+  public: // Constructor
     t_placer_statistics() { reset(); }
 
-  public: //Mutator
+  public: // Mutator
     ///@brief Clear all data fields.
     void reset();
 
diff --git a/vpr/src/place/placer.cpp b/vpr/src/place/placer.cpp
index c2980ce984..f264576c62 100644
--- a/vpr/src/place/placer.cpp
+++ b/vpr/src/place/placer.cpp
@@ -117,7 +117,7 @@ Placer::Placer(const Netlist<>& net_list,
     }
 
     // Gets initial cost and loads bounding boxes.
-    costs_.bb_cost = net_cost_handler_.comp_bb_cost(e_cost_methods::NORMAL).first;
+    std::tie(costs_.bb_cost, std::ignore, costs_.congestion_cost) = net_cost_handler_.comp_bb_cong_cost(e_cost_methods::NORMAL);
 
     if (placer_opts.place_algorithm.is_timing_driven()) {
         alloc_and_init_timing_objects_(net_list, analysis_opts);
@@ -254,9 +254,8 @@ void Placer::check_place_() {
 
 int Placer::check_placement_costs_() {
     int error = 0;
-    double timing_cost_check;
 
-    const auto [bb_cost_check, expected_wirelength] = net_cost_handler_.comp_bb_cost(e_cost_methods::CHECK);
+    const auto [bb_cost_check, expected_wirelength, _] = net_cost_handler_.comp_bb_cong_cost(e_cost_methods::CHECK);
 
     if (fabs(bb_cost_check - costs_.bb_cost) > costs_.bb_cost * PL_INCREMENTAL_COST_TOLERANCE) {
         VTR_LOG_ERROR(
@@ -266,6 +265,7 @@ int Placer::check_placement_costs_() {
     }
 
     if (placer_opts_.place_algorithm.is_timing_driven()) {
+        double timing_cost_check;
         comp_td_costs(place_delay_model_.get(), *placer_criticalities_, placer_state_, &timing_cost_check);
         if (fabs(timing_cost_check - costs_.timing_cost) > costs_.timing_cost * PL_INCREMENTAL_COST_TOLERANCE) {
             VTR_LOG_ERROR(
diff --git a/vpr/src/route/route_common.cpp b/vpr/src/route/route_common.cpp
index 6ae41b5bb5..b5785cc5fc 100644
--- a/vpr/src/route/route_common.cpp
+++ b/vpr/src/route/route_common.cpp
@@ -80,15 +80,12 @@ static bool classes_in_same_block(ParentBlockId blk_id, int first_class_ptc_num,
  * @param route_opts Contains channel utilization threshold and weighting factor
  *                   used to increase initial 'acc_cost' for nodes going through
  *                   congested channels.
- * @param chanx_util Post-placement estimate of CHANX routing utilization per (layer, x, y) location.
- * @param chany_util Post-placement estimate of CHANY routing utilization per (layer, x, y) location.
+ * @param chan_util Post-placement estimate of routing channel utilization per (layer, x, y) location.
  * @return Initial `acc_cost` for the given RR node.
  */
-
 static float comp_initial_acc_cost(RRNodeId node_id,
                                    const t_router_opts& route_opts,
-                                   const vtr::NdMatrix<double, 3>& chanx_util,
-                                   const vtr::NdMatrix<double, 3>& chany_util);
+                                   const ChannelData<vtr::NdMatrix<double, 3>>& chan_util);
 
 /************************** Subroutine definitions ***************************/
 
@@ -436,8 +433,7 @@ void alloc_and_load_rr_node_route_structs(const t_router_opts& router_opts) {
 
 static float comp_initial_acc_cost(RRNodeId node_id,
                                    const t_router_opts& route_opts,
-                                   const vtr::NdMatrix<double, 3>& chanx_util,
-                                   const vtr::NdMatrix<double, 3>& chany_util) {
+                                   const ChannelData<vtr::NdMatrix<double, 3>>& chan_util) {
     const auto& rr_graph = g_vpr_ctx.device().rr_graph;
 
     // The default acc_cost is 1 for all rr_nodes. For routing wires, if they pass through a channel
@@ -457,7 +453,7 @@ static float comp_initial_acc_cost(RRNodeId node_id,
             int y = rr_graph.node_ylow(node_id);
             int layer = rr_graph.node_layer(node_id);
             for (int x = rr_graph.node_xlow(node_id); x <= rr_graph.node_xhigh(node_id); x++) {
-                max_util = std::max(max_util, chanx_util[layer][x][y]);
+                max_util = std::max(max_util, chan_util.x[layer][x][y]);
             }
 
         } else {
@@ -465,7 +461,7 @@ static float comp_initial_acc_cost(RRNodeId node_id,
             int x = rr_graph.node_xlow(node_id);
             int layer = rr_graph.node_layer(node_id);
             for (int y = rr_graph.node_ylow(node_id); y <= rr_graph.node_yhigh(node_id); y++) {
-                max_util = std::max(max_util, chany_util[layer][x][y]);
+                max_util = std::max(max_util, chan_util.y[layer][x][y]);
             }
         }
 
@@ -481,18 +477,18 @@ void reset_rr_node_route_structs(const t_router_opts& route_opts) {
     auto& route_ctx = g_vpr_ctx.mutable_routing();
     const auto& device_ctx = g_vpr_ctx.device();
     const auto& blk_loc_registry = g_vpr_ctx.placement().blk_loc_registry();
-    const bool cube_bb = g_vpr_ctx.placement().cube_bb;
 
     VTR_ASSERT(route_ctx.rr_node_route_inf.size() == size_t(device_ctx.rr_graph.num_nodes()));
 
-    RoutingChanUtilEstimator routing_chan_util_estimator(blk_loc_registry, cube_bb);
-    const auto [chanx_util, chany_util] = routing_chan_util_estimator.estimate_routing_chan_util();
+    // RoutingChanUtilEstimator assumes cube bounding box
+    RoutingChanUtilEstimator routing_chan_util_estimator(blk_loc_registry);
+    const ChannelData<vtr::NdMatrix<double, 3>> chan_util = routing_chan_util_estimator.estimate_routing_chan_util();
 
     for (const RRNodeId rr_id : device_ctx.rr_graph.nodes()) {
         t_rr_node_route_inf& node_inf = route_ctx.rr_node_route_inf[rr_id];
 
         node_inf.prev_edge = RREdgeId::INVALID();
-        node_inf.acc_cost = comp_initial_acc_cost(rr_id, route_opts, chanx_util, chany_util);
+        node_inf.acc_cost = comp_initial_acc_cost(rr_id, route_opts, chan_util);
         node_inf.path_cost = std::numeric_limits<float>::infinity();
         node_inf.backward_path_cost = std::numeric_limits<float>::infinity();
         node_inf.set_occ(0);
diff --git a/vpr/src/route/route_utilization.cpp b/vpr/src/route/route_utilization.cpp
index c01d0a62c8..b8cb5b4c5d 100644
--- a/vpr/src/route/route_utilization.cpp
+++ b/vpr/src/route/route_utilization.cpp
@@ -5,15 +5,16 @@
 #include "vpr_utils.h"
 #include "route_common.h"
 
-RoutingChanUtilEstimator::RoutingChanUtilEstimator(const BlkLocRegistry& blk_loc_registry, bool cube_bb) {
+RoutingChanUtilEstimator::RoutingChanUtilEstimator(const BlkLocRegistry& blk_loc_registry) {
     placer_state_ = std::make_unique<PlacerState>(/*placement_is_timing_driven=*/false);
     placer_state_->mutable_blk_loc_registry() = blk_loc_registry;
 
     placer_opts_.place_algorithm = e_place_algorithm::BOUNDING_BOX_PLACE;
-    net_cost_handler_ = std::make_unique<NetCostHandler>(placer_opts_, *placer_state_, cube_bb);
+    /// RoutingChanUtilEstimator uses cube bounding box
+    net_cost_handler_ = std::make_unique<NetCostHandler>(placer_opts_, *placer_state_, /*cube_bb=*/true);
 }
 
-std::pair<vtr::NdMatrix<double, 3>, vtr::NdMatrix<double, 3>> RoutingChanUtilEstimator::estimate_routing_chan_util() {
+ChannelData<vtr::NdMatrix<double, 3>> RoutingChanUtilEstimator::estimate_routing_chan_util() {
     const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
     const auto& block_locs = placer_state_->block_locs();
 
@@ -21,24 +22,28 @@ std::pair<vtr::NdMatrix<double, 3>, vtr::NdMatrix<double, 3>> RoutingChanUtilEst
 
     if (placement_is_done) {
         // Compute net bounding boxes
-        net_cost_handler_->comp_bb_cost(e_cost_methods::NORMAL);
+        net_cost_handler_->comp_bb_cong_cost(e_cost_methods::NORMAL);
 
-        // Estimate routing channel utilization using
-        return net_cost_handler_->estimate_routing_chan_util();
+        // Estimate routing channel usage
+        net_cost_handler_->estimate_routing_chan_util(/*compute_congestion_cost=*/false);
+
+        return net_cost_handler_->get_chan_util();
     } else {
         const auto& device_ctx = g_vpr_ctx.device();
 
-        auto chanx_util = vtr::NdMatrix<double, 3>({{(size_t)device_ctx.grid.get_num_layers(),
-                                                     device_ctx.grid.width(),
-                                                     device_ctx.grid.height()}},
-                                                   0);
+        ChannelData<vtr::NdMatrix<double, 3>> chan_util;
+
+        chan_util.x = vtr::NdMatrix<double, 3>({{(size_t)device_ctx.grid.get_num_layers(),
+                                                 device_ctx.grid.width(),
+                                                 device_ctx.grid.height()}},
+                                               0);
 
-        auto chany_util = vtr::NdMatrix<double, 3>({{(size_t)device_ctx.grid.get_num_layers(),
-                                                     device_ctx.grid.width(),
-                                                     device_ctx.grid.height()}},
-                                                   0);
+        chan_util.y = vtr::NdMatrix<double, 3>({{(size_t)device_ctx.grid.get_num_layers(),
+                                                 device_ctx.grid.width(),
+                                                 device_ctx.grid.height()}},
+                                               0);
 
-        return {chanx_util, chany_util};
+        return chan_util;
     }
 }
 
diff --git a/vpr/src/route/route_utilization.h b/vpr/src/route/route_utilization.h
index 8e71e73375..5a89247eaf 100644
--- a/vpr/src/route/route_utilization.h
+++ b/vpr/src/route/route_utilization.h
@@ -8,15 +8,15 @@
 
 /**
  * @class RoutingChanUtilEstimator
- * @brief This class computes the net bounding boxes and estimates the routing channel utilization
+ * @brief This class computes the net bounding boxes (cube mode) and estimates the routing channel utilization
  * for each CHANX/CHANY channel by smearing the estimated wirelength for each net across all channels
  * within its bounding box.
  */
 class RoutingChanUtilEstimator {
   public:
-    RoutingChanUtilEstimator(const BlkLocRegistry& blk_loc_registry, bool cube_bb);
+    RoutingChanUtilEstimator(const BlkLocRegistry& blk_loc_registry);
 
-    std::pair<vtr::NdMatrix<double, 3>, vtr::NdMatrix<double, 3>> estimate_routing_chan_util();
+    ChannelData<vtr::NdMatrix<double, 3>> estimate_routing_chan_util();
 
   private:
     std::unique_ptr<PlacerState> placer_state_;