diff --git a/.github/actions/setup-rendering-deps/action.yml b/.github/actions/setup-rendering-deps/action.yml
index 0c92f0291..53116d996 100644
--- a/.github/actions/setup-rendering-deps/action.yml
+++ b/.github/actions/setup-rendering-deps/action.yml
@@ -18,15 +18,12 @@ inputs:
 runs:
   using: 'composite'
   steps:
-    - name: Cache APT packages
-      id: cache-apt
+    - name: Cache fonts
+      id: cache-fonts
       uses: actions/cache@v3
       with:
-        path: |
-          /var/cache/apt/archives
-          /usr/share/fonts/truetype/humor-sans
-        key: apt-packages-${{ runner.os }}-fonts-backend-graphviz-v2
-        restore-keys: apt-packages-${{ runner.os }}-
+        path: /usr/share/fonts/truetype/humor-sans
+        key: fonts-${{ runner.os }}-humor-sans-v1
 
     - name: Install XKCD fonts
       if: ${{ inputs.skip-fonts != 'true' }}
diff --git a/.github/workflows/notebook-pr.yaml b/.github/workflows/notebook-pr.yaml
index 7adbc8411..f73d36852 100644
--- a/.github/workflows/notebook-pr.yaml
+++ b/.github/workflows/notebook-pr.yaml
@@ -148,33 +148,23 @@ jobs:
         run: |
           nb="${{ matrix.notebook }}"
           dir=$(dirname "$nb")
+          nb_name=$(basename "$nb" .ipynb)
           echo "dir=$dir" >> $GITHUB_OUTPUT
-          # Create a safe artifact name from the notebook path
-          safe_name=$(echo "$nb" | tr '/' '_' | tr ' ' '_')
-          echo "artifact_name=$safe_name" >> $GITHUB_OUTPUT
+          echo "nb_name=$nb_name" >> $GITHUB_OUTPUT
+          # Use ___ as delimiter (won't appear in paths) so we can restore later
+          safe_dir=$(echo "$dir" | sed 's|/|___|g')
+          echo "artifact_name=${safe_dir}___${nb_name}" >> $GITHUB_OUTPUT
 
-      - name: Upload processed notebook
+      - name: Upload processed tutorial directory
         uses: actions/upload-artifact@v4
         with:
-          name: notebook-${{ steps.get-dir.outputs.artifact_name }}
-          path: ${{ matrix.notebook }}
-          retention-days: 1
-
-      - name: Upload static files
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: static-${{ steps.get-dir.outputs.artifact_name }}
-          path: ${{ steps.get-dir.outputs.dir }}/static/
-          if-no-files-found: ignore
-          retention-days: 1
-
-      - name: Upload solutions
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: solutions-${{ steps.get-dir.outputs.artifact_name }}
-          path: ${{ steps.get-dir.outputs.dir }}/solutions/
+          name: tutorial-${{ steps.get-dir.outputs.artifact_name }}
+          path: |
+            ${{ matrix.notebook }}
+            ${{ steps.get-dir.outputs.dir }}/static/${{ steps.get-dir.outputs.nb_name }}*
+            ${{ steps.get-dir.outputs.dir }}/solutions/${{ steps.get-dir.outputs.nb_name }}*
+            ${{ steps.get-dir.outputs.dir }}/student/${{ steps.get-dir.outputs.nb_name }}*
+            ${{ steps.get-dir.outputs.dir }}/instructor/${{ steps.get-dir.outputs.nb_name }}*
           if-no-files-found: ignore
           retention-days: 1
 
@@ -212,53 +202,26 @@ jobs:
         run: |
           echo "Restoring processed files from artifacts..."
 
-          # Process notebook artifacts
-          for dir in artifacts/notebook-*; do
+          # Artifact name format: tutorial-tutorials___W1D5_Microcircuits___W1D5_Tutorial2
+          # The path is encoded with ___ as delimiter. Last segment is notebook name, rest is directory.
+          # upload-artifact strips common path prefixes, so we need to reconstruct the target directory.
+          for dir in artifacts/tutorial-*; do
             if [ -d "$dir" ]; then
-              echo "Processing $dir"
-              cp -v "$dir"/*.ipynb tutorials/ 2>/dev/null || true
-              # Find the actual notebook and copy to correct location
-              find "$dir" -name "*.ipynb" -exec sh -c '
-                for f; do
-                  # Extract original path from artifact structure
-                  rel_path=$(basename "$f")
-                  # Find where this notebook should go based on its name
-                  original=$(find tutorials -name "$rel_path" -type f 2>/dev/null | head -1)
-                  if [ -n "$original" ]; then
-                    cp -v "$f" "$original"
-                  fi
-                done
-              ' sh {} +
+              # Extract artifact name and parse it to get target directory
+              artifact_name=$(basename "$dir" | sed 's/^tutorial-//')
+              # Convert ___ to / to get full path, then extract directory
+              full_path=$(echo "$artifact_name" | sed 's|___|/|g')
+              tutorial_dir=$(dirname "$full_path")
+              echo "Restoring from artifact $(basename "$dir") to: $tutorial_dir"
+
+              # Copy all files to the correct tutorial directory
+              # The artifact contains files with stripped prefixes (e.g., W1D5_Tutorial2.ipynb, student/, solutions/)
+              mkdir -p "$tutorial_dir"
+              cp -rv "$dir"/* "$tutorial_dir/" 2>/dev/null || true
             fi
           done
 
-          # Process static artifacts
-          for dir in artifacts/static-*; do
-            if [ -d "$dir" ]; then
-              echo "Processing static: $dir"
-              # Extract tutorial path from artifact name
-              artifact_name=$(basename "$dir" | sed 's/^static-//')
-              # Convert back: tutorials_W1D1_xxx.ipynb -> tutorials/W1D1_xxx
-              tutorial_dir=$(echo "$artifact_name" | sed 's/_/\//g' | sed 's/\.ipynb$//' | xargs dirname)
-              if [ -d "$tutorial_dir" ]; then
-                mkdir -p "$tutorial_dir/static"
-                cp -rv "$dir"/* "$tutorial_dir/static/" 2>/dev/null || true
-              fi
-            fi
-          done
-
-          # Process solutions artifacts
-          for dir in artifacts/solutions-*; do
-            if [ -d "$dir" ]; then
-              echo "Processing solutions: $dir"
-              artifact_name=$(basename "$dir" | sed 's/^solutions-//')
-              tutorial_dir=$(echo "$artifact_name" | sed 's/_/\//g' | sed 's/\.ipynb$//' | xargs dirname)
-              if [ -d "$tutorial_dir" ]; then
-                mkdir -p "$tutorial_dir/solutions"
-                cp -rv "$dir"/* "$tutorial_dir/solutions/" 2>/dev/null || true
-              fi
-            fi
-          done
+          echo "Restore complete."
 
       - name: Verify exercises
         env:
@@ -285,6 +248,9 @@ jobs:
           python ci/find_unreferenced_content.py > to_remove.txt
           if [ -s to_remove.txt ]; then git rm --pathspec-from-file=to_remove.txt; fi
 
+      - name: Clean up artifacts directory
+        run: rm -rf artifacts/
+
       - name: Commit post-processed files
         run: |
           git config --local user.email "action@github.com"
@@ -292,6 +258,8 @@ jobs:
           git add '**/*.ipynb'
           git add '**/static/*.png'
           git add '**/solutions/*.py'
+          git add '**/student/*.ipynb'
+          git add '**/instructor/*.ipynb'
           git add '**/README.md'
           git diff-index --quiet HEAD || git commit -m "Process tutorial notebooks"
 
diff --git a/requirements.txt b/requirements.txt
index c4704a8b9..f984a7ab0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,4 +32,4 @@ git+https://github.com/neuromatch/GNS-Modeling#egg=gns
 git+https://github.com/neuromatch/pyBPL#egg=pybpl
 git+https://github.com/neuromatch/MotorNet#egg=motornet
 git+https://github.com/ctn-waterloo/sspspace@neuromatch#egg=sspspace
-git+https://github.com/mitchellostrow/DSA#egg=DSA
\ No newline at end of file
+git+https://github.com/mitchellostrow/DSA#egg=dsa-metric
diff --git a/tutorials/W1D1_Generalization/W1D1_Tutorial1.ipynb b/tutorials/W1D1_Generalization/W1D1_Tutorial1.ipynb
index 19bf45cc1..a98a86bfd 100644
--- a/tutorials/W1D1_Generalization/W1D1_Tutorial1.ipynb
+++ b/tutorials/W1D1_Generalization/W1D1_Tutorial1.ipynb
@@ -1563,7 +1563,7 @@
     "# Section 3: Dissecting TrOCR\n",
     "\n",
     "TrOCR is a model that performs printed optical character recognition and handwriting transcription using the transformer model. But what's inside of it?\n",
-    "It's important to note here that the original transformer model consisted of an encoder step, following by a decoder step. Taken together, this was the initial Transformer model of Vaswani et al. However, subsequent research into transformers led researchers to find applications of the encoding step specifically (encoding models like BERT) and also specific applications of the decoder step (autoregressive models like GPT). This meant that the terminology then changed to be *encoder transformers* and *decoder/causal/autoregressive transformers*. TrOCR is an example of the original transformer setup (both an encoder step and decoder step joined together). The image below outlines this setup. This also matches the transformer architecture given in the video above."
+    "It's important to note here that the original transformer model consisted of an encoder step, followed by a decoder step. Taken together, this was the initial Transformer model of Vaswani et al. However, subsequent research into transformers led researchers to find applications of the encoding step specifically (encoding models like BERT) and also specific applications of the decoder step (autoregressive models like GPT). This meant that the terminology then changed to be *encoder transformers* and *decoder/causal/autoregressive transformers*. TrOCR is an example of the original transformer setup (both an encoder step and decoder step joined together). The image below outlines this setup. This also matches the transformer architecture given in the video above."
    ]
   },
   {
diff --git a/tutorials/W1D1_Generalization/W1D1_Tutorial3.ipynb b/tutorials/W1D1_Generalization/W1D1_Tutorial3.ipynb
index 9847fdf6f..f0c6984df 100644
--- a/tutorials/W1D1_Generalization/W1D1_Tutorial3.ipynb
+++ b/tutorials/W1D1_Generalization/W1D1_Tutorial3.ipynb
@@ -187,7 +187,7 @@
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
     "%matplotlib inline\n",
-    "%config InlineBackend.figure_format = 'retina' # perfrom high definition rendering for images and plots\n",
+    "%config InlineBackend.figure_format = 'retina' # perform high-definition rendering for images and plots\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle\")"
    ]
   },
@@ -620,7 +620,7 @@
     "\n",
     "Let's put ourselves in the mindset of a cognitive scientist studying handwriting. We're interested in how people learn to recognize new characters. Indeed, humans display low **sample complexity** when learning new visual concepts: they seem to grasp new concepts with very few presentations, generalizing effortlessly. In AI, learning from $k$ labeled examples is known as $k$-shot learning; one-shot and few-shot learning refer to learning from one or a few labeled examples.\n",
     "\n",
-    "A good dataset to investigate one-shot learning is the Omniglot dataset. Omniglot has sometimes been described as *MNIST, transposed*. Instead of **thousands** of examples from **10** digit classes, Omniglot consists of **20** instances from **1623** character classes. These character classes are sourced from 50 alphabets, both natural (e.g. Cherokee or Greek) and constructed (e.g. the alien alphabet from the TV show Futurama). \n",
+    "A good dataset to investigate one-shot learning is the Omniglot dataset. Omniglot has sometimes been described as *MNIST, transposed*. Instead of **thousands** of examples from **10** digit classes (many examples, few classes), Omniglot consists of **20** instances from **1623** character classes (few examples, many classes). These character classes are sourced from 50 alphabets, both natural (e.g. Cherokee or Greek) and constructed (e.g. the alien alphabet from the TV show Futurama). \n",
     "\n",
     "![Sample characters from the Omniglot dataset](https://github.com/brendenlake/omniglot/raw/master/omniglot_grid.jpg)\n",
     "\n",
@@ -992,7 +992,7 @@
    "name": "python3"
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -1006,7 +1006,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.22"
+   "version": "3.13.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/W1D2_ComparingTasks/W1D2_Tutorial1.ipynb b/tutorials/W1D2_ComparingTasks/W1D2_Tutorial1.ipynb
index a5c1ed927..70c392fa1 100644
--- a/tutorials/W1D2_ComparingTasks/W1D2_Tutorial1.ipynb
+++ b/tutorials/W1D2_ComparingTasks/W1D2_Tutorial1.ipynb
@@ -39,7 +39,7 @@
     "\n",
     "# Tutorial Objectives\n",
     "\n",
-    "*Estimated timing of tutorial: 90 minutes*\n",
+    "*Estimated time of tutorial: 90 minutes*\n",
     "\n",
     "In this tutorial, we'll explore how task specification affects generalization in networks. We will use the same base architecture (a convolutional neural network / CNN) to perform multiple different tasks. We will explore the number of training points and number of epochs needed to train these networks up to a specific accuracy value. Additionally, we will explore how well representations learned for a given task generalize, and whether these representations can be used to solve the other tasks.\n",
     "\n",
diff --git a/tutorials/W1D2_ComparingTasks/W1D2_Tutorial2.ipynb b/tutorials/W1D2_ComparingTasks/W1D2_Tutorial2.ipynb
index ad7a68180..500b4da85 100644
--- a/tutorials/W1D2_ComparingTasks/W1D2_Tutorial2.ipynb
+++ b/tutorials/W1D2_ComparingTasks/W1D2_Tutorial2.ipynb
@@ -796,7 +796,7 @@
     "\n",
     "Here $\\tau$ is a temperature parameter that controls the sharpness of the distribution. You can think of it as a cross-entropy loss with a single pseudo-class corresponding to similar labels and the negative pairs corresponding to different labels. \n",
     "\n",
-    "### Decoupled constrastive learning\n",
+    "### Decoupled contrastive learning\n",
     "\n",
     "InfoNCE typically requires substantial batch sizes—commonly 128 or larger—to perform optimally. The need for large batch sizes stems from the necessity for diverse negative samples in the batch to effectively learn the contrasts. However, large batch sizes can be impractical in resource-constrained settings or when data availability is limited.\n",
     "\n",
@@ -1658,7 +1658,7 @@
     "\n",
     "Through practical exercises with the MNIST dataset, we've seen how contrastive learning can be implemented. The session highlighted the intuitive appeal of contrastive learning: learning by comparison, which is a natural way for both humans and machines to understand the world.\n",
     "\n",
-    "Let's bring this back to the overall goal of the overall theme of the course, **generalization**. So far, we've looked at how tasks are defined by cost functions, specifications of different losses. Some tasks require models to learn representatinos that are not task-specific but can be very task general in large parts, and specialised (task-specific) in other parts. For example, most of a CNN architecture might specialise in learning features present in the real world: shapes, colors, lines. Contrastive learning is a way we can efficently make use of the large amounts of unlabeled data in the world. It's also a task that, over large datasets, results in models learning very rich, general representations. \n",
+    "Let's bring this back to the overall goal of the overall theme of the course, **generalization**. So far, we've looked at how tasks are defined by cost functions, specifications of different losses. Some tasks require models to learn representations that are not task-specific but can be very task general in large parts, and specialised (task-specific) in other parts. For example, most of a CNN architecture might specialise in learning features present in the real world: shapes, colors, lines. Contrastive learning is a way we can efficiently make use of the large amounts of unlabeled data in the world. It's also a task that, over large datasets, results in models learning very rich, general representations. \n",
     "\n",
     "Take a moment to think about all the ways that learning via contrastive learning might give a network (human or artificial) a rich set of representations. We'll soon get to studying how to measure representations and across multiple systems and their geometry. But first, let's address another method to learn tasks that's a little bit different to what we've seen before: Reinforcement Learning."
    ]
diff --git a/tutorials/W1D2_ComparingTasks/W1D2_Tutorial3.ipynb b/tutorials/W1D2_ComparingTasks/W1D2_Tutorial3.ipynb
index 98511191d..991ce8d1e 100644
--- a/tutorials/W1D2_ComparingTasks/W1D2_Tutorial3.ipynb
+++ b/tutorials/W1D2_ComparingTasks/W1D2_Tutorial3.ipynb
@@ -515,7 +515,7 @@
     "\n",
     "$$R(T) = \\sum_{t = 1}^T (p^* - \\mathbb{E}(p_{a_t})),$$\n",
     "\n",
-    "where $p^*$ is the probability of the reward for the best arm, i.e., max($p_L$, $p_r$). $\\mathbb{E}(p_{a_t})$ corresponds to the expected probability of reward for the action that was chosen at the previous time $t$."
+    "where $p^*$ is the probability of the reward for the best arm, i.e., max($p_L$, $p_R$). $\\mathbb{E}(p_{a_t})$ corresponds to the expected probability of reward for the action that was chosen at the previous time $t$."
    ]
   },
   {
diff --git a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial1.ipynb b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial1.ipynb
index 10d865eed..51a87fa5d 100644
--- a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial1.ipynb
+++ b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial1.ipynb
@@ -1120,7 +1120,7 @@
    },
    "outputs": [],
    "source": [
-    "# @title Train adversirally robust model\n",
+    "# @title Train adversirially robust model\n",
     "\n",
     "# model_robust = Net().to(args.device)\n",
     "# optimizer = optim.Adadelta(model_robust.parameters(), lr=args.lr)\n",
@@ -1532,7 +1532,7 @@
     "\n",
     "The color-coded matrix shows the dot product similarity between activations for the training images (rows) and test images (columns)--yellow means that the two images are highly similar based on the dot product of the activations for those images, and dark blue means the two images are highly dissimilar. The goal of this exercise is to explore how the predicted leaning direction of the test stimuli is determined by your leaning ratings of the training stimuli, and the similarity of the training and test stimuli.\n",
     "\n",
-    "1) Using the matrix, find training and test images that are highly similar and play around with the rating of the training image. How much does the predicted learning rating for the test image change? Try this for a few different pairs.\n",
+    "1) Using the matrix, find training and test images that are highly similar and play around with the rating of the training image. How much does the predicted leaning rating for the test image change? Try this for a few different pairs.\n",
     "\n",
     "2) Now find a highly dissimilar pair and play with the rating. How much does the predicted leaning of the test image change? Try this for a few different pairs.\n",
     "\n",
diff --git a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial2.ipynb b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial2.ipynb
index d788fd7be..fbd860a10 100644
--- a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial2.ipynb
+++ b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial2.ipynb
@@ -1698,7 +1698,7 @@
     "- Characterized the computation that happens across different layers of a network as a path, with each step changing the geometry of the representation to go from input pixels to target labels\n",
     "- Examined the representational geometry paths for different model architectures and different inputs and learned how to interpret them\n",
     "\n",
-    "We used this method to examine how models trained on adversarial stimulu (vs control) differentially treat inputs that are both normal and adversarial. We saw that the category / class level similarity structure, which was different for the standard model on adversarial stimuli, resulting in lower accuracies, actually has a divergent path during the conversion from input data to output labels. This is another link into the idea of **similarity** as a lens that helps us understand **generalization**."
+    "We used this method to examine how models trained on adversarial stimuli (vs control) differentially treat inputs that are both normal and adversarial. We saw that the category / class level similarity structure, which was different for the standard model on adversarial stimuli, resulting in lower accuracies, actually has a divergent path during the conversion from input data to output labels. This is another link into the idea of **similarity** as a lens that helps us understand **generalization**."
    ]
   }
  ],
diff --git a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial3.ipynb b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial3.ipynb
index d91cbb8f0..d991cd1fc 100644
--- a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial3.ipynb
+++ b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial3.ipynb
@@ -1270,7 +1270,7 @@
     "execution": {}
    },
    "source": [
-    "## Loading fMRI patterns from the NSD datset"
+    "## Loading fMRI patterns from the Natural Scenes Dataset"
    ]
   },
   {
@@ -1279,7 +1279,7 @@
     "execution": {}
    },
    "source": [
-    "Let's now load the fMRI patterns from the NSD dataset for these 90 images.\n",
+    "Let's now load the fMRI patterns from NSD for these 90 images.\n",
     "We have pre-extracted the patterns, so we just need to load Numpy arrays from the `.npy` files."
    ]
   },
@@ -2369,7 +2369,7 @@
    "source": [
     "# The Big Picture\n",
     "\n",
-    "Generalization can arise across multiple dimensions, whether that be generalization of an experiment to a new set of subjects or whether a new set of stimuli generalize to the same subjects. More likely, we would want to know how much an experiment would generalize to novel subjects across novel stimuli. We can test these ideas statistically by using the bootstreap method in statistics. This notebook highlights some issues with naive approaches to statistics when assessing generalization in this way. We explored the 2-factor bootstrap method and used a toolbox that explicitly takes care of the calculation so that we don't overestimate the variance involved. It's important to be aware of factors of generalization and how multiple overlapping factors might interact."
+    "Generalization can arise across multiple dimensions, whether that be generalization of an experiment to a new set of subjects or whether a new set of stimuli generalize to the same subjects. More likely, we would want to know how much an experiment would generalize to novel subjects across novel stimuli. We can test these ideas statistically by using the bootstrap method in statistics. This notebook highlights some issues with naive approaches to statistics when assessing generalization in this way. We explored the 2-factor bootstrap method and used a toolbox that explicitly takes care of the calculation so that we don't overestimate the variance involved. It's important to be aware of factors of generalization and how multiple overlapping factors might interact."
    ]
   }
  ],
diff --git a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial4.ipynb b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial4.ipynb
index 064782b50..8ab31206c 100644
--- a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial4.ipynb
+++ b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial4.ipynb
@@ -1296,7 +1296,7 @@
    "source": [
     "To obtain an unbiased estimate, we can split the data into independent sets and cross-validate the difference between patterns across the two sets (Allefeld and Haynes, 2014; Nili et al. 2014).\n",
     "\n",
-    "The cross-validated squared Euclidean distance–the so-called *crossclidian*–between two activity patterns $\\mathbf{b_i}$ and $\\mathbf{b_j}$ can be computed as: \n",
+    "The cross-validated squared Euclidean distance–the so-called *crossclidean*–between two activity patterns $\\mathbf{b_i}$ and $\\mathbf{b_j}$ can be computed as: \n",
     "\n",
     "$$d^2_{\\text{Euclidean, cross-validated}}=(\\mathbf{b_i} - \\mathbf{b_j})_\\text{train}(\\mathbf{b_i} - \\mathbf{b_j})_\\text{test}^T$$\n",
     "\n",
diff --git a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial5.ipynb b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial5.ipynb
index cf185d05d..aa4f573de 100644
--- a/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial5.ipynb
+++ b/tutorials/W1D3_ComparingArtificialAndBiologicalNetworks/W1D3_Tutorial5.ipynb
@@ -2185,9 +2185,9 @@
     "\n",
     "# Intro\n",
     "\n",
-    "Welcome to Tutorial 5 of Day 3 (W1D3) of the NeuroAI course. In this tutorial we are going to look at an exciting method that measures similarity from a slightly different perspective, a temporal one. The prior methods we have looked at were centeed around geometry and spatial representations, where we looked at metrics such as the Euclidean and Mahalanobis distance metrics. However, one thing we often want to study in neuroscience and in AI separately - is the temporal domain. Even more so in our own field of NeuroAI, we often deal with time series of neuronal / biological recordings. One thing you should already have a broad level of awareness of is that end structures can end up looking the same even though the paths taken to arrive at those end structures were very different.\n",
+    "Welcome to Tutorial 5 of Day 3 (W1D3) of the NeuroAI course. The prior methods we have looked at so far were centered around geometry and spatial representations, where we looked at metrics such as the Euclidean and Mahalanobis distance metrics. In this tutorial we are going to look at an exciting method that measures similarity from a slightly different perspective - a temporal one. This is very important in our field of NeuroAI as we often deal with time series of neuronal / biological recordings. One thing you should already be aware of is the fact that geometric plots of temporal data can end up looking the same even though the paths taken to arrive at those end structures are very different.\n",
     "\n",
-    "In NeuroAI, we're often confronted with systems that seem to have some sort of overlap and we want to study whether this implies there is a shared computation pairs up with the shared task (we looked at this in detail yesterday in our *Comparing Tasks* day). Today, we will begin by watching a short intro video by Mitchell Ostrow, who will describe his method to compare representations over temporal sequences (the method is called Dynamic Similarity Analysis). Then we are going to introduce three simple dynamical systems and we will explore them from the perspective of Dynamic Similarity Analysis and also describe the conceptual relationship to Representational Similarity Analysis. You will have a short coding exercise on the topic of temporal similarity analysis on three different types of trajectories. \n",
+    "In NeuroAI, we're often confronted with systems that seem to have some sort of overlap and we want to study whether this implies there is a shared computation pairs up with the shared task (we looked at this in detail yesterday in the day on *Comparing Tasks*). Today, we will watch a short intro video by Mitchell Ostrow, who will describe a method he devised to compare representations of temporal sequences (the method is called *Dynamic Similarity Analysis*). Then we are going to introduce three simple dynamical systems and we will explore them from the perspective of Dynamic Similarity Analysis and also describe the conceptual relationship to Representational Similarity Analysis. You will have a short coding exercise on the topic of temporal similarity analysis on three different types of trajectories. \n",
     "\n",
     "At the end of the tutorial, we will finally look at a further aspect of temporal sequences using RNNs. This is an adaptation of the ideas introduced in Tutorial 2 but now based around recurrent representations from RNNs. We hope you enjoy this tutorial today and that it gets you thinking not just what similarity values mean, but which ones are appropriate (here, from a spatial or temporal perspective). We aim to continually expand the tools necessary in the NeuroAI researcher's toolkit. Complementary tools, when applicable, can often tell a far richer story than just using a single method."
    ]
@@ -2322,6 +2322,31 @@
     "trajectory_walk = trajectory"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d4091b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot the trajectory\n",
+    "fig, axes = plt.subplots(1,3, figsize=(10, 4))\n",
+    "\n",
+    "axes[0].plot(trajectory_circle[:, 0], trajectory_circle[:, 1])\n",
+    "axes[0].set_title(\"Circle\")\n",
+    "axes[0].set_xlabel(\"X\")\n",
+    "axes[0].set_ylabel(\"Y\")\n",
+    "axes[1].plot(trajectory_oval[:, 0], trajectory_oval[:, 1])\n",
+    "axes[1].set_title(\"Oval\")\n",
+    "axes[1].set_xlabel(\"X\")\n",
+    "axes[1].set_ylabel(\"Y\")\n",
+    "axes[2].plot(trajectory_walk[:, 0], trajectory_walk[:, 1])\n",
+    "axes[2].set_title(\"2D Random Walk\")\n",
+    "axes[2].set_xlabel(\"X\")\n",
+    "axes[2].set_ylabel(\"Y\")\n",
+    "plt.show()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "113a0dee",
@@ -2329,9 +2354,9 @@
     "execution": {}
    },
    "source": [
-    "Can you see how the spatial / geometric similarity of `R-Walk` and `Oval` are more similar, but the oscillations during the temporal sequence are shared between `Circle` and `Oval`? Let's run Dynamic Similarity Analysis on these temporal sequences and see what scores are returned.\n",
+    "Can you see how the spatial / geometric similarity of `Random Walk` and `Oval` are quite similar? Can you also observe that the `Circle` and `Oval` trajectories contain similar oscillatory patterns? What we mean by similarity can be viewed through different lenses: a spatial/geometric shape-based one, and another temporal one that considers how the sequences **pattern together**. Let's run Dynamic Similarity Analysis on these temporal sequences and explore the similarity values to further get a grasp on this difference.\n",
     "\n",
-    "We calcularted `trajectory_oval` and `trajectory_circle` above, so let's plug these into the `DSA` function imported earlier (in the helper function cell) and see what the similarity score is."
+    "We calculated `trajectory_oval` and `trajectory_circle` above, so let's plug these into the `DSA` function imported earlier (in the helper function cell) and see what the similarity score is."
    ]
   },
   {
diff --git a/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb b/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb
index ac7e0ff08..84746351a 100644
--- a/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb
+++ b/tutorials/W1D5_Microcircuits/W1D5_Tutorial1.ipynb
@@ -3228,7 +3228,7 @@
    "source": [
     "# The Big Picture\n",
     "\n",
-    "The main message we would like you to take away from this tutorial is that sparsity has numerous advantageous properties for representing computation in the brain and in AI models. We know the brain likely uses this principle extensively. The mechanisms to measure and induce sparsity invcomputational models of neuroscience and AI are important and we hope you have gained an idea of how you might think about applications of sparsity in the future.\n",
+    "The main message we would like you to take away from this tutorial is that sparsity has numerous advantageous properties for representing computation in the brain and in AI models. We know the brain likely uses this principle extensively. The mechanisms to measure and induce sparsity in computational models of neuroscience and AI are important and we hope you have gained an idea of how you might think about applications of sparsity in the future.\n",
     "\n",
     "In the next tutorial, we will look at another essential operation to be realized in brains & machines - normalization. If you have time at the end of this day's tutorials, we have also included some further bonus material that covers the interesting application of spatial sparsity. If you're running low on time, please concentrate on the other tutorials and come back to the bonus material at a more convenient time."
    ]
diff --git a/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb b/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb
index c925796b0..195569c39 100644
--- a/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb
+++ b/tutorials/W1D5_Microcircuits/W1D5_Tutorial2.ipynb
@@ -2028,7 +2028,7 @@
     "\n",
     "#################################################\n",
     "## TODO: Implement the normalization example equation ##\n",
-    "# Fill remove the following line of code one you have completed the exercise:\n",
+    "# Fill out and remove the following line of code one you have completed the exercise:\n",
     "raise NotImplementedError(\"Student exercise: choose your parameters values.\")\n",
     "#################################################\n",
     "\n",
diff --git a/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb b/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb
index 7cdd041c6..5ac3b86cf 100644
--- a/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb
+++ b/tutorials/W1D5_Microcircuits/W1D5_Tutorial3.ipynb
@@ -42,7 +42,7 @@
     "\n",
     "By the end of this tutorial, we aim to:\n",
     "\n",
-    "1. Learn how the brain and AI systems implemention attention\n",
+    "1. Learn how the brain and AI systems implement attention\n",
     "\n",
     "2. Understand how multiplicative interactions allow flexible gating of information\n",
     "\n",
@@ -212,7 +212,7 @@
     "        plt.subplot(1, 2, 1)\n",
     "        plt.plot(t_loss, label=\"Training loss\", color=\"red\")\n",
     "        if v_loss is not None:\n",
-    "            # plt.plot(v_loss, label=\"Valididation loss\", color=\"blue\")\n",
+    "            # plt.plot(v_loss, label=\"Validation loss\", color=\"blue\")\n",
     "            plt.scatter(len(t_loss)-1, v_loss, label=\"Validation loss\", color=\"blue\", marker=\"*\")\n",
     "            # plt.text(len(t_loss)-1, v_loss, f\"{v_loss:.3f}\", va=\"bottom\", ha=\"right\")\n",
     "        plt.yscale(\"log\")\n",
@@ -2076,7 +2076,7 @@
     "B_t_mlp = 1000  # batch size for training (number of training samples)\n",
     "n_epochs = 100  # number of epochs\n",
     "s_sparse = sparse_dense[0]  # dense\n",
-    "hidden_layers = [512]  # the number of hidden units in }each layer [H1, H2, ...]\n",
+    "hidden_layers = [512]  # the number of hidden units in each layer [H1, H2, ...]\n",
     "kind = \"MLP\"\n",
     "\n",
     "mlp_model = BinaryMLP(context_length, hidden_layers, 1)  # MLP model\n",
@@ -2150,7 +2150,7 @@
     "embed_dim = 2  # embedding dimension\n",
     "kind = \"SAT\"\n",
     "\n",
-    "sat_model_s = SelfAttention(context_length, embed_dim)  # selt-attention transformer\n",
+    "sat_model_s = SelfAttention(context_length, embed_dim)  # self-attention transformer\n",
     "data_gen = s_Sparse_AND(context_length, s_sparse)\n",
     "results_sat_s = make_train(sat_model_s, data_gen, B_t_sat, B_valid, n_epochs, DEVICE, kind, verbose=True, etta=1e-2)"
    ]
@@ -2180,7 +2180,7 @@
     "embed_dim = 2  # embedding dimension\n",
     "kind = \"SAT\"\n",
     "\n",
-    "sat_model_d = SelfAttention(context_length, embed_dim)  # selt-attention transformer\n",
+    "sat_model_d = SelfAttention(context_length, embed_dim)  # self-attention transformer\n",
     "data_gen = s_Sparse_AND(context_length, s_sparse)\n",
     "results_sat_d = make_train(sat_model_d, data_gen, B_t_sat, B_valid, n_epochs, DEVICE, kind, verbose=True, etta=1e-2)"
    ]
diff --git a/tutorials/W2D1_Macrocircuits/W2D1_Tutorial1.ipynb b/tutorials/W2D1_Macrocircuits/W2D1_Tutorial1.ipynb
index 05d3b1286..af39907cc 100644
--- a/tutorials/W2D1_Macrocircuits/W2D1_Tutorial1.ipynb
+++ b/tutorials/W2D1_Macrocircuits/W2D1_Tutorial1.ipynb
@@ -154,7 +154,7 @@
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
     "%matplotlib inline\n",
-    "%config InlineBackend.figure_format = 'retina' # perfrom high definition rendering for images and plots\n",
+    "%config InlineBackend.figure_format = 'retina' # perform high definition rendering for images and plots\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle\")"
    ]
   },
@@ -203,7 +203,7 @@
     "    Inputs:\n",
     "    - Es_train (np.ndarray): loss values.\n",
     "    - X_test (np.ndarray): test input data.\n",
-    "    - y_test (np.ndarray): test outpu data.\n",
+    "    - y_test (np.ndarray): test output data.\n",
     "    \"\"\"\n",
     "    with plt.xkcd():\n",
     "        fig, axes = plt.subplots(1,2,figsize=(10,5))\n",
@@ -363,7 +363,7 @@
     "\n",
     "# Section 1: Introduction\n",
     "\n",
-    "In this section we will write some Python functions to help build some neural networks that will allow us to effectively examine the expressiity of shallow versus deep networks. We will specifically look at this issue through the lens of the universal approximation theorem and ask ourselves what deeper neural networks give us in terms of the ability of those models to capture a wide range of functions. As you will recall from today's introduction video, the idea of each layer being able to fold activations via an activation function increases the ability to model nonlinear functions much more effectively. After going through this tutorial, this idea will hopefully be much clearer.\n",
+    "In this section we will write some Python functions to help build some neural networks that will allow us to effectively examine the expressivity of shallow versus deep networks. We will specifically look at this issue through the lens of the universal approximation theorem and ask ourselves what deeper neural networks give us in terms of the ability of those models to capture a wide range of functions. As you will recall from today's introduction video, the idea of each layer being able to fold activations via an activation function increases the ability to model nonlinear functions much more effectively. After going through this tutorial, this idea will hopefully be much clearer.\n",
     "\n",
     "By **shallow network**, we mean one with a very small number of layers (e.g. one). A shallow networks can be **wide** if it has many, many neurons in this layer, or it can be smaller, having only a limited number of neurons. In contrast, by **deep networks**, we refer to the number of layers in the network. It's important to keep in mind that the term **wide** in the terminology we will use specifically refers to *the number of neurons in a layer, not the number of layers in a network*. If we take a single layer in a shallow or a deep network, we can describe it as being **wide** if it has a very large number of neurons. "
    ]
@@ -548,7 +548,7 @@
     "\n",
     "def make_MLP(n_in, W, D, nonlin = 'tanh'):\n",
     "    \"\"\"\n",
-    "    Create `nn.Sequnetial()` fully-connected model in pytorch with the given parameters.\n",
+    "    Create `nn.Sequential()` fully-connected model in pytorch with the given parameters.\n",
     "\n",
     "    Inputs:\n",
     "    - n_in (int): input dimension.\n",
@@ -609,7 +609,7 @@
     "    Inputs:\n",
     "    - n_in (int): input dimension.\n",
     "    - W (int): width of the network.\n",
-    "    - D (int): depth if the network.\n",
+    "    - D (int): depth of the network.\n",
     "\n",
     "    Outputs:\n",
     "    - num_params (int): number of parameters in the network.\n",
@@ -644,7 +644,7 @@
     "    Inputs:\n",
     "    - n_in (int): input dimension.\n",
     "    - W (int): width of the network.\n",
-    "    - D (int): depth if the network.\n",
+    "    - D (int): depth of the network.\n",
     "\n",
     "    Outputs:\n",
     "    - num_params (int): number of parameters in the network.\n",
diff --git a/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb b/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb
index c23d1e5bc..ef850bd98 100644
--- a/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb
+++ b/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb
@@ -631,7 +631,7 @@
     "---\n",
     "# Section 1: Weight Perturbation\n",
     "\n",
-    "In this section, we will start exploring more bioligcally plausible learning algorithms that are known to exhibit increased variance, specifically the *weight perturbation* algorithm."
+    "In this section, we will start exploring more biologically plausible learning algorithms that are known to exhibit increased variance, specifically the *weight perturbation* algorithm."
    ]
   },
   {
@@ -1796,7 +1796,7 @@
     "\n",
     "*Estimated timing to here from start of tutorial: 1 hour 20 minutes*\n",
     "\n",
-    "This section presents the last method for this day, which is a method that leans towards exhibiting more biased solutions than ones that exhibit higher-variance solutions. Specifically, the metho we are going to look at today is known as the Kolen-Pollack method. While in the previous section we looked at Feedback Alignment, in that case, we hinted at the fact that this works well for simple tasks. However, feedback alignment, as will be shown below, does not do very well in tasks of the level of complexity we are typically interested in. The Kolen-Pollack method attempts to fix some of the problems of Feedback Alignment in order to be better at more complex and interesting tasks."
+    "This section presents the last method for this day, which is a method that leans towards exhibiting more biased solutions than ones that exhibit higher-variance solutions. Specifically, the method we are going to look at today is known as the Kolen-Pollack method. While in the previous section we looked at Feedback Alignment, in that case, we hinted at the fact that this works well for simple tasks. However, feedback alignment, as will be shown below, does not do very well in tasks of the level of complexity we are typically interested in. The Kolen-Pollack method attempts to fix some of the problems of Feedback Alignment in order to be better at more complex and interesting tasks."
    ]
   },
   {
@@ -2213,7 +2213,7 @@
     "\n",
     "While the summary above recaps the main takeaway points of today's tutorial, let's also stop and think a bit bigger. Let's think back to the opening section of today's tutorial about biological plausibility and what it means for both the future of neuroscience and AI. As we have seen today, learning in brains is restricted by the directional nature of information transfer and biologically plausible learning algorithms are those algorithms that better mirror these properties. This is where we run into a dilemma: on the one hand in standard AI training, backpropagation is so successful because we get exactly the correct set of error signals to make updates to our weights. This has worked well in AI, but this method in the current set up might lead to a wall that we cannot break and extend into significant further advances in AI. This is also an issue working in neuroscience, where AI models are often used as *in silico* representations to model biological processes or as candidate representational spaces to model different stimuli. \n",
     "\n",
-    "The main idea we want you to take away from today is to be aware of alternate approaches that better mirror computational constraints from a system (the brain) that we know in many ways is better than frontier / state of the art AI models. The exact techniques are only candidates, but there is a wide belief that the NeuroAI community might be in an excellent position to study and propose learning algortihms that are not only biologically plausible, but also show promise as future widely-adopted learning algorithms in large-scale deep learning networks.\n",
+    "The main idea we want you to take away from today is to be aware of alternate approaches that better mirror computational constraints from a system (the brain) that we know in many ways is better than frontier / state of the art AI models. The exact techniques are only candidates, but there is a wide belief that the NeuroAI community might be in an excellent position to study and propose learning algorithms that are not only biologically plausible, but also show promise as future widely-adopted learning algorithms in large-scale deep learning networks.\n",
     "\n",
     "We hope this idea sticks around in your mind and that you have found today's tutorial insightful.\n",
     "\n",
diff --git a/tutorials/W2D4_Macrolearning/W2D4_Tutorial1.ipynb b/tutorials/W2D4_Macrolearning/W2D4_Tutorial1.ipynb
index c38f39d37..9859acb2a 100644
--- a/tutorials/W2D4_Macrolearning/W2D4_Tutorial1.ipynb
+++ b/tutorials/W2D4_Macrolearning/W2D4_Tutorial1.ipynb
@@ -481,7 +481,7 @@
     "\n",
     "###################################################################\n",
     "## Fill out the following then remove\n",
-    "raise NotImplementedError(\"Student exercise: need to normalized days and to fit model with it\")\n",
+    "raise NotImplementedError(\"Student exercise: need to normalize days and to fit model with it\")\n",
     "###################################################################\n",
     "\n",
     "#apply normalization for days\n",
@@ -490,7 +490,7 @@
     "summer_days_test_norm = (summer_days_test - ...) / ...\n",
     "\n",
     "#define MLP\n",
-    "model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=10000, random_state = 42, solver = \"lbfgs\") # LBFGS is better to use when there is small amount of data\n",
+    "model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=10000, random_state = 42, solver = \"lbfgs\") # LBFGS is better to use when there is a small amount of data\n",
     "\n",
     "#train MLP\n",
     "model.fit(..., ...)\n",
@@ -522,7 +522,7 @@
     "summer_days_test_norm = (summer_days_test - summer_days_mean) / summer_days_std\n",
     "\n",
     "#define MLP\n",
-    "model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=10000, random_state = 42, solver = \"lbfgs\") # LBFGS is better to use when there is small amount of data\n",
+    "model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=10000, random_state = 42, solver = \"lbfgs\") # LBFGS is better to use when there is a small amount of data\n",
     "\n",
     "#train MLP\n",
     "model.fit(summer_days_train_norm, summer_prices_train)\n",
@@ -950,7 +950,7 @@
     "\n",
     "Distribution shifts are a huge issue in modern ML systems. Awareness of the fundamental idea behind how these shifts can happen is increasingly important, the more that these systems take on roles that impact systems that we interact with in our daily lives. During COVID-19, product replenishment systems failed spectacularly because there was an underlying shift (panic buying of certain items) that the model did not expect and this caused a huge problem for systems that relied on statistical predictions in the company pipeline.\n",
     "\n",
-    "In NeuroAI, the distribution shifts can happen in numerous places. For example, training a model on sets of neurons that belong to different brain areas or perhaps the same distribution of neurons that differ due to a confounding third factor, that renders the training and test distribution of features to be different. Awareness of potential distribution shifts is incredibly important and should be something systems are continuosly monitoring. NeuroAI currently lags behind in its adoption of evaluations that monitor these kinds of issues. Our goal is to bring this attention more to the forefront so that in your careers as NeuroAI practioners, you are aware of the necessary factors that can affect the models you build.\n",
+    "In NeuroAI, the distribution shifts can happen in numerous places. For example, training a model on sets of neurons that belong to different brain areas or perhaps the same distribution of neurons that differ due to a confounding third factor, that renders the training and test distribution of features to be different. Awareness of potential distribution shifts is incredibly important and should be something systems are continously monitoring. NeuroAI currently lags behind in its adoption of evaluations that monitor these kinds of issues. Our goal is to bring this attention more to the forefront so that in your careers as NeuroAI practitioners, you are aware of the necessary factors that can affect the models you build.\n",
     "\n",
     "In the next tutorials, we are going to address the question of generalization—what are the techniques and methods to deal with poor generalization performance due to distribution shifts."
    ]
diff --git a/tutorials/W2D4_Macrolearning/W2D4_Tutorial2.ipynb b/tutorials/W2D4_Macrolearning/W2D4_Tutorial2.ipynb
index 78d590e83..14074bf52 100644
--- a/tutorials/W2D4_Macrolearning/W2D4_Tutorial2.ipynb
+++ b/tutorials/W2D4_Macrolearning/W2D4_Tutorial2.ipynb
@@ -43,7 +43,7 @@
     "\n",
     "*Estimated timing of tutorial: 25 minutes*\n",
     "\n",
-    "In this tutorial, we wil discover how further training on new data or tasks causes forgetting of past tasks. This is like the idea of learning a new idea by replacing an old one. This is a huge issue with the current AI models and deep neural networks and a very active area of research in the ML community. We are going to explore the problem in more detail and investigate some further issues connected to this idea, for example, how different learning schedules impact performance."
+    "In this tutorial, we will discover how further training on new data or tasks causes forgetting of past tasks. This is like the idea of learning a new idea by replacing an old one. This is a huge issue with the current AI models and deep neural networks and a very active area of research in the ML community. We are going to explore the problem in more detail and investigate some further issues connected to this idea, for example, how different learning schedules impact performance."
    ]
   },
   {
@@ -194,7 +194,7 @@
     "        - autumn_r_squared (list): List containing the R-squared values for the autumn season at each epoch.\n",
     "    \"\"\"\n",
     "\n",
-    "    print(f\"Summmer final R-squared value is: {summer_r_squared[-1]:.02f}\")\n",
+    "    print(f\"Summer final R-squared value is: {summer_r_squared[-1]:.02f}\")\n",
     "    print(f\"Autumn final R-squared value is: {autumn_r_squared[-1]:.02f}\")\n",
     "\n",
     "\n",
@@ -475,7 +475,7 @@
     "execution": {}
    },
    "source": [
-    "Notice how disruptive the change is for R-squared values — even one iteration is enough to drastically alter the performance. The model has learned to perform perfectly on the autumn data, while it completely messes up predictions for the summer days. Indeed, the model forgot the relationships for the old data and lost its predictive power while training on the new dataset. In the next section of the tutorial, we are going to explore a different approach—what if, instead of training sequentially, we train the model on both datasets together?"
+    "Notice how disruptive the change is for R-squared values — even one iteration is enough to drastically alter the performance. The model has learned to perform perfectly on the autumn data, while it completely disrupts predictions for the summer days. Indeed, the model forgot the relationships for the old data and lost its predictive power while training on the new dataset. In the next section of the tutorial, we are going to explore a different approach—what if, instead of training sequentially, we train the model on both datasets together?"
    ]
   },
   {
@@ -874,7 +874,7 @@
    "source": [
     "# The Big Picture\n",
     "\n",
-    "What causes catastrophic forgetting? If we train tasks continually (one after the other) and not in a joint fashion, it implies the overwriting of old knowledge (information) with new knowledge (information). That doesn't seem to happen in biological systems, so is there a lesson we can take from psychology and neuroscience to better handle catastrophic forgetting? Information seems to be distributed across all weights for this ovwerwriting to happen. Large LLM models have developed one solution, the so-called Mixture-of-Experts Model. This is where a routing mechanism decides what sections of a neural network become active for each tasks. Is that a viable solution? These models are gigantic and extremely computationally expensive. We think there is scope to be more brain-like without requiring such vast computational giants.\n",
+    "What causes catastrophic forgetting? If we train tasks continually (one after the other) and not in a joint fashion, it implies the overwriting of old knowledge (information) with new knowledge (information). That doesn't seem to happen in biological systems, so is there a lesson we can take from psychology and neuroscience to better handle catastrophic forgetting? Information seems to be distributed across all weights for this overwriting to happen. Large LLM models have developed one solution, the so-called Mixture-of-Experts Model. This is where a routing mechanism decides what sections of a neural network become active for each tasks. Is that a viable solution? These models are gigantic and extremely computationally expensive. We think there is scope to be more brain-like without requiring such vast computational giants.\n",
     "\n",
     "If you're interested in learning more, the topics we covered today are also often referred to as the **Stability-Plasticity Dilemma**. A search for that term will certainly bring you to many recent advances in exploring this idea.  \n",
     "\n",
diff --git a/tutorials/W2D4_Macrolearning/W2D4_Tutorial3.ipynb b/tutorials/W2D4_Macrolearning/W2D4_Tutorial3.ipynb
index 490c9aaa0..19f09e21f 100644
--- a/tutorials/W2D4_Macrolearning/W2D4_Tutorial3.ipynb
+++ b/tutorials/W2D4_Macrolearning/W2D4_Tutorial3.ipynb
@@ -160,7 +160,7 @@
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
     "%matplotlib inline\n",
-    "%config InlineBackend.figure_format = 'retina' # perfrom high definition rendering for images and plots\n",
+    "%config InlineBackend.figure_format = 'retina' # perform high definition rendering for images and plots\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle\")"
    ]
   },
@@ -444,7 +444,7 @@
     "\n",
     "    def __len__(self):\n",
     "        \"\"\"Calculate the length of the dataset. It is obligatory for PyTorch to know in advance how many samples to expect (before training),\n",
-    "        thus we enforced to icnlude number of epochs and tasks per epoch in `FruitSupplyDataset` parameters.\"\"\"\n",
+    "        thus we enforced to include number of epochs and tasks per epoch in `FruitSupplyDataset` parameters.\"\"\"\n",
     "\n",
     "        return self.num_epochs * self.num_tasks\n",
     "\n",
@@ -758,7 +758,7 @@
     "\n",
     "    def __len__(self):\n",
     "        \"\"\"Calculate the length of the dataset. It is obligatory for PyTorch to know in advance how many samples to expect (before training),\n",
-    "        thus we enforced to icnlude number of epochs and tasks per epoch in `FruitSupplyDataset` parameters.\"\"\"\n",
+    "        thus we enforced to include number of epochs and tasks per epoch in `FruitSupplyDataset` parameters.\"\"\"\n",
     "\n",
     "        return self.num_epochs * self.num_tasks\n",
     "\n",
diff --git a/tutorials/W2D4_Macrolearning/W2D4_Tutorial4.ipynb b/tutorials/W2D4_Macrolearning/W2D4_Tutorial4.ipynb
index c36c131f1..829a3a559 100644
--- a/tutorials/W2D4_Macrolearning/W2D4_Tutorial4.ipynb
+++ b/tutorials/W2D4_Macrolearning/W2D4_Tutorial4.ipynb
@@ -159,7 +159,7 @@
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
     "%matplotlib inline\n",
-    "%config InlineBackend.figure_format = 'retina' # perfrom high definition rendering for images and plots\n",
+    "%config InlineBackend.figure_format = 'retina' # perform high definition rendering for images and plots\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle\")"
    ]
   },
@@ -724,7 +724,7 @@
     "        return self.state\n",
     "\n",
     "    def step(self, action):\n",
-    "        \"\"\"Evaluate agent's perfromance, return reward and next observation.\"\"\"\n",
+    "        \"\"\"Evaluate agent's performance, return reward and next observation.\"\"\"\n",
     "        if self.state[action] == self.rewarded_digit:\n",
     "            feedback = self.reward\n",
     "        else:\n",
@@ -1170,7 +1170,7 @@
     "    - num_tasks (int, default = 10): number of tasks to evaluate agent on.\n",
     "    - num_gradient_steps (int, default = 25): number of gradient steps to perform.\n",
     "    - num_trials (int, default = 6): number of times the agent is exposed to the environment per gradient step to be trained .\n",
-    "    - num_evaluation_trials (int, default = 20): number of times the agent is exposed to the environment to evaluate it (no training happend during this phase).\n",
+    "    - num_evaluation_trials (int, default = 20): number of times the agent is exposed to the environment to evaluate it (no training happened during this phase).\n",
     "\n",
     "    Outputs:\n",
     "    - score (int): total score.\n",
diff --git a/tutorials/W2D4_Macrolearning/W2D4_Tutorial5.ipynb b/tutorials/W2D4_Macrolearning/W2D4_Tutorial5.ipynb
index 2d927f020..6f0578be4 100644
--- a/tutorials/W2D4_Macrolearning/W2D4_Tutorial5.ipynb
+++ b/tutorials/W2D4_Macrolearning/W2D4_Tutorial5.ipynb
@@ -161,7 +161,7 @@
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
     "%matplotlib inline\n",
-    "%config InlineBackend.figure_format = 'retina' # perfrom high definition rendering for images and plots\n",
+    "%config InlineBackend.figure_format = 'retina' # perform high definition rendering for images and plots\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle\")"
    ]
   },
@@ -662,7 +662,7 @@
     "        return self.state\n",
     "\n",
     "    def step(self, action):\n",
-    "        \"\"\"Evaluate agent's perfromance, return reward, max reward (for tracking agent's performance) and next observation.\"\"\"\n",
+    "        \"\"\"Evaluate agent's performance, return reward, max reward (for tracking agent's performance) and next observation.\"\"\"\n",
     "        feedback = color_names_rewards[self.color_state[action]]\n",
     "        max_feedback = np.max([color_names_rewards[self.color_state[action]], color_names_rewards[self.color_state[1 - action]]])\n",
     "        self.update_state()\n",
@@ -862,7 +862,7 @@
     "          log_probs.append(log_prob)\n",
     "          entropy_term += entropy\n",
     "\n",
-    "      #calculataing loss\n",
+    "      #calculating loss\n",
     "      Qval = 0\n",
     "      Qvals = torch.zeros(len(rewards))\n",
     "      for t in reversed(range(len(rewards))):\n",
@@ -890,7 +890,7 @@
     "    - env (ChangingEnv): environment.\n",
     "    - agent (ActorCritic): particular instance of Actor Critic agent to train.\n",
     "    - mode (int, default = 1): mode of the environment.\n",
-    "    - num_evaluation_trials (int, default = 20): number of times the agent is exposed to the environment to evaluate it (no training happend during this phase).\n",
+    "    - num_evaluation_trials (int, default = 20): number of times the agent is exposed to the environment to evaluate it (no training happened during this phase).\n",
     "\n",
     "    Outputs:\n",
     "    - scores (list): rewards over all trials of evaluation.\n",
@@ -1185,7 +1185,7 @@
    "outputs": [],
    "source": [
     "# @title Submit your feedback\n",
-    "content_review(f\"{feedback_prefix}_a2c_agent_in_changing_enviornment\")"
+    "content_review(f\"{feedback_prefix}_a2c_agent_in_changing_environment\")"
    ]
   },
   {
diff --git a/tutorials/W2D5_Mysteries/W2D5_Intro.ipynb b/tutorials/W2D5_Mysteries/W2D5_Intro.ipynb
index e7bea670c..263e39f50 100644
--- a/tutorials/W2D5_Mysteries/W2D5_Intro.ipynb
+++ b/tutorials/W2D5_Mysteries/W2D5_Intro.ipynb
@@ -28,7 +28,7 @@
     "\n",
     "**By Neuromatch Academy**\n",
     "\n",
-    "__Content creators:__ Megan Peters, Joseph LeDoux, Matthias Michel, Daniel Dennett"
+    "__Content creators:__ Megan Peters, Joseph LeDoux, Matthias Michel, Claire Sergent, Daniel Dennett"
    ]
   },
   {
diff --git a/tutorials/W2D5_Mysteries/W2D5_Tutorial1.ipynb b/tutorials/W2D5_Mysteries/W2D5_Tutorial1.ipynb
index af88d951e..98f509406 100644
--- a/tutorials/W2D5_Mysteries/W2D5_Tutorial1.ipynb
+++ b/tutorials/W2D5_Mysteries/W2D5_Tutorial1.ipynb
@@ -23,7 +23,7 @@
     "\n",
     "**By Neuromatch Academy**\n",
     "\n",
-    "__Content creators:__ Steve Fleming, Guillaume Dumas, Samuele Bolotta, Juan David Vargas, Hakwan Lau, Anil Seth, Megan Peters\n",
+    "__Content creators:__ Steve Fleming, Guillaume Dumas, Samuele Bolotta, Juan David Vargas, Hakwan Lau, Anil Seth, Claire Sergent, Megan Peters\n",
     "\n",
     "__Content reviewers:__ Samuele Bolotta, Lily Chamakura, RyeongKyung Yoon, Yizhou Chen, Ruiyi Zhang, Patrick Mineault, Alex Murphy\n",
     "\n",
@@ -209,7 +209,7 @@
     "logging.getLogger('matplotlib.font_manager').disabled = True\n",
     "\n",
     "%matplotlib inline\n",
-    "%config InlineBackend.figure_format = 'retina' # perfrom high definition rendering for images and plots\n",
+    "%config InlineBackend.figure_format = 'retina' # perform high definition rendering for images and plots\n",
     "plt.style.use(\"https://raw.githubusercontent.com/NeuromatchAcademy/course-content/main/nma.mplstyle\")"
    ]
   },