From 3d0f4ac990f1ac3619b3aa9f98ca42399e08794f Mon Sep 17 00:00:00 2001
From: simonsays1980 <simon.zehnder@gmail.com>
Date: Mon, 20 Jan 2025 11:31:53 +0100
Subject: [PATCH 1/2] [RLlib] Add metrics to buffers. (#49822)

---
 rllib/algorithms/algorithm.py                 |  10 +
 rllib/algorithms/dqn/dqn.py                   |   7 +
 rllib/utils/metrics/__init__.py               |  34 ++
 .../replay_buffers/episode_replay_buffer.py   | 388 +++++++++++++++++-
 .../prioritized_episode_buffer.py             | 112 ++++-
 5 files changed, 542 insertions(+), 9 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index beba22d4f719e..ad9797e3812cd 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -3962,6 +3962,16 @@ def _create_local_replay_buffer_if_necessary(
         ):
             return
 
+        # Add parameters, if necessary.
+        if config["replay_buffer_config"]["type"] in [
+            "EpisodeReplayBuffer",
+            "PrioritizedEpisodeReplayBuffer",
+        ]:
+            # TODO (simon): If all episode buffers have metrics, check for sublassing.
+            config["replay_buffer_config"][
+                "metrics_num_episodes_for_smoothing"
+            ] = self.config.metrics_num_episodes_for_smoothing
+
         return from_config(ReplayBuffer, config["replay_buffer_config"])
 
     @OldAPIStack
diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py
index 24e9afe26ddea..b328b664e0d32 100644
--- a/rllib/algorithms/dqn/dqn.py
+++ b/rllib/algorithms/dqn/dqn.py
@@ -51,6 +51,7 @@
     NUM_ENV_STEPS_SAMPLED_LIFETIME,
     NUM_TARGET_UPDATES,
     REPLAY_BUFFER_ADD_DATA_TIMER,
+    REPLAY_BUFFER_RESULTS,
     REPLAY_BUFFER_SAMPLE_TIMER,
     REPLAY_BUFFER_UPDATE_PRIOS_TIMER,
     SAMPLE_TIMER,
@@ -689,6 +690,12 @@ def _training_step_new_api_stack(self):
                         sample_episodes=True,
                     )
 
+                    # Get the replay buffer metrics.
+                    replay_buffer_results = self.local_replay_buffer.get_metrics()
+                    self.metrics.merge_and_log_n_dicts(
+                        [replay_buffer_results], key=REPLAY_BUFFER_RESULTS
+                    )
+
                 # Perform an update on the buffer-sampled train batch.
                 with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
                     learner_results = self.learner_group.update_from_episodes(
diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py
index 82d2ad63862bd..50dfe780b2509 100644
--- a/rllib/utils/metrics/__init__.py
+++ b/rllib/utils/metrics/__init__.py
@@ -36,6 +36,40 @@
 ENV_TO_MODULE_SUM_EPISODES_LENGTH_IN = "env_to_module_sum_episodes_length_in"
 ENV_TO_MODULE_SUM_EPISODES_LENGTH_OUT = "env_to_module_sum_episodes_length_out"
 
+# Counters for adding and evicting in replay buffers.
+ACTUAL_N_STEP = "actual_n_step"
+AGENT_ACTUAL_N_STEP = "agent_actual_n_step"
+AGENT_STEP_UTILIZATION = "agent_step_utilization"
+ENV_STEP_UTILIZATION = "env_step_utilization"
+NUM_AGENT_EPISODES_STORED = "num_agent_episodes"
+NUM_AGENT_EPISODES_ADDED = "num_agent_episodes_added"
+NUM_AGENT_EPISODES_ADDED_LIFETIME = "num_agent_episodes_added_lifetime"
+NUM_AGENT_EPISODES_EVICTED = "num_agent_episodes_evicted"
+NUM_AGENT_EPISODES_EVICTED_LIFETIME = "num_agent_episodes_evicted_lifetime"
+NUM_AGENT_EPISODES_PER_SAMPLE = "num_agent_episodes_per_sample"
+NUM_AGENT_RESAMPLES = "num_agent_resamples"
+NUM_AGENT_STEPS_ADDED = "num_agent_steps_added"
+NUM_AGENT_STEPS_ADDED_LIFETIME = "num_agent_steps_added_lifetime"
+NUM_AGENT_STEPS_EVICTED = "num_agent_steps_evicted"
+NUM_AGENT_STEPS_EVICTED_LIFETIME = "num_agent_steps_evicted_lifetime"
+NUM_AGENT_STEPS_PER_SAMPLE = "num_agent_steps_per_sample"
+NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME = "num_agent_steps_per_sample_lifetime"
+NUM_AGENT_STEPS_STORED = "num_agent_steps"
+NUM_ENV_STEPS_STORED = "num_env_steps"
+NUM_ENV_STEPS_ADDED = "num_env_steps_added"
+NUM_ENV_STEPS_ADDED_LIFETIME = "num_env_steps_added_lifetime"
+NUM_ENV_STEPS_EVICTED = "num_env_steps_evicted"
+NUM_ENV_STEPS_EVICTED_LIFETIME = "num_env_steps_evicted_lifetime"
+NUM_ENV_STEPS_PER_SAMPLE = "num_env_steps_per_sample"
+NUM_ENV_STEPS_PER_SAMPLE_LIFETIME = "num_env_steps_per_sample_lifetime"
+NUM_EPISODES_STORED = "num_episodes"
+NUM_EPISODES_ADDED = "num_episodes_added"
+NUM_EPISODES_ADDED_LIFETIME = "num_episodes_added_lifetime"
+NUM_EPISODES_EVICTED = "num_episodes_evicted"
+NUM_EPISODES_EVICTED_LIFETIME = "num_episodes_evicted_lifetime"
+NUM_EPISODES_PER_SAMPLE = "num_episodes_per_sample"
+NUM_RESAMPLES = "num_resamples"
+
 EPISODE_DURATION_SEC_MEAN = "episode_duration_sec_mean"
 EPISODE_LEN_MEAN = "episode_len_mean"
 EPISODE_LEN_MAX = "episode_len_max"
diff --git a/rllib/utils/replay_buffers/episode_replay_buffer.py b/rllib/utils/replay_buffers/episode_replay_buffer.py
index d524cd013aa55..52197e5de0e05 100644
--- a/rllib/utils/replay_buffers/episode_replay_buffer.py
+++ b/rllib/utils/replay_buffers/episode_replay_buffer.py
@@ -1,16 +1,58 @@
 from collections import deque
 import copy
+import hashlib
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import scipy
 
+from ray.rllib.core import DEFAULT_AGENT_ID
 from ray.rllib.env.single_agent_episode import SingleAgentEpisode
 from ray.rllib.env.utils.infinite_lookback_buffer import InfiniteLookbackBuffer
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.replay_buffers.base import ReplayBufferInterface
-from ray.rllib.utils.typing import SampleBatchType
 from ray.rllib.utils import force_list
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.rllib.utils.metrics import (
+    ACTUAL_N_STEP,
+    AGENT_ACTUAL_N_STEP,
+    AGENT_STEP_UTILIZATION,
+    ENV_STEP_UTILIZATION,
+    NUM_AGENT_EPISODES_STORED,
+    NUM_AGENT_EPISODES_ADDED,
+    NUM_AGENT_EPISODES_ADDED_LIFETIME,
+    NUM_AGENT_EPISODES_EVICTED,
+    NUM_AGENT_EPISODES_EVICTED_LIFETIME,
+    NUM_AGENT_EPISODES_PER_SAMPLE,
+    NUM_AGENT_STEPS_STORED,
+    NUM_AGENT_STEPS_ADDED,
+    NUM_AGENT_STEPS_ADDED_LIFETIME,
+    NUM_AGENT_STEPS_EVICTED,
+    NUM_AGENT_STEPS_EVICTED_LIFETIME,
+    NUM_AGENT_STEPS_PER_SAMPLE,
+    NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_SAMPLED_LIFETIME,
+    NUM_ENV_STEPS_STORED,
+    NUM_ENV_STEPS_ADDED,
+    NUM_ENV_STEPS_ADDED_LIFETIME,
+    NUM_ENV_STEPS_EVICTED,
+    NUM_ENV_STEPS_EVICTED_LIFETIME,
+    NUM_ENV_STEPS_PER_SAMPLE,
+    NUM_ENV_STEPS_PER_SAMPLE_LIFETIME,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_EPISODES_STORED,
+    NUM_EPISODES_ADDED,
+    NUM_EPISODES_ADDED_LIFETIME,
+    NUM_EPISODES_EVICTED,
+    NUM_EPISODES_EVICTED_LIFETIME,
+    NUM_EPISODES_PER_SAMPLE,
+)
+from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+from ray.rllib.utils.replay_buffers.base import ReplayBufferInterface
+from ray.rllib.utils.typing import SampleBatchType, ResultDict
 
 
 class EpisodeReplayBuffer(ReplayBufferInterface):
@@ -65,6 +107,7 @@ def __init__(
         *,
         batch_size_B: int = 16,
         batch_length_T: int = 64,
+        metrics_num_episodes_for_smoothing: int = 100,
     ):
         """Initializes an EpisodeReplayBuffer instance.
 
@@ -112,6 +155,10 @@ def __init__(
 
         self.rng = np.random.default_rng(seed=None)
 
+        # Initialize the metrics.
+        self.metrics = MetricsLogger()
+        self._metrics_num_episodes_for_smoothing = metrics_num_episodes_for_smoothing
+
     @override(ReplayBufferInterface)
     def __len__(self) -> int:
         return self.get_num_timesteps()
@@ -124,6 +171,12 @@ def add(self, episodes: Union[List["SingleAgentEpisode"], "SingleAgentEpisode"])
         """
         episodes = force_list(episodes)
 
+        # Set up some counters for metrics.
+        num_env_steps_added = 0
+        num_episodes_added = 0
+        num_episodes_evicted = 0
+        num_env_steps_evicted = 0
+
         for eps in episodes:
             # Make sure we don't change what's coming in from the user.
             # TODO (sven): It'd probably be better to make sure in the EnvRunner to not
@@ -134,8 +187,12 @@ def add(self, episodes: Union[List["SingleAgentEpisode"], "SingleAgentEpisode"])
             #  actually preferred).
             eps = copy.deepcopy(eps)
 
-            self._num_timesteps += len(eps)
-            self._num_timesteps_added += len(eps)
+            eps_len = len(eps)
+            # TODO (simon): Check, if we can deprecate these two
+            # variables and instead peek into the metrics.
+            self._num_timesteps += eps_len
+            self._num_timesteps_added += eps_len
+            num_env_steps_added += eps_len
 
             # Ongoing episode, concat to existing record.
             if eps.id_ in self.episode_id_to_index:
@@ -146,6 +203,7 @@ def add(self, episodes: Union[List["SingleAgentEpisode"], "SingleAgentEpisode"])
                 existing_eps.concat_episode(eps)
             # New episode. Add to end of our episodes deque.
             else:
+                num_episodes_added += 1
                 self.episodes.append(eps)
                 eps_idx = len(self.episodes) - 1 + self._num_episodes_evicted
                 self.episode_id_to_index[eps.id_] = eps_idx
@@ -157,6 +215,8 @@ def add(self, episodes: Union[List["SingleAgentEpisode"], "SingleAgentEpisode"])
                 # Eject oldest episode.
                 evicted_eps = self.episodes.popleft()
                 evicted_eps_len = len(evicted_eps)
+                num_episodes_evicted += 1
+                num_env_steps_evicted += evicted_eps_len
                 # Correct our size.
                 self._num_timesteps -= evicted_eps_len
 
@@ -206,6 +266,168 @@ def add(self, episodes: Union[List["SingleAgentEpisode"], "SingleAgentEpisode"])
                 # Increase episode evicted counter.
                 self._num_episodes_evicted += 1
 
+        self._update_add_metrics(
+            num_env_steps_added,
+            num_episodes_added,
+            num_episodes_evicted,
+            num_env_steps_evicted,
+        )
+
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def _update_add_metrics(
+        self,
+        num_timesteps_added: int,
+        num_episodes_added: int,
+        num_episodes_evicted: int,
+        num_env_steps_evicted: int,
+        **kwargs,
+    ) -> None:
+        """Updates the replay buffer's adding metrics.
+
+        Args:
+            num_timesteps_added: The total number of environment steps added to the
+                buffer in the `EpisodeReplayBuffer.add` call.
+            num_episodes_added: The total number of episodes added to the
+                buffer in the `EpisodeReplayBuffer.add` call.
+            num_episodes_evicted: The total number of environment steps evicted from
+                the buffer in the `EpisodeReplayBuffer.add` call. Note, this
+                does not include the number of episodes evicted before ever
+                added to the buffer (i.e. can happen in case a lot of episodes
+                were added and the buffer's capacity is not large enough).
+            num_env_steps_evicted: he total number of environment steps evicted from
+                the buffer in the `EpisodeReplayBuffer.add` call. Note, this
+                does not include the number of steps evicted before ever
+                added to the buffer (i.e. can happen in case a lot of episodes
+                were added and the buffer's capacity is not large enough).
+        """
+        # Get the actual number of agent steps residing in the buffer.
+        # TODO (simon): Write the same counters and getters as for the
+        # multi-agent buffers.
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_STORED, DEFAULT_AGENT_ID),
+            self.get_num_timesteps(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Number of timesteps added.
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_ADDED, DEFAULT_AGENT_ID),
+            num_timesteps_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_ADDED_LIFETIME, DEFAULT_AGENT_ID),
+            num_timesteps_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_EVICTED, DEFAULT_AGENT_ID),
+            num_env_steps_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_EVICTED_LIFETIME, DEFAULT_AGENT_ID),
+            num_env_steps_evicted,
+            reduce="sum",
+        )
+        # Whole buffer step metrics.
+        self.metrics.log_value(
+            NUM_ENV_STEPS_STORED,
+            self.get_num_timesteps(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_ADDED,
+            num_timesteps_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_ADDED_LIFETIME,
+            num_timesteps_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_EVICTED,
+            num_env_steps_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_EVICTED_LIFETIME,
+            num_env_steps_evicted,
+            reduce="sum",
+        )
+
+        # Episode metrics.
+
+        # Number of episodes in the buffer.
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_STORED, DEFAULT_AGENT_ID),
+            self.get_num_episodes(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Number of new episodes added. Note, this metric could
+        # be zero.
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_ADDED, DEFAULT_AGENT_ID),
+            num_episodes_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_ADDED_LIFETIME, DEFAULT_AGENT_ID),
+            num_episodes_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_EVICTED, DEFAULT_AGENT_ID),
+            num_episodes_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_EVICTED_LIFETIME, DEFAULT_AGENT_ID),
+            num_episodes_evicted,
+            reduce="sum",
+        )
+
+        # Whole buffer episode metrics.
+        self.metrics.log_value(
+            NUM_EPISODES_STORED,
+            self.get_num_episodes(),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Number of new episodes added. Note, this metric could
+        # be zero.
+        self.metrics.log_value(
+            NUM_EPISODES_ADDED,
+            num_episodes_added,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_EPISODES_ADDED_LIFETIME,
+            num_episodes_added,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            NUM_EPISODES_EVICTED,
+            num_episodes_evicted,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_EPISODES_EVICTED_LIFETIME,
+            num_episodes_evicted,
+            reduce="sum",
+        )
+
     @override(ReplayBufferInterface)
     def sample(
         self,
@@ -349,6 +571,11 @@ def _sample_batch(
         is_terminated = [[False] * batch_length_T for _ in range(batch_size_B)]
         is_truncated = [[False] * batch_length_T for _ in range(batch_size_B)]
 
+        # Record all the env step buffer indices that are contained in the sample.
+        sampled_env_step_idxs = set()
+        # Record all the episode buffer indices that are contained in the sample.
+        sampled_episode_idxs = set()
+
         B = 0
         T = 0
         while B < batch_size_B:
@@ -413,10 +640,24 @@ def _sample_batch(
                 # Start filling the next row.
                 B += 1
                 T = 0
+            # Add the episode buffer index to the set of episode indexes.
+            sampled_episode_idxs.add(episode_idx)
+            # Record a has for the episode ID and timestep inside of the episode.
+            sampled_env_step_idxs.add(
+                hashlib.sha256(f"{episode.id_}-{episode_ts}".encode()).hexdigest()
+            )
 
         # Update our sampled counter.
         self.sampled_timesteps += batch_size_B * batch_length_T
 
+        # Update the sample metrics.
+        self._update_sample_metrics(
+            num_env_steps_sampled=batch_size_B * batch_length_T,
+            num_episodes_per_sample=len(sampled_episode_idxs),
+            num_env_steps_per_sample=len(sampled_env_step_idxs),
+            sampled_n_step=None,
+        )
+
         # TODO: Return SampleBatch instead of this simpler dict.
         ret = {
             "obs": np.array(observations),
@@ -532,6 +773,12 @@ def _sample_episodes(
         self._last_sampled_indices = []
 
         sampled_episodes = []
+        # Record all the env step buffer indices that are contained in the sample.
+        sampled_env_step_idxs = set()
+        # Record all the episode buffer indices that are contained in the sample.
+        sampled_episode_idxs = set()
+        # Record all n-steps that have been used.
+        sampled_n_steps = []
 
         B = 0
         while B < batch_size_B:
@@ -619,7 +866,10 @@ def _sample_episodes(
                     ),
                     len_lookback_buffer=lookback,
                 )
-
+            # Record a has for the episode ID and timestep inside of the episode.
+            sampled_env_step_idxs.add(
+                hashlib.sha256(f"{episode.id_}-{episode_ts}".encode()).hexdigest()
+            )
             # Remove reference to sampled episode.
             del episode
 
@@ -636,15 +886,135 @@ def _sample_episodes(
 
             # Append the sampled episode.
             sampled_episodes.append(sampled_episode)
+            sampled_episode_idxs.add(episode_idx)
+            sampled_n_steps.append(actual_n_step)
 
             # Increment counter.
-            B += (actual_length - episode_ts + 1) or 1
+            B += (actual_length - episode_ts - (actual_n_step - 1) + 1) or 1
 
         # Update the metric.
         self.sampled_timesteps += batch_size_B
 
+        # Update the sample metrics.
+        self._update_sample_metrics(
+            batch_size_B,
+            len(sampled_episode_idxs),
+            len(sampled_env_step_idxs),
+            sum(sampled_n_steps) / batch_size_B,
+        )
+
         return sampled_episodes
 
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def _update_sample_metrics(
+        self,
+        num_env_steps_sampled: int,
+        num_episodes_per_sample: int,
+        num_env_steps_per_sample: int,
+        sampled_n_step: Optional[float],
+        **kwargs: Dict[str, Any],
+    ) -> None:
+        """Updates the replay buffer's sample metrics.
+
+        Args:
+            num_env_steps_sampled: The number of environment steps sampled
+                this iteration in the `sample` method.
+            num_episodes_per_sample: The number of unique episodes in the
+                sample.
+            num_env_steps_per_sample: The number of unique environment steps
+                in the sample.
+            sampled_n_step: The mean n-step used in the sample. Note, this
+                is constant, if the n-step is not sampled.
+        """
+        if sampled_n_step:
+            self.metrics.log_value(
+                ACTUAL_N_STEP,
+                sampled_n_step,
+                reduce="mean",
+                window=self._metrics_num_episodes_for_smoothing,
+            )
+            self.metrics.log_value(
+                (AGENT_ACTUAL_N_STEP, DEFAULT_AGENT_ID),
+                sampled_n_step,
+                reduce="mean",
+                window=self._metrics_num_episodes_for_smoothing,
+            )
+        self.metrics.log_value(
+            (NUM_AGENT_EPISODES_PER_SAMPLE, DEFAULT_AGENT_ID),
+            num_episodes_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_PER_SAMPLE, DEFAULT_AGENT_ID),
+            num_env_steps_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME, DEFAULT_AGENT_ID),
+            num_env_steps_per_sample,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_SAMPLED, DEFAULT_AGENT_ID),
+            num_env_steps_sampled,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        # TODO (simon): Check, if we can then deprecate
+        # self.sampled_timesteps.
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_SAMPLED_LIFETIME, DEFAULT_AGENT_ID),
+            num_env_steps_sampled,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (AGENT_STEP_UTILIZATION, DEFAULT_AGENT_ID),
+            self.metrics.peek((NUM_AGENT_STEPS_PER_SAMPLE_LIFETIME, DEFAULT_AGENT_ID))
+            / self.metrics.peek((NUM_AGENT_STEPS_SAMPLED_LIFETIME, DEFAULT_AGENT_ID)),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+        # Whole buffer sampled env steps metrics.
+        self.metrics.log_value(
+            NUM_EPISODES_PER_SAMPLE,
+            num_episodes_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_PER_SAMPLE,
+            num_env_steps_per_sample,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_PER_SAMPLE_LIFETIME,
+            num_env_steps_per_sample,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_SAMPLED,
+            num_env_steps_sampled,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_ENV_STEPS_SAMPLED_LIFETIME,
+            num_env_steps_sampled,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            ENV_STEP_UTILIZATION,
+            self.metrics.peek(NUM_ENV_STEPS_PER_SAMPLE_LIFETIME)
+            / self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME),
+            reduce="mean",
+            window=self._metrics_num_episodes_for_smoothing,
+        )
+
+    # TODO (simon): Check, if we can instead peek into the metrics
+    # and deprecate all variables.
     def get_num_episodes(self) -> int:
         """Returns number of episodes (completed or truncated) stored in the buffer."""
         return len(self.episodes)
@@ -665,6 +1035,10 @@ def get_added_timesteps(self) -> int:
         """Returns number of timesteps that have been added in buffer's lifetime."""
         return self._num_timesteps_added
 
+    def get_metrics(self) -> ResultDict:
+        """Returns the metrics of the buffer and reduces them."""
+        return self.metrics.reduce()
+
     @override(ReplayBufferInterface)
     def get_state(self) -> Dict[str, Any]:
         """Gets a pickable state of the buffer.
diff --git a/rllib/utils/replay_buffers/prioritized_episode_buffer.py b/rllib/utils/replay_buffers/prioritized_episode_buffer.py
index 02982d51ef6a3..f6ca7e548c487 100644
--- a/rllib/utils/replay_buffers/prioritized_episode_buffer.py
+++ b/rllib/utils/replay_buffers/prioritized_episode_buffer.py
@@ -1,4 +1,5 @@
 import copy
+import hashlib
 import numpy as np
 import scipy
 
@@ -6,11 +7,19 @@
 from numpy.typing import NDArray
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+from ray.rllib.core import DEFAULT_AGENT_ID
 from ray.rllib.env.single_agent_episode import SingleAgentEpisode
 from ray.rllib.execution.segment_tree import MinSegmentTree, SumSegmentTree
 from ray.rllib.utils import force_list
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.rllib.utils.metrics import (
+    NUM_AGENT_RESAMPLES,
+    NUM_RESAMPLES,
+)
 from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer
-from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import ModuleID, SampleBatchType
 
 
@@ -118,6 +127,7 @@ def __init__(
         batch_size_B: int = 16,
         batch_length_T: int = 1,
         alpha: float = 1.0,
+        metrics_num_episodes_for_smoothing: int = 100,
         **kwargs,
     ):
         """Initializes a `PrioritizedEpisodeReplayBuffer` object
@@ -132,7 +142,10 @@ def __init__(
                 prioritization, `alpha=0.0` means no prioritization.
         """
         super().__init__(
-            capacity=capacity, batch_size_B=batch_size_B, batch_length_T=batch_length_T
+            capacity=capacity,
+            batch_size_B=batch_size_B,
+            batch_length_T=batch_length_T,
+            metrics_num_episodes_for_smoothing=metrics_num_episodes_for_smoothing,
         )
 
         # `alpha` should be non-negative.
@@ -196,6 +209,12 @@ def add(
 
         episodes = force_list(episodes)
 
+        # Set up some counters for metrics.
+        num_env_steps_added = 0
+        num_episodes_added = 0
+        num_episodes_evicted = 0
+        num_env_steps_evicted = 0
+
         # Add first the timesteps of new episodes to have info about how many
         # episodes should be evicted to stay below capacity.
         new_episode_ids = []
@@ -215,6 +234,8 @@ def add(
             eps_evicted.append(self.episodes.popleft())
             eps_evicted_ids.append(eps_evicted[-1].id_)
             eps_evicted_idxs.append(self.episode_id_to_index.pop(eps_evicted_ids[-1]))
+            num_episodes_evicted += 1
+            num_env_steps_evicted += len(eps_evicted[-1])
             # If this episode has a new chunk in the new episodes added,
             # we subtract it again.
             # TODO (sven, simon): Should we just treat such an episode chunk
@@ -282,6 +303,7 @@ def add(
                     existing_eps.concat_episode(eps)
                 # Otherwise, create a new entry.
                 else:
+                    num_episodes_added += 1
                     self.episodes.append(eps)
                     eps_idx = len(self.episodes) - 1 + self._num_episodes_evicted
                     self.episode_id_to_index[eps.id_] = eps_idx
@@ -295,9 +317,18 @@ def add(
                             for i in range(len(eps))
                         ]
                     )
+                num_env_steps_added += len(eps)
                 # Increase index to the new length of `self._indices`.
                 j = len(self._indices)
 
+        # Increase metrics.
+        self._update_add_metrics(
+            num_env_steps_added,
+            num_episodes_added,
+            num_episodes_evicted,
+            num_env_steps_evicted,
+        )
+
     @override(EpisodeReplayBuffer)
     def sample(
         self,
@@ -391,6 +422,14 @@ def sample(
         self._last_sampled_indices = []
 
         sampled_episodes = []
+        # Record the sampled episode buffer indices to check the number of
+        # episodes per sample.
+        sampled_episode_idxs = set()
+        # Record sampled env step hashes to check the number of different
+        # env steps per sample.
+        sampled_env_steps_idxs = set()
+        num_resamples = 0
+        sampled_n_steps = []
 
         # Sample proportionally from replay buffer's segments using the weights.
         total_segment_sum = self._sum_segment.sum()
@@ -429,6 +468,7 @@ def sample(
             # Skip, if we are too far to the end and `episode_ts` + n_step would go
             # beyond the episode's end.
             if episode_ts + actual_n_step > len(episode):
+                num_resamples += 1
                 continue
 
             # Note, this will be the reward after executing action
@@ -492,20 +532,88 @@ def sample(
                 len_lookback_buffer=0,
                 t_started=episode_ts,
             )
+            # Record here the episode time step via a hash code.
+            sampled_env_steps_idxs.add(
+                hashlib.sha256(f"{episode.id_}-{episode_ts}".encode()).hexdigest()
+            )
+            # Convert to numpy arrays, if required.
             if to_numpy:
                 sampled_episode.to_numpy()
             sampled_episodes.append(sampled_episode)
 
+            # Add the episode buffer index to the sampled indices.
+            sampled_episode_idxs.add(episode_idx)
+            # Record the actual n-step for this sample.
+            sampled_n_steps.append(actual_n_step)
+
             # Increment counter.
             B += 1
 
             # Keep track of sampled indices for updating priorities later.
             self._last_sampled_indices.append(idx)
 
+        # Add to the sampled timesteps counter of the buffer.
         self.sampled_timesteps += batch_size_B
 
+        # Update the sample metrics.
+        self._update_sample_metrics(
+            batch_size_B,
+            len(sampled_episode_idxs),
+            len(sampled_env_steps_idxs),
+            sum(sampled_n_steps) / batch_size_B,
+            num_resamples,
+        )
+
         return sampled_episodes
 
+    @override(EpisodeReplayBuffer)
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def _update_sample_metrics(
+        self,
+        num_env_steps_sampled: int,
+        num_episodes_per_sample: int,
+        num_env_steps_per_sample: int,
+        sampled_n_step: Optional[float],
+        num_resamples: int,
+        **kwargs: Dict[str, Any],
+    ) -> None:
+        """Updates the replay buffer's sample metrics.
+
+        Args:
+            num_env_steps_sampled: The number of environment steps sampled
+                this iteration in the `sample` method.
+            num_episodes_per_sample: The number of unique episodes in the
+                sample.
+            num_env_steps_per_sample: The number of unique environment steps
+                in the sample.
+            sampled_n_step: The mean n-step used in the sample. Note, this
+                is constant, if the n-step is not sampled.
+            num_resamples: The total number of times environment steps needed to
+                be resampled. Resampling happens, if the sampled time step is
+                to near to the episode's end to cover the complete n-step.
+        """
+        # Call the super's method to increase all regular sample metrics.
+        super()._update_sample_metrics(
+            num_env_steps_sampled,
+            num_episodes_per_sample,
+            num_env_steps_per_sample,
+            sampled_n_step,
+        )
+
+        # Add the metrics for resamples.
+        self.metrics.log_value(
+            (NUM_AGENT_RESAMPLES, DEFAULT_AGENT_ID),
+            num_resamples,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            NUM_RESAMPLES,
+            num_resamples,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+
     @override(EpisodeReplayBuffer)
     def get_state(self) -> Dict[str, Any]:
         """Gets the state of a `PrioritizedEpisodeReplayBuffer`.

From 243927c335b474dd685e8f857bdffa8fa5f00ce3 Mon Sep 17 00:00:00 2001
From: Sven Mika <svenmika1977@gmail.com>
Date: Mon, 20 Jan 2025 11:59:17 +0100
Subject: [PATCH 2/2] [RLlib; docs] Docs do-over (new API stack): Move "new API
 stack" info box right below page titles for better visibility. (#49921)

---
 doc/source/rllib/algorithm-config.rst         |  4 +-
 doc/source/rllib/checkpoints.rst              |  6 +-
 doc/source/rllib/external-envs.rst            |  5 +-
 doc/source/rllib/hierarchical-envs.rst        |  5 +-
 doc/source/rllib/key-concepts.rst             |  6 +-
 doc/source/rllib/multi-agent-envs.rst         |  4 +-
 .../rllib/new-api-stack-migration-guide.rst   |  5 +-
 .../rllib/package_ref/algorithm-config.rst    |  5 +-
 doc/source/rllib/package_ref/algorithm.rst    |  4 +-
 doc/source/rllib/package_ref/callback.rst     |  4 +-
 .../rllib/package_ref/distributions.rst       |  5 +-
 doc/source/rllib/package_ref/env.rst          |  5 +-
 .../rllib/package_ref/env/env_runner.rst      |  5 +-
 .../rllib/package_ref/env/multi_agent_env.rst |  5 +-
 .../env/multi_agent_env_runner.rst            |  4 +-
 .../package_ref/env/multi_agent_episode.rst   |  4 +-
 .../env/single_agent_env_runner.rst           |  4 +-
 .../package_ref/env/single_agent_episode.rst  |  4 +-
 doc/source/rllib/package_ref/env/utils.rst    |  4 +-
 doc/source/rllib/package_ref/index.rst        |  5 +-
 doc/source/rllib/package_ref/learner.rst      |  6 +-
 doc/source/rllib/package_ref/offline.rst      |  6 +-
 .../rllib/package_ref/replay-buffers.rst      |  5 +-
 doc/source/rllib/package_ref/rl_modules.rst   |  4 +-
 doc/source/rllib/package_ref/utils.rst        |  5 +-
 doc/source/rllib/rllib-algorithms.rst         |  4 +-
 doc/source/rllib/rllib-callback.rst           |  4 +-
 doc/source/rllib/rllib-env.rst                |  6 +-
 doc/source/rllib/rllib-examples.rst           |  5 +-
 doc/source/rllib/rllib-offline.rst            | 69 +++++++++----------
 doc/source/rllib/rllib-rlmodule.rst           |  4 +-
 doc/source/rllib/scaling-guide.rst            |  4 +-
 doc/source/rllib/single-agent-episode.rst     |  6 +-
 doc/source/rllib/user-guides.rst              |  6 +-
 34 files changed, 99 insertions(+), 128 deletions(-)

diff --git a/doc/source/rllib/algorithm-config.rst b/doc/source/rllib/algorithm-config.rst
index 5e5336ebe0bc7..d6b53763a54f6 100644
--- a/doc/source/rllib/algorithm-config.rst
+++ b/doc/source/rllib/algorithm-config.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-algo-configuration-docs:
 
 AlgorithmConfig API
 ===================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 RLlib's :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` API is
 the auto-validated and type-safe gateway into configuring and building an RLlib
 :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`.
diff --git a/doc/source/rllib/checkpoints.rst b/doc/source/rllib/checkpoints.rst
index f7acaf0c24669..ed98b263ad405 100644
--- a/doc/source/rllib/checkpoints.rst
+++ b/doc/source/rllib/checkpoints.rst
@@ -1,14 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
-
 .. _rllib-checkpoints-docs:
 
 Checkpointing
 =============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 RLlib offers a powerful checkpointing system for all its major classes, allowing you to save the
 states of :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instances and their subcomponents
 to local disk or cloud storage, and restore previously run experiment states and individual subcomponents.
diff --git a/doc/source/rllib/external-envs.rst b/doc/source/rllib/external-envs.rst
index d9e4392b01bd7..5de952db46d1c 100644
--- a/doc/source/rllib/external-envs.rst
+++ b/doc/source/rllib/external-envs.rst
@@ -1,13 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-external-env-setups-doc:
 
-
 External Environments and Applications
 ======================================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 In many situations, it doesn't make sense for an RL environment to be "stepped" by RLlib.
 For example, if you train a policy inside a complex simulator that operates its own execution loop,
 like a game engine or a robotics simulation. A natural and user friendly approach is to flip this setup around
diff --git a/doc/source/rllib/hierarchical-envs.rst b/doc/source/rllib/hierarchical-envs.rst
index f9c52d5d2675e..70c160b90e2c3 100644
--- a/doc/source/rllib/hierarchical-envs.rst
+++ b/doc/source/rllib/hierarchical-envs.rst
@@ -1,13 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-hierarchical-environments-doc:
 
-
 Hierarchical Environments
 =========================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 You can implement hierarchical training as a special case of multi-agent RL. For example, consider a two-level hierarchy of policies,
 where a top-level policy issues high level tasks that are executed at a finer timescale by one or more low-level policies.
 The following timeline shows one step of the top-level policy, which corresponds to four low-level actions:
diff --git a/doc/source/rllib/key-concepts.rst b/doc/source/rllib/key-concepts.rst
index e3d80e11be4ee..b5496dd38279c 100644
--- a/doc/source/rllib/key-concepts.rst
+++ b/doc/source/rllib/key-concepts.rst
@@ -1,14 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
-
 .. _rllib-key-concepts:
 
 Key concepts
 ============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 To help you get a high-level understanding of how the library works, on this page, you learn about the
 key concepts and general architecture of RLlib.
 
diff --git a/doc/source/rllib/multi-agent-envs.rst b/doc/source/rllib/multi-agent-envs.rst
index f5fdefe52d9a4..9944a1a619451 100644
--- a/doc/source/rllib/multi-agent-envs.rst
+++ b/doc/source/rllib/multi-agent-envs.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-multi-agent-environments-doc:
 
 Multi-Agent Environments
 ========================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 In a multi-agent environment, multiple "agents" act simultaneously, in a turn-based
 sequence, or through an arbitrary combination of both.
 
diff --git a/doc/source/rllib/new-api-stack-migration-guide.rst b/doc/source/rllib/new-api-stack-migration-guide.rst
index 650aa369396cc..9ba4e3e63f632 100644
--- a/doc/source/rllib/new-api-stack-migration-guide.rst
+++ b/doc/source/rllib/new-api-stack-migration-guide.rst
@@ -1,8 +1,5 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
-
 .. _rllib-new-api-stack-migration-guide:
 
 .. testcode::
@@ -15,6 +12,8 @@
 New API stack migration guide
 =============================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 This page explains, step by step, how to convert and translate your existing old API stack
 RLlib classes and code to RLlib's new API stack.
 
diff --git a/doc/source/rllib/package_ref/algorithm-config.rst b/doc/source/rllib/package_ref/algorithm-config.rst
index e290606ef4b9f..725252c61df98 100644
--- a/doc/source/rllib/package_ref/algorithm-config.rst
+++ b/doc/source/rllib/package_ref/algorithm-config.rst
@@ -1,13 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _algorithm-config-reference-docs:
 
-
 Algorithm Configuration API
 ===========================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 .. currentmodule:: ray.rllib.algorithms.algorithm_config
 
 Constructor
diff --git a/doc/source/rllib/package_ref/algorithm.rst b/doc/source/rllib/package_ref/algorithm.rst
index bc72e6be3e29f..0555a1a1f510a 100644
--- a/doc/source/rllib/package_ref/algorithm.rst
+++ b/doc/source/rllib/package_ref/algorithm.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _algorithm-reference-docs:
 
 Algorithms
 ==========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 The :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class is the highest-level API in RLlib responsible for **WHEN** and **WHAT** of RL algorithms.
 Things like **WHEN** should we sample the algorithm, **WHEN** should we perform a neural network update, and so on.
 The **HOW** will be delegated to components such as ``RolloutWorker``, etc..
diff --git a/doc/source/rllib/package_ref/callback.rst b/doc/source/rllib/package_ref/callback.rst
index 0636ad3386ac4..9b7cdbffc1ed6 100644
--- a/doc/source/rllib/package_ref/callback.rst
+++ b/doc/source/rllib/package_ref/callback.rst
@@ -1,10 +1,12 @@
-.. include:: /_includes/rllib/new_api_stack.rst
+.. include:: /_includes/rllib/we_are_hiring.rst
 
 .. _rllib-callback-reference-docs:
 
 Callback APIs
 =============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 Callback APIs enable you to inject code into an experiment, an Algorithm,
 and the subcomponents of an Algorithm.
 
diff --git a/doc/source/rllib/package_ref/distributions.rst b/doc/source/rllib/package_ref/distributions.rst
index 954e4ded09f81..f01fa27f92c28 100644
--- a/doc/source/rllib/package_ref/distributions.rst
+++ b/doc/source/rllib/package_ref/distributions.rst
@@ -1,13 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
-
 .. _rllib-distributions-reference-docs:
 
 Distribution API
 ================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 .. currentmodule:: ray.rllib.models.distributions
 
 Base Distribution class
diff --git a/doc/source/rllib/package_ref/env.rst b/doc/source/rllib/package_ref/env.rst
index b5eaf762abbc8..b8a49f1965087 100644
--- a/doc/source/rllib/package_ref/env.rst
+++ b/doc/source/rllib/package_ref/env.rst
@@ -1,13 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _env-reference-docs:
 
 Environments
 ============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 RLlib mainly supports the `Farama gymnasium API <https://gymnasium.farama.org/>`__ for
 single-agent environments, and RLlib's own :py:class:`~ray.rllib.env.multi_agent_env.MultiAgentEnv`
 API for multi-agent setups.
diff --git a/doc/source/rllib/package_ref/env/env_runner.rst b/doc/source/rllib/package_ref/env/env_runner.rst
index 6dc1f9fee626a..b1f7fb8401adb 100644
--- a/doc/source/rllib/package_ref/env/env_runner.rst
+++ b/doc/source/rllib/package_ref/env/env_runner.rst
@@ -1,13 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _env-runner-reference-docs:
 
 EnvRunner API
 =============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 rllib.env.env_runner.EnvRunner
 ------------------------------
 
diff --git a/doc/source/rllib/package_ref/env/multi_agent_env.rst b/doc/source/rllib/package_ref/env/multi_agent_env.rst
index 496be69237667..18702f75b3fec 100644
--- a/doc/source/rllib/package_ref/env/multi_agent_env.rst
+++ b/doc/source/rllib/package_ref/env/multi_agent_env.rst
@@ -1,13 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _multi-agent-env-reference-docs:
 
 MultiAgentEnv API
 =================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 rllib.env.multi_agent_env.MultiAgentEnv
 ---------------------------------------
 
diff --git a/doc/source/rllib/package_ref/env/multi_agent_env_runner.rst b/doc/source/rllib/package_ref/env/multi_agent_env_runner.rst
index 8e21dd456f233..270a0150e3b24 100644
--- a/doc/source/rllib/package_ref/env/multi_agent_env_runner.rst
+++ b/doc/source/rllib/package_ref/env/multi_agent_env_runner.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _multi-agent-env-runner-reference-docs:
 
 MultiAgentEnvRunner API
 =======================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 rllib.env.multi_agent_env_runner.MultiAgentEnvRunner
 ----------------------------------------------------
 
diff --git a/doc/source/rllib/package_ref/env/multi_agent_episode.rst b/doc/source/rllib/package_ref/env/multi_agent_episode.rst
index a22e11bb42002..892a3242e3780 100644
--- a/doc/source/rllib/package_ref/env/multi_agent_episode.rst
+++ b/doc/source/rllib/package_ref/env/multi_agent_episode.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _multi-agent-episode-reference-docs:
 
 MultiAgentEpisode API
 =====================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 rllib.env.multi_agent_episode.MultiAgentEpisode
 -----------------------------------------------
 
diff --git a/doc/source/rllib/package_ref/env/single_agent_env_runner.rst b/doc/source/rllib/package_ref/env/single_agent_env_runner.rst
index 2ac9e8d0afff7..4127bce127f42 100644
--- a/doc/source/rllib/package_ref/env/single_agent_env_runner.rst
+++ b/doc/source/rllib/package_ref/env/single_agent_env_runner.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _single-agent-env-runner-reference-docs:
 
 SingleAgentEnvRunner API
 ========================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 rllib.env.single_agent_env_runner.SingleAgentEnvRunner
 ------------------------------------------------------
 
diff --git a/doc/source/rllib/package_ref/env/single_agent_episode.rst b/doc/source/rllib/package_ref/env/single_agent_episode.rst
index c0006a46adfd5..c48f1dba68bde 100644
--- a/doc/source/rllib/package_ref/env/single_agent_episode.rst
+++ b/doc/source/rllib/package_ref/env/single_agent_episode.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _single-agent-episode-reference-docs:
 
 SingleAgentEpisode API
 ======================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 rllib.env.single_agent_episode.SingleAgentEpisode
 -------------------------------------------------
 
diff --git a/doc/source/rllib/package_ref/env/utils.rst b/doc/source/rllib/package_ref/env/utils.rst
index adf293b676808..49a884bd6bc45 100644
--- a/doc/source/rllib/package_ref/env/utils.rst
+++ b/doc/source/rllib/package_ref/env/utils.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _env-utils-reference-docs:
 
 Env Utils
 =========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 rllib.env.utils
 ---------------
 
diff --git a/doc/source/rllib/package_ref/index.rst b/doc/source/rllib/package_ref/index.rst
index 7775df5327aa3..5638c44be5098 100644
--- a/doc/source/rllib/package_ref/index.rst
+++ b/doc/source/rllib/package_ref/index.rst
@@ -1,13 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-reference-docs:
 
 Ray RLlib API
 =============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 .. tip:: We'd love to hear your feedback on using RLlib - `sign up to our forum and start asking questions <https://discuss.ray.io>`_!
 
 This section contains an overview of RLlib's package- and API reference.
diff --git a/doc/source/rllib/package_ref/learner.rst b/doc/source/rllib/package_ref/learner.rst
index ef2b5edfeee43..9d7ca2e126aca 100644
--- a/doc/source/rllib/package_ref/learner.rst
+++ b/doc/source/rllib/package_ref/learner.rst
@@ -1,14 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _learner-reference-docs:
 
-
 LearnerGroup API
 ================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 Configuring a LearnerGroup and Learner actors
 ---------------------------------------------
 
diff --git a/doc/source/rllib/package_ref/offline.rst b/doc/source/rllib/package_ref/offline.rst
index 00e55feff9ce3..fb38e2dabfd03 100644
--- a/doc/source/rllib/package_ref/offline.rst
+++ b/doc/source/rllib/package_ref/offline.rst
@@ -1,14 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _new-api-offline-reference-docs:
 
-
 Offline RL API
 ==============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 Configuring Offline RL
 ----------------------
 
diff --git a/doc/source/rllib/package_ref/replay-buffers.rst b/doc/source/rllib/package_ref/replay-buffers.rst
index cc9d0ce055981..89f79c843aef1 100644
--- a/doc/source/rllib/package_ref/replay-buffers.rst
+++ b/doc/source/rllib/package_ref/replay-buffers.rst
@@ -1,13 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _replay-buffer-api-reference-docs:
 
 Replay Buffer API
 =================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 The following classes don't take into account the separation of experiences from different policies, multi-agent replay buffers will be explained further below.
 
 Replay Buffer Base Classes
diff --git a/doc/source/rllib/package_ref/rl_modules.rst b/doc/source/rllib/package_ref/rl_modules.rst
index 405b2b67fc261..03c4b9250603c 100644
--- a/doc/source/rllib/package_ref/rl_modules.rst
+++ b/doc/source/rllib/package_ref/rl_modules.rst
@@ -1,13 +1,11 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rlmodule-reference-docs:
 
 RLModule APIs
 =============
 
+.. include:: /_includes/rllib/new_api_stack.rst
 
 RLModule specifications and configurations
 -------------------------------------------
diff --git a/doc/source/rllib/package_ref/utils.rst b/doc/source/rllib/package_ref/utils.rst
index d0bfca886353c..4418e1b0d5b22 100644
--- a/doc/source/rllib/package_ref/utils.rst
+++ b/doc/source/rllib/package_ref/utils.rst
@@ -1,13 +1,12 @@
-
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _utils-reference-docs:
 
 RLlib Utilities
 ===============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 Here is a list of all the utilities available in RLlib.
 
 Scheduler API
diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst
index 4c4a5dd1cd5b9..3b88c4c265007 100644
--- a/doc/source/rllib/rllib-algorithms.rst
+++ b/doc/source/rllib/rllib-algorithms.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-algorithms-doc:
 
 Algorithms
 ==========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 The following table is an overview of all available algorithms in RLlib. Note that all algorithms support
 multi-GPU training on a single (GPU) node in `Ray (open-source) <https://docs.ray.io/en/latest/index.html>`__ (|multi_gpu|)
 as well as multi-GPU training on multi-node (GPU) clusters when using the `Anyscale platform <https://www.anyscale.com/platform>`__
diff --git a/doc/source/rllib/rllib-callback.rst b/doc/source/rllib/rllib-callback.rst
index 611ea0b5e3a33..cd26d543d07d9 100644
--- a/doc/source/rllib/rllib-callback.rst
+++ b/doc/source/rllib/rllib-callback.rst
@@ -1,10 +1,12 @@
-.. include:: /_includes/rllib/new_api_stack.rst
+.. include:: /_includes/rllib/we_are_hiring.rst
 
 .. _rllib-callback-docs:
 
 Callbacks
 =========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 Callbacks are the most straightforward way to inject code into experiments. You can define the code to execute at certain events and pass it to your
 :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig`.
 
diff --git a/doc/source/rllib/rllib-env.rst b/doc/source/rllib/rllib-env.rst
index 77289711955de..d42576bcc734e 100644
--- a/doc/source/rllib/rllib-env.rst
+++ b/doc/source/rllib/rllib-env.rst
@@ -1,7 +1,5 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-environments-doc:
 
 Environments
@@ -14,9 +12,7 @@ Environments
     hierarchical-envs
     external-envs
 
-
-Overview
---------
+.. include:: /_includes/rllib/new_api_stack.rst
 
 .. grid:: 1 2 3 4
     :gutter: 1
diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst
index 2a8df0d58b053..566b125380eb6 100644
--- a/doc/source/rllib/rllib-examples.rst
+++ b/doc/source/rllib/rllib-examples.rst
@@ -1,13 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
-
 .. _rllib-examples-overview-docs:
 
 Examples
 ========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 This page contains an index of all the python scripts in the
 `examples folder <https://github.com/ray-project/ray/blob/master/rllib/examples>`__
 of RLlib, demonstrating the different use cases and features of the library.
diff --git a/doc/source/rllib/rllib-offline.rst b/doc/source/rllib/rllib-offline.rst
index c587fef055fe6..6630bdf497452 100644
--- a/doc/source/rllib/rllib-offline.rst
+++ b/doc/source/rllib/rllib-offline.rst
@@ -1,12 +1,9 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
-Working with Offline Data
+Working with offline data
 =========================
 
-Getting Started
----------------
+.. include:: /_includes/rllib/new_api_stack.rst
 
 RLlib's offline RL API enables you to work with experiences read from offline storage (for example, disk, cloud storage,
 streaming systems, Hadoop Distributed File System (HDFS). For example, you might want to read experiences saved from previous training runs, collected
@@ -42,7 +39,7 @@ of the API is to apply as many data transformations as possible on-the-fly prior
     :py:class:`~ray.rllib.policy.sample_batch.SampleBatch` data recorded with the old API stack. To enable this feature set
     ``config.offline_data(input_read_sample_batches=True)``.
 
-Example: Training an Expert Policy
+Example: Training an expert policy
 ----------------------------------
 In this example you train a PPO agent on the ``CartPole-v1`` environment until it reaches an episode mean return of ``450.0``. You checkpoint
 this agent and then use its policy to record expert data to local disk.
@@ -130,7 +127,7 @@ this agent and then use its policy to record expert data to local disk.
 In this example, you saved a checkpoint from an agent that has become an expert at playing ``CartPole-v1``.  You use this checkpoint in the next
 example to record expert data to disk, which is later utilized for offline training to clone another agent.
 
-Example: Record Expert Data to Local Disk
+Example: Record expert data to local disk
 -----------------------------------------
 After you train an expert policy to play `CartPole-v1` you load its policy here to record expert data during evaluation. You use ``5``
 :py:class:`~ray.rllib.offline.offline_env_runner.OfflineSingleAgentEnvRunner` instances to collect ``50`` complete episodes per `sample()` call. In this
@@ -242,7 +239,7 @@ evaluation, enabling parallel data writing. You can explore the folder to review
 .. note:: The number of write operations per worker may vary because policy rollouts aren't evenly distributed. Faster workers collect more episodes,
     leading to differences in write operation counts. As a result, the second numbering may differ across files generated by different env-runner instances.
 
-Example: Training on Previously Saved Experiences
+Example: Training on previously saved experiences
 -------------------------------------------------
 In this example you are using behavior cloning with the previously recorded Parquet data from your expert policy playing ``CartPole-v1``. The
 data needs to be linked in the configuration of the algorithm (through the ``input_`` attribute).
@@ -346,7 +343,7 @@ complex environments, which require more sophisticated agents and significantly
 can be highly beneficial. Combining behavior cloning with subsequent fine-tuning using a reinforcement learning algorithm can substantially
 reduce training time, resource consumption, and associated costs.
 
-Using External Expert Experiences
+Using external expert experiences
 ---------------------------------
 Your expert data is often already available, either recorded from an operational system or directly provided by human experts. Typically,
 you might store this data in a tabular (columnar) format. RLlib's new Offline RL API simplifies the use of such data by allowing direct ingestion
@@ -381,7 +378,7 @@ you provide this schema as follows:
 .. note:: Internally, the legacy ``gym``'s ``done`` signals are mapped to ``gymnasium``'s ``terminated`` signals, with ``truncated`` values defaulting to
     ``False``. RLlib's :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode` structures align with ``gymnasium``, adhering to the updated environment API standards in reinforcement learning.
 
-Converting Tabular Data to RLlib's Episode Format
+Converting tabular data to RLlib's episode format
 -------------------------------------------------
 While the tabular format is widely compatible and seamlessly integrates with RLlib's new Offline RL API, there are cases where you may prefer to use RLlib's native episode format.
 As briefly mentioned earlier, such scenarios typically arise when full expert trajectories are required.
@@ -592,7 +589,7 @@ your own data into RLlib's :py:class:`~ray.rllib.env.single_agent_episode.Single
         del episodes_ds
         episodes.clear()
 
-Using Old API Stack ``SampleBatch`` Recordings
+Using old API stack ``SampleBatch`` recordings
 ----------------------------------------------
 If you have expert data previously recorded using RLlib's old API stack, it can be seamlessly utilized in the new stack's Offline RL API by setting ``input_read_sample_batches=True``. Alternatively,
 you can convert your ``SampleBatch`` recordings into :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode` format using RLlib's
@@ -630,7 +627,7 @@ you can convert your ``SampleBatch`` recordings into :py:class:`~ray.rllib.env.s
 .. note:: RLlib considers your :py:class:`~ray.rllib.policy.sample_batch.SampleBatch` to represent a terminated/truncated episode and builds its :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode`
     according to this assumption.
 
-Pre-Processing, Filtering and Post-Processing
+Pre-processing, filtering and post-processing
 ---------------------------------------------
 
 During recording, your expert policy may utilize pre-processing techniques for observations, such as *frame-stacking*, or filtering methods like *mean-std filtering*. Similarly, actions may undergo pre-processing, such as *action
@@ -643,7 +640,7 @@ the expert policy uses *mean-std filtering* for observations, it learns a strate
 policy, it's essential to use the exact same filter during evaluation to avoid performance degradation. Similarly, a policy trained through behavior cloning may also require a *mean-std filter* for observations to accurately replicate the
 behavior of the expert policy.
 
-Scaling I/O Throughput
+Scaling I/O throughput
 ----------------------
 
 Just as online training can be scaled, offline recording I/O throughput can also be increased by configuring the number of RLlib env-runners. Use the ``num_env_runners`` setting to scale recording during training or ``evaluation_num_env_runners``
@@ -701,7 +698,7 @@ You scale the number of learners in RLlib's :py:meth:`~ray.rllib.algorithms.algo
 With this configuration you start an application with 4 (remote) :py:class:`~ray.rllib.core.learner.learner.Learner`s (see :ref:`Learner (Alpha)` for more details about RLlib's learners)
 each of them using a single GPU.
 
-Using Cloud Storage
+Using cloud storage
 -------------------
 Unlike RLlib's previous stack, the new Offline RL API is cloud-agnostic and fully integrates with PyArrow. You can utilize any available cloud storage path or PyArrow-compatible filesystem. If
 using a PyArrow or compatible filesystem, ensure that your ``input_`` path is a relative path within this filesystem. Similar to Ray Data, you can also use placeholders, lists of files
@@ -744,7 +741,7 @@ use the following syntax:
 
 You can learn more about PyArrow's filesystems, particularly regarding cloud filesystems and required authentication, in `PyArrow Filesystem Interface <https://arrow.apache.org/docs/python/filesystems.html#filesystem-interface>`__.
 
-Using Cloud Storage for Recording
+Using cloud storage for recording
 *********************************
 You can use cloud storage in a similar way when recording experiences from an expert policy:
 
@@ -777,14 +774,14 @@ in denied write access, causing the recording process to stop.
 
 .. _how-to-tune-performance:
 
-How to Tune Performance
+How to tune performance
 -----------------------
 
 In RLlib's Offline RL API the various key layers are managed by distinct modules and configurations, making it non-trivial to scale these layers effectively. It's important to understand the specific parameters and their respective impact on system performance.
 
 .. _how-to-tune-reading-operations:
 
-How to Tune Reading Operations
+How to tune reading operations
 ******************************
 As noted earlier, the **Reading Operations** layer is automatically handled and dynamically optimized by :ref:`Ray Data <data>`. It's strongly recommended to avoid modifying this process. However, there are certain parameters that can enhance performance on this
 layer to some extent, including:
@@ -794,7 +791,7 @@ layer to some extent, including:
 #. Data sharding.
 #. Data pruning.
 
-Available Resources
+Available resources
 ~~~~~~~~~~~~~~~~~~~
 The scheduling strategy employed by :ref:`Ray Data <data>` operates independently of any existing placement group, scheduling tasks and actors separately. Consequently, it's essential to reserve adequate resources for other tasks and actors within your job. To
 optimize :ref:`Ray Data <data>`'s scalability for read operations and improve reading performance, consider increasing the available resources in your cluster while preserving the resource allocation for existing tasks and actors. The key resources to monitor and
@@ -806,7 +803,7 @@ include:
 - Independent connections to the network backbone: Nodes utilize dedicated bandwidth, avoiding shared up-links and potential bottlenecks (see for ex. `here <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-network-bandwidth.html>`__ for AWS and `here <https://cloud.google.com/compute/docs/network-bandwidth?hl=en>`__ for GCP network bandwidth documentations).
 - Optimized cloud access: Employing features like `S3 Transfer Acceleration <https://aws.amazon.com/s3/transfer-acceleration/>`__, `Google Cloud Storage FUSE <https://cloud.google.com/storage/docs/cloud-storage-fuse/file-caching#configure-parallel-downloads>`__ , or parallel and accelerated data transfer methods to enhance performance.
 
-Data Locality
+Data locality
 ~~~~~~~~~~~~~
 Data locality is a critical factor in achieving fast data processing. For instance, if your data resides on GCP, running a Ray cluster on AWS S3 or a local machine inevitably results in low transfer rates and slow data processing. To ensure optimal performance, storing data within the same region, same zone and cloud provider as the Ray cluster is generally
 sufficient to enable efficient streaming for RLlib's Offline RL API. Additional adjustments to consider include:
@@ -814,14 +811,14 @@ sufficient to enable efficient streaming for RLlib's Offline RL API. Additional
 - Multi-Region Buckets: Use multi-region storage to improve data availability and potentially enhance access speeds for distributed systems.
 - Storage class optimization within buckets: Use **standard storage** for frequent access and low-latency streaming. Avoid archival storage classes like AWS Glacier or GCP Archive for streaming workloads due to high retrieval times.
 
-Data Sharding
+Data sharding
 ~~~~~~~~~~~~~
 Data sharding improves the efficiency of fetching, transferring, and reading data by balancing chunk sizes. If chunks are too large, they can cause delays during transfer and processing, leading to bottlenecks. Conversely, chunks that are too small can result in high metadata fetching overhead, slowing down overall performance. Finding an optimal chunk size is
 critical for balancing these trade-offs and maximizing throughput.
 
 - As a rule-of-thumb keep data file sizes in between 64MiB to 256MiB.
 
-Data Pruning
+Data pruning
 ~~~~~~~~~~~~
 If your data is in **Parquet** format (the recommended offline data format for RLlib), you can leverage data pruning to optimize performance. :ref:`Ray Data <data>` supports pruning in its :py:meth:`~ray.data.read_parquet` method through projection pushdown (column filtering) and filter pushdown (row filtering). These filters are applied directly during file
 scans, reducing the amount of unnecessary data loaded into memory.
@@ -869,7 +866,7 @@ Similarly, if you only require specific rows from your dataset, you can apply pu
         )
     )
 
-How to Tune Post-Processing (PreLearner)
+How to tune post-processing (PreLearner)
 ****************************************
 When enabling high throughput in Read Operations, it's essential to ensure sufficient processing capacity in the Post-Processing (Pre-Learner) stage. Insufficient capacity in this stage can cause backpressure, leading to increased memory usage and, in severe cases,
 object spilling to disk or even Out-Of-Memory (see :ref:`Out-Of-Memory Prevention <ray-oom-prevention>`) errors.
@@ -880,7 +877,7 @@ Tuning the **Post-Processing (Pre-Learner)** layer is generally more straightfor
 - Allocated Resources
 - Read Batch and Buffer Sizes.
 
-Actor Pool Size
+Actor pool size
 ~~~~~~~~~~~~~~~
 Internally, the **Post-Processing (PreLearner)** layer is defined by a :py:meth:`~ray.data.Dataset.map_batches` operation that starts an :py:class:`~ray.data._internal.execution.operators.actor_pool_map_operator._ActorPool`. Each actor in this pool runs an :py:class:`~ray.rllib.offline.offline_prelearner.OfflinePreLearner`
 instances to transform batches on their way from disk to RLlib's :py:class:`~ray.rllib.core.learner.learner.Learner`. Obviously, the size of this :py:class:`~ray.data._internal.execution.operators.actor_pool_map_operator._ActorPool` defines the throughput of this layer and needs to be fine-tuned in regard to the pervious layer's
@@ -927,7 +924,7 @@ This allows :ref:`Ray Data <data>` to start up to ``8`` post-processing actors t
     optimized for streaming data, which typically exhibits stable throughput and resource usage, except in cases of imbalances between upstream and downstream tasks. As a rule of thumb, consider using autoscaling only under the following conditions: (1) throughput is expected to be highly variable, (2) Cluster resources
     are subject to fluctuations (for example, in shared or dynamic environments), and/or (3) workload characteristics are highly unpredictable.
 
-Allocated Resources
+Allocated resources
 ~~~~~~~~~~~~~~~~~~~
 Other than the number of post-processing actors you can tune performance on the **Post-Processing (PreLearner)** layer through defining resources to be allocated to each :py:class:`~ray.rllib.offline.offline_prelearner.OffLinePreLearner` in the actor pool. Such resources can be defined either through ``num_cpus`` and ``num_gpus``
 or in the ``ray_remote_args``.
@@ -955,7 +952,7 @@ As an example, to provide each of your ``4`` :py:class:`~ray.rllib.offline.offli
 .. warning:: Don't override the ``batch_size`` in RLlib's ``map_batches_kwargs``. This usually leads to high performance degradations. Note, this ``batch_size`` differs from the `train_batch_size_per_learner`: the former specifies the batch size in transformations of
     the streaming pipeline, while the latter defines the batch size used for training within each :py:class:`~ray.rllib.core.learner.learner.Learner` (the batch size of the actual model forward- and backward passes performed for training).
 
-Read Batch- and Buffer Sizes
+Read batch- and buffer sizes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 When working with data from :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode` or the legacy :py:class:`~ray.rllib.policy.sample_batch.SampleBatch` format, fine-tuning the `input_read_batch_size` parameter provides additional optimization opportunities. This parameter controls the size of batches retrieved from data
 files. Its effectiveness is particularly notable when handling episodic or legacy :py:class:`~ray.rllib.policy.sample_batch.SampleBatch` data because the streaming pipeline utilizes for these data an :py:class:`~ray.rllib.utils.replay_buffers.episode_replay_buffer.EpisodeReplayBuffer` to handle the multiple timesteps contained in each
@@ -1049,7 +1046,7 @@ results in lower sampling variation because many timesteps are repeatedly sample
 .. tip:: To choose an adequate `input_read_batch_size` take a look at the length of your recorded episodes. In some cases each single episode is long enough to fulfill the `train_batch_size_per_learner` and you could choose a `input_read_batch_size` of ``1``. Most times it's not and you need to consider how many episodes should be buffered to balance
     the amount of data digested from read input and the variation of data sampled from the :py:class:`~ray.rllib.utils.replay_buffers.episode_replay_buffer.EpisodeReplayBuffer` instances in the :py:class:`~ray.rllib.offline.offline_prelearner.OfflinePreLearner`.
 
-How to Tune Updating (Learner)
+How to tune updating (Learner)
 ******************************
 
 **Updating (Learner)**  is the final downstream task in RLlib's Offline RL pipeline, and its consumption speed determines the overall throughput of the data pipeline. If the learning process is slow, it can cause backpressure in upstream layers, potentially leading to object spilling or Out-Of-Memory (OOM) errors. Therefore, it's essential to fine-tune this
@@ -1064,7 +1061,7 @@ layer in coordination with the upstream components. Several parameters can be ad
 
 .. _actor-pool-size:
 
-Actor Pool Size
+Actor pool size
 ***************
 
 RLlib supports scaling :py:class:`~ray.rllib.core.learner.learner.Learner` instances through the parameter `num_learners`. When this value is ``0``, RLlib uses a Learner instance in the local process, whereas for values ``>0``, RLlib scales out using a :py:class:`~ray.train._internals.backend_executor_BackendExecutor`. This executor spawns your specified
@@ -1086,7 +1083,7 @@ For example to set the number of learners to ``4``, you use the following syntax
 .. tip::For performance optimization you should choose between using a single local :py:class:`~ray.rllib.core.learner.learner.Learner` or multiple remote ones :py:class:`~ray.rllib.core.learner.learner.Learner`. In case your dataset is small, use scaling of :py:class:`~ray.rllib.core.learner.learner.Learner` instances with caution as it produces significant
     overhead and splits the data pipeline into multiple streams.
 
-Allocated Resources
+Allocated resources
 ~~~~~~~~~~~~~~~~~~~
 Just as with the Post-Processing (Pre-Learner) layer, allocating additional resources can help address slow training issues. The primary resource to leverage is the GPU, as training involves forward and backward passes through the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`, which GPUs can accelerate significantly. If your training
 already utilizes GPUs and performance still remains an issue, consider scaling up by either adding more GPUs to each :py:class:`~ray.rllib.core.learner.learner.Learner` to increase GPU memory and computational capacity (set `config.learners(num_gpus_per_learner=...)`), or by adding additional :py:class:`~ray.rllib.core.learner.learner.Learner` workers to further distribute the workload (by setting `config.learners(num_learners=...)`). Additionally, ensure that data
@@ -1107,7 +1104,7 @@ To provide your learners with more compute use ``num_gpus_per_learner`` or ``num
 
 .. tip::If you experience backpressure in the **Post-Processing (Pre-Learner)** stage of your pipeline, consider enabling GPU training before scaling up the number of your :py:class:`~ray.rllib.core.learner.learner.Learner` instances.
 
-Scheduling Strategy
+Scheduling strategy
 ~~~~~~~~~~~~~~~~~~~
 The scheduling strategy in Ray plays a key role in task and actor placement by attempting to distribute them across multiple nodes in a cluster, thereby maximizing resource utilization and fault tolerance. When running on a single-node cluster (that's: one large head node), the scheduling strategy has little to no noticeable impact. However, in a multi-node cluster,
 scheduling can significantly influence the performance of your Offline Data pipeline due to the importance of data locality. Data processing occurs across all nodes, and maintaining data locality during training can enhance performance.
@@ -1160,7 +1157,7 @@ Here is an example of how you can change the scheduling strategy:
 
 .. warning::Changing scheduling strategies in RLlib's Offline RL API is experimental; use with caution.
 
-Batch Size
+Batch size
 ~~~~~~~~~~
 Batch size is one of the simplest parameters to adjust for optimizing performance in RLlib's new Offline RL API. Small batch sizes may under-utilize hardware, leading to inefficiencies, while overly large batch sizes can exceed memory limits. In a streaming pipeline, the selected batch size impacts how data is partitioned and processed across parallel workers. Larger
 batch sizes reduce the overhead of frequent task coordination, but if they exceed hardware constraints, they can slow down the entire pipeline. You can configure the training batch size using the `train_batch_size_per_learner` attribute as shown below.
@@ -1180,7 +1177,7 @@ batch sizes reduce the overhead of frequent task coordination, but if they excee
 
 In `Ray Data <data>`, it's common practice to use batch sizes that are powers of two. However, you are free to select any integer value for the batch size based on your needs.
 
-Batch Prefetching
+Batch prefetching
 ~~~~~~~~~~~~~~~~~
 Batch prefetching allows you to control data consumption on the downstream side of your offline data pipeline. The primary goal is to ensure that learners remain active, maintaining a continuous flow of data. This is achieved by preparing the next batch while the learner processes the current one. Prefetching determines how many batches are kept ready for learners
 and should be tuned based on the time required to produce the next batch and the learner's update speed. Prefetching too many batches can lead to memory inefficiencies and, in some cases, backpressure in upstream tasks.
@@ -1205,7 +1202,7 @@ You can configure batch prefetching in the `iter_batches_kwargs`:
 .. warning:: Don't override the ``batch_size`` in RLlib's `map_batches_kwargs`. This usually leads to high performance degradations. Note, this ``batch_size`` differs from the `train_batch_size_per_learner`: the former specifies the batch size
     in iterating over data output of the streaming pipeline, while the latter defines the batch size used for training within each :py:class:`~ray.rllib.core.learner.learner.Learner`.
 
-Learner Iterations
+Learner iterations
 ~~~~~~~~~~~~~~~~~~
 This tuning parameter is available only when using multiple instances of ::py:class:`~ray.rllib.core.learner.learner.Learner`. In distributed learning, each :py:class:`~ray.rllib.core.learner.learner.Learner` instance processes a sub-stream of the offline streaming pipeline, iterating over batches from that sub-stream. You can control the number of iterations each
 :py:class:`~ray.rllib.core.learner.learner.Learner` instance runs per RLlib training iteration. Result reporting occurs after each RLlib training iteration. Setting this parameter too low results in inefficiencies, while excessively high values can hinder training monitoring and, in some cases - such as in RLlib's :py:class:`~ray.rllib.algorithms.marwil.marwil.MARWIL`
@@ -1239,7 +1236,7 @@ Customization of the Offline RL components in RLlib, such as the :py:class:`~ray
 - PreLearner Level
 - Pipeline Level.
 
-Connector Level
+Connector level
 ***************
 Small data transformations on instances of :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode` can be easily implemented by modifying the :py:class:`~ray.rllib.connectors.connector_pipeline_v2.ConnectorPipelineV2`, which is part of the :py:class:`~ray.rllib.offline.offline_prelearner.OfflinePreLearner` and prepares episodes for training. You can leverage any connector from
 RLlib's library (see `RLlib's default connectors <https://github.com/ray-project/ray/blob/master/rllib/connectors>`__) or create a custom connector (see `RLlib's ConnectorV2 examples <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors>`__) to integrate into the :py:class:`~ray.rllib.core.learner.learner.Learner`'s :py:class:`~ray.rllib.connectors.connector_pipeline_v2.ConnectorPipelineV2`.
@@ -1296,7 +1293,7 @@ Below is the example code snippet from `RLlib's MARWIL algorithm <https://github
 
         return pipeline
 
-Define a Primer LearnerConnector Pipeline
+Define a primer LearnerConnector pipeline
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 There are multiple ways to customize the :py:class:`~ray.rllib.connectors.learner.learner_connector_pipeline.LearnerConnectorPipeline`. One approach, as demonstrated above, is to override the `build_learner_connector` method in the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`. Alternatively, you can directly define a custom :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` piece to the
 :py:class:`~ray.rllib.connectors.learner.learner_connector_pipeline.LearnerConnectorPipeline` by utilizing the `learner_connector` attribute:
@@ -1322,7 +1319,7 @@ There are multiple ways to customize the :py:class:`~ray.rllib.connectors.learne
 As noted in the comments, this approach to adding a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` piece to the :py:class:`~ray.rllib.connectors.learner.learner_connector_pipeline.LearnerConnectorPipeline` is suitable only if you intend to manipulate raw episodes, as your :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` piece serves as the foundation for building the remainder of the pipeline (including batching and other processing
 steps). If your goal is to modify data further along in the :py:class:`~ray.rllib.connectors.learner.learner_connector_pipeline.LearnerConnectorPipeline`, you should either override the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`'s `build_learner_connector` method or consider the third option: overriding the entire :py:class:`~ray.rllib.offline.offline_prelearner.PreLearner`.
 
-PreLearner Level
+PreLearner level
 ****************
 If you need to perform data transformations at a deeper level - before your data reaches the :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode` stage - consider overriding the :py:class:`~ray.rllib.offline.offline_prelearner.OfflinePreLearner`. This class orchestrates the complete data transformation pipeline, converting raw input data into
 :py:class:`~ray.rllib.policy.sample_batch.MultiAgentBatch` objects ready for training. For instance, if your data is stored in specialized formats requiring pre-parsing and restructuring (for example, XML, HTML, Protobuf, images, or videos), you may need to handle these custom formats directly. You can leverage tools such as `Ray Data's custom datasources <custom_datasource>` (for example, :py:meth:`~ray.data.read_binary_files`) to manage the ingestion process. To ensure
@@ -1693,7 +1690,7 @@ To integrate your custom :py:class:`~ray.rllib.offline.offline_prelearner.Offlin
 
 If these customization capabilities still don't meet your requirements, consider moving to the **Pipeline Level** for even greater flexibility.
 
-Pipeline Level
+Pipeline level
 ~~~~~~~~~~~~~~
 On this level of RLlib's Offline RL API you can redefine your complete pipeline from data reading to batch iteration by overriding the :py:class:`~®ay.rllib.offline.offline_data.OfflineData` class. In most cases however the other two levels should be sufficient for your requirements. Manipulating the complete pipeline needs sensible handling because it could degrade performance of your
 pipeline to a high degree. Study carefully the :py:class:`~ray.rllib.offline.offline_data.OfflineData` class to reach a good understanding of how the default pipeline works before going over to program your own one. There are mainly two methods that define this pipeline:
diff --git a/doc/source/rllib/rllib-rlmodule.rst b/doc/source/rllib/rllib-rlmodule.rst
index 5e875f3bfc929..7af433ecfd8b1 100644
--- a/doc/source/rllib/rllib-rlmodule.rst
+++ b/doc/source/rllib/rllib-rlmodule.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rlmodule-guide:
 
 RL Modules
 ==========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 The :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` class in RLlib's new API stack allows you to write custom
 models, including highly complex multi-network setups often found in multi-agent or model-based algorithms.
 
diff --git a/doc/source/rllib/scaling-guide.rst b/doc/source/rllib/scaling-guide.rst
index e6d75e8028f0f..e30770bd08113 100644
--- a/doc/source/rllib/scaling-guide.rst
+++ b/doc/source/rllib/scaling-guide.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-scaling-guide:
 
 RLlib scaling guide
 ===================
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 RLlib is a distributed and scalable RL library, based on `Ray <https://www.ray.io/>`__. An RLlib :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
 uses `Ray actors <https://docs.ray.io/en/latest/ray-core/actors.html>`__ wherever parallelization of
 its sub-components can speed up sample and learning throughput.
diff --git a/doc/source/rllib/single-agent-episode.rst b/doc/source/rllib/single-agent-episode.rst
index 1ae9feb22eb74..3faa014239b6e 100644
--- a/doc/source/rllib/single-agent-episode.rst
+++ b/doc/source/rllib/single-agent-episode.rst
@@ -1,14 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
-
 .. _single-agent-episode-docs:
 
-
 Episodes
 ========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 RLlib stores and transports all trajectory data in the form of `Episodes`, in particular
 :py:class:`~ray.rllib.env.single_agent_episode.SingleAgentEpisode` for single-agent setups
 and :py:class:`~ray.rllib.env.multi_agent_episode.MultiAgentEpisode` for multi-agent setups.
diff --git a/doc/source/rllib/user-guides.rst b/doc/source/rllib/user-guides.rst
index eac69b9fcd8a1..340be7ed93c6f 100644
--- a/doc/source/rllib/user-guides.rst
+++ b/doc/source/rllib/user-guides.rst
@@ -1,13 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-guides:
 
-===========
 User Guides
 ===========
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 .. toctree::
     :hidden:
 
@@ -25,7 +24,6 @@ User Guides
     rllib-dev
     scaling-guide
 
-
 .. _rllib-feature-guide:
 
 RLlib Feature Guides