Skip to content

Commit

Permalink
fix(PocketIC): safely drop StateMachine (#3450)
Browse files Browse the repository at this point in the history
This PR safely drops every StateMachine in PocketIC to prevent its state
directory from being deleted before every Arc to its state manager is
dropped.
  • Loading branch information
mraszyk authored Jan 15, 2025
1 parent f8f274d commit 2828131
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 6 deletions.
34 changes: 32 additions & 2 deletions rs/pocket_ic_server/src/pocket_ic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,9 @@ impl SubnetsImpl {
pub(crate) fn get_all(&self) -> Vec<Arc<Subnet>> {
self.subnets.read().unwrap().values().cloned().collect()
}
fn clear(&self) {
self.subnets.write().unwrap().clear();
}
}

impl Subnets for SubnetsImpl {
Expand Down Expand Up @@ -421,8 +424,8 @@ pub struct PocketIc {

impl Drop for PocketIc {
fn drop(&mut self) {
let subnets = self.subnets.get_all();
if let Some(ref state_dir) = self.state_dir {
let subnets = self.subnets.get_all();
for subnet in &subnets {
subnet.state_machine.checkpointed_tick();
}
Expand Down Expand Up @@ -452,9 +455,36 @@ impl Drop for PocketIc {
let topology_json = serde_json::to_string(&raw_topology).unwrap();
topology_file.write_all(topology_json.as_bytes()).unwrap();
}
for subnet in subnets {
for subnet in self.subnets.get_all() {
subnet.state_machine.drop_payload_builder();
}
let state_machines: Vec<_> = self
.subnets
.get_all()
.into_iter()
.map(|subnet| subnet.state_machine.clone())
.collect();
self.subnets.clear();
// for every StateMachine, wait until nobody else has an Arc to that StateMachine
// and then drop that StateMachine
let start = std::time::Instant::now();
for state_machine in state_machines {
let mut state_machine = Some(state_machine);
while state_machine.is_some() {
match Arc::try_unwrap(state_machine.take().unwrap()) {
Ok(sm) => {
sm.drop();
break;
}
Err(sm) => {
state_machine = Some(sm);
}
}
if start.elapsed() > std::time::Duration::from_secs(5 * 60) {
panic!("Timed out while dropping PocketIC.");
}
}
}
}
}

Expand Down
14 changes: 10 additions & 4 deletions rs/state_machine_tests/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1826,20 +1826,26 @@ impl StateMachine {
fn into_components(self) -> (Box<dyn StateMachineStateDir>, u64, Time, u64) {
let state_manager = Arc::downgrade(&self.state_manager);
let result = self.into_components_inner();
let mut i = 0i32;
// StateManager is owned by an Arc, that is cloned into multiple components and different
// threads. If we return before all the asynchronous components release the Arc, we may
// end up with to StateManagers writing to the same directory, resulting in a crash.
let start = std::time::Instant::now();
while state_manager.upgrade().is_some() {
std::thread::sleep(std::time::Duration::from_millis(50));
i += 1;
if i >= 100 {
panic!("Failed to wait for StateManager drop");
if start.elapsed() > std::time::Duration::from_secs(5 * 60) {
panic!("Timed out while dropping StateMachine.");
}
}
result
}

/// Safely drops this `StateMachine`. We cannot achieve this functionality by implementing `Drop`
/// since we have to wait until there are no more `Arc`s for the state manager and
/// this is infeasible in a `Drop` implementation.
pub fn drop(self) {
let _ = self.into_components();
}

/// Emulates a node restart, including checkpoint recovery.
pub fn restart_node(self) -> Self {
// We must drop self before setup_form_dir so that we don't have two StateManagers pointing
Expand Down

0 comments on commit 2828131

Please sign in to comment.