From 4eadb0de25d27d77f6c8896d4f22db4e548bca84 Mon Sep 17 00:00:00 2001 From: "Ericson \"Fogo\" Soares" Date: Sun, 30 Jun 2024 00:59:01 -0300 Subject: [PATCH] [ENG-1809] Make core more resilient to crashes (#2574) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Catching panics * Re-enable unwind on release profile * Reverting deps due to bad assertion * Comment why the deps where reverted --------- Co-authored-by: VĂ­tor Vasconcellos --- Cargo.lock | Bin 308583 -> 302174 bytes Cargo.toml | 8 +- .../heavy-lifting/src/job_system/error.rs | 3 + .../heavy-lifting/src/job_system/job.rs | 78 +++++++++++++----- .../media_processor/helpers/thumbnailer.rs | 26 ++++-- crates/task-system/src/system.rs | 2 +- crates/task-system/src/worker/runner.rs | 41 ++++++--- rust-toolchain.toml | 2 +- 8 files changed, 114 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bafb8147947572f5042bac09c25f09e0d090e5fc..3f0c76e7d7602ff1abbdf36c1bb4d0575c085fb4 100644 GIT binary patch delta 463 zcmWO2J!?}@7yw|-xmU0$sV2o}s}MV=o8EKp`2;l%B7!(HAlO0We2TQV=p@z%aVkQj z8Qw)Kv6G8dqaFn*6dfE3QYi=(6;Zo&sNiP)!Nc=wv%2x7y7XkIKT~>8>VK%21N}p+ z$NF>AYNUVn)`_&;ZpC&BU8;BWyJmxNm61ZZJTNA?^qI^uB8q}}Y816bP(pJ-t)PK$ zuB@cNu;7i(tqNSh){$f{-1&@4Fu#B@SlpFVVEqCv!^Q%tCO5Bz?n>Mb$9w2~Tz-rW z4#)3L(Tx;Ny+qs4nL^EY^);H^3GN;mg)3vY9{(+)N27866Z*qI4dY7p_l{INHItVf}PciLW}i+JN?bT#I-WpWPkH-|*o|T-(N*`+%dSfhg;_E~C3M+t~}W zd?-qcMkJw*$>9vt-*UpQw;)FBxbg-=R*B?z~UM`=vt zCi)PR=1OWElW<08!emI&lOU9dw72JLJA3U-8%{hq@%2sT_||IIbr1KRc*Zq#l#VGw zMKm#Z%RNcbk(_x@(GX{h;NE4=sOL7BM2%CKNTI3GBvD%EH^-(XTz6qUwbz#B%Y9?@ zMZH_+=9EF~h1aM23mgm9XRn0m)9zxjl@ zFznvAPG=UnuGKEjo?p49QrcKzqX{w!aBvYK0( zDJ4uKX;m!8x73s6=*!jSUV36@@4o$w6AzPz<;2QrH@T&mR;OG(jl=6eS)@j)ga?sm zsCI_SoTN08Td9MQ!312QCX@=oEEPQROqtSD2vx3ItEW!be9e~f#*s0%KPaaHL0AGIAYhB9vG(&!ZGQ5Y>t0)$1GMoo-Cs(*6?;@%rKV zyh5&Te7tEg%?UltndmHhBr0%XjY!I7>4mX2NU9u9S&NLGXXu5+jE+h!g)PUPucq#P z>W=40Q7Mu%w9d4$EJ%iVa4vF1U2xi<2^srqPC~*RLml3PeKw}eUpu{*7V_2l zrlt@%MaLyHMx}KcYSdIyH{b!2(PbqlO^j+Uf`TX{3>MnN{-D;X|yQS1S@RDl=E)2zgU@<^eR#UJgFswj?kXp<>k-!7xSwU{r zGi%ZKWHm!uohk?U*i<3Ms%!guzgw*|#^+Z%oyBsJRB_+nq8Jk<8EuuP5qeb#jk8&k zAiV|DqNmDd%TYfdA`Oikj2d9j_YO@=mMiWaJFlYU-KVSF<=ErZd*!ZMtGTA95Gr?4 z83lOy?4$(Aq~ejFbWMC_*ieY*AV-aN9v$#fP!j+u*{i-ZS#Er@p6Gq%$j<(uC#ttL z_P$JZ^zS)QJ-mHGUORFGy5IBZ#)W0=o%&$YSwJUBDRuIOW#$bwiNZ>#R0TUgHK!J! zAebpdJ;BIQurE@glg@*uZFAkzcpi!@08r)l_p8hLm;JnYM>aa0Wm{gk3koCI;kG~o zKpvMKSs=h%ummm;byRQ!$+O4dEe0cHMs#EtLTMC6S`Cr2p*0j#seW1QDt|jwZSB1^ zKG$D6RlRiHMt9gJK=5!mc}u;$*F5*eeI_}Y2Xhr8Mi`v6QYa`6USPly!v#(m8GuX^ zr!p}IVF%%vh{4L@Ua5DM*MCuMZVX1Ocl@zk{R5lpd-zyqdE^`A!1iixKT%qF&zZ9( zP$w`yN%ABIRHE<*NKbm6b+8N_Gyw#)!GkL#5|24I11j}@x3B)~M49_zwWG9IJ-07G zuc(x|L6b>9am2E0gE2Z{Og+s(8O{kDB&+0=G9tr9BuIpFltVAoQ@#DCF793M+H|?^ zlXav2gHP1Imyp{VhwDq*xwFuTY5l6@m|FwqNNwP7cA>S}?56c=lXi~cPZk0b0}}

%QI#uWjG5-0kGn zVynF{gLk1ka_`vm2iFemsJEBD-&x<>G=xz&D6S#}8w6(|2PT^{`f|M%OJNpM9~ua_iVSP34)-)$pwd-6w<- z4TNF919Aw>#!-}&xn+0=0?GjYE@F=u0ij?Z&`Z=x~P@bQk=gnEiMCde__DfM{jRjaLypxPgbm#Y1bY@P(P7@}}7=Y8bAfIyyS-S+-sdy;|??-|~9> z_SA-HI?HDHU}aZ-{=J&C>o{K?Wn;U_uRh%PRMSf2&1^LHDM)3J>yRiJm)hve0alf0 z0!27s9e8VtHrt-ZB1jnc{Q0E*{j0+JK zEexj4dZ!R12e68gVo7Sjg3XDTkO2^+^5{7NcA@{QCoe(}egEge@YH8IUQW(7uIwM1 zYJBTMBebAqnH9w;4Al>f>n4Itu2S<#wPo} z;f>G6#vllltN&5?{TwliI|iqPtyv}T(;#HzE?|{%Sijd1xOEEo&ceKz5?LUyS%e=a zsBi;hm)ZLpZ%+*Z79!QcSrLcKic&oZGapQ`^7x6sU_?<-l#Psvc*%pKo@$!~lB~5! qBk^Sq7lKhxL$kxJ3DJB0waLaHwUrxhsxK+a4>dOSFL|KR9Q!ZLxF1^p diff --git a/Cargo.toml b/Cargo.toml index 349d571ad..1cc1388c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ gix-ignore = "0.11.2" globset = "0.4.14" http = "0.2" # Update blocked by axum hyper = "0.14" # Update blocked due to API breaking changes -image = "0.25.1" +image = "0.24.9" # Update blocked due to https://github.com/image-rs/image/issues/2230 itertools = "0.13.0" lending-stream = "1.0" libc = "0.2" @@ -66,7 +66,7 @@ tracing-subscriber = "0.3.18" tracing-test = "0.2.5" uhlc = "0.6.0" # Must follow version used by specta uuid = "1.8" -webp = "0.3.0" +webp = "0.2.6" # Update blocked by image [workspace.dependencies.prisma-client-rust] git = "https://github.com/brendonovich/prisma-client-rust" @@ -101,8 +101,6 @@ libp2p-stream = { git = "https://github.com/spacedriveapp/rust-libp2p.git", rev blake3 = { git = "https://github.com/spacedriveapp/blake3.git", rev = "d3aab416c12a75c2bfabce33bcd594e428a79069" } -# Due to image crate version bump -pdfium-render = { git = "https://github.com/fogodev/pdfium-render.git", rev = "e7aa1111f441c49e857cebda15b4e51b24356aaa" } [profile.dev] # Make compilation faster on macOS @@ -143,7 +141,7 @@ incremental = false # Optimize release builds [profile.release] -panic = "abort" # Strip expensive panic clean-up logic +panic = "unwind" # Sadly we need unwind to avoid unexpected crashes on third party crates codegen-units = 1 # Compile crates one after another so the compiler can optimize better lto = true # Enables link to optimizations opt-level = "s" # Optimize for binary size diff --git a/core/crates/heavy-lifting/src/job_system/error.rs b/core/crates/heavy-lifting/src/job_system/error.rs index 98c5b8f8e..4ae944b2e 100644 --- a/core/crates/heavy-lifting/src/job_system/error.rs +++ b/core/crates/heavy-lifting/src/job_system/error.rs @@ -30,6 +30,9 @@ pub enum JobSystemError { #[error(transparent)] Report(#[from] ReportError), + + #[error("internal job panic! ")] + Panic(JobId), } impl From for rspc::Error { diff --git a/core/crates/heavy-lifting/src/job_system/job.rs b/core/crates/heavy-lifting/src/job_system/job.rs index 91b3173b1..0a3642797 100644 --- a/core/crates/heavy-lifting/src/job_system/job.rs +++ b/core/crates/heavy-lifting/src/job_system/job.rs @@ -13,6 +13,7 @@ use std::{ hash::{Hash, Hasher}, marker::PhantomData, ops::{Deref, DerefMut}, + panic::AssertUnwindSafe, path::Path, pin::pin, sync::Arc, @@ -21,7 +22,7 @@ use std::{ use async_channel as chan; use chrono::{DateTime, Utc}; -use futures::{stream, Future, StreamExt}; +use futures::{stream, Future, FutureExt, StreamExt}; use futures_concurrency::{ future::{Join, TryJoin}, stream::Merge, @@ -750,15 +751,29 @@ where trace!("Dispatching job"); - spawn(to_spawn_job::( - self.id, - self.job, - ctx.clone(), - None, - base_dispatcher, - commands_rx, - done_tx, - )); + spawn({ + let id = self.id; + let job = self.job; + let ctx = ctx.clone(); + + async move { + if AssertUnwindSafe(to_spawn_job::( + id, + job, + ctx, + None, + base_dispatcher, + commands_rx, + done_tx, + )) + .catch_unwind() + .await + .is_err() + { + error!("job panicked"); + } + } + }); JobHandle { id: self.id, @@ -791,15 +806,29 @@ where trace!("Resuming job"); - spawn(to_spawn_job::( - self.id, - self.job, - ctx.clone(), - serialized_tasks, - base_dispatcher, - commands_rx, - done_tx, - )); + spawn({ + let id = self.id; + let job = self.job; + let ctx = ctx.clone(); + + async move { + if AssertUnwindSafe(to_spawn_job::( + id, + job, + ctx, + serialized_tasks, + base_dispatcher, + commands_rx, + done_tx, + )) + .catch_unwind() + .await + .is_err() + { + error!("job panicked"); + } + } + }); JobHandle { id: self.id, @@ -855,9 +884,14 @@ async fn to_spawn_job( spawn( async move { - tx.send(job.run::(dispatcher, ctx).await) - .await - .expect("job run channel closed"); + tx.send( + AssertUnwindSafe(job.run::(dispatcher, ctx)) + .catch_unwind() + .await + .unwrap_or(Err(Error::JobSystem(JobSystemError::Panic(job_id)))), + ) + .await + .expect("job run channel closed"); } .in_current_span(), ); diff --git a/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs b/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs index 5f636f606..caa06a3fe 100644 --- a/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs +++ b/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs @@ -14,6 +14,7 @@ use sd_file_ext::extensions::{VideoExtension, ALL_VIDEO_EXTENSIONS}; use std::{ ops::Deref, + panic, path::{Path, PathBuf}, str::FromStr, time::Duration, @@ -313,9 +314,9 @@ pub async fn generate_thumbnail( } fn inner_generate_image_thumbnail( - file_path: PathBuf, + file_path: &PathBuf, ) -> Result, thumbnailer::NonCriticalThumbnailerError> { - let mut img = format_image(&file_path).map_err(|e| { + let mut img = format_image(file_path).map_err(|e| { thumbnailer::NonCriticalThumbnailerError::FormatImage(file_path.clone(), e.to_string()) })?; @@ -336,7 +337,7 @@ fn inner_generate_image_thumbnail( // this corrects the rotation/flip of the image based on the *available* exif data // not all images have exif data, so we don't error. we also don't rotate HEIF as that's against the spec - if let Some(orientation) = Orientation::from_path(&file_path) { + if let Some(orientation) = Orientation::from_path(file_path) { if ConvertibleExtension::try_from(file_path.as_ref()) .expect("we already checked if the image was convertible") .should_rotate() @@ -347,7 +348,10 @@ fn inner_generate_image_thumbnail( // Create the WebP encoder for the above image let encoder = Encoder::from_image(&img).map_err(|reason| { - thumbnailer::NonCriticalThumbnailerError::WebPEncoding(file_path, reason.to_string()) + thumbnailer::NonCriticalThumbnailerError::WebPEncoding( + file_path.clone(), + reason.to_string(), + ) })?; // Type `WebPMemory` is !Send, which makes the `Future` in this function `!Send`, @@ -378,7 +382,19 @@ async fn generate_image_thumbnail( move || { // Handling error on receiver side - let _ = tx.send(inner_generate_image_thumbnail(file_path)); + + let _ = tx.send( + panic::catch_unwind(|| inner_generate_image_thumbnail(&file_path)).unwrap_or_else( + move |_| { + Err( + thumbnailer::NonCriticalThumbnailerError::PanicWhileGeneratingThumbnail( + file_path, + "Internal panic on third party crate".to_string(), + ), + ) + }, + ), + ); } }); diff --git a/crates/task-system/src/system.rs b/crates/task-system/src/system.rs index 904b0a5eb..f64887bc3 100644 --- a/crates/task-system/src/system.rs +++ b/crates/task-system/src/system.rs @@ -44,7 +44,7 @@ impl System { let workers_count = usize::max( std::thread::available_parallelism().map_or_else( |e| { - error!("Failed to get available parallelism in the job system: {e:#?}"); + error!(?e, "Failed to get available parallelism in the job system"); 1 }, NonZeroUsize::get, diff --git a/crates/task-system/src/worker/runner.rs b/crates/task-system/src/worker/runner.rs index d99981558..e9ab8da9e 100644 --- a/crates/task-system/src/worker/runner.rs +++ b/crates/task-system/src/worker/runner.rs @@ -1,6 +1,8 @@ use std::{ + any::Any, collections::{HashMap, VecDeque}, future::pending, + panic::AssertUnwindSafe, pin::pin, sync::{ atomic::{AtomicBool, Ordering}, @@ -53,7 +55,7 @@ struct AbortAndSuspendSignalers { struct RunningTask { id: TaskId, kind: PendingTaskKind, - handle: JoinHandle<()>, + handle: JoinHandle>>, } enum WaitingSuspendedTask { @@ -150,7 +152,7 @@ impl Runner { &mut self, task_id: TaskId, task_work_state: TaskWorkState, - ) -> JoinHandle<()> { + ) -> JoinHandle>> { let (abort_tx, abort_rx) = oneshot::channel(); let (suspend_tx, suspend_rx) = oneshot::channel(); @@ -163,13 +165,16 @@ impl Runner { ); let handle = spawn( - run_single_task( - task_work_state, - self.task_output_tx.clone(), - suspend_rx, - abort_rx, + AssertUnwindSafe( + run_single_task( + task_work_state, + self.task_output_tx.clone(), + suspend_rx, + abort_rx, + ) + .in_current_span(), ) - .in_current_span(), + .catch_unwind(), ); trace!("Task runner spawned"); @@ -624,8 +629,14 @@ impl Runner { } } - if let Err(e) = handle.await { - error!(%task_id, ?e, "Task failed to join"); + match handle.await { + Ok(Ok(())) => { /* Everything is Awesome! */ } + Ok(Err(_)) => { + error!(%task_id, "Task panicked"); + } + Err(e) => { + error!(%task_id, ?e, "Task failed to join"); + } } stolen_task_tx.close(); @@ -806,8 +817,14 @@ impl Runner { assert_eq!(*finished_task_id, old_task_id, "Task output id mismatch"); // Sanity check - if let Err(e) = handle.await { - error!(?e, "Task failed to join"); + match handle.await { + Ok(Ok(())) => { /* Everything is Awesome! */ } + Ok(Err(_)) => { + error!("Task panicked"); + } + Err(e) => { + error!(?e, "Task failed to join"); + } } if let Some((next_task_kind, task_work_state)) = self.get_next_task() { diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 744175d52..c6e4d7d50 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "1.78" +channel = "1.79"