From 54075012aa2f5deddbc1c39b9a05821ce893c7d8 Mon Sep 17 00:00:00 2001 From: Matteo Settenvini Date: Sun, 26 Jan 2025 02:33:25 +0100 Subject: [PATCH] feat(dso): implement unused DSO cleaner This is the first implementation of a recursive DSO cleaner which is reasonably fast. The implementation is still (unit-) untested. Of course, modules to be dlopen'ed will always escape such a tool, which will need to support another cleaner plugin with a whitelist. --- Cargo.lock | 10 ++ Cargo.toml | 3 +- src/cleaners.rs | 46 ++++++-- src/cleaners/dso.rs | 255 ++++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 6 +- 5 files changed, 295 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3bd19fc..a5522c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -298,6 +298,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +dependencies = [ + "libc", +] + [[package]] name = "miniz_oxide" version = "0.8.3" @@ -539,6 +548,7 @@ dependencies = [ "env_logger", "goblin", "log", + "memmap2", "nix", "petgraph", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 44b67e0..7f0527e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,8 @@ clap = { version = "4.5", features = ["derive"] } env_logger = { version = "0.11" } goblin = { version = "0.9" } log = { version = "0.4" } -nix = { version = "0.29" } +memmap2 = { version = "0.9" } +nix = { version = "0.29", features = ["fs"] } petgraph = { version = "0.7" } tokio = { version = "1", features = ["full"] } walkdir = { version = "2" } diff --git a/src/cleaners.rs b/src/cleaners.rs index 8628ebf..bd58fdc 100644 --- a/src/cleaners.rs +++ b/src/cleaners.rs @@ -9,7 +9,7 @@ use crate::{ }; use anyhow::Result; use async_trait::async_trait; -use dso::Dso; +use dso::DsoCleaner; use nix::libc::EXDEV; use std::{ collections::HashMap, @@ -20,7 +20,7 @@ use tokio::{ sync::{broadcast, mpsc}, task::JoinSet, }; -use walkdir::WalkDir; +use walkdir::{DirEntry, WalkDir}; #[async_trait] pub trait Cleaner { @@ -38,18 +38,21 @@ pub struct Runner { removal_fn: RemovalFn, } +const CHANNEL_SIZE: usize = 100; +const CHANNEL_MAX_LOAD: usize = CHANNEL_SIZE * 3 / 4; + impl Runner { pub fn new(args: Args) -> Self { let removal_fn = Self::new_removal_fn(&args); Self { - cleaners: vec![Box::new(Dso::new())], + cleaners: vec![Box::new(DsoCleaner::default())], removal_fn, } } pub async fn run(self) -> Result<()> { - let input_tx = broadcast::Sender::new(100); - let (output_tx, output_rx) = mpsc::channel(100); + let input_tx = broadcast::Sender::new(CHANNEL_SIZE); + let (output_tx, output_rx) = mpsc::channel(CHANNEL_SIZE); let mut tasks = JoinSet::new(); // Processors @@ -76,14 +79,13 @@ impl Runner { } async fn input_producer(input_tx: broadcast::Sender) -> Result<()> { - let walker = WalkDir::new("."); + let walker = WalkDir::new(".").follow_links(false); for entry in walker { match entry { - Ok(e) if !e.file_type().is_dir() => { - if input_tx.len() >= 75 { - // TODO: FIXME: make this better, e.g. use backoff, this is a quick - // hack - tokio::time::sleep(std::time::Duration::from_millis(100)).await; + Ok(e) if !Self::is_dir(&e) => { + if input_tx.len() >= CHANNEL_MAX_LOAD { + // TODO: FIXME: make this better, this is a quick hack + tokio::time::sleep(std::time::Duration::from_millis(50)).await; } input_tx.send(e.into_path())?; } @@ -96,6 +98,28 @@ impl Runner { Ok(()) } + fn is_dir(entry: &DirEntry) -> bool { + let ty = entry.file_type(); + if ty.is_dir() { + true + } else if ty.is_file() { + false + } else { + // it is a symlink + match std::fs::metadata(entry.path()) { + Ok(metadata) => metadata.is_dir(), + Err(e) => { + log::debug!( + "unable to resolve symlink {}: {}", + entry.path().display(), + e + ); + false + } + } + } + } + async fn output_consumer(removal_fn: RemovalFn, mut output_rx: mpsc::Receiver) { let mut to_remove = HashMap::new(); while let Some(decision) = output_rx.recv().await { diff --git a/src/cleaners/dso.rs b/src/cleaners/dso.rs index 582165a..ac962d1 100644 --- a/src/cleaners/dso.rs +++ b/src/cleaners/dso.rs @@ -5,31 +5,87 @@ use super::Cleaner; use crate::decision::{Action, Decision}; use anyhow::Result; use async_trait::async_trait; -use std::path::PathBuf; -use tokio::sync::{broadcast, mpsc}; +use goblin::elf::Elf; +use memmap2::Mmap; +use nix::{errno::Errno, libc::ino_t}; +use petgraph::{prelude::DiGraphMap, visit::Dfs}; +use std::{ + collections::{HashMap, HashSet}, + fs::File, + io::{ErrorKind, Read, Seek}, + path::{Path, PathBuf}, +}; +use tokio::sync::{ + broadcast::{self, error::RecvError}, + mpsc, +}; + +type InodeMap = HashMap>; +type InodeGraph = DiGraphMap; /// Cleans up unused shared libraries /// and warns about broken dependencies as well -pub struct Dso {} +#[derive(Default)] +pub struct DsoCleaner {} -impl Dso { - pub fn new() -> Self { - Self {} - } +#[derive(Default)] +struct State { + paths_map: InodeMap, + graph: InodeGraph, } +const INODE_ANY_EXECUTABLE: ino_t = 0; +const ELF_MAGIC_HEADER: &[u8; 4] = b"\x7fELF"; + #[async_trait] -impl Cleaner for Dso { +impl Cleaner for DsoCleaner { async fn run( &mut self, mut files: broadcast::Receiver, decisions: mpsc::Sender, ) -> Result<()> { - while let Ok(file) = files.recv().await { - // TODO: handle Lagged? + let mut state = State::default(); + + loop { + match files.recv().await { + Ok(file) => { + if let Err(e) = Self::process_file(&mut state, &file) { + log::warn!("{}: {}", file.display(), e); + } + } + Err(RecvError::Closed) => break, + e => { + e?; + } + } + } + + // println!( + // "{:?}", + // petgraph::dot::Dot::with_config(&state.graph, &[petgraph::dot::Config::EdgeNoLabel]) + // ); + + let mut dfs = Dfs::empty(&state.graph); + if state.graph.contains_node(INODE_ANY_EXECUTABLE) { + dfs.move_to(INODE_ANY_EXECUTABLE); + } + while let Some(_) = dfs.next(&state.graph) {} + + for path in state + .paths_map + .into_iter() + .filter_map(|(n, paths)| { + if !dfs.discovered.contains(&n) { + Some(paths) + } else { + None + } + }) + .flatten() + { decisions .send(Decision { - path: file, + path, action: Action::Remove, }) .await?; @@ -38,3 +94,180 @@ impl Cleaner for Dso { Ok(()) } } + +impl DsoCleaner { + fn process_file(state: &mut State, path: &Path) -> Result<()> { + let mut f = File::open(path)?; + let mut hdr = [0u8; 4]; + if let Err(e) = f.read_exact(&mut hdr) { + if e.kind() != ErrorKind::UnexpectedEof { + anyhow::bail!(e) + } + + return Ok(()); // not ELF, ignore + }; + + let is_elf = &hdr == ELF_MAGIC_HEADER; + if !is_elf { + return Ok(()); + } + + f.rewind()?; + let mmap = unsafe { Mmap::map(&f)? }; + let elf = Elf::parse(&mmap)?; + + if path.is_symlink() { + if !elf.is_lib { + // we don't care about symlinks to + // executables in our graph, as we + // are cleaning up only DSOs. + Ok(()) + } else { + Self::process_elf_symlink(state, path) + } + } else { + Self::process_elf_file(state, path, &elf) + } + } + + fn process_elf_symlink(state: &mut State, path: &Path) -> Result<()> { + let src = nix::sys::stat::lstat(path)?; + let dst = nix::sys::stat::stat(path)?; + if src.st_dev != dst.st_dev { + log::warn!( + "{} points outside of the sysroot filesystem, check if this is intended", + path.display() + ); + return Ok(()); + } + let current_dir = std::env::current_dir()?; + let dst_path = std::fs::canonicalize(path)? + .strip_prefix(current_dir)? + .to_path_buf(); + + log::trace!( + "dso: adding to graph symlink: '{}' to '{}'", + path.display(), + dst_path.display() + ); + + state + .paths_map + .entry(src.st_ino) + .or_default() + .insert(path.into()); + + state + .paths_map + .entry(dst.st_ino) + .or_default() + .insert(dst_path); + + state.graph.add_edge(src.st_ino, dst.st_ino, ()); + Ok(()) + } + + fn process_elf_file(state: &mut State, path: &Path, elf: &Elf) -> Result<()> { + log::trace!("dso: adding to graph elf file '{}'", path.display()); + + let current_dir = std::env::current_dir()?; + let origin = std::fs::canonicalize(path)? + .parent() + .unwrap() + .strip_prefix(current_dir)? + .to_path_buf() + .into_os_string() + .into_string() + .map_err(|s| anyhow::anyhow!("cannot represent {:?} as a UTF-8 string", s))?; + + let mut search_paths = vec![]; + + if elf.rpaths != vec![""] { + if elf.runpaths != vec![""] { + let mut rpaths = elf + .rpaths + .iter() + .map(|p| p.replace("$ORIGIN", &origin)) + .collect::>(); + search_paths.append(&mut rpaths); + } + + let ld_config_path = std::env::var("LD_LIBRARY_PATH"); + let mut env_paths = ld_config_path + .as_ref() + .map(|env| { + env.split(':') + .filter_map(|dir| { + if dir.is_empty() { + None + } else { + Some(dir.to_string()) + } + }) + .collect::>() + }) + .unwrap_or_default(); + search_paths.append(&mut env_paths); + } + + if elf.runpaths != vec![""] { + let mut runpaths = elf + .runpaths + .iter() + .map(|p| p.replace("$ORIGIN", &origin)) + .collect::>(); + search_paths.append(&mut runpaths); + } + + // Standard dirs: + search_paths.push("/usr/local/lib".into()); + search_paths.push("/lib".into()); + search_paths.push("/usr/lib".into()); + + let src_stat = nix::sys::stat::stat(path)?; + let src_inode = if elf.is_lib { + src_stat.st_ino + } else { + // We put all executables in the same node + INODE_ANY_EXECUTABLE + }; + + 'next_lib: for &library in elf.libraries.iter() { + for lib_path in search_paths.iter() { + let tentative_path = PathBuf::from(lib_path).strip_prefix("/")?.join(library); + let dst = match nix::sys::stat::stat(&tentative_path) { + Ok(dst) => dst, + Err(Errno::ENOENT) => continue, + Err(e) => anyhow::bail!( + "got errno {} while accessing {}", + e, + tentative_path.display() + ), + }; + + if src_stat.st_dev != dst.st_dev { + continue; // These are not the droids you are looking for. + } + + state + .paths_map + .entry(src_inode) + .or_default() + .insert(path.into()); + + state + .paths_map + .entry(dst.st_ino) + .or_default() + .insert(tentative_path); + + state.graph.add_edge(src_inode, dst.st_ino, ()); + continue 'next_lib; + } + + anyhow::bail!("{}: unable to find library {}", path.display(), library); + } + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index 8f9b255..1d7751a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,10 +13,12 @@ use env_logger::Env; #[tokio::main] async fn main() -> Result<()> { - let logging_env = Env::default().filter_or("LOG_LEVEL", "warn"); + let args = Args::try_parse()?; + + let logging_env = + Env::default().filter_or("LOG_LEVEL", if args.dry_run { "info" } else { "warn" }); env_logger::Builder::from_env(logging_env).init(); - let args = Args::try_parse()?; std::env::set_current_dir(&args.sysroot_location)?; let runner = Runner::new(args);