From 54075012aa2f5deddbc1c39b9a05821ce893c7d8 Mon Sep 17 00:00:00 2001
From: Matteo Settenvini <matteo.settenvini@montecristosoftware.eu>
Date: Sun, 26 Jan 2025 02:33:25 +0100
Subject: [PATCH] feat(dso): implement unused DSO cleaner

This is the first implementation of a recursive
DSO cleaner which is reasonably fast.

The implementation is still (unit-) untested.

Of course, modules to be dlopen'ed will always
escape such a tool, which will need to
support another cleaner plugin with a
whitelist.
---
 Cargo.lock          |  10 ++
 Cargo.toml          |   3 +-
 src/cleaners.rs     |  46 ++++++--
 src/cleaners/dso.rs | 255 ++++++++++++++++++++++++++++++++++++++++++--
 src/main.rs         |   6 +-
 5 files changed, 295 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3bd19fc..a5522c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -298,6 +298,15 @@ version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
+[[package]]
+name = "memmap2"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "miniz_oxide"
 version = "0.8.3"
@@ -539,6 +548,7 @@ dependencies = [
  "env_logger",
  "goblin",
  "log",
+ "memmap2",
  "nix",
  "petgraph",
  "tokio",
diff --git a/Cargo.toml b/Cargo.toml
index 44b67e0..7f0527e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,8 @@ clap = { version = "4.5", features = ["derive"] }
 env_logger = { version = "0.11" }
 goblin = { version = "0.9" }
 log = { version = "0.4" }
-nix = { version = "0.29" }
+memmap2 = { version = "0.9" }
+nix = { version = "0.29", features = ["fs"] }
 petgraph = { version = "0.7" }
 tokio = { version = "1", features = ["full"] }
 walkdir = { version = "2" }
diff --git a/src/cleaners.rs b/src/cleaners.rs
index 8628ebf..bd58fdc 100644
--- a/src/cleaners.rs
+++ b/src/cleaners.rs
@@ -9,7 +9,7 @@ use crate::{
 };
 use anyhow::Result;
 use async_trait::async_trait;
-use dso::Dso;
+use dso::DsoCleaner;
 use nix::libc::EXDEV;
 use std::{
     collections::HashMap,
@@ -20,7 +20,7 @@ use tokio::{
     sync::{broadcast, mpsc},
     task::JoinSet,
 };
-use walkdir::WalkDir;
+use walkdir::{DirEntry, WalkDir};
 
 #[async_trait]
 pub trait Cleaner {
@@ -38,18 +38,21 @@ pub struct Runner {
     removal_fn: RemovalFn,
 }
 
+const CHANNEL_SIZE: usize = 100;
+const CHANNEL_MAX_LOAD: usize = CHANNEL_SIZE * 3 / 4;
+
 impl Runner {
     pub fn new(args: Args) -> Self {
         let removal_fn = Self::new_removal_fn(&args);
         Self {
-            cleaners: vec![Box::new(Dso::new())],
+            cleaners: vec![Box::new(DsoCleaner::default())],
             removal_fn,
         }
     }
 
     pub async fn run(self) -> Result<()> {
-        let input_tx = broadcast::Sender::new(100);
-        let (output_tx, output_rx) = mpsc::channel(100);
+        let input_tx = broadcast::Sender::new(CHANNEL_SIZE);
+        let (output_tx, output_rx) = mpsc::channel(CHANNEL_SIZE);
         let mut tasks = JoinSet::new();
 
         // Processors
@@ -76,14 +79,13 @@ impl Runner {
     }
 
     async fn input_producer(input_tx: broadcast::Sender<PathBuf>) -> Result<()> {
-        let walker = WalkDir::new(".");
+        let walker = WalkDir::new(".").follow_links(false);
         for entry in walker {
             match entry {
-                Ok(e) if !e.file_type().is_dir() => {
-                    if input_tx.len() >= 75 {
-                        // TODO: FIXME: make this better, e.g. use backoff, this is a quick
-                        // hack
-                        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+                Ok(e) if !Self::is_dir(&e) => {
+                    if input_tx.len() >= CHANNEL_MAX_LOAD {
+                        // TODO: FIXME: make this better, this is a quick hack
+                        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
                     }
                     input_tx.send(e.into_path())?;
                 }
@@ -96,6 +98,28 @@ impl Runner {
         Ok(())
     }
 
+    fn is_dir(entry: &DirEntry) -> bool {
+        let ty = entry.file_type();
+        if ty.is_dir() {
+            true
+        } else if ty.is_file() {
+            false
+        } else {
+            // it is a symlink
+            match std::fs::metadata(entry.path()) {
+                Ok(metadata) => metadata.is_dir(),
+                Err(e) => {
+                    log::debug!(
+                        "unable to resolve symlink {}: {}",
+                        entry.path().display(),
+                        e
+                    );
+                    false
+                }
+            }
+        }
+    }
+
     async fn output_consumer(removal_fn: RemovalFn, mut output_rx: mpsc::Receiver<Decision>) {
         let mut to_remove = HashMap::new();
         while let Some(decision) = output_rx.recv().await {
diff --git a/src/cleaners/dso.rs b/src/cleaners/dso.rs
index 582165a..ac962d1 100644
--- a/src/cleaners/dso.rs
+++ b/src/cleaners/dso.rs
@@ -5,31 +5,87 @@ use super::Cleaner;
 use crate::decision::{Action, Decision};
 use anyhow::Result;
 use async_trait::async_trait;
-use std::path::PathBuf;
-use tokio::sync::{broadcast, mpsc};
+use goblin::elf::Elf;
+use memmap2::Mmap;
+use nix::{errno::Errno, libc::ino_t};
+use petgraph::{prelude::DiGraphMap, visit::Dfs};
+use std::{
+    collections::{HashMap, HashSet},
+    fs::File,
+    io::{ErrorKind, Read, Seek},
+    path::{Path, PathBuf},
+};
+use tokio::sync::{
+    broadcast::{self, error::RecvError},
+    mpsc,
+};
+
+type InodeMap = HashMap<ino_t, HashSet<PathBuf>>;
+type InodeGraph = DiGraphMap<ino_t, ()>;
 
 /// Cleans up unused shared libraries
 /// and warns about broken dependencies as well
-pub struct Dso {}
+#[derive(Default)]
+pub struct DsoCleaner {}
 
-impl Dso {
-    pub fn new() -> Self {
-        Self {}
-    }
+#[derive(Default)]
+struct State {
+    paths_map: InodeMap,
+    graph: InodeGraph,
 }
 
+const INODE_ANY_EXECUTABLE: ino_t = 0;
+const ELF_MAGIC_HEADER: &[u8; 4] = b"\x7fELF";
+
 #[async_trait]
-impl Cleaner for Dso {
+impl Cleaner for DsoCleaner {
     async fn run(
         &mut self,
         mut files: broadcast::Receiver<PathBuf>,
         decisions: mpsc::Sender<Decision>,
     ) -> Result<()> {
-        while let Ok(file) = files.recv().await {
-            // TODO: handle Lagged?
+        let mut state = State::default();
+
+        loop {
+            match files.recv().await {
+                Ok(file) => {
+                    if let Err(e) = Self::process_file(&mut state, &file) {
+                        log::warn!("{}: {}", file.display(), e);
+                    }
+                }
+                Err(RecvError::Closed) => break,
+                e => {
+                    e?;
+                }
+            }
+        }
+
+        // println!(
+        //     "{:?}",
+        //     petgraph::dot::Dot::with_config(&state.graph, &[petgraph::dot::Config::EdgeNoLabel])
+        // );
+
+        let mut dfs = Dfs::empty(&state.graph);
+        if state.graph.contains_node(INODE_ANY_EXECUTABLE) {
+            dfs.move_to(INODE_ANY_EXECUTABLE);
+        }
+        while let Some(_) = dfs.next(&state.graph) {}
+
+        for path in state
+            .paths_map
+            .into_iter()
+            .filter_map(|(n, paths)| {
+                if !dfs.discovered.contains(&n) {
+                    Some(paths)
+                } else {
+                    None
+                }
+            })
+            .flatten()
+        {
             decisions
                 .send(Decision {
-                    path: file,
+                    path,
                     action: Action::Remove,
                 })
                 .await?;
@@ -38,3 +94,180 @@ impl Cleaner for Dso {
         Ok(())
     }
 }
+
+impl DsoCleaner {
+    fn process_file(state: &mut State, path: &Path) -> Result<()> {
+        let mut f = File::open(path)?;
+        let mut hdr = [0u8; 4];
+        if let Err(e) = f.read_exact(&mut hdr) {
+            if e.kind() != ErrorKind::UnexpectedEof {
+                anyhow::bail!(e)
+            }
+
+            return Ok(()); // not ELF, ignore
+        };
+
+        let is_elf = &hdr == ELF_MAGIC_HEADER;
+        if !is_elf {
+            return Ok(());
+        }
+
+        f.rewind()?;
+        let mmap = unsafe { Mmap::map(&f)? };
+        let elf = Elf::parse(&mmap)?;
+
+        if path.is_symlink() {
+            if !elf.is_lib {
+                // we don't care about symlinks to
+                // executables in our graph, as we
+                // are cleaning up only DSOs.
+                Ok(())
+            } else {
+                Self::process_elf_symlink(state, path)
+            }
+        } else {
+            Self::process_elf_file(state, path, &elf)
+        }
+    }
+
+    fn process_elf_symlink(state: &mut State, path: &Path) -> Result<()> {
+        let src = nix::sys::stat::lstat(path)?;
+        let dst = nix::sys::stat::stat(path)?;
+        if src.st_dev != dst.st_dev {
+            log::warn!(
+                "{} points outside of the sysroot filesystem, check if this is intended",
+                path.display()
+            );
+            return Ok(());
+        }
+        let current_dir = std::env::current_dir()?;
+        let dst_path = std::fs::canonicalize(path)?
+            .strip_prefix(current_dir)?
+            .to_path_buf();
+
+        log::trace!(
+            "dso: adding to graph symlink: '{}' to '{}'",
+            path.display(),
+            dst_path.display()
+        );
+
+        state
+            .paths_map
+            .entry(src.st_ino)
+            .or_default()
+            .insert(path.into());
+
+        state
+            .paths_map
+            .entry(dst.st_ino)
+            .or_default()
+            .insert(dst_path);
+
+        state.graph.add_edge(src.st_ino, dst.st_ino, ());
+        Ok(())
+    }
+
+    fn process_elf_file(state: &mut State, path: &Path, elf: &Elf) -> Result<()> {
+        log::trace!("dso: adding to graph elf file '{}'", path.display());
+
+        let current_dir = std::env::current_dir()?;
+        let origin = std::fs::canonicalize(path)?
+            .parent()
+            .unwrap()
+            .strip_prefix(current_dir)?
+            .to_path_buf()
+            .into_os_string()
+            .into_string()
+            .map_err(|s| anyhow::anyhow!("cannot represent {:?} as a UTF-8 string", s))?;
+
+        let mut search_paths = vec![];
+
+        if elf.rpaths != vec![""] {
+            if elf.runpaths != vec![""] {
+                let mut rpaths = elf
+                    .rpaths
+                    .iter()
+                    .map(|p| p.replace("$ORIGIN", &origin))
+                    .collect::<Vec<_>>();
+                search_paths.append(&mut rpaths);
+            }
+
+            let ld_config_path = std::env::var("LD_LIBRARY_PATH");
+            let mut env_paths = ld_config_path
+                .as_ref()
+                .map(|env| {
+                    env.split(':')
+                        .filter_map(|dir| {
+                            if dir.is_empty() {
+                                None
+                            } else {
+                                Some(dir.to_string())
+                            }
+                        })
+                        .collect::<Vec<_>>()
+                })
+                .unwrap_or_default();
+            search_paths.append(&mut env_paths);
+        }
+
+        if elf.runpaths != vec![""] {
+            let mut runpaths = elf
+                .runpaths
+                .iter()
+                .map(|p| p.replace("$ORIGIN", &origin))
+                .collect::<Vec<_>>();
+            search_paths.append(&mut runpaths);
+        }
+
+        // Standard dirs:
+        search_paths.push("/usr/local/lib".into());
+        search_paths.push("/lib".into());
+        search_paths.push("/usr/lib".into());
+
+        let src_stat = nix::sys::stat::stat(path)?;
+        let src_inode = if elf.is_lib {
+            src_stat.st_ino
+        } else {
+            // We put all executables in the same node
+            INODE_ANY_EXECUTABLE
+        };
+
+        'next_lib: for &library in elf.libraries.iter() {
+            for lib_path in search_paths.iter() {
+                let tentative_path = PathBuf::from(lib_path).strip_prefix("/")?.join(library);
+                let dst = match nix::sys::stat::stat(&tentative_path) {
+                    Ok(dst) => dst,
+                    Err(Errno::ENOENT) => continue,
+                    Err(e) => anyhow::bail!(
+                        "got errno {} while accessing {}",
+                        e,
+                        tentative_path.display()
+                    ),
+                };
+
+                if src_stat.st_dev != dst.st_dev {
+                    continue; // These are not the droids you are looking for.
+                }
+
+                state
+                    .paths_map
+                    .entry(src_inode)
+                    .or_default()
+                    .insert(path.into());
+
+                state
+                    .paths_map
+                    .entry(dst.st_ino)
+                    .or_default()
+                    .insert(tentative_path);
+
+                state.graph.add_edge(src_inode, dst.st_ino, ());
+                continue 'next_lib;
+            }
+
+            anyhow::bail!("{}: unable to find library {}", path.display(), library);
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 8f9b255..1d7751a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -13,10 +13,12 @@ use env_logger::Env;
 
 #[tokio::main]
 async fn main() -> Result<()> {
-    let logging_env = Env::default().filter_or("LOG_LEVEL", "warn");
+    let args = Args::try_parse()?;
+
+    let logging_env =
+        Env::default().filter_or("LOG_LEVEL", if args.dry_run { "info" } else { "warn" });
     env_logger::Builder::from_env(logging_env).init();
 
-    let args = Args::try_parse()?;
     std::env::set_current_dir(&args.sysroot_location)?;
 
     let runner = Runner::new(args);