feat(dso): implement unused DSO cleaner

This is the first implementation of a recursive
DSO cleaner which is reasonably fast.

The implementation is still (unit-) untested.

Of course, modules to be dlopen'ed will always
escape such a tool, which will need to
support another cleaner plugin with a
whitelist.
This commit is contained in:
Matteo Settenvini 2025-01-26 02:33:25 +01:00
parent 5507a1dd21
commit 54075012aa
Signed by: matteo
GPG Key ID: 1C1B12600D81DE05
5 changed files with 295 additions and 25 deletions

10
Cargo.lock generated
View File

@ -298,6 +298,15 @@ version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "memmap2"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
dependencies = [
"libc",
]
[[package]]
name = "miniz_oxide"
version = "0.8.3"
@ -539,6 +548,7 @@ dependencies = [
"env_logger",
"goblin",
"log",
"memmap2",
"nix",
"petgraph",
"tokio",

View File

@ -16,7 +16,8 @@ clap = { version = "4.5", features = ["derive"] }
env_logger = { version = "0.11" }
goblin = { version = "0.9" }
log = { version = "0.4" }
nix = { version = "0.29" }
memmap2 = { version = "0.9" }
nix = { version = "0.29", features = ["fs"] }
petgraph = { version = "0.7" }
tokio = { version = "1", features = ["full"] }
walkdir = { version = "2" }

View File

@ -9,7 +9,7 @@ use crate::{
};
use anyhow::Result;
use async_trait::async_trait;
use dso::Dso;
use dso::DsoCleaner;
use nix::libc::EXDEV;
use std::{
collections::HashMap,
@ -20,7 +20,7 @@ use tokio::{
sync::{broadcast, mpsc},
task::JoinSet,
};
use walkdir::WalkDir;
use walkdir::{DirEntry, WalkDir};
#[async_trait]
pub trait Cleaner {
@ -38,18 +38,21 @@ pub struct Runner {
removal_fn: RemovalFn,
}
const CHANNEL_SIZE: usize = 100;
const CHANNEL_MAX_LOAD: usize = CHANNEL_SIZE * 3 / 4;
impl Runner {
pub fn new(args: Args) -> Self {
let removal_fn = Self::new_removal_fn(&args);
Self {
cleaners: vec![Box::new(Dso::new())],
cleaners: vec![Box::new(DsoCleaner::default())],
removal_fn,
}
}
pub async fn run(self) -> Result<()> {
let input_tx = broadcast::Sender::new(100);
let (output_tx, output_rx) = mpsc::channel(100);
let input_tx = broadcast::Sender::new(CHANNEL_SIZE);
let (output_tx, output_rx) = mpsc::channel(CHANNEL_SIZE);
let mut tasks = JoinSet::new();
// Processors
@ -76,14 +79,13 @@ impl Runner {
}
async fn input_producer(input_tx: broadcast::Sender<PathBuf>) -> Result<()> {
let walker = WalkDir::new(".");
let walker = WalkDir::new(".").follow_links(false);
for entry in walker {
match entry {
Ok(e) if !e.file_type().is_dir() => {
if input_tx.len() >= 75 {
// TODO: FIXME: make this better, e.g. use backoff, this is a quick
// hack
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
Ok(e) if !Self::is_dir(&e) => {
if input_tx.len() >= CHANNEL_MAX_LOAD {
// TODO: FIXME: make this better, this is a quick hack
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
}
input_tx.send(e.into_path())?;
}
@ -96,6 +98,28 @@ impl Runner {
Ok(())
}
fn is_dir(entry: &DirEntry) -> bool {
let ty = entry.file_type();
if ty.is_dir() {
true
} else if ty.is_file() {
false
} else {
// it is a symlink
match std::fs::metadata(entry.path()) {
Ok(metadata) => metadata.is_dir(),
Err(e) => {
log::debug!(
"unable to resolve symlink {}: {}",
entry.path().display(),
e
);
false
}
}
}
}
async fn output_consumer(removal_fn: RemovalFn, mut output_rx: mpsc::Receiver<Decision>) {
let mut to_remove = HashMap::new();
while let Some(decision) = output_rx.recv().await {

View File

@ -5,31 +5,87 @@ use super::Cleaner;
use crate::decision::{Action, Decision};
use anyhow::Result;
use async_trait::async_trait;
use std::path::PathBuf;
use tokio::sync::{broadcast, mpsc};
use goblin::elf::Elf;
use memmap2::Mmap;
use nix::{errno::Errno, libc::ino_t};
use petgraph::{prelude::DiGraphMap, visit::Dfs};
use std::{
collections::{HashMap, HashSet},
fs::File,
io::{ErrorKind, Read, Seek},
path::{Path, PathBuf},
};
use tokio::sync::{
broadcast::{self, error::RecvError},
mpsc,
};
type InodeMap = HashMap<ino_t, HashSet<PathBuf>>;
type InodeGraph = DiGraphMap<ino_t, ()>;
/// Cleans up unused shared libraries
/// and warns about broken dependencies as well
pub struct Dso {}
#[derive(Default)]
pub struct DsoCleaner {}
impl Dso {
pub fn new() -> Self {
Self {}
}
#[derive(Default)]
struct State {
paths_map: InodeMap,
graph: InodeGraph,
}
const INODE_ANY_EXECUTABLE: ino_t = 0;
const ELF_MAGIC_HEADER: &[u8; 4] = b"\x7fELF";
#[async_trait]
impl Cleaner for Dso {
impl Cleaner for DsoCleaner {
async fn run(
&mut self,
mut files: broadcast::Receiver<PathBuf>,
decisions: mpsc::Sender<Decision>,
) -> Result<()> {
while let Ok(file) = files.recv().await {
// TODO: handle Lagged?
let mut state = State::default();
loop {
match files.recv().await {
Ok(file) => {
if let Err(e) = Self::process_file(&mut state, &file) {
log::warn!("{}: {}", file.display(), e);
}
}
Err(RecvError::Closed) => break,
e => {
e?;
}
}
}
// println!(
// "{:?}",
// petgraph::dot::Dot::with_config(&state.graph, &[petgraph::dot::Config::EdgeNoLabel])
// );
let mut dfs = Dfs::empty(&state.graph);
if state.graph.contains_node(INODE_ANY_EXECUTABLE) {
dfs.move_to(INODE_ANY_EXECUTABLE);
}
while let Some(_) = dfs.next(&state.graph) {}
for path in state
.paths_map
.into_iter()
.filter_map(|(n, paths)| {
if !dfs.discovered.contains(&n) {
Some(paths)
} else {
None
}
})
.flatten()
{
decisions
.send(Decision {
path: file,
path,
action: Action::Remove,
})
.await?;
@ -38,3 +94,180 @@ impl Cleaner for Dso {
Ok(())
}
}
impl DsoCleaner {
fn process_file(state: &mut State, path: &Path) -> Result<()> {
let mut f = File::open(path)?;
let mut hdr = [0u8; 4];
if let Err(e) = f.read_exact(&mut hdr) {
if e.kind() != ErrorKind::UnexpectedEof {
anyhow::bail!(e)
}
return Ok(()); // not ELF, ignore
};
let is_elf = &hdr == ELF_MAGIC_HEADER;
if !is_elf {
return Ok(());
}
f.rewind()?;
let mmap = unsafe { Mmap::map(&f)? };
let elf = Elf::parse(&mmap)?;
if path.is_symlink() {
if !elf.is_lib {
// we don't care about symlinks to
// executables in our graph, as we
// are cleaning up only DSOs.
Ok(())
} else {
Self::process_elf_symlink(state, path)
}
} else {
Self::process_elf_file(state, path, &elf)
}
}
fn process_elf_symlink(state: &mut State, path: &Path) -> Result<()> {
let src = nix::sys::stat::lstat(path)?;
let dst = nix::sys::stat::stat(path)?;
if src.st_dev != dst.st_dev {
log::warn!(
"{} points outside of the sysroot filesystem, check if this is intended",
path.display()
);
return Ok(());
}
let current_dir = std::env::current_dir()?;
let dst_path = std::fs::canonicalize(path)?
.strip_prefix(current_dir)?
.to_path_buf();
log::trace!(
"dso: adding to graph symlink: '{}' to '{}'",
path.display(),
dst_path.display()
);
state
.paths_map
.entry(src.st_ino)
.or_default()
.insert(path.into());
state
.paths_map
.entry(dst.st_ino)
.or_default()
.insert(dst_path);
state.graph.add_edge(src.st_ino, dst.st_ino, ());
Ok(())
}
fn process_elf_file(state: &mut State, path: &Path, elf: &Elf) -> Result<()> {
log::trace!("dso: adding to graph elf file '{}'", path.display());
let current_dir = std::env::current_dir()?;
let origin = std::fs::canonicalize(path)?
.parent()
.unwrap()
.strip_prefix(current_dir)?
.to_path_buf()
.into_os_string()
.into_string()
.map_err(|s| anyhow::anyhow!("cannot represent {:?} as a UTF-8 string", s))?;
let mut search_paths = vec![];
if elf.rpaths != vec![""] {
if elf.runpaths != vec![""] {
let mut rpaths = elf
.rpaths
.iter()
.map(|p| p.replace("$ORIGIN", &origin))
.collect::<Vec<_>>();
search_paths.append(&mut rpaths);
}
let ld_config_path = std::env::var("LD_LIBRARY_PATH");
let mut env_paths = ld_config_path
.as_ref()
.map(|env| {
env.split(':')
.filter_map(|dir| {
if dir.is_empty() {
None
} else {
Some(dir.to_string())
}
})
.collect::<Vec<_>>()
})
.unwrap_or_default();
search_paths.append(&mut env_paths);
}
if elf.runpaths != vec![""] {
let mut runpaths = elf
.runpaths
.iter()
.map(|p| p.replace("$ORIGIN", &origin))
.collect::<Vec<_>>();
search_paths.append(&mut runpaths);
}
// Standard dirs:
search_paths.push("/usr/local/lib".into());
search_paths.push("/lib".into());
search_paths.push("/usr/lib".into());
let src_stat = nix::sys::stat::stat(path)?;
let src_inode = if elf.is_lib {
src_stat.st_ino
} else {
// We put all executables in the same node
INODE_ANY_EXECUTABLE
};
'next_lib: for &library in elf.libraries.iter() {
for lib_path in search_paths.iter() {
let tentative_path = PathBuf::from(lib_path).strip_prefix("/")?.join(library);
let dst = match nix::sys::stat::stat(&tentative_path) {
Ok(dst) => dst,
Err(Errno::ENOENT) => continue,
Err(e) => anyhow::bail!(
"got errno {} while accessing {}",
e,
tentative_path.display()
),
};
if src_stat.st_dev != dst.st_dev {
continue; // These are not the droids you are looking for.
}
state
.paths_map
.entry(src_inode)
.or_default()
.insert(path.into());
state
.paths_map
.entry(dst.st_ino)
.or_default()
.insert(tentative_path);
state.graph.add_edge(src_inode, dst.st_ino, ());
continue 'next_lib;
}
anyhow::bail!("{}: unable to find library {}", path.display(), library);
}
Ok(())
}
}

View File

@ -13,10 +13,12 @@ use env_logger::Env;
#[tokio::main]
async fn main() -> Result<()> {
let logging_env = Env::default().filter_or("LOG_LEVEL", "warn");
let args = Args::try_parse()?;
let logging_env =
Env::default().filter_or("LOG_LEVEL", if args.dry_run { "info" } else { "warn" });
env_logger::Builder::from_env(logging_env).init();
let args = Args::try_parse()?;
std::env::set_current_dir(&args.sysroot_location)?;
let runner = Runner::new(args);