summary history branches tags files
src/main.rs
use crates_io_api::{SyncClient, Crate, Error, Sort, ListOptions};
use serde::{Deserialize};
use serde_json::{Deserializer};
use std::process::{Command, Stdio};
use std::io::Write;

#[derive(Deserialize, Debug)]
struct BuildMetadataTarget {
    kind: Vec<String>,
    crate_types: Vec<String>,
    name: String,
}

#[derive(Deserialize, Debug)]
struct BuildMetadata {
    reason: String,
    package_id: String,
    target: Option<BuildMetadataTarget>,
    filenames: Option<Vec<String>>,
    executable: Option<String>
}

#[derive(Debug, PartialEq)]
enum ArtifactType {
    Binary,
    RustLibrary,
}

#[derive(Debug)]
struct Artifact {
    filename: String,
    kind: ArtifactType,
    size: u64,
}

#[derive(Debug)]
struct CrateResult {
    deps: usize,
    artifacts: Vec<Artifact>,
}
impl std::fmt::Display for CrateResult {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        let libs: Vec<&Artifact> = self.artifacts.iter().filter(|a| a.kind == ArtifactType::RustLibrary).collect();
        let bins: Vec<&Artifact> = self.artifacts.iter().filter(|a| a.kind == ArtifactType::Binary).collect();
        let lib = libs.first().map(|a| a.size).unwrap_or(0);
        let bin = bins.first().map(|a| a.size).unwrap_or(0);
        let lib_str = match lib {
            0 => "        ".to_string(),
            _ => format!("{:>8.2}", lib as f64 / 1024. / 1024.),
        };
        let bin_str = match bin {
            0 => "        ".to_string(),
            _ => format!("{:>8.2}", bin as f64 / 1024. / 1024.),
        };
        let res = write!(f, " {:>6} \t {} \t {} ", self.deps, lib_str, bin_str);
        if libs.len() > 1 {
            let _ = write!(f, "  WARNING: >1 lib");
        }
        res
    }
}

#[derive(Debug)]
struct PkgId {
    name: String,
    version: String,
}

fn top_crates(count: usize, category: Option<String>, filter_fn: Option<fn(&Crate) -> bool>) -> Result<Vec<Crate>, Error> {
    let mut remaining = count;
    let mut page = 1;
    let mut all_crates = Vec::<Crate>::with_capacity(count);
    while remaining > 0 {
        let client = SyncClient::new();
        let options = ListOptions {
            sort: Sort::Downloads,
            per_page: 100,
            page: page,
            query: None,
            category: category.clone(),
        };
        let mut crates = client.crates(options)?;

        crates.crates.retain(|c| match filter_fn {
            Some(f) => f(&c),
            None => true,
        });
        all_crates.append(&mut crates.crates);
        remaining -= match remaining {
            x if x > 100 => 100,
            x => x,
        };
        page += 1;
    }
    Ok(all_crates)
}

#[allow(dead_code)]
fn search_crates(query: &str) -> Result<Vec<Crate>, Error> {
    let client = SyncClient::new();
    let options = ListOptions {
        sort: Sort::Downloads,
        per_page: 100,
        page: 1,
        query: Some(query.to_string()),
        category: None,
    };
    let crates = client.crates(options)?;
    Ok(crates.crates)
}

fn pkgid(crt: &Crate) -> Result<PkgId, std::io::Error> {
    let dir = format!("clone_{}", crt.id);
    let output = Command::new("cargo")
        .args(&["pkgid"])
        .current_dir(&dir)
        .output()
        .expect("pkgid failed");
    let output = String::from_utf8(output.stdout).expect("Unreadable pkgid output");
    let substr = output.split("#").nth(1).expect("pkgid missing #");
    let mut bits = substr.split(":");

    Ok(PkgId {
        name: bits.next().expect("pkgid missing crate").trim().into(),
        version: bits.next().expect("pkgid missing version").trim().into(),
    })
}

fn artifacts(_crt: &Crate, metadata: &str, pkgid: &PkgId) -> Result<Vec<Artifact>, std::io::Error> {
    let mut artifacts: Vec<Artifact> = vec!();
    let pkgid_str = format!("{} {}", pkgid.name, pkgid.version);
    let stream  = Deserializer::from_str(metadata).into_iter::<BuildMetadata>();
    let mut meta_objs: Vec<BuildMetadata> = vec!();
    for value in stream {
        let m: BuildMetadata = value.expect("Fail parsing metadata json");
        if m.package_id.starts_with(&pkgid_str) &&
            m.reason == "compiler-artifact" &&
            m.target.is_some() {
            meta_objs.push(m);
        }
    }
    for m in &meta_objs {
        let target = m.target.as_ref().unwrap();
        if target.kind.iter().any(|x| x == "bin") {
            if let Some(exe) = &m.executable {
                artifacts.push(Artifact {
                    filename: exe.clone(),
                    kind: ArtifactType::Binary,
                    size: 0,
                });
            }
        }
        if target.kind.iter().any(|x| x == "lib" || x == "rlib") {
            if let Some(filenames) = &m.filenames {
                for file in filenames {
                    if file.ends_with(".rlib") {
                        artifacts.push(Artifact {
                            filename: file.clone(),
                            kind: ArtifactType::RustLibrary,
                            size: 0,
                        });
                    }
                }
            }
        }
    }
    for mut arty in &mut artifacts {
        if let Ok(file) = std::fs::File::open(&arty.filename) {
            if let Ok(stat) = file.metadata() {
                arty.size = stat.len();
            }
        }
    }
    Ok(artifacts)
}

fn analyze_crate(crt: &Crate) -> Result<CrateResult, std::io::Error> {
    let dir = format!("clone_{}", crt.id);
    let repo = crt.repository.as_ref().ok_or(std::io::ErrorKind::NotFound)?;
    // Always provide a username/password so git fails fast if one is required.
    let repo = repo.replace("https://", "https://dummy_user:dummy_password@");
    let _result = Command::new("git")
        .args(&["clone", "--recursive", "--quiet", &repo, &dir])
        .stdin(Stdio::null())
        .stderr(Stdio::null())
        .status()
        .expect("clone failed");
    if !std::path::Path::new(&dir).exists() {
        return Err(std::io::ErrorKind::Other.into());
    }

    let cargo_toml_path = format!("{}/Cargo.toml", dir);
    if !std::path::Path::new(&cargo_toml_path).exists() {
        return Err(std::io::ErrorKind::Other.into());
    }

    let result = Command::new("cargo")
        .args(&["build", "--release", "--message-format=json"])
        .current_dir(&dir)
        .stderr(Stdio::null())
        .output()
        .expect("build failed");
    if !result.status.success() {
        return Err(std::io::ErrorKind::Other.into());
    }
    let metadata = String::from_utf8(result.stdout).expect("Unreadable pkgid output");

    // $ cargo tree --no-indent -a |sort |uniq -c |sort -nr |wc -l
    let mut cargo_result = Command::new("cargo")
        .current_dir(&dir)
        .args(&["tree", "--no-indent", "--no-dev-dependencies", "-a"])
        .stdin(Stdio::null())
        .stdout(Stdio::piped())
        .stderr(Stdio::null())
        .spawn()
        .expect("tree failed");
    let cargo_out = cargo_result.stdout.take().expect("Cargo tree stdout failed");
    let mut sort_result = Command::new("sort")
        .current_dir(&dir)
        .stdin(Stdio::from(cargo_out))
        .stdout(Stdio::piped())
        .spawn()
        .expect("sort failed");
    let sort_out = sort_result.stdout.take().expect("sort stdout failed");
    let mut awk_result = Command::new("awk")
        .current_dir(&dir)
        .args(&["{print $1}"])
        .stdin(Stdio::from(sort_out))
        .stdout(Stdio::piped())
        .spawn()
        .expect("awk failed");
    let awk_out = awk_result.stdout.take().expect("awk stdout failed");
    let mut uniq_result = Command::new("uniq")
        .current_dir(&dir)
        .args(&["-c"])
        .stdin(Stdio::from(awk_out))
        .stdout(Stdio::piped())
        .spawn()
        .expect("uniq failed");
    let uniq_out = uniq_result.stdout.take().expect("uniq stdout failed");
    let sort2_result = Command::new("sort")
        .current_dir(&dir)
        .args(&["-nr"])
        .stdin(Stdio::from(uniq_out))
        .stdout(Stdio::piped())
        .spawn()
        .expect("sort failed");
    let output = sort2_result.wait_with_output().expect("sort failed");
    let _ = cargo_result.wait();
    let _ = sort_result.wait();
    let _ = awk_result.wait();
    let _ = uniq_result.wait();
    let output = String::from_utf8(output.stdout).expect("Unreadable output");

    // Subtract 1 for the root crate.
    let dep_count = match output.lines().count() {
        e if e > 0 => e - 1,
        _ => return Err(std::io::ErrorKind::Other.into()),
    };

    let artifacts = artifacts(crt, &metadata, &pkgid(crt)?)?;

    Ok(CrateResult {
        deps: dep_count,
        artifacts: artifacts,
    })
}

#[derive(Debug, Default)]
struct Statistics {
    count: usize,
    mean: f64,
    median: f64,
    stddev: f64,
    max: usize,
}

#[derive(Debug)]
struct BatchStatistics {
    deps: Statistics,
    libs: Statistics,
    bins: Statistics,
}

fn statistics(crates: &Vec<CrateResult>) -> Result<BatchStatistics, std::io::Error> {
    let deps: Vec<usize> = crates.iter().filter_map(|c| match c.deps {
        0 => None,
        c => Some(c),
    }).collect();
    let deps_f64: Vec<f64> = deps.iter().map(|v| *v as f64).collect();

    let libs: Vec<u64> = crates.iter().filter_map(|c| {
        c.artifacts.iter().filter_map(|a| {
            match a.kind {
                ArtifactType::RustLibrary => Some(a.size),
                _ => None,
            }
        }).next()
    }).collect();
    let libs_f64: Vec<f64> = libs.iter().map(|v| *v as f64).collect();

    let bins: Vec<u64> = crates.iter().filter_map(|c| {
        c.artifacts.iter().filter_map(|a| {
            match a.kind {
                ArtifactType::Binary => Some(a.size),
                _ => None,
            }
        }).next()
    }).collect();
    let bins_f64: Vec<f64> = bins.iter().map(|v| *v as f64).collect();

    Ok(BatchStatistics {
        deps: Statistics {
            count: deps.len(),
            mean: statistical::mean(deps_f64.as_slice()),
            median: statistical::median(deps_f64.as_slice()),
            stddev: statistical::standard_deviation(deps_f64.as_slice(), None),
            max: *deps.iter().max().unwrap_or(&0),
        },
        libs: Statistics {
            count: libs.len(),
            mean: statistical::mean(libs_f64.as_slice()),
            median: statistical::median(libs_f64.as_slice()),
            stddev: statistical::standard_deviation(libs_f64.as_slice(), None),
            max: *libs.iter().max().unwrap_or(&0) as usize,
        },
        bins: Statistics {
            count: bins.len(),
            mean: statistical::mean(bins_f64.as_slice()),
            median: statistical::median(bins_f64.as_slice()),
            stddev: statistical::standard_deviation(bins_f64.as_slice(), None),
            max: *bins.iter().max().unwrap_or(&0) as usize,
        },
    })
}

fn analyze(crates: Vec<Crate>) {
    let blacklist = [
        "rustc-ap-rustc_cratesio_shim", // all of rust compiler
        "rustc-ap-rustc_target",
        "rustc-ap-serialize",
        "rustc-ap-rustc_data_structures",
        "rustc-ap-syntax_pos",
        "rustc-ap-syntax",
        "rustc-ap-rustc_errors",
        // these are identical to rls-analysis
        "rls-data",
        "rls-span",
        "rls-vfs",
        // these are identical to actix-http
        "actix-files",
        "actix-http-test",
        "actix-web",
        "actix-web-httpauth",
        // identical to winapi
        "winapi-build",
        "winapi-i686-pc-windows-gnu",
        "winapi-x86_64-pc-windows-gnu",
        // identical to rand
        "rand_xorshift",
        "rand_pcg",
        "rand_os",
        "rand_jitter",
        "rand_isaac",
        "rand_hc",
        "rand_core",
        "rand_chacha",
        // identical to wayland-client
        "wayland-commons",
        "wayland-kbd",
        "wayland-protocols",
        "wayland-scanner",
        "wayland-server",
        "wayland-window",
        // identical to clone_tokio
        "clone_tokio-codec",
        "clone_tokio-core",
        "clone_tokio-curl",
        "clone_tokio-current-thread",
        "clone_tokio-executor",
        "clone_tokio-fs",
        "clone_tokio-io",
        "clone_tokio-proto",
        "clone_tokio-reactor",
        "clone_tokio-service",
        "clone_tokio-signal",
        "clone_tokio-sync",
        "clone_tokio-tcp",
        "clone_tokio-threadpool",
        "clone_tokio-timer",
        "clone_tokio-tls",
        "clone_tokio-trace-core",
        "clone_tokio-tungstenite",
        "clone_tokio-udp",
        "clone_tokio-uds",
    ];

    // Buckets of 1, up to 20
    let mut buckets: [u8; 22] = [
        0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,
        0,
        0,
    ];

    let mut results: Vec<CrateResult> = vec!();
    println!("");
    println!("{:<32}:  {:>6} \t {:>7} \t {:>7} ", "CRATE", "DEPS", "LIB (MB)", "BIN (MB)");
    println!("{}", std::iter::repeat("-").take(73).collect::<String>());
    let crate_count = crates.len();
    for (idx,c) in crates.iter().enumerate() {
        if !blacklist.contains(&c.id.as_str()) {
            let progress = format!("[{:>3}/{}]", idx, crate_count);
            print!("{:<10} {:<21}: ", progress, c.id.chars().take(21).collect::<String>());
            let _ = std::io::stdout().flush();
            match analyze_crate(c) {
                Err(_) => {
                    println!("");
                },
                Ok(res) => {
                    println!("{}", res);
                    match res.deps {
                        e if e <= 20 => buckets[e] += 1,
                        _ => buckets[21] += 1,
                    }
                    results.push(res);
                },
            }
        }
    }

    let stats = statistics(&results).expect("failed to generate statistics");

    println!("");
    println!("Number of crates analyzed: {}", results.len());
    println!("");
    println!("Dependencies:");
    println!("    count: {}", stats.deps.count);
    println!("     mean: {:.2} +/- {:.2}", stats.deps.mean, stats.deps.stddev);
    println!("   median: {:.2}", stats.deps.median);
    println!("  maximum: {}", stats.deps.max);
    println!("");
    println!("Library size:");
    println!("    count: {}", stats.libs.count);
    println!("     mean: {:.2} +/- {:.2} [{:.2} MB + / {:.2} MB]",
             stats.libs.mean, stats.libs.stddev,
             stats.libs.mean / 1024. / 1024., stats.libs.stddev / 1024. / 1024.);
    println!("   median: {:.2} [{:.2} MB]", stats.libs.median, stats.libs.median / 1024. / 1024.);
    println!("  maximum: {} [{:.2} MB]", stats.libs.max, stats.libs.max as f64 / 1024. / 1024.);
    println!("");
    println!("Binary size:");
    println!("    count: {}", stats.bins.count);
    println!("     mean: {:.2} +/- {:.2} [{:.2} MB + / {:.2} MB]",
             stats.bins.mean, stats.bins.stddev,
             stats.bins.mean / 1024. / 1024., stats.bins.stddev / 1024. / 1024.);
    println!("   median: {:.2} [{:.2} MB]", stats.bins.median, stats.bins.median / 1024. / 1024.);
    println!("  maximum: {} [{:.2} MB]", stats.bins.max, stats.bins.max as f64 / 1024. / 1024.);

    println!("");
    println!("Dependency count histogram (buckets 0-20 by 1, 20+):");
    for (i, count) in buckets.iter().enumerate() {
        let idx = match i {
            21 => "> 20".to_string(),
            _ => format!("{:>4}", i),
        };
        print!("{} ({:>5.1}%): ", idx, 100.0 * (*count as f64) / results.len() as f64);
        println!("{}", ['*'].iter().cycle().take(*count as usize).collect::<String>());
    }

    // Buckets of 10, up to 200
    let mut buckets: [u8; 21] = [
        0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,
        0,
    ];
    for res in &results {
        match res.deps {
            e if e < 200 => buckets[e / 10] += 1,
            _ => buckets[20] += 1,
        }
    }

    println!("");
    println!("Dependency count histogram (buckets 0-200 by 10, 200+):");
    for (i, count) in buckets.iter().enumerate() {
        let idx = match i {
            20 => "    > 200".to_string(),
            _ => format!("{:>3} - {:>3}", 10*i, 10*(i+1)),
        };
        print!("{} ({:>5.1}%): ", idx, 100.0 * (*count as f64) / results.len() as f64);
        println!("{}", ['*'].iter().cycle().take(std::cmp::min(50, *count as usize)).collect::<String>());
    }
    println!("");
}

fn main() {
    println!("========== 200 command-line-utilities crates ==========");
    let crates = top_crates(200, Some("command-line-utilities".into()), None).unwrap();
    analyze(crates);

    println!("========== 100 graphics crates ==========");
    let crates = top_crates(100, Some("graphics".into()), None).unwrap();
    analyze(crates);

    println!("========== 100 gui crates ==========");
    let crates = top_crates(100, Some("gui".into()), None).unwrap();
    analyze(crates);

    println!("========== 100 web-programming crates ==========");
    let crates = top_crates(100, Some("web-programming".into()), None).unwrap();
    analyze(crates);

    println!("========== Top 400 crates ==========");
    let crates = top_crates(400, None, None).unwrap();
    analyze(crates);
}