---
title: "Benchmarking rgbio Performance"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Benchmarking rgbio Performance}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---


# Benchmarking rgbio Performance

## Benchmark Setup

To evaluate real-world performance, we downloaded 17 archaeal type strain assemblies from the order "Methanococcales" directly from NCBI using the `datasets` command-line tool. These represent real-world GenBank files with varying sizes and complexity.


```
#> The Benchmark was run on 2026-05-27 using rgbio version 0.4.0
```

### Data Download Command
```bash
# Download 17 archaeal type strain assemblies in GenBank format
datasets download genome taxon "methanococcales" \
  --from-type \
  --reference \
  --assembly-source RefSeq \
  --include gbff \
  --filename methanococcales_type_strains.zip --dehydrated && \
  unzip methanococcales_type_strains.zip && \
  datasets rehydrate --directory .
```

## Packages Tested

We compared `rgbio` against three other popular GenBank reading packages:


``` r
library(bench)
library(dplyr)
library(ggplot2)

# Packages being benchmarked
library(rgbio)      # High-performance GenBank I/O using Rust gb-io crate
library(read.gb)    # Specifically made for reading gb files  
library(geneviewer) # Has a function to read gb files
library(genbankr)   # Traditional Bioconductor GenBank reader. Only available up to Bioconductor version: 3.16 ≙ R version 4.2
```

- **rgbio**: Our Rust-powered package with two output formats (`tidy` and `bioconductor`)
- **geneviewer**: A visualization-focused package with GenBank reading capabilities
- **genbankr**: The traditional Bioconductor solution for GenBank parsing
- **read.gb**: A dedicated GenBank file reader

## Running the Benchmarks


``` r
# Set up test data paths
gbk_dir <- "./ncbi_dataset/data"
gbk_files <- list.files(gbk_dir, pattern = "\\.gbff$", recursive = TRUE, full.names = TRUE)

# Define reader functions for consistent testing
readers <- list(
    "rgbio::read_gbk(format = 'tidy')" = function(file) {
        tryCatch({
            rgbio::read_gbk(file, format = "tidy")
        }, error = function(e) {
            message("rgbio (tidy) failed on ", file, ": ", e$message)
            return(NULL)
        })
    },
    "rgbio::read_gbk(format = 'bioconductor')" = function(file) {
        tryCatch({
            rgbio::read_gbk(file, format = "bioconductor")
        }, error = function(e) {
            message("rgbio (bioconductor) failed on ", file, ": ", e$message)
            return(NULL)
        })
    },
    "rgbio::read_gbk(format = 'base')" = function(file) {
        tryCatch({
            rgbio::read_gbk(file, format = "base")
        }, error = function(e) {
            message("rgbio (base) failed on ", file, ": ", e$message)
            return(NULL)
        })
    },
    "rgbio::read_gbk(format = 'tidy', validate = FALSE)" = function(file) {
        tryCatch({
            rgbio::read_gbk(file, format = "tidy", validate = FALSE)
        }, error = function(e) {
            message("rgbio (tidy) failed on ", file, ": ", e$message)
            return(NULL)
        })
    },
    "rgbio::read_gbk(format = 'bioconductor', validate = FALSE)" = function(file) {
        tryCatch({
            rgbio::read_gbk(file, format = "bioconductor", validate = FALSE)
        }, error = function(e) {
            message("rgbio (bioconductor) failed on ", file, ": ", e$message)
            return(NULL)
        })
    },
    "rgbio::read_gbk(format = 'base', validate = FALSE)" = function(file) {
        tryCatch({
            rgbio::read_gbk(file, format = "base", validate = FALSE)
        }, error = function(e) {
            message("rgbio (base) failed on ", file, ": ", e$message)
            return(NULL)
        })
    },
    "geneviewer::read_gbk()" = function(file) {
        tryCatch({
            geneviewer::read_gbk(file)
        }, error = function(e) {
            message("geneviewer failed on ", file, ": ", e$message)
            return(NULL)
        })
    },
    "genbankr::readGenBank()" = function(file) {
        tryCatch({
            genbankr::readGenBank(file)
        }, error = function(e) {
            message("genbankr failed on ", file, ": ", e$message)
            return(NULL)
        })
    }
)
```

We use the `bench` package to get accurate timing measurements, testing each parser on every file exactly once to simulate real-world usage patterns.


``` r
# Run comprehensive benchmarks across all file × reader combinations
benchmark_results <- press(
    file = gbk_files,
    reader = names(readers),
    {
        result <- bench::mark(
            result = readers[[reader]](file),
            iterations = 1,
            check = FALSE
        )
    }
)
```

To benchmark the writing, we pre-read each file once per `rgbio` format (without benchmarking), store those parsed objects, and then benchmark only the write step.


``` r
# Pre-read objects once (no benchmarking) for each file × rgbio format
write_cache_grid <- expand.grid(
    file = gbk_files,
    write_format = c("tidy", "bioconductor"),
    stringsAsFactors = FALSE
)

write_cache_read_once <- function(file, write_format) {
    tryCatch({
        rgbio::read_gbk(file, format = write_format)
    }, error = function(e) {
        message("rgbio ", write_format, " pre-read failed on ", file, ": ", e$message)
        return(NULL)
    })
}

rgbio_cached_reads <- write_cache_grid %>%
    rowwise() %>%
    mutate(parsed_obj = list(write_cache_read_once(file, write_format))) %>%
    ungroup()

# Benchmark write phase only, reusing cached parsed objects
write_benchmark_results <- press(
    row_id = seq_len(nrow(rgbio_cached_reads)),
    {
        file <- rgbio_cached_reads$file[[row_id]]
        write_format <- rgbio_cached_reads$write_format[[row_id]]
        obj <- rgbio_cached_reads$parsed_obj[[row_id]]

        result <- bench::mark(
            result = {
                tmp_file <- tempfile(pattern = "rgbio_write_benchmark_", fileext = ".gb")

                tryCatch({
                    if (is.null(obj)) {
                        stop("cached read object is NULL")
                    }

                    if (write_format == "tidy") {
                        seqs <- setNames(as.character(obj$sequences$sequence), obj$sequences$record_id)
                        rgbio::write_gbk(
                            file = tmp_file,
                            sequences = seqs,
                            features = obj$features,
                            metadata = obj$metadata
                        )
                    } else {
                        rgbio::write_gbk(
                            file = tmp_file,
                            sequences = obj$sequences,
                            features = obj$features,
                            metadata = obj$metadata
                        )
                    }
                }, error = function(e) {
                    message("rgbio ", write_format, " write failed on ", file, ": ", e$message)
                    return(FALSE)
                }, finally = {
                    unlink(tmp_file)
                })
            },
            iterations = 1,
            check = FALSE
        ) %>%
            mutate(
                file = file,
                write_format = write_format,
                operation = "write"
            )
    }
)

# Store write benchmark results for downstream processing
write_results <- write_benchmark_results
```

## Processing Results


``` r
# Clean up results and add useful metadata
benchmark_df <- benchmark_results %>%
    mutate(
        file_id = as.numeric(factor(file)),
        file_name = dirname(file),
        file_size_mb = file.info(file)$size / (1024^2),
        success = !is.na(median),
        time_ms = ifelse(success, as.numeric(median) * 1000, NA),
        memory_mb = ifelse(success, as.numeric(gsub("[^0-9.]", "", mem_alloc)), NA)
    ) %>%
    select(reader, file_id, file_name, file_size_mb, success, time_ms, memory_mb)

# Parse write benchmark results with matching metadata fields
write_benchmark_df <- write_results %>%
    mutate(
        file_id = as.numeric(factor(file)),
        file_name = dirname(file),
        file_size_mb = file.info(file)$size / (1024^2),
        success = !is.na(median),
        time_ms = ifelse(success, as.numeric(median) * 1000, NA),
        memory_mb = ifelse(success, as.numeric(gsub("[^0-9.]", "", mem_alloc)), NA)
    ) %>%
    select(write_format, operation, file_id, file_name, file_size_mb, success, time_ms, memory_mb)
```


## Results

### Performance Visualization


``` r
p1 <- ggplot(benchmark_df, aes(x = factor(reader, levels = names(readers)), y = time_ms, fill = reader)) +
    geom_violin(alpha = 0.7) + 
    geom_boxplot(width = 0.2, alpha = 0.8) +
    scale_y_log10() + 
    labs(
        title = "Time Reading GenBank Files (lower is better)", 
        y = "Time (ms, log scale)",
        subtitle = paste("Based on", 
                        benchmark_df %>% 
                        filter(success) %>% 
                        group_by(reader) %>% 
                        summarise(n = n()) %>% 
                        pull(n) %>% 
                        min(), 
                        "files per parser")
    ) +
    theme_minimal() +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none"
    ) +
    xlab("Parsing function")

p1
```

![Parsing time comparison across different GenBank readers. Lower values indicate better performance.](figure/performance-plot-1.png)

### Summary Statistics


``` r
# Calculate performance summary with speedup factors
summary_stats <- benchmark_df %>%
    group_by(reader) %>%
    summarise(
        n_files = n(),
        median_time_ms = round(median(time_ms, na.rm = TRUE), 1),
        mean_time_ms = round(mean(time_ms, na.rm = TRUE), 1),
        median_memory_mb = round(median(memory_mb, na.rm = TRUE), 1),
        .groups = "drop"
    ) %>%
    arrange(factor(reader, levels = names(readers))) %>%
    mutate(
        max_time = max(median_time_ms),
        speedup = case_when(
            median_time_ms == max_time ~ "baseline",
            TRUE ~ paste0(round(max_time / median_time_ms, 1), "x")
        )
    ) %>% 
    select(reader, speedup, median_time_ms, mean_time_ms, median_memory_mb, n_files)

knitr::kable(summary_stats, 
             col.names = c("Parser", "Speed vs Slowest", "Median Time (ms)", 
                          "Mean Time (ms)", "Median Memory (MB)", "Files Tested"),
             caption = "Performance comparison summary")
```


Table: Performance comparison summary

|Parser                                                     |Speed vs Slowest | Median Time (ms)| Mean Time (ms)| Median Memory (MB)| Files Tested|
|:----------------------------------------------------------|:----------------|----------------:|--------------:|------------------:|------------:|
|rgbio::read_gbk(format = 'tidy')                           |45.3x            |             42.2|           42.5|                4.4|           17|
|rgbio::read_gbk(format = 'bioconductor')                   |38.2x            |             50.1|           48.8|                6.2|           17|
|rgbio::read_gbk(format = 'base')                           |45.4x            |             42.1|           41.6|                4.3|           17|
|rgbio::read_gbk(format = 'tidy', validate = FALSE)         |44.8x            |             42.7|           44.2|                5.0|           17|
|rgbio::read_gbk(format = 'bioconductor', validate = FALSE) |37.2x            |             51.4|           68.3|                5.0|           17|
|rgbio::read_gbk(format = 'base', validate = FALSE)         |45.6x            |             41.9|           42.5|                4.1|           17|
|geneviewer::read_gbk()                                     |1.5x             |           1261.0|         1387.9|                1.5|           17|
|genbankr::readGenBank()                                    |baseline         |           1912.1|         1688.6|               38.7|           17|


## Key Findings

The benchmarks reveal several important insights:

1. **rgbio is significantly faster**: Both `rgbio` formats substantially outperform traditional R-based parsers
2. **Consistent performance**: `rgbio` shows reliable performance across different file sizes and complexities  
3. **Format flexibility**: The `tidy` format offers the best performance, while `bioconductor` format provides compatibility with existing workflows
4. **Memory efficiency**: `rgbio` maintains competitive memory usage while delivering superior speed

## Technical Details


``` r
sessionInfo()
#> R version 4.2.3 (2023-03-15)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS 15.7.7
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] C/UTF-8/C/C/C/C
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] genbankr_1.26.0   geneviewer_0.1.11 read.gb_2.2       rgbio_0.4.0      
#> [5] ggplot2_4.0.3     dplyr_1.2.1       bench_1.1.4      
#> 
#> loaded via a namespace (and not attached):
#>  [1] bitops_1.0-9                matrixStats_1.5.0          
#>  [3] fontawesome_0.5.3           bit64_4.8.2                
#>  [5] filelock_1.0.3              RColorBrewer_1.1-3         
#>  [7] progress_1.2.3              httr_1.4.8                 
#>  [9] GenomeInfoDb_1.34.9         tools_4.2.3                
#> [11] utf8_1.2.6                  R6_2.6.1                   
#> [13] otel_0.2.0                  DBI_1.3.0                  
#> [15] BiocGenerics_0.44.0         withr_3.0.2                
#> [17] tidyselect_1.2.1            prettyunits_1.2.0          
#> [19] bit_4.6.0                   curl_7.1.0                 
#> [21] compiler_4.2.3              cli_3.6.6                  
#> [23] Biobase_2.58.0              xml2_1.5.2                 
#> [25] DelayedArray_0.24.0         rtracklayer_1.58.0         
#> [27] scales_1.4.0                S7_0.2.2                   
#> [29] rappdirs_0.3.4              stringr_1.6.0              
#> [31] digest_0.6.39               Rsamtools_2.14.0           
#> [33] rentrez_1.2.4               XVector_0.38.0             
#> [35] pkgconfig_2.0.3             htmltools_0.5.9            
#> [37] MatrixGenerics_1.10.0       dbplyr_2.5.2               
#> [39] fastmap_1.2.0               BSgenome_1.66.3            
#> [41] htmlwidgets_1.6.4           rlang_1.2.0                
#> [43] RSQLite_3.52.0              BiocIO_1.8.0               
#> [45] farver_2.1.2                generics_0.1.4             
#> [47] jsonlite_2.0.0              BiocParallel_1.32.6        
#> [49] VariantAnnotation_1.44.1    RCurl_1.98-1.18            
#> [51] magrittr_2.0.5              GenomeInfoDbData_1.2.9     
#> [53] Matrix_1.5-3                Rcpp_1.1.1-1.1             
#> [55] S4Vectors_0.36.2            lifecycle_1.0.5            
#> [57] stringi_1.8.7               yaml_2.3.12                
#> [59] SummarizedExperiment_1.28.0 zlibbioc_1.44.0            
#> [61] BiocFileCache_2.6.1         grid_4.2.3                 
#> [63] blob_1.3.0                  parallel_4.2.3             
#> [65] crayon_1.5.3                profmem_0.7.0              
#> [67] lattice_0.22-9              Biostrings_2.66.0          
#> [69] GenomicFeatures_1.50.4      hms_1.1.4                  
#> [71] KEGGREST_1.38.0             knitr_1.51                 
#> [73] pillar_1.11.1               GenomicRanges_1.50.2       
#> [75] rjson_0.2.23                codetools_0.2-20           
#> [77] biomaRt_2.54.1              stats4_4.2.3               
#> [79] XML_3.99-0.23               glue_1.8.1                 
#> [81] evaluate_1.0.5              png_0.1-9                  
#> [83] vctrs_0.7.3                 gtable_0.3.6               
#> [85] purrr_1.2.2                 tidyr_1.3.2                
#> [87] cachem_1.1.0                xfun_0.57                  
#> [89] restfulr_0.0.16             tibble_3.3.1               
#> [91] GenomicAlignments_1.34.1    AnnotationDbi_1.60.2       
#> [93] memoise_2.0.1               IRanges_2.32.0
```

## Conclusion

These benchmarks demonstrate that `rgbio` provides substantial performance improvements for GenBank file parsing in R, making it an excellent choice for both interactive analysis and large-scale genomic data processing workflows. The Rust-powered backend delivers the speed improvements while maintaining the familiar R interface that researchers expect.

For the most up-to-date benchmarks and methodology, visit the [rgbio GitHub repository](https://github.com/richardstoeckl/rgbio).