Getting Started • romicidx

library(omicidx)
#> Loading required package: DBI
#> Loading required package: duckdb
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
#> Loading required package: dbplyr
#> 
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#> 
#>     ident, sql
#> Loading required package: R6

The omicidx package provides an interface to the OmicIDX DuckDB database, which contains a wealth of omics data. This vignette demonstrates how to establish a connection to the database and perform basic queries.

Establishing a Connection

To connect to the OmicIDX DuckDB database, use the omicidx_duckdb_connection() function:

con <- omicidx_duckdb_connection()
con
#> <duckdb_connection 7ba50 driver=<duckdb_driver dbdir=':memory:' read_only=FALSE bigint=numeric>>

Note that this function sets up a read-only connection to the remote DuckDB database hosted remotely. The database doesn’t actually contain any data. Instead it contains links to data stored as parquet files in cloud storage.

Listing Tables

Once connected, you can list the available tables in the database:

tables <- DBI::dbListTables(con)
print(tables)
#>  [1] "_catalog"             "ncbi_biosamples"      "sra_experiments"     
#>  [4] "sra_metadata"         "sra_runs"             "sra_samples"         
#>  [7] "sra_studies"          "stg_geo_platforms"    "stg_geo_samples"     
#> [10] "stg_geo_series"       "stg_ncbi_bioprojects" "stg_ncbi_biosamples" 
#> [13] "stg_sra_accessions"   "stg_sra_experiments"  "stg_sra_runs"        
#> [16] "stg_sra_samples"      "stg_sra_studies"

Querying Data

Each table in the database can be queried using standard SQL. For example, to retrieve the first 10 rows from the sra_runs table:

res <- DBI::dbGetQuery(con, "SELECT * FROM staging.stg_sra_runs LIMIT 10")
print(res)
#>     accession experiment_accession title run_center run_date center_name
#> 1  SRR4652455           SRX2310319  <NA>       <NA>     <NA>        <NA>
#> 2  SRR4661781           SRX2310320  <NA>       <NA>     <NA>        <NA>
#> 3  SRR4661782           SRX2310321  <NA>       <NA>     <NA>        <NA>
#> 4  SRR4661783           SRX2310322  <NA>       <NA>     <NA>        <NA>
#> 5  SRR4661784           SRX2310323  <NA>       <NA>     <NA>        <NA>
#> 6  SRR4661785           SRX2310324  <NA>       <NA>     <NA>        <NA>
#> 7  SRR4661786           SRX2310325  <NA>       <NA>     <NA>        <NA>
#> 8  SRR4661787           SRX2310326  <NA>       <NA>     <NA>        <NA>
#> 9  SRR4661788           SRX2310327  <NA>       <NA>     <NA>        <NA>
#> 10 SRR4661789           SRX2310328  <NA>       <NA>     <NA>        <NA>
#>    broker_name                                        alias  GEO
#> 1         <NA> P0028_MT_I2722_TTCATACG_L001_R1_001.fastq.gz <NA>
#> 2         <NA>             FSL_E2-0214_R1.trimmedP.fastq.gz <NA>
#> 3         <NA>             FSL_K6-1142_R2.trimmedP.fastq.gz <NA>
#> 4         <NA>             FSL_K6-1030_R2.trimmedP.fastq.gz <NA>
#> 5         <NA>             FSL_H8-0481_R1.trimmedP.fastq.gz <NA>
#> 6         <NA>             FSL_H7-0909_R1.trimmedP.fastq.gz <NA>
#> 7         <NA>             FSL_H7-0676_R1.trimmedP.fastq.gz <NA>
#> 8         <NA>             FSL_F4-0079_R2.trimmedP.fastq.gz <NA>
#> 9         <NA>             FSL_H8-0063_R1.trimmedP.fastq.gz <NA>
#> 10        <NA>             FSL_H8-0534_R2.trimmedP.fastq.gz <NA>
#>                                                     identifiers attributes
#> 1  P0028_MT_I2722_TTCATACG_L001_R1_001.fastq.gz, SUB2043882, NA       NULL
#> 2              FSL_E2-0214_R1.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 3              FSL_K6-1142_R2.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 4              FSL_K6-1030_R2.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 5              FSL_H8-0481_R1.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 6              FSL_H7-0909_R1.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 7              FSL_H7-0676_R1.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 8              FSL_F4-0079_R2.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 9              FSL_H8-0063_R1.trimmedP.fastq.gz, SUB2043904, NA       NULL
#> 10             FSL_H8-0534_R2.trimmedP.fastq.gz, SUB2043904, NA       NULL
#>    qualities has_complete_run_info          _loaded_at
#> 1                            FALSE 2025-10-30 23:06:34
#> 2                            FALSE 2025-10-30 23:06:34
#> 3                            FALSE 2025-10-30 23:06:34
#> 4                            FALSE 2025-10-30 23:06:34
#> 5                            FALSE 2025-10-30 23:06:34
#> 6                            FALSE 2025-10-30 23:06:34
#> 7                            FALSE 2025-10-30 23:06:34
#> 8                            FALSE 2025-10-30 23:06:34
#> 9                            FALSE 2025-10-30 23:06:34
#> 10                           FALSE 2025-10-30 23:06:34

Usage with dplyr

The connection can also be used with the dplyr package for more convenient data manipulation. For example, to get the first 10 entries from the sra_studies table:

library(dplyr)

sra_studies_tbl <- tbl(con, "staging.stg_sra_studies")

result <- 
  sra_studies_tbl |>
  head(10) |>
  collect()
print(result)
#> # A tibble: 10 × 17
#>    accession study_accession title   description abstract study_type center_name
#>    <chr>     <chr>           <chr>   <chr>       <chr>    <chr>      <chr>      
#>  1 SRP009395 SRP009395       Subtro… <NA>        The sco… Metagenom… Biodiversi…
#>  2 SRP009396 SRP009396       Identi… <NA>        The aim… Other      GEO        
#>  3 SRP009397 SRP009397       Entamo… <NA>        Parasit… Whole Gen… JCVI       
#>  4 SRP009399 SRP009399       Genome… <NA>        Backgro… Whole Gen… Umass Medi…
#>  5 SRP009401 SRP009401       Transc… <NA>        Sacchar… Transcrip… GEO        
#>  6 ERP001015 ERP001015       Antarc… Pyrosequen… The ext… Metagenom… ICTAR      
#>  7 SRP009402 SRP009402       Charac… <NA>        The pre… Transcrip… Graduate U…
#>  8 SRP009404 SRP009404       Transc… <NA>        To unde… Transcrip… Shanghai O…
#>  9 SRP009407 SRP009407       Cheese… <NA>        Grana T… Metagenom… IASMA rese…
#> 10 SRP009408 SRP009408       MicroR… <NA>        Backgro… Transcrip… GEO        
#> # ℹ 10 more variables: broker_name <chr>, alias <chr>, BioProject <chr>,
#> #   GEO <chr>, pubmed_ids <list>, has_complete_metadata <lgl>,
#> #   attributes <list>, identifiers <list>, xrefs <list>, `_loaded_at` <dttm>

We can do more complex queries as well, such as grouping, counting, and arranging data. For example, to count the number of studies by study type:

studies_by_type <-
  sra_studies_tbl |>
  group_by(study_type) |>
  count() |>
  arrange(desc(n)) |>
  head(20) |>
  collect()
print(studies_by_type)
#> # A tibble: 12 × 2
#> # Groups:   study_type [12]
#>    study_type                      n
#>    <chr>                       <dbl>
#>  1 "Other"                    364910
#>  2 "Whole Genome Sequencing"  119980
#>  3 "Transcriptome Analysis"   112895
#>  4 "Metagenomics"              64548
#>  5 "Population Genomics"         835
#>  6 "Epigenetics"                 702
#>  7 "Cancer Genomics"             429
#>  8 "Exome Sequencing"            372
#>  9 "Pooled Clone Sequencing"      31
#> 10 "Synthetic Genomics"           12
#> 11 "Transcriptome Sequencing"      2
#> 12 "Whole Genome Sequencing "      1