Reading padloc-db data • padlocdev

Reading padloc-db files

You can read profile HMMs, system models, the HMM metadata table, system metadata table, and system group information with the following functions.

Metadata tables

Read hmm_meta.txt with read_hmm_meta().

path <- padlocdev_example("padloc-db/hmm_meta.txt")
hmm_meta <- read_hmm_meta(path)
head(hmm_meta)
#> # A tibble: 6 × 15
#>   hmm.acc hmm.name hmm.description protein.name secondary.name author number.seq
#>   <chr>   <chr>    <chr>           <chr>        <chr>          <chr>       <dbl>
#> 1 PDLC00… HamA1_0… DUF1837 domain  HamA1|HamA   NA             Payne…         49
#> 2 PDLC00… HamA1_0… DUF1837 domain  HamA1|HamA   NA             Payne…          6
#> 3 PDLC00… HamB1_0… DEAD-box helic… HamB1|HamB   NA             Payne…         56
#> 4 PDLC00… HamB1_0… DEAD-box helic… HamB1|HamB   NA             Payne…          5
#> 5 PDLC00… HamA2_0… DUF1837 domain  HamA2|HamA   NA             Payne…          7
#> 6 PDLC00… HamA2_0… DUF1837 domain  HamA2|HamA   NA             Payne…        144
#> # ℹ 8 more variables: length.hmm <dbl>, e.value.threshold <dbl>,
#> #   hmm.coverage.threshold <dbl>, target.coverage.threshold <dbl>,
#> #   system <chr>, literature.ref <chr>, database.ref <chr>, comments <chr>

Read sys_meta.txt with read_sys_meta().

path <- padlocdev_example("padloc-db/sys_meta.txt")
sys_meta <- read_sys_meta(path)
head(sys_meta)
#> # A tibble: 6 × 5
#>   system type     yaml.name     search notes
#>   <chr>  <chr>    <chr>         <lgl>  <chr>
#> 1 BREX   other    brex_other    TRUE   NA   
#> 2 BREX   type I   brex_type_I   TRUE   NA   
#> 3 BREX   type II  brex_type_II  TRUE   NA   
#> 4 BREX   type III brex_type_III TRUE   NA   
#> 5 BREX   type IV  brex_type_IV  TRUE   NA   
#> 6 BREX   type V   brex_type_V   TRUE   NA

Read sys_groups.txt with read_sys_groups().

path <- padlocdev_example("padloc-db/sys_groups.txt")
sys_groups <- read_sys_groups(path)
head(sys_groups)
#> # A tibble: 6 × 2
#>   yaml.name     group  
#>   <chr>         <chr>  
#> 1 brex_other    group_1
#> 2 brex_type_I   group_1
#> 3 brex_type_II  group_1
#> 4 brex_type_III group_2
#> 5 brex_type_IV  group_2
#> 6 brex_type_V   group_2

All metadata tables are stored as tibble objects, and can be operated-on the same as any other tibble (see ?tibble::tbl_df for more information).

Profile HMMs

Read a full profile HMM, including header and model information with read_hmm().

path <- padlocdev_example("padloc-db/hmm/PDLC00150.hmm")
hmm <- read_hmm(path)

Profile HMMs are stored as list objects with two elements, header and model.

names(hmm)
#> [1] "header" "model"

names(hmm[["header"]])
#>  [1] "HMMER3/f"            "NAME"                "LENG"               
#>  [4] "ALPH"                "ACC"                 "DESC"               
#>  [7] "MAXL"                "RF"                  "MM"                 
#> [10] "CONS"                "CS"                  "MAP"                
#> [13] "DATE"                "COM"                 "NSEQ"               
#> [16] "EFFN"                "CKSUM"               "GA"                 
#> [19] "TC"                  "NC"                  "STATS LOCAL MSV"    
#> [22] "STATS LOCAL VITERBI" "STATS LOCAL FORWARD"

names(hmm[["model"]])
#> [1] "compo"             "match_emissions"   "insert_emissions" 
#> [4] "state_transitions"

Accessing elements works the same as with any other list.

hmm[["header"]][["NAME"]]
#> [1] "HamA1_00001"

# Or...
hmm$header$NAME
#> [1] "HamA1_00001"

# Or...
purrr::pluck(hmm, "header", "NAME")
#> [1] "HamA1_00001"

# Etc...

The model information for an HMM can be quite large, which can slow down reading.

object.size(hmm[["header"]]) |> print(units = "Kb")
#> 4.2 Kb
object.size(hmm[["model"]]) |> print(units = "Kb")
#> 519.8 Kb

Currently, there is no use for reading in the model information anyway, at least for the purpose of data validation. To read just the header information use read_hmm_header() instead.

hmm_header <- read_hmm_header(path)

The object returned by read_hmm_header() is equivalent to the list returned by read_hmm(), but only with the header element.

names(hmm_header)
#> [1] "header"

names(hmm_header[["header"]])
#>  [1] "HMMER3/f"            "NAME"                "LENG"               
#>  [4] "ALPH"                "ACC"                 "DESC"               
#>  [7] "MAXL"                "RF"                  "MM"                 
#> [10] "CONS"                "CS"                  "MAP"                
#> [13] "DATE"                "COM"                 "NSEQ"               
#> [16] "EFFN"                "CKSUM"               "GA"                 
#> [19] "TC"                  "NC"                  "STATS LOCAL MSV"    
#> [22] "STATS LOCAL VITERBI" "STATS LOCAL FORWARD"

System models

Read system models with read_padloc_model().

path <- padlocdev_example("padloc-db/sys/brex_type_I.yaml")
model <- read_padloc_model(path)

System models are also stored as list objects with each element corresponding to a field in the original model file.

names(model)
#> [1] "force_strand"       "maximum_separation" "minimum_core"      
#> [4] "minimum_total"      "core_genes"         "secondary_genes"   
#> [7] "neutral_genes"      "prohibited_genes"

model[["core_genes"]]
#> [1] "BrxA" "BrxB" "BrxC" "PglX" "PglZ" "BrxL"

Reading multiple files

Wrappers are provided for reading all of the system model or profile HMM files in a directory into a list. The name of each element in the list is derived from the name of the file read-in.

Use multi_read_padloc_model() to read all system models.

path <- padlocdev_example("padloc-db/sys")
models <- multi_read_padloc_model(path)
names(models)
#>  [1] "brex_other"       "brex_type_I"      "brex_type_II"     "brex_type_III"   
#>  [5] "brex_type_IV"     "brex_type_V"      "brex_type_VI"     "DRT_class_I"     
#>  [9] "DRT_class_II"     "DRT_class_III"    "DRT_other"        "DRT_type_I"      
#> [13] "DRT_type_II"      "DRT_type_III"     "DRT_type_IV"      "DRT_type_V"      
#> [17] "hachiman_other"   "hachiman_type_I"  "hachiman_type_II"

models[["brex_type_I"]]
#> $force_strand
#> [1] FALSE
#> 
#> $maximum_separation
#> [1] 3
#> 
#> $minimum_core
#> [1] 6
#> 
#> $minimum_total
#> [1] 6
#> 
#> $core_genes
#> [1] "BrxA" "BrxB" "BrxC" "PglX" "PglZ" "BrxL"
#> 
#> $secondary_genes
#> [1] "NA"
#> 
#> $neutral_genes
#> [1] "NA"
#> 
#> $prohibited_genes
#> [1] "BrxE"

Use multi_read_hmm() to read all profile HMMs.

path <- padlocdev_example("padloc-db/hmm")
hmms <- multi_read_hmm(path)
names(hmms)
#>  [1] "PDLC00150" "PDLC00151" "PDLC00174" "PDLC00175" "PDLC00195" "PDLC00196"
#>  [7] "PDLC00199" "PDLC00200" "PDLC00202" "PDLC00203" "PDLC02171" "PDLC02172"
#> [13] "PDLC02173" "PDLC02174" "PDLC02175" "PDLC02176" "PDLC02177" "PDLC02178"
#> [19] "PDLC02181" "PDLC02182" "PDLC02437" "PDLC02438" "PDLC02459" "PDLC02460"
#> [25] "PDLC02471" "PDLC02472" "PDLC02494" "PDLC02495" "PDLC02496" "PDLC02497"
#> [31] "PDLC02500" "PDLC02501" "PDLC02504" "PDLC02505" "PDLC02506" "PDLC02510"
#> [37] "PDLC02511" "PDLC02512" "PDLC02513" "PDLC02515" "PDLC02516" "PDLC02541"
#> [43] "PDLC02542"

For larger databases, it’s more efficient to just read the header information of each HMM.

hmm_headers <- multi_read_hmm_header(path)
names(hmm_headers)
#>  [1] "PDLC00150" "PDLC00151" "PDLC00174" "PDLC00175" "PDLC00195" "PDLC00196"
#>  [7] "PDLC00199" "PDLC00200" "PDLC00202" "PDLC00203" "PDLC02171" "PDLC02172"
#> [13] "PDLC02173" "PDLC02174" "PDLC02175" "PDLC02176" "PDLC02177" "PDLC02178"
#> [19] "PDLC02181" "PDLC02182" "PDLC02437" "PDLC02438" "PDLC02459" "PDLC02460"
#> [25] "PDLC02471" "PDLC02472" "PDLC02494" "PDLC02495" "PDLC02496" "PDLC02497"
#> [31] "PDLC02500" "PDLC02501" "PDLC02504" "PDLC02505" "PDLC02506" "PDLC02510"
#> [37] "PDLC02511" "PDLC02512" "PDLC02513" "PDLC02515" "PDLC02516" "PDLC02541"
#> [43] "PDLC02542"