Reading padloc-db files
You can read profile HMMs, system models, the HMM metadata table, system metadata table, and system group information with the following functions.
Metadata tables
Read hmm_meta.txt with read_hmm_meta().
path <- padlocdev_example("padloc-db/hmm_meta.txt")
hmm_meta <- read_hmm_meta(path)
head(hmm_meta)
#> # A tibble: 6 × 15
#> hmm.acc hmm.name hmm.description protein.name secondary.name author number.seq
#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 PDLC00… HamA1_0… DUF1837 domain HamA1|HamA NA Payne… 49
#> 2 PDLC00… HamA1_0… DUF1837 domain HamA1|HamA NA Payne… 6
#> 3 PDLC00… HamB1_0… DEAD-box helic… HamB1|HamB NA Payne… 56
#> 4 PDLC00… HamB1_0… DEAD-box helic… HamB1|HamB NA Payne… 5
#> 5 PDLC00… HamA2_0… DUF1837 domain HamA2|HamA NA Payne… 7
#> 6 PDLC00… HamA2_0… DUF1837 domain HamA2|HamA NA Payne… 144
#> # ℹ 8 more variables: length.hmm <dbl>, e.value.threshold <dbl>,
#> # hmm.coverage.threshold <dbl>, target.coverage.threshold <dbl>,
#> # system <chr>, literature.ref <chr>, database.ref <chr>, comments <chr>Read sys_meta.txt with read_sys_meta().
path <- padlocdev_example("padloc-db/sys_meta.txt")
sys_meta <- read_sys_meta(path)
head(sys_meta)
#> # A tibble: 6 × 5
#> system type yaml.name search notes
#> <chr> <chr> <chr> <lgl> <chr>
#> 1 BREX other brex_other TRUE NA
#> 2 BREX type I brex_type_I TRUE NA
#> 3 BREX type II brex_type_II TRUE NA
#> 4 BREX type III brex_type_III TRUE NA
#> 5 BREX type IV brex_type_IV TRUE NA
#> 6 BREX type V brex_type_V TRUE NARead sys_groups.txt with
read_sys_groups().
path <- padlocdev_example("padloc-db/sys_groups.txt")
sys_groups <- read_sys_groups(path)
head(sys_groups)
#> # A tibble: 6 × 2
#> yaml.name group
#> <chr> <chr>
#> 1 brex_other group_1
#> 2 brex_type_I group_1
#> 3 brex_type_II group_1
#> 4 brex_type_III group_2
#> 5 brex_type_IV group_2
#> 6 brex_type_V group_2All metadata tables are stored as tibble objects, and
can be operated-on the same as any other tibble (see
?tibble::tbl_df for more information).
Profile HMMs
Read a full profile HMM, including header and model information with
read_hmm().
path <- padlocdev_example("padloc-db/hmm/PDLC00150.hmm")
hmm <- read_hmm(path)Profile HMMs are stored as list objects with two
elements, header and model.
names(hmm)
#> [1] "header" "model"
names(hmm[["header"]])
#> [1] "HMMER3/f" "NAME" "LENG"
#> [4] "ALPH" "ACC" "DESC"
#> [7] "MAXL" "RF" "MM"
#> [10] "CONS" "CS" "MAP"
#> [13] "DATE" "COM" "NSEQ"
#> [16] "EFFN" "CKSUM" "GA"
#> [19] "TC" "NC" "STATS LOCAL MSV"
#> [22] "STATS LOCAL VITERBI" "STATS LOCAL FORWARD"
names(hmm[["model"]])
#> [1] "compo" "match_emissions" "insert_emissions"
#> [4] "state_transitions"Accessing elements works the same as with any other list.
hmm[["header"]][["NAME"]]
#> [1] "HamA1_00001"
# Or...
hmm$header$NAME
#> [1] "HamA1_00001"
# Or...
purrr::pluck(hmm, "header", "NAME")
#> [1] "HamA1_00001"
# Etc...The model information for an HMM can be quite large, which can slow down reading.
object.size(hmm[["header"]]) |> print(units = "Kb")
#> 4.2 Kb
object.size(hmm[["model"]]) |> print(units = "Kb")
#> 519.8 KbCurrently, there is no use for reading in the model information
anyway, at least for the purpose of data validation. To read just the
header information use read_hmm_header() instead.
hmm_header <- read_hmm_header(path)The object returned by read_hmm_header() is equivalent
to the list returned by read_hmm(), but only
with the header element.
names(hmm_header)
#> [1] "header"
names(hmm_header[["header"]])
#> [1] "HMMER3/f" "NAME" "LENG"
#> [4] "ALPH" "ACC" "DESC"
#> [7] "MAXL" "RF" "MM"
#> [10] "CONS" "CS" "MAP"
#> [13] "DATE" "COM" "NSEQ"
#> [16] "EFFN" "CKSUM" "GA"
#> [19] "TC" "NC" "STATS LOCAL MSV"
#> [22] "STATS LOCAL VITERBI" "STATS LOCAL FORWARD"System models
Read system models with read_padloc_model().
path <- padlocdev_example("padloc-db/sys/brex_type_I.yaml")
model <- read_padloc_model(path)System models are also stored as list objects with each
element corresponding to a field in the original model file.
names(model)
#> [1] "force_strand" "maximum_separation" "minimum_core"
#> [4] "minimum_total" "core_genes" "secondary_genes"
#> [7] "neutral_genes" "prohibited_genes"
model[["core_genes"]]
#> [1] "BrxA" "BrxB" "BrxC" "PglX" "PglZ" "BrxL"Reading multiple files
Wrappers are provided for reading all of the system model or profile
HMM files in a directory into a list. The name of each
element in the list is derived from the name of the file
read-in.
Use multi_read_padloc_model() to read all system
models.
path <- padlocdev_example("padloc-db/sys")
models <- multi_read_padloc_model(path)
names(models)
#> [1] "brex_other" "brex_type_I" "brex_type_II" "brex_type_III"
#> [5] "brex_type_IV" "brex_type_V" "brex_type_VI" "DRT_class_I"
#> [9] "DRT_class_II" "DRT_class_III" "DRT_other" "DRT_type_I"
#> [13] "DRT_type_II" "DRT_type_III" "DRT_type_IV" "DRT_type_V"
#> [17] "hachiman_other" "hachiman_type_I" "hachiman_type_II"
models[["brex_type_I"]]
#> $force_strand
#> [1] FALSE
#>
#> $maximum_separation
#> [1] 3
#>
#> $minimum_core
#> [1] 6
#>
#> $minimum_total
#> [1] 6
#>
#> $core_genes
#> [1] "BrxA" "BrxB" "BrxC" "PglX" "PglZ" "BrxL"
#>
#> $secondary_genes
#> [1] "NA"
#>
#> $neutral_genes
#> [1] "NA"
#>
#> $prohibited_genes
#> [1] "BrxE"Use multi_read_hmm() to read all profile HMMs.
path <- padlocdev_example("padloc-db/hmm")
hmms <- multi_read_hmm(path)
names(hmms)
#> [1] "PDLC00150" "PDLC00151" "PDLC00174" "PDLC00175" "PDLC00195" "PDLC00196"
#> [7] "PDLC00199" "PDLC00200" "PDLC00202" "PDLC00203" "PDLC02171" "PDLC02172"
#> [13] "PDLC02173" "PDLC02174" "PDLC02175" "PDLC02176" "PDLC02177" "PDLC02178"
#> [19] "PDLC02181" "PDLC02182" "PDLC02437" "PDLC02438" "PDLC02459" "PDLC02460"
#> [25] "PDLC02471" "PDLC02472" "PDLC02494" "PDLC02495" "PDLC02496" "PDLC02497"
#> [31] "PDLC02500" "PDLC02501" "PDLC02504" "PDLC02505" "PDLC02506" "PDLC02510"
#> [37] "PDLC02511" "PDLC02512" "PDLC02513" "PDLC02515" "PDLC02516" "PDLC02541"
#> [43] "PDLC02542"For larger databases, it’s more efficient to just read the header information of each HMM.
hmm_headers <- multi_read_hmm_header(path)
names(hmm_headers)
#> [1] "PDLC00150" "PDLC00151" "PDLC00174" "PDLC00175" "PDLC00195" "PDLC00196"
#> [7] "PDLC00199" "PDLC00200" "PDLC00202" "PDLC00203" "PDLC02171" "PDLC02172"
#> [13] "PDLC02173" "PDLC02174" "PDLC02175" "PDLC02176" "PDLC02177" "PDLC02178"
#> [19] "PDLC02181" "PDLC02182" "PDLC02437" "PDLC02438" "PDLC02459" "PDLC02460"
#> [25] "PDLC02471" "PDLC02472" "PDLC02494" "PDLC02495" "PDLC02496" "PDLC02497"
#> [31] "PDLC02500" "PDLC02501" "PDLC02504" "PDLC02505" "PDLC02506" "PDLC02510"
#> [37] "PDLC02511" "PDLC02512" "PDLC02513" "PDLC02515" "PDLC02516" "PDLC02541"
#> [43] "PDLC02542"