Generate files • padlocdev

library(padlocdev)

# Read sys_master.txt
sys_master <- read_sys_master("~/Developer/active/padloc_internal/sys_master/sys_master.txt")

# Drop irrelevant info
filtered <- sys_master %>% 
  dplyr::select(system, yaml.name, notes, references) %>%
  dplyr::filter(yaml.name != "adjacent" & yaml.name != "cas_type_arrays")

# Separate references
separated <- filtered %>% 
  tidyr::separate_longer_delim(cols = references, delim = ";") %>%
  tidyr::separate_wider_delim(
    cols = references, delim = ":", 
    names = c("authordate", "doi_short"), 
    too_few = "align_start"
  )

# Pull out a table of the short DOIs, drop NAs, convert to long DOIs
# (takes about a minute)
long_dois <- separated %>%
  dplyr::distinct(doi_short) %>%
  dplyr::filter(!is.na(doi_short) & doi_short != "NA") %>%
  doi_to_long_df()

# Download CrossRef data (takes a couple of minutes)
crossref_data <- get_crossref_data(long_dois)

# Pull out the useful info, and clean up
crossref_data_selected <- crossref_data %>%
  crossref_data_pull_author() %>%
  crossref_data_pull_date() %>%
  crossref_data_pull_title() %>%
  crossref_data_pull_journal() %>%
  dplyr::select(author, date, title, journal, doi_short)

# Generate nicely formatted reference list
reference_list <- crossref_data_selected %>%
  dplyr::arrange(author) %>%
  dplyr::mutate(
    joined = paste0(
      "> <a name=\"", doi_short, "\">",
      author,", *et al.* (", date, ")</a><br>**",
      title, "**<br>*", journal, "*<br>https://doi.org/", doi_short
    )
  ) %>%
  dplyr::pull(joined) %>%
  stringr::str_flatten(collapse = "\n\n")

# Join clean data onto system list and format yaml and reference columns
system_table <- separated %>%
  dplyr::left_join(crossref_data_selected, by = dplyr::join_by(doi_short)) %>%
  dplyr::mutate(
    yaml.name = paste0(
      "[", yaml.name, "](https://github.com/padlocbio/padloc-db/blob/master/sys/", yaml.name, ".yaml)"
    ),
    references = dplyr::case_when(
      !is.na(doi_short) ~ paste0(
        "[", author, " (", date, ")](#", doi_short, ")"
      ),
      stringr::str_detect(authordate, "UNPUBLISHED") ~ "Payne (unpublished)"
    )
  ) %>%
  dplyr::summarise(references = paste0(references, collapse = "<br>"), .by = c(system, yaml.name, notes)) %>%
  # dplyr::select(!c(doi_short, author, created)) %>%
  dplyr::rename(System = system, `YAML name` = yaml.name, Comments = notes, References = references)

# Convert to markdown table
system_table_md <- system_table %>%
  dplyr::mutate(dplyr::across(dplyr::everything(), ~ dplyr::case_when(is.na(.) ~ "", . == "NA" ~ "", .default = .))) %>%
  knitr::kable() %>%
  as.character() %>%
  stringr::str_flatten(collapse = "\n")

# Generate file
paste0(
  "# Defence system information\n",
  "Click on a YAML name to inspect the relevant file. Click on a reference to see the full reference information.\n",
  system_table_md,
  "\n## References\n",
  reference_list
) %>%
  stringr::str_flatten() %>%
  readr::write_file("~/Developer/active/padloc-db/system_info.md")


generate_sys_meta <- function(sys_master) {
  
}

generate_defence_system_model_data <- function(sys_master) {
  out <- sys_master %>%
    dplyr::filter(!stringr::str_detect(yaml.name, "adjacent|cas_type_arrays")) %>%
    dplyr::select(system, yaml.name, notes, example, references, order) %>%
    dplyr::rename(system_type = system, yaml_name = yaml.name, comment = notes, genome_accession = example, citation_list = references) %>%
    dplyr::mutate(genome_id = openssl::sha256(genome_accession, "padloc-key")) %>%
    dplyr::mutate(citation_list = stringr::str_squish(citation_list)) %>%
    dplyr::mutate(citation_list = stringr::str_replace_all(citation_list, ";", " | ")) %>%
    dplyr::mutate(citation_list = stringr::str_remove_all(citation_list, "\\w{1,}\\d{4}:")) %>%
    dplyr::mutate(citation_list = dplyr::if_else(stringr::str_detect(citation_list, "UNPUBLISHED"), "Payne (unpublished)", citation_list)) %>%
    dplyr::mutate(example_name = NA, evidence = NA) %>%
    dplyr::select(system_type, yaml_name, comment, genome_accession, genome_id, citation_list, order)
  out
}

model_data_defence_system <- generate_defence_system_model_data(sys_master)
readr::write_csv(model_data_defence_system, "~/Developer/active/padloc_internal/webserver_systems_padloc-db_v2.0.0.csv")

generate_citation_model_data <- function(crossref_data) {
  out <- crossref_data %>%
    crossref_data_pull_author() %>%
    crossref_data_clean_author() %>%
    crossref_data_clean_date() %>%
    dplyr::mutate(
      short = doi_short,
      year = stringr::str_extract(issued, "[0-9]{4}"),
      link = paste0("https://doi.org/", doi_short),
      long = paste0(author, " (", year, ")")
    ) %>%
    dplyr::select(short, long, link) %>%
    dplyr::mutate()
  out
}

model_data_citation <- generate_citation_model_data(crossref_data)
readr::write_csv(model_data_citation, "D:/payle484/working/model_data_citations_2023-10-10.csv")