文章SameStr(三):图3代码

“Publication Figure 3”

百度云盘链接: https://pan.baidu.com/s/15g7caZp354zIWktpnWzWhQ

提取码: 4sh7

Libraries

Standard Import

library(tidyverse)
library(cowplot)
library(scales)
library(ggpubr)

Special

library(ggridges)
library(grid)
# library(Hmisc)
# library(ggExtra)

Paths

bin_dir = '../bin/'
data_dir = '../data/'
results_dir = '../results/'

Custom Scripts / Theme

source(paste0(bin_dir, 'theme_settings.R'))
source (paste0(bin_dir, 'utilities.R'))
source(paste0(bin_dir, 'species_competition_functions.R'))
source(paste0(bin_dir, 'distmat_utils.R'))
source(paste0(bin_dir, 'analysis_metadata.R'))
date <- format(Sys.time(), "%d%m%y")

Import Tables

Metadata

samples.metadata <- read_tsv(paste0(data_dir, 'samples.metadata.tsv'))
microbe.taxonomy <- read_tsv(paste0(data_dir, 'microbe.taxonomy.tsv'))
microbe.metadata <- microbe.taxonomy %>% 
  right_join(., read_tsv(paste0(data_dir, 'microbe.metadata.tsv'))) 

Taxonomy

MetaPhlAn2 tables
combine with taxonomy

mp_species.long <- microbe.taxonomy %>% 
  right_join(., read_tsv(paste0(data_dir, 'samples.mp_profiles.species.tsv')), 
             by = 'species') %>% 
  left_join(., samples.metadata, by = 'Name')

rCDI subset
annotate with metadata

mp_species.rcdi.long <- 
  mp_species.long %>%
  filter(Study %in% c(rcdi.studies)) 

SameStr Data

SameStr Case-Summary table

samples.sstr_data <- read_tsv(paste0(data_dir, 'samples.sstr_data_wmetadata.tsv')) %>% 
  left_join(., microbe.taxonomy, by = 'species')

Strain Finder Data

Strain Finder Tables

Sys.setenv("VROOM_CONNECTION_SIZE" = 10485760)

sf_strain_table <- read_tsv(paste0(data_dir, 'ALM.Strain_Table.tsv')) %>% 
  rename(Sample = '...1') # X1
sf_species_table <- read_tsv(paste0(data_dir, 'ALM.MG_OTU_Bacteria.tsv')) %>% 
  rename(bact = `BACT ID`) %>% 
  rename_all(.funs = funs(str_to_lower(.)))

Mock Data

Multi-Strain Species Population

mock.sstr_data <- read_tsv(paste0(data_dir, 'mock.sstr_data_wmetadata.tsv')) %>% 
  left_join(., microbe.taxonomy, by = 'species')

Fast ANI

FastAni Comparison Data

refs <- read_tsv(paste0(data_dir, 'references.mp_fastani.tsv'))

Set Figure

fig = paste0('Fig_3.')

Specificity of Shared Taxa

Comparing distributions of shared (genera, species, and) strains in related and unrelated sample pairs in order to assess the specificity with which methods call shared strains.

Strain Finder

Format

Taxa per sample matrix

# mg-OTU strain-level matrix
sf_strain_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  column_to_rownames('Sample') %>% 
  as.matrix()

colnames(sf_strain_table.matrix) <- 
  tibble(tax = colnames(sf_strain_table.matrix)) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  group_by(bact) %>% 
  mutate(strain_id = paste0(bact, '.', dense_rank(gdna))) %>% 
  pull(strain_id)

# mg-OTU species-level matrix
sf_species_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>% 
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  pivot_wider(id_cols = 'Sample', names_from = 'bact', values_from = 'rel_abund') %>% 
  column_to_rownames('Sample') %>% 
  as.matrix()

# mg-OTU genus-level matrix
sf_genus_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  left_join(., sf_species_table %>% dplyr::select(bact, genus), by = 'bact') %>% 
  group_by(Sample, genus) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  pivot_wider(id_cols = 'Sample', names_from = 'genus', values_from = 'rel_abund') %>% 
  column_to_rownames('Sample') %>% 
  as.matrix()

Sample/Sample co-occurrence matrix at mg-OTU species- and strain-level

# genus
sf_genus_table.dist <- sf_genus_table.matrix %*% t(sf_genus_table.matrix)
sf_genus_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_genus_table.dist), 
                  value_name = 'n.shared_genus',
                  rm_diag = T)
sf_genus_counts <- diag(sf_genus_table.dist) %>% 
  as.data.frame() %>% 
  rownames_to_column('Sample') %>% 
  rename(n.genus = '.')

# species
sf_species_table.dist <- sf_species_table.matrix %*% t(sf_species_table.matrix)
sf_species_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_species_table.dist), 
                  value_name = 'n.shared_species',
                  rm_diag = T)
sf_species_counts <- diag(sf_species_table.dist) %>% 
  as.data.frame() %>% 
  rownames_to_column('Sample') %>% 
  rename(n.species = '.')

# strain
sf_strain_table.dist <- sf_strain_table.matrix %*% t(sf_strain_table.matrix)
sf_strain_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_strain_table.dist), 
                  value_name = 'n.shared_strains',
                  rm_diag = T)
sf_strain_counts <- diag(sf_strain_table.dist) %>% 
  as.data.frame() %>% 
  rownames_to_column('Sample') %>% 
  rename(n.strains = '.')

join data

sf <-
  full_join(sf_species_table.long, sf_strain_table.long, by = c('row','col')) %>% 
  full_join(., sf_genus_table.long, by = c('row','col')) %>% 
  
  left_join(., sf_genus_counts, by = c('row' = 'Sample')) %>% 
  left_join(., sf_genus_counts, by = c('col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  
  left_join(., sf_species_counts, by = c('row' = 'Sample')) %>% 
  left_join(., sf_species_counts, by = c('col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  
  left_join(., sf_strain_counts, by = c('row' = 'Sample')) %>% 
  left_join(., sf_strain_counts, by = c('col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  
  rename(Sample.row = row, 
         Sample.col = col, 
         n.shared_species.sf = n.shared_species,
         n.shared_strains.sf = n.shared_strains, 
         n.shared_genus.sf = n.shared_genus) %>% 
  
  rename_at(.vars = vars(matches('n.*col'), matches('n.*row')),
            .funs = funs(gsub(gsub(., pattern = '.row', replacement = '.sf.row'), 
                              pattern = '.col', replacement = '.sf.col'))) %>%
  left_join(., samples.metadata, by = c('Sample.row' = 'Sample')) %>% 
  left_join(., samples.metadata, by = c('Sample.col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  filter(!is.na(Name.row), !is.na(Name.col)) %>% 
  rowwise() %>% 
  mutate(Name.paste = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  mutate(n.not_shared_strains.sf = (n.strains.sf.row - n.shared_strains.sf) + (n.strains.sf.col - n.shared_strains.sf), 
         n.not_shared_species.sf = (n.species.sf.row - n.shared_species.sf) + (n.species.sf.col - n.shared_species.sf), 
         n.not_shared_genus.sf = (n.genus.sf.row - n.shared_genus.sf) + (n.genus.sf.col - n.shared_genus.sf), 
         n.total_strains.sf = n.shared_strains.sf + n.not_shared_strains.sf,
         n.total_species.sf = n.shared_species.sf + n.not_shared_species.sf, 
         n.total_genus.sf = n.shared_genus.sf + n.not_shared_genus.sf, 
         f.shared_strains.sf = n.shared_strains.sf / n.total_strains.sf,
         f.shared_species.sf = n.shared_species.sf / n.total_species.sf,
         f.shared_genus.sf = n.shared_genus.sf / n.total_genus.sf)

Shared Taxa

Density Ridge Plot based on Strain Finder Data published with the Alm dataset, showing co-occurrence of genera, species, and strains in samples which are common in unrelated but more common in related samples.

sf %>% 
  mutate(related = ifelse(replace_na(Donor.Subject.row == Donor.Subject.col, F), 'Related', 'Unrelated')) %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = c('n.shared_genus.sf', 'n.shared_species.sf', 'n.shared_strains.sf')) %>% 
  mutate(metric = case_when(
         metric == 'n.shared_strains.sf' ~ 'StrainFinder/\nmg-OTUs (strain)', 
         metric == 'n.shared_species.sf' ~ 'StrainFinder/\nmg-OTUs (species)',
         metric == 'n.shared_genus.sf' ~ 'StrainFinder/\nmg-OTUs (genus)',
         T ~ metric
       )) %>% 
ggplot(aes(value, fct_relevel(metric, 
                              'StrainFinder/\nmg-OTUs (strain)', 
                              'StrainFinder/\nmg-OTUs (species)', 
                              'StrainFinder/\nmg-OTUs (genus)'), fill = related)) + 
  geom_density_ridges(panel_scaling = F, alpha = 0.75, scale = 5) + 
  scale_fill_manual(values = c('black','white')) +
  scale_x_continuous(trans = pseudo_log_trans(), breaks = c(0, 3, 10, 30, 100, 300)) +
  theme_cowplot() +
  labs(x = 'Shared Bacteria') +
  theme(legend.key.size = unit(c(.75, .5, .75, .5), "lines"),
        legend.title = element_blank(),
        axis.title.y = element_blank(), 
        )

在这里插入图片描述

SameStr

Format

Generate MetaPhlAn co-occurrence table, taxa counts

cooccurrence <- mp_cooccurrence(mp_species.long)
taxa_counts <- cooccurrence$mp_taxa_counts
taxa_cooccurrence <- cooccurrence$mp_taxa_table.long

Merge with SameStr strain-level data

sstr <-
  samples.sstr_data %>% 
  
  group_by(Study.pair, Name.row, Name.col, 
           Case_Name.pair, Case_Name.row, Case_Name.col, Same_Case.label, 
           Related.label, Related.bool, Pair_Type) %>%
  summarize(.groups = 'drop',
            n.analyzed_strains = sum(overlap > min_overlap, na.rm = T),
            n.shared_strains.mvs = sum(distance.mvs > min_similarity & overlap > min_overlap, na.rm = T),
            n.shared_strains.cvs = sum(distance.cvs > min_similarity & overlap > min_overlap, na.rm = T)) %>%

  # tag successful fmt cases
  mutate(fmt_success = !Case_Name.pair %in% cases_failed) %>% 
  mutate(Study.row = str_split_fixed(Name.row, pattern = '_', 2)[,1], 
         Study.col = str_split_fixed(Name.col, pattern = '_', 2)[,1]) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated')) %>% 

  rowwise() %>%
  mutate(Sample_Pair = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  left_join(., taxa_cooccurrence, by = 'Sample_Pair') %>% 
  left_join(., taxa_counts, by = c('Name.row' = 'Sample')) %>% 
  left_join(., taxa_counts, by = c('Name.row' = 'Sample'), suffix = c('.row','.col')) %>% 
  
  mutate(n.not_shared_species = (n.species.row - n.shared_species) + (n.species.col - n.shared_species), 
         n.not_shared_genus = (n.genus.row - n.shared_genus) + (n.genus.col - n.shared_genus), 
         n.not_shared_family = (n.family.row - n.shared_family) + (n.family.col - n.shared_family), 

         n.total_species = n.shared_species + n.not_shared_species, 
         n.total_genus = n.shared_genus + n.not_shared_genus, 
         n.total_family = n.shared_family + n.not_shared_family, 

         f.shared_strains.mvs = replace_na(n.shared_strains.mvs / n.analyzed_strains, 0), 
         f.shared_strains.cvs = replace_na(n.shared_strains.cvs / n.analyzed_strains, 0), 
         f.shared_species = n.shared_species / n.total_species,
         f.shared_genus = n.shared_genus / n.total_genus, 
         f.shared_family = n.shared_family / n.total_family)
  

Subset rCDI Alm data

sstr.alm <-
  sstr %>%
  # ALM only, same as above
  filter(Study.pair == 'ALM')

sstr.control <- 
  sstr %>% 
  # Control only
  filter(Study.row %in% control.studies &
         Study.col %in% control.studies) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated'))

Shared Taxa

Density Ridge Plot based on SameStr Data for the same Alm dataset shown above, showing co-occurrence of genera, species, and strains in samples. Co-occurrence is common in unrelated genera and species, but more separable compared to SF. For shared strains, distributions divide related from unrelated sample pairs which mostly don’t share any strains.

Alm
sstr.alm %>% 

  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = c('n.shared_genus', 'n.shared_species', 'n.shared_strains.mvs')) %>% 
  mutate(metric = case_when(
         metric == 'n.shared_strains.mvs' ~ 'SameStr/\nMetaPhlAn2 (strain)', 
         metric == 'n.shared_species' ~ 'SameStr/\nMetaPhlAn2 (species)', 
         metric == 'n.shared_genus' ~ 'SameStr/\nMetaPhlAn2 (genus)', 
         T ~ metric
       )) %>% 
ggplot(aes(value, fct_relevel(metric, 
                              'SameStr/\nMetaPhlAn2 (strain)',
                              'SameStr/\nMetaPhlAn2 (species)', 
                              'SameStr/\nMetaPhlAn2 (genus)',), fill = Related.bool)) + 
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  # scale = 4
  scale_x_continuous(trans = pseudo_log_trans(), breaks = c(0, 3, 10, 30, 100, 300)) +

  scale_fill_manual(values = c('black','white')) +
  theme_cowplot() +
  labs(x = 'Shared Bacteria') +
  theme(legend.key.size = unit(c(.75, .5, .75, .5), "lines"),
        legend.title = element_blank(),
        axis.title.y = element_blank(), 
        )

在这里插入图片描述

Most unrelated strains

samples.sstr_data %>% 
  filter(Study.pair == 'ALM') %>% 
  filter(overlap > min_overlap) %>%
  group_by(genus, species, Related.bool) %>% 
  summarize(count = sum(Shared_Strain.bool == T, na.rm = T),
            total = sum(!is.na(Shared_Strain.bool))) %>% 
  mutate(fraction = count / total) %>% 
  filter(total > 20) %>% 
  arrange(Related.bool, desc(fraction))
Control
plot.control <-
  sstr %>% 
  filter(Study.row %in% control.studies &
         Study.col %in% control.studies) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated')) %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = c('f.shared_strains.mvs', 'f.shared_strains.cvs')) %>% 
mutate(metric = case_when(
  metric == 'f.shared_strains.mvs' ~ 'SameStr', 
  metric == 'f.shared_strains.cvs' ~ 'Consensus', 
)) %>% 
ggplot(aes(value, Related.text, fill = metric)) + 
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  # scale = 4
  scale_x_continuous(labels = percent_format(), expand = c(0,0), breaks = c(0, 0.5, 1)) + 
  scale_fill_manual(values = c('black','white')) +
  theme_cowplot() +
  labs(x = 'as [%] of analyzed Spp.') + 
  theme(legend.title = element_blank(),
        axis.title.y = element_blank(), 
        )
plot.control + theme(legend.position = 'none')

Exporting Plot

output_name = 'SharedStrains.Control'

ggsave(plot.control + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 2.85, height = 3.25,
       filename = paste0(results_dir, fig, output_name, '.RidgePlot.pdf'))
sstr.control %>% 
  dplyr::select(Related.text, n.shared_strains.mvs) %>% 
  group_by(Related.text) %>%
  summarize_all(.funs = funs(n(), min, max, median, mean, sd))

Tool Comparison

Format

Merge Tool Data

alm_table <-
  sf %>% 
  left_join(., sstr.alm %>% 
  rename_at(.vars = vars(n.shared_strains.mvs,
                         n.shared_species,
                         n.shared_genus,
                         n.shared_family,
                         f.shared_strains.mvs,
                         f.shared_species,
                         f.shared_genus,
                         f.shared_family),
            .funs = funs(paste0(gsub(., pattern = '.mvs', replacement = ''), '.sstr'))) %>% 

  rowwise() %>% 
  mutate(Name.paste = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  dplyr::select(-Name.row, -Name.col, -Case_Name.row, -Case_Name.col)) %>% 
  
  # label study, case
  mutate(Case_Split.row = str_split(Case_Name.row, ';'), 
         Case_Split.col = str_split(Case_Name.col, ';')) %>%
  rowwise() %>%
  mutate(Study.same = Study.row == Study.col, 
         Study.pair = ifelse(Study.same, Study.row, 'Unrelated'),
         Case_Name.same = (Case_Name.col %in% Case_Split.row) | (Case_Name.row %in% Case_Split.col), 
         Case_Name.pair = ifelse(Case_Name.same, intersect(Case_Split.row, Case_Split.col), 'Unrelated')) %>% 
  dplyr::select(-starts_with('Case_Split.')) %>% 
  ungroup() %>% 
  
  # label comparison type + study, case 
  mutate(
    Pair_Type = 'No-Pair',
    Pair_Type = case_when(

      
    (Sample_Type.row == 'recipient' & Sample_Type.col == 'donor') | 
    (Sample_Type.col == 'recipient' & Sample_Type.row == 'donor') ~ 'Pre-FMT/Donor',
    
    (Sample_Type.row == 'post' & Sample_Type.col == 'recipient') | 
    (Sample_Type.col == 'post' & Sample_Type.row == 'recipient') ~ 'Pre-FMT/Post-FMT',
    
    (Sample_Type.row == 'post' & Sample_Type.col == 'donor') | 
    (Sample_Type.col == 'post' & Sample_Type.row == 'donor') ~ 'Donor/Post-FMT',
    
    Sample_Type.row == 'recipient' & Sample_Type.col == 'recipient' ~ 'Pre-FMT/Pre-FMT',
    Sample_Type.row == 'post' & Sample_Type.col == 'post' ~ 'Post-FMT/Post-FMT',
    Sample_Type.row == 'donor' & Sample_Type.col == 'donor' ~ 'Donor/Donor')) %>% 
  
  # label family relationship
  mutate(Same_Case.label = ifelse(Case_Name.same, 'Same Case', 'Distinct Case'),
         Same_Study.label = ifelse(Study.same, 'Same Study', 'Distinct Study'),
         ) %>% 
  
  dplyr::select(Name.paste, contains('n.shared'), starts_with('f.'), ends_with('.pair'), 
         Related.label, Related.bool, Pair_Type) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated'))
alm_table.n <-
  alm_table %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('n.shared')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  mutate(taxonomic_level = gsub(taxonomic_level, pattern = 'shared_', replacement = ''),
         taxonomic_level = gsub(taxonomic_level, pattern = 'strains', replacement = 'strain')) %>% 
  dplyr::select(-metric)

alm_table.f <-
  alm_table %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('f.shared')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  mutate(taxonomic_level = gsub(taxonomic_level, pattern = 'shared_', replacement = ''),
         taxonomic_level = gsub(taxonomic_level, pattern = 'strains', replacement = 'strain')) %>% 
  dplyr::select(-metric)

Shared Strains n

Number of shared strains between related and unrelated samples of the ALM rCDI cohort based on CVS, MVS of MetaPhlAn2 alignments and Strain Finder on AMPHORA markers

plot.strain_count <-

  alm_table.n %>% 
  filter(taxonomic_level == 'strain') %>% 
  
  mutate(tool = case_when(
    tool == 'sstr' ~ 'MVS',
    tool == 'sf' ~ 'SF',
    tool == 'cvs' ~ 'CVS',
    T ~ tool)) %>%

  mutate(tool = fct_relevel(tool,'SF','CVS','MVS')) %>%

  ggplot(aes(value, tool, fill = Related.text)) +
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  
  scale_x_continuous(expand = c(0,0), 
                     trans = pseudo_log_trans(1, base = 10),
                     breaks = c(0, 1, 10, 30, 100, 300)
) + 
  theme_cowplot() +
  labs(x = 'Count [n]') +  
  scale_fill_manual(values = c('black', 'white')) + 
  theme(strip.background = element_blank(),
        axis.title.y = element_blank(),
        legend.title = element_blank(),
        legend.position = c(0.6, 0.9))


legend <- cowplot::get_legend(plot.strain_count)
plot.strain_count + theme(legend.position = 'none')
grid.newpage()
grid.draw(legend)

Whereas Strain Finder is finding a lot of co-occurring strains in unrelated samples, the consensus approach is very conservative. Compared to SameStr MVS, consensus finds slightly fewer potential false-positives (means ~.45 vs. .22 strains in unrelated samples) but also misses on a large fraction of potential true-positives (means ~15 vs. 8 in related samples)

alm_table.n %>% 
  filter(taxonomic_level == 'strain') %>% 
  ungroup() %>% 
  mutate(tool = case_when(
    tool == 'sstr' ~ 'MVS',
    tool == 'sf' ~ 'SF',
    tool == 'cvs' ~ 'CVS',
    T ~ tool)) %>% 
  dplyr::select(Name.paste, Related.text, tool, value) %>% 
  pivot_wider(names_from = 'tool', values_from = 'value') %>% 
  mutate(`SF-MVS` = SF - MVS) %>% 
  dplyr::select(-Name.paste) %>% 
  group_by(Related.text) %>% 
  summarize_all(.funs = funs(n(), min, max, median, mean, sd)) %>% 
  group_by(Related.text) %>%
  mutate_all(.funs = funs(round(., 2))) %>% 
  pivot_longer(names_to = 'names', values_to = 'value', cols = matches('.*_.*')) %>% 
  separate(names, into = c('tool','metric'), sep = '_') %>% 
  pivot_wider(names_from = 'metric', values_from = 'value')

Related
CVS 7.99±6.83
MVS 14.77±11.34
SF 78.35±73.50
DIFF 93.13±82.59

Unrelated
CVS 0.22±0.55
MVS 0.45±0.75
SF 34.71±37.52
DIFF 35.16±37.68

Shared Strains %

Fraction of shared strains betweent related and unrelated samples of the ALM rCDI cohort

plot.strain_fraction <-

  alm_table.f %>% 
  filter(taxonomic_level == 'strain') %>% 

  mutate(tool = case_when(
    tool == 'sstr' ~ 'SameStr MVS/\nMetaPhlAn2', 
    tool == 'sf' ~ 'StrainFinder/\nmg-OTUs', 
    tool == 'cvs' ~ 'Consensus/\nMetaPhlAn2', 
    T ~ tool)) %>% 
  
  mutate(tool = fct_relevel(tool,
    'StrainFinder/\nmg-OTUs', 
    'Consensus/\nMetaPhlAn2', 
    'SameStr MVS/\nMetaPhlAn2', 
  )) %>% 

  ggplot(aes(value, tool, fill = Related.text)) +
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  
  scale_x_continuous(expand = c(0,0), 
                     trans = pseudo_log_trans(base = 10),
                     labels = percent_format()
) + 
  theme_cowplot() +
  labs(x = 'Taxa Shared between Samples [%]') + 
  scale_fill_manual(values = c('black', 'white')) + 
  theme(strip.background = element_blank(),
        axis.title.y = element_blank(),
        legend.title = element_blank(),
        legend.position = c(0.6, 0.9))

plot.strain_fraction + theme(legend.position = 'none')

Exporting Plot

output_name = 'SharedStrains.Alm'

ggsave(plot.strain_count + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 3.5, height = 3.25,
       filename = paste0(results_dir, fig, output_name, '.Counts', '.RidgePlot.pdf'))
ggsave(plot.strain_fraction + theme(legend.position = 'none', axis.line.y = element_blank()), 
       device = 'pdf', dpi = 300, width = 5, height = 4,
       filename = paste0(results_dir, fig, output_name, '.Fraction', '.RidgePlot.pdf'))
ggsave(legend, 
       device = 'pdf', dpi = 300, width = 5, height = 5,
       filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))

Shared F>G>Spp. %

alm_table %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('f.shared')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  dplyr::select(-metric) %>% 
  filter(tool != 'cvs') %>% 
  mutate(taxonomic_level = str_to_title(gsub(taxonomic_level, pattern = 'shared_', replacement = ''))) %>% 
  mutate(tool = case_when(tool == 'sstr' ~ 'SameStr/\nMetaPhlAn2', tool == 'sf' ~ 'StrainFinder/\nmg-OTUs', T ~ tool)) %>% 

  ggplot(aes(value, tool, fill = Related.text)) +
  geom_density_ridges(alpha = 0.75, scale = 3) + 
  scale_x_continuous(limits = c(0, 1), labels = percent_format(), expand = c(0,0)) + 
  theme_cowplot() +
  labs(x = 'Fraction of Taxa Shared between Samples') + 
  scale_fill_manual(values = c('black', 'white')) + 
  facet_wrap(~taxonomic_level, nrow = 1, scales = 'free_x') + 
  theme(strip.background = element_blank(),
        axis.title.y = element_blank(),
        legend.title = element_blank(),
        legend.position = c(0.8, 0.9))

Sensitivity of Taxa Detection

Format

Comparison of genera found with MP2/Amphora

counts <-
  full_join(sf_genus_counts, sf_species_counts, by = 'Sample') %>%  
  mutate(Name = paste0('ALM_', Sample, '.pair')) %>% 
  full_join(., taxa_counts %>% 
              rename(Name = 'Sample') %>% dplyr::select(-n.family), 
            by = 'Name', suffix = c('.sf','.mp')) %>% 
  filter(!is.na(n.species.mp), !is.na(n.species.sf))
lm_eqn <- function(df){
    m <- lm(mp ~ sf, df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)~"="~rv, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             rv = format(sqrt(summary(m)$r.squared), digits = 2)))
    as.character(as.expression(eq));
}
counts.data <- counts %>% 
  pivot_longer(names_to = 'metric', values_to = 'n', cols = starts_with('n.')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  dplyr::select(-metric) %>% 
  pivot_wider(names_from = 'tool', values_from = 'n') %>% 
  mutate(difference = mp - sf) 

counts.data.genus <-
  counts.data %>% 
  filter(taxonomic_level == 'genus')

counts.data.species <-
  counts.data %>% 
  filter(taxonomic_level == 'species')
get_stats <- function(df, digits = 2) {
  df.mean <- mean(df, na.rm = T)
  df.sd <- sd(df, na.rm = T)
  df.format <- paste('µ=', round(df.mean, digits = digits),  '±', round(df.sd, digits = digits), sep = '')
  return(
    list(
      'mean' = df.mean, 
      'sd' = df.sd,
      'format' = df.format))
}
digits = 1
stats.g.diff <- get_stats(counts.data.genus$difference, digits = digits)
stats.g.mp <- get_stats(counts.data.genus$mp, digits = digits)
stats.g.sf <- get_stats(counts.data.genus$sf, digits = digits)
stats.s.diff <- get_stats(counts.data.species$difference, digits = digits)
stats.s.mp <- get_stats(counts.data.species$mp, digits = digits)
stats.s.sf <- get_stats(counts.data.species$sf, digits = digits)

Statistics

counts.data %>% 
  dplyr::select(-Sample, -Name) %>% 
  group_by(taxonomic_level) %>% 
  summarize_all(.funs = funs(mean, sd)) %>% 
  dplyr::select(taxonomic_level, starts_with('sf_'), starts_with('mp_'), starts_with('dif')) %>% 
  mutate_at(.vars = vars(everything(), -taxonomic_level),
            .funs = funs(round(., digits = 2)))

genus:
sf = 23.78 ± 16.67
mp = 50.54 ± 15.0

species:
sf = 48.48 ± 33.88
mp = 97.62 ± 39.6

Detected Taxa per Sample

yy.s = 80
yy.g = 30
nudge = 70

plot <-
  counts.data %>% 
  ggplot(aes(y = mp, x = sf, fill = taxonomic_level)) +
  geom_point(shape = 21, col = 'black', size = 5) +

  annotate(geom = 'text', y = yy.s, x = yy.s + nudge + 5, label = stats.s.diff$format, size = 5, parse = F) +
  annotate(geom = 'text', y = yy.g, x = yy.g + nudge, label = stats.g.diff$format, size = 5, parse = F) +
  
  annotate("segment", show.legend = F, 
           arrow = arrow(length = unit(0.3, "cm"), type = "closed"), size = 1, col = 'black',
   y=yy.s, yend=yy.s + stats.s.diff$mean, x=yy.s, xend=yy.s) + 
  annotate("segment", show.legend = F, 
           arrow = arrow(length = unit(0.3, "cm"), type = "closed"), size = 1, col = 'black',
   y=yy.g, yend=yy.g + stats.g.diff$mean, x=yy.g, xend=yy.g) + 
    
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + 
  
  theme_cowplot() +
  scale_x_continuous(limits = c(0, 200)) + 
  scale_y_continuous(limits = c(0, 200)) + 
  scale_fill_manual(values = c('grey25', 'white')) +
  scale_color_manual(values = c('black', 'black')) + 
  theme(aspect.ratio = 1,
        plot.subtitle = element_text(hjust = 0.5), 
        legend.title = element_blank(), 
        legend.position = c(0.02, 0.9)) +
  labs(y = 'SameStr / MetaPhlAn2', x = 'Strain Finder / mg-OTUs', subtitle = 'Detected Taxa per Sample')
plot

Exporting Plot

output_name = 'DetectedTaxa.Species.Alm'

ggsave(plot, 
       device = 'pdf', dpi = 300, width = 3.75, height = 3.75,
       filename = paste0(results_dir, fig, output_name, '.CorrPlot.pdf'))

Taxa Differences

Number of detected mg-OTUs:
species == 306
genus == 116

sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>% 
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(bact) %>% 
  summarize(present = as.numeric(sum(rel_abund, na.rm = T) > 0)) %>% 
  summarize(count = sum(present, na.rm = T)) 

sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0)) %>% 
  left_join(., sf_species_table %>% dplyr::select(bact, genus), by = 'bact') %>% 
  group_by(genus) %>% 
  summarize(present = as.numeric(sum(rel_abund, na.rm = T) > 0)) %>% 
  summarize(count = sum(present, na.rm = T)) 

Number of detected MetaPhlAn2 species:
species == 399
genus == 154

mp_species.long %>% 
  dplyr::select(species, Name, rel_abund) %>% 
  filter(grepl(Name, pattern = '^ALM'), rel_abund > 0) %>% 
  filter(!grepl(species, pattern = 'unclassified')) %>% 
  group_by(species) %>% 
  summarize(present = sum(rel_abund > 0, na.rm = T)) %>% 
  summarize(count = sum(present > 0, na.rm = T))

mp_species.long %>% 
  dplyr::select(genus, Name, rel_abund) %>% 
  filter(grepl(Name, pattern = '^ALM'), rel_abund > 0) %>% 
  filter(!grepl(genus, pattern = 'unclassified')) %>% 
  group_by(genus) %>% 
  summarize(present = sum(rel_abund > 0, na.rm = T)) %>% 
  summarize(count = sum(present > 0, na.rm = T))

Format

found_sf <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  left_join(., sf_species_table %>% dplyr::select(bact, genome, genus), by = 'bact') %>% 
  group_by(Sample, genus) %>% 
  summarize(species_count.sf = sum(rel_abund, na.rm = T), .groups = 'drop')
found_mp <-
  mp_species.long %>% 
  dplyr::select(genus, species, Name, rel_abund) %>% 
  filter(grepl(Name, pattern = '^ALM'), rel_abund > 0) %>% 
  mutate(genus = gsub(genus, pattern = '_noname', replacement = '')) %>% 
  group_by(genus, Name) %>% 
  summarize(species_count.mp = sum(rel_abund > 0, na.rm =T), .groups = 'drop') %>% 
  mutate(Sample = gsub(str_split_fixed(Name, pattern = '_', 2)[,2], pattern = '.pair', replacement = '')) %>% 
  dplyr::select(-Name)

taxa.genus.important <-
  full_join(found_sf, found_mp) %>% 
  mutate_at(.vars = vars(starts_with('species_count')),
            .funs = funs(ifelse(is.na(.), 0, .))) %>% 
  group_by(genus) %>% 
  summarize(species_count.sf = sum(species_count.sf, na.rm = T), 
            species_count.mp = sum(species_count.mp, na.rm = T), .groups = 'drop') %>% 
  mutate(total = species_count.mp + species_count.sf) %>% 
  arrange(desc(total)) %>% 
  top_n(30, total) %>% 
  pull(genus)

found <- 
  full_join(found_sf, found_mp) %>% 
  mutate_at(.vars = vars(starts_with('species_count')),
            .funs = funs(ifelse(is.na(.), 0, .))) %>% 
  filter(genus %in% taxa.genus.important)

Ridge Plot

plot <- 
  found %>% 
  pivot_longer(names_to = 'metric', values_to = 'count', cols = c('species_count.sf', 'species_count.mp')) %>% 
  mutate(metric = case_when(
    metric == 'species_count.sf' ~ 'mg-OTUs', 
    metric == 'species_count.mp' ~ 'MetaPhlAn2', 
  )) %>% 

  ggplot(aes(count, genus, fill = metric)) + 
  geom_density_ridges(panel_scaling = F, alpha = 0.75, scale = 2) + 
  scale_x_continuous(trans = pseudo_log_trans()) + 
  labs(x = 'Detected Species') + 
  theme_cowplot() + 
  theme(axis.title.y = element_blank()) +
  scale_fill_manual(values = c('black','white')) 

plot + theme(legend.position = 'none')

legend <- cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)

Exporting Plot

output_name = 'DetectedTaxa.Species.Alm'

ggsave(plot + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 4, height = 6,
       filename = paste0(results_dir, fig, output_name, '.RidgePlot.pdf'))
ggsave(legend, 
       device = 'pdf', dpi = 300, width = 1.25, height = 1.5,
       filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))
found %>% 
  ungroup() %>% 
  dplyr::select(-Sample) %>% 
  group_by(genus) %>% 
  summarize_all(.funs = funs(mean, sd)) %>% 
  mutate(diff = species_count.sf_mean - species_count.mp_mean) %>% 
  group_by(genus) %>% 
  mutate_all(.funs = funs(round(., 2))) %>% 
  ungroup() %>% 
  rename_at(.vars = vars(starts_with('species_')), 
            .funs = funs(gsub(., pattern = 'species_count.', replacement = ''))) %>% 
  dplyr::select(genus, starts_with('sf'), starts_with('mp'), diff) %>% 
  arrange(diff) %>% 
  head(10)

Genus SF vs. MP

Streptococcus 2.68±4.02 vs. 8.15±2.90 (-5.47)
Lachnospiraceae 0.00±0.00 vs. 4.83±2.54 (-4.83)
Bacteroides 3.87±4.70 vs. 8.44±4.70 (-4.57)
Lactobacillus 1.41±2.37 vs. 5.89±2.69 (-4.48)
Clostridium 2.43±3.25 vs. 6.18±3.57 (-3.75)

SF Majority Strain

sf_min_abund <- function(min_abund) {

  sf_strain_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > min_abund))) %>% # set > .5 filter here to compare SF majority strains
  column_to_rownames('Sample') %>% 
  as.matrix()

colnames(sf_strain_table.matrix) <- 
  tibble(tax = colnames(sf_strain_table.matrix)) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  group_by(bact) %>% 
  mutate(strain_id = paste0(bact, '.', dense_rank(gdna))) %>% 
  pull(strain_id)

# strain
sf_strain_table.dist <- sf_strain_table.matrix %*% t(sf_strain_table.matrix)
sf_strain_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_strain_table.dist), 
                  value_name = paste0('n.shared_strains.', 'min_abund_', min_abund),
                  rm_diag = T)
return(sf_strain_table.long)
}

df <- sf_min_abund(0)
for (m in c(5e-3, 5e-2, 5e-1)) {
  df <- full_join(df, sf_min_abund(m), by = c('row','col')) 
}


related.v <-
  alm_table.n %>% 
  filter(taxonomic_level == 'strain', tool == 'sf') %>% 
  dplyr::select(Name.paste, Related.text)

sf <-
  df %>% 
  
  rename(Sample.row = row, 
         Sample.col = col, 
         ) %>% 
  
  rename_at(.vars = vars(matches('n.*col'), matches('n.*row')),
            .funs = funs(gsub(gsub(., pattern = '.row', replacement = '.sf.row'), 
                              pattern = '.col', replacement = '.sf.col'))) %>%
  left_join(., samples.metadata, by = c('Sample.row' = 'Sample')) %>% 
  left_join(., samples.metadata, by = c('Sample.col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  filter(!is.na(Name.row), !is.na(Name.col)) %>% 
  rowwise() %>% 
  mutate(Name.paste = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  left_join(., related.v) %>% 
  # mutate(related = ifelse(replace_na(Donor.Subject.row == Donor.Subject.col, F), 'Related', 'Unrelated')) %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('n.shared_strains')) %>% 
  separate(metric, into = c('metric','min_abund'), sep = '.abund_') %>% 
  mutate(min_abund = as.numeric(min_abund))
  

Strain Finder shows a high number of cooccurring strains in unrelated sample pairs. Cooccurrence distributions of related / unrelated sample pairs overlap widely, even when considering only strains that were detected at >=50% strain abundance in the species population - a problem that might be due to the amphora markers.

plot <- 
  sf %>% 
  
  mutate(metric = paste0('>=', round(min_abund * 100, 5), '%')) %>% 

  ggplot(aes(value, fct_reorder(metric, min_abund, .desc = T), fill = Related.text)) + 
  geom_density_ridges(alpha = 0.75, scale = 2) +
  scale_fill_manual(values = c('black','white')) +
  scale_x_continuous(expand = c(0,0), 
                     trans = pseudo_log_trans(1, base = 10),
                     breaks = c(0, 1, 10, 30, 100, 300)
) + 
  theme_cowplot() +
  theme(aspect.ratio = 1) + 
  labs(x = 'Shared Strains', y = 'Strain Finder\nStrain Abundance') +
  theme(legend.title = element_blank(),
        legend.position = 'top')
plot

Exporting plot

output_name = 'StrainFinder.StrainCooccurrence'

ggsave(plot, 
       device = 'pdf', dpi = 300, width = 4, height = 4,
       filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))

Mock Multi-Strain Species Population

Detecting strains hidden in Mock Multi-Strain Species Population with varying numbers of strains, target coverage, noise coverage and total coverage.

Format

percent_format_signif <-
  function(x, label.precision=3, label.percent=T) {
    if (label.percent == T) {
      paste0(signif(100 * x, label.precision), "%")
    } else {
      signif(digits = label.precision)
    }
  }

Calculate metrics

df <- 
  mock.sstr_data %>% 

  pivot_longer(names_to = 'metric', 
               values_to = 'distance', 
               cols = c('distance.mvs', 'distance.cvs')) %>% 
  mutate(shared = distance > min_similarity & overlap > min_overlap, 
         should = tsc > 0,
         TP = shared & should,
         FP = shared & !should, 
         TN = !shared & !should, 
         FN = !shared & should, 
         event = case_when(
           TP ~ 'TP', 
           FP ~ 'FP', 
           TN ~ 'TN', 
           FN ~ 'FN',
           T ~ 'Unknown'
         )) %>% 
  mutate(metric = case_when(
    metric == 'distance.cvs' ~ 'Consensus (CVS)',
    metric == 'distance.mvs' ~ 'SameStr (MVS)',
    T ~ 'Unknown'
  )) %>% 
  group_by(metric, tsc, ngc, total_coverage, rel_abund.target) %>% 
  summarize(TP = sum(TP, na.rm = T), 
            FP = sum(FP, na.rm = T), 
            TN = sum(TN, na.rm = T),
            FN = sum(FN, na.rm = T), 
            .groups = 'drop') %>% 
  mutate(Precision = TP / (TP + FP), 
    Recall = TP / (TP + FN), 
    `Precision/Recall` = Precision / Recall, 
    Accuracy = (TP+TN)/(TP+FP+FN+TN),
    `Error Rate` = 1 - Accuracy)

N Mock Observations

nrow(mock.sstr_data)

Accuracy Heatmap

plot <-
  df %>% 
  mutate(`Error Rate` = 
         Hmisc::cut2(`Error Rate`, 
                   cuts = seq(0, 1, 0.2), 
                   minmax = T, 
                   formatfun = scales::percent, 
                   oneval = F) %>% 
         as.character() %>% 
         gsub(pattern = '\\[', replacement = '') %>% 
         gsub(pattern = '\\]', replacement = '') %>% 
         gsub(pattern = '\\)', replacement = '') %>% 
         gsub(pattern = ',', replacement = '-')) %>% 
  filter(ngc > 0) %>% 
  ggplot(aes(
    tsc, 
    as.factor(ngc))
    ) +  
  geom_tile(aes(fill = `Error Rate`), col = 'black') +
  scale_x_continuous(expand = c(0,0),
                     breaks = seq(0, 20, 4)) + 
  scale_y_discrete(expand = c(0,0)) + 
  facet_wrap(~ metric, scales = 'free_y') +
  theme_cowplot() + 
  theme(aspect.ratio = 1,
        strip.background = element_blank(),
        panel.spacing.x = unit(1, "lines"),
        axis.text = element_text(size = 9),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(hjust = 0.5),
        legend.position = 'right',
        legend.title = element_text(size = 12),
        legend.key = element_rect(colour = "black")) + 
  labs(x = 'Mean Fold-Coverage of the Target Strain', 
       y = 'Noise Coverage') + 
  scale_fill_manual(values = c('white', 'gray75', 'gray50', 'gray25', 'black'))

plot + theme(legend.position = 'none')

legend <- cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)

Exporting Plot

output_name = 'AccuracyHeatmap.Mock'

ggsave(plot + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 5.5, height = 3,
       filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
ggsave(legend, 
       device = 'pdf', dpi = 300, width = 1.25, height = 1.5,
       filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))

Comparison of average detection accuracy using Consensus vs. SameStr MVS with at least 5x coverage when target strain is dominant (>50% species population) or sub-dominant (15-50% species population)

df %>% 
  mutate(miss_rate = TP / (FN + TP)) %>% 
  filter(rel_abund.target > .5, total_coverage > 5) %>% 
  group_by(metric) %>% 
  summarize(Accuracy = 1 - mean(`Error Rate`, na.rm = T))

df %>% 
  filter(rel_abund.target > .15) %>% 
  filter(rel_abund.target <= .5, total_coverage > 5) %>%
  group_by(metric) %>% 
  summarize(Accuracy = 1 - mean(`Error Rate`, na.rm = T))

Phylogenetic Resolution of MP Markers

Comparing full-genome based Fast-ANI scores to MetaPhlAn2 based similarity

Reference Genomes

Similarity Distribution of Reference Genomes

percent_format_signif <- function(x, label.precision=3, label.percent=T) {
    if (label.percent == T) {
      paste0(signif(100 * x, label.precision), "%")
    } else {
      signif(digits = label.precision)
    }
}

species_ridge <-
  refs %>% 
  filter(distance > .97 | ani > .97) %>%
  pivot_longer(values_to = 'value', names_to = 'metric', cols = c('distance', 'ani')) %>% 
  mutate(metric = case_when(
    metric == 'distance' ~ 'Nucleotide Similarity of\nMetaPhlAn2 Markers',
    metric == 'ani' ~ 'fastANI Identity of\nFull Sequences',
    T ~ metric
  )) %>% 
  ggplot(aes(x = value, 
             y = fct_reorder(genus, genus_avg_marker_len), 
             group = species, 
             fill = species)) +
  geom_density_ridges(scale = 3, 
                     alpha = 0.5, stat = 'binline', show.legend = F,
                     ) + 
  scale_x_reverse(labels = percent_format_signif, 
                  breaks=c(min_similarity, 0.99, 0.98, 0.97),
                  limits = c(NA, 0.97), 
                  expand = c(0.1, 0)) +
  theme_cowplot() + 
  geom_vline(xintercept = min_similarity, show.legend = T, linetype = 'dashed') + 
  guides(fill = guide_legend(ncol = 1)) +
  labs(x = 'Sequence Identity') +
  facet_grid(cols = vars(metric)) +
  theme(strip.background = element_blank(),
        axis.title.y = element_blank())

species_ridge

refs %>% 
  pivot_longer(values_to = 'value', names_to = 'metric', cols = c('distance', 'ani')) %>% 
  group_by(metric) %>% 
  summarize(not_shared = sum(value < min_similarity, na.rm = T),
            shared = sum(value > min_similarity, na.rm = T)) %>% 
  mutate(total = not_shared + shared,
         rate = shared / total * 100)

Correlation of MP marker vs. fastANI based similarity

min = .97
refs.cor <- 
  cor.test(~ ani + distance,
           data = refs %>% 
             filter(!is.na(ani), !is.na(distance)) %>%
             filter(ani > min & distance > min),
           method = 'spearman') %>% 
  broom::tidy(.) %>% 
  mutate(estimate.round = round(estimate, 2), 
         sig = cut(p.value, 
                   breaks = c(-Inf, 0.001, 0.01, 0.05, Inf),
                   labels = c("***", "**", "*", "")))
refs.cor

plot.scatter <-
  ggscatter(data = refs %>% 
  filter(!is.na(ani), !is.na(distance)) %>%
  filter(ani > min & distance > min),
  'ani', 'distance', 
  color = 'black',  
  fill = 'black',
  alpha = 0.025,
  xlab = 'Full-Sequence fastANI', ylab = 'MetaPhlAn2 Markers',
  ggtheme = theme_cowplot()) + 
  theme(aspect.ratio = 1) +
  scale_x_continuous(labels = percent_format_signif, breaks=c(min_similarity, 0.99, 0.98, 0.97)) +
  scale_y_continuous(labels = percent_format_signif, breaks=c(min_similarity, 0.99, 0.98, 0.97)) + 
  annotate('text', label = paste0('r=', 
                                  refs.cor$estimate.round,
                                  ' ', 
                                  refs.cor$sig), 
           hjust = 0, 
           x = .97, y = .999, size = 6) + 
  geom_density_2d(col = 'black')

plot <-
  ggExtra::ggMarginal(
    plot.scatter,
    type = 'density',
    color = 'black',
    fill = 'grey75', 
    size = 10)
plot

在这里插入图片描述

Exporting Plots

output_name = 'MetaPhlAn2_FastANI.References'
ggsave(species_ridge, 
       device = 'pdf', dpi = 300, width = 6, height = 8,
       filename = paste0(results_dir, fig, output_name, '.RidgePlot.pdf'))

ggsave(plot, 
       device = 'pdf', dpi = 300, width = 4, height = 4,
       filename = paste0(results_dir, fig, output_name, '.CorrPlot.pdf'))
  • 26
    点赞
  • 30
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

生信学习者2

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值