vignettes/analysis.Rmd
analysis.Rmd
Beginning an microbiome analysis starts with importing data and wrangling it into a proper format.
# Get the data files from package
input_files <- pkg.data$input_files
biom_file <- input_files$biom_files$silva # Path to silva biom file
tree_file <- input_files$tree_files$silva # Path to silva tree file
metadata_file <- input_files$metadata$two_groups # Path to Nephele metadata
parse_func <- parse_taxonomy_silva_128 # A custom phyloseq parsing function for silva annotations
# Get the phyloseq object
phy_obj <- create_phyloseq(biom_file = biom_file,
tree_file = tree_file,
metadata_file = metadata_file,
parse_func = parse_func)
# Get the taxmap object in the raw format
raw_metacoder <- as_MicrobiomeR_format(obj = phy_obj, format = "raw_format")
After importing data and formatting it, data should be filtered to reduce noise and if desired, to subset data.
# Remove Archaea from the taxmap object
metacoder_obj <- taxa::filter_taxa(obj = raw_metacoder,
taxon_names == "Archaea",
subtaxa = TRUE,
invert = TRUE)
# Ambiguous Annotation Filter - Remove taxonomies with ambiguous names
metacoder_obj <- metacoder::filter_ambiguous_taxa(metacoder_obj,
subtaxa = TRUE)
# Low Sample Filter - Remove the low samples
# The sample filter should generally be implemented first
metacoder_obj <- sample_id_filter(obj = metacoder_obj,
.f_filter = ~sum(.),
.f_condition = ~.>= 20,
validated = TRUE)
# Master Threshold Filter - Add the otu_proportions table and then filter OTUs based on min %
metacoder_obj <- otu_proportion_filter(obj = metacoder_obj,
otu_percentage = 0.00001)
# Taxon Prevalence Filter - Add taxa_abundance and taxa_proportions and then filter OTUs that do not
# appear more than a certain amount of times in a certain percentage of samples at the specified
# agglomerated rank. This is considered a supervised method, because it relies on intermediate
# taxonomies to filter the data.
# The default minimum abundance is 5 and the sample percentage is 0.5 (5%).
# Phylum
metacoder_obj <- taxa_prevalence_filter(obj = metacoder_obj,
rank = "Phylum")
# Class
metacoder_obj <- taxa_prevalence_filter(obj = metacoder_obj,
rank = "Class",
validated = TRUE)
# Order
metacoder_obj <- taxa_prevalence_filter(obj = metacoder_obj,
rank = "Order",
validated = TRUE)
# OTU Prevalence Filter - Filter OTUs that do not appear more than a certian amount of times in a
# certain percentage of samples. This is considered an unsupervised method, because it relies only
# on the leaf OTU ids to filter the data.
metacoder_obj <- otu_prevalence_filter(obj = metacoder_obj,
validated = TRUE)
# Coefficient of Variation Filter - Filter OTUs based on the coefficient of variation
metacoder_obj <- cov_filter(obj = metacoder_obj,
coefficient_of_variation = 3,
validated = TRUE)
Analysis is primarily done with metacoder, MicrobiomeR, and ggplot2. Before beginning the analysis it’s wise to create an output directory. Use end_path=FALSE
with MicrobiomeR’s output_dir()
function to avoid the creation of a date formatted directory.
# Create a directory for whichever plot you want to save
heat_tree_path <- output_dir(end_path = FALSE, start_path = "output", plot_type = "heat_tree")
corr_plot_path <- output_dir(end_path = FALSE, start_path = "output", plot_type = "correlation")
sb_plot_path <- output_dir(end_path = FALSE, start_path = "output", plot_type = "stacked_barplot")
ord_plot_path <- output_dir(end_path = FALSE, start_path = "output", plot_type = "ordination")
Statistical analysis is primarily done with the help of metacoder style functions such as the calc_*()
group of functions, and compare_groups()
. The taxa function taxonomy_table()
is also useful for matching stats with the proper taxonomic annotation. MicrobiomeR creates the proper tables with as_MicrobiomeR_format(format = "analyzed_format", ...)
.
# Get the statistical observation data.
metacoder_obj <- as_MicrobiomeR_format(obj = metacoder_obj, format = "analyzed_format")
In addition to standard statistical analysis provided by metacoder, MicrobiomeR
simplifies alpha diversity, ordination, and PERMANOVA analysis.
# Generate alpha diversity measures
measures <- alpha_diversity_measures(obj = metacoder_obj)
measures$Shannon
#> [1] 5.977645 5.807758 5.924257 5.651563 5.712745 5.677156 6.107177
#> [8] 5.263318 6.040767 5.963451 6.011352 6.238364 6.061033 6.213808
#> [15] 6.111409 5.930385 5.638817 5.670399 5.665353 5.949696 6.213651
#> [22] 5.674047 6.053098 5.818605 6.134535 6.175986 5.681311 5.892901
#> [29] 5.561349 6.118321 6.033960 4.994411 6.048751 5.474699 6.004132
#> [36] 5.720120 5.309873 5.752245 5.526904 5.698600 5.380529 5.641993
#> [43] 5.469683 5.868113 5.453688 5.777398 5.952376 4.249344
# Get only ordination data for the first Axis based on principal coordinates analysis using weighted unifrac.
ordination <- ordination_plot(obj = metacoder_obj, method = "PCoA", distance = "wunifrac", only_data = TRUE)
ordination$Axis.1
#> [1] 0.1480991727 -0.0530880826 0.1192335362 -0.1045700493 0.0491129449
#> [6] 0.1265867917 -0.0345322993 -0.1174505365 -0.0377552818 -0.0927269385
#> [11] -0.0235620393 0.0008706629 -0.0723557471 -0.0282243478 0.0740555158
#> [16] -0.0524377243 -0.1037144750 -0.1418732833 -0.1202257911 0.0617099739
#> [21] -0.0390451135 -0.0432635870 -0.0058600684 -0.0552540161 0.0415570399
#> [26] -0.0462610529 0.2380605736 0.1145329575 0.0690412299 -0.0336046151
#> [31] -0.0495665644 0.1856307251 -0.0617423548 0.1775513193 0.2054672317
#> [36] -0.0777818491 0.0076295756 -0.1458834521 0.0797228129 -0.0696594057
#> [41] -0.0257490009 -0.1183574855 -0.0721215576 0.0042024229 0.1547426859
#> [46] 0.1580175051 -0.0155816171 -0.1735763413
# Generate permanova statistics for data
permanova <- permanova(obj = metacoder_obj, group = "TreatmentGroup")
permanova$permanova$aov.tab
#> Permutation: free
#> Number of permutations: 99
#>
#> Terms added sequentially (first to last)
#>
#> Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
#> TreatmentGroup 1 0.7051 0.70506 4.8736 0.0958 0.01 **
#> Residuals 46 6.6548 0.14467 0.9042
#> Total 47 7.3598 1.0000
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Visualization of taxmap objects can be done in several ways, and MicrobiomeR
offers the most complete visualizations of any other existing microbiome package. The metacoder package primarily produces heat_tree()
s for visualization, which can be used for any taxmap object. MicrobiomeR does this as well, but takes care of creating default values that we enjoyed in our heat tree plots.
MicrobiomeR also uses ggplot2 to create correlation_plot()
s.
# Generate heat_trees
heat_tree_plots <- heat_tree_plots(metacoder_obj,
rank_list = c("Phylum", "Class", "Order"),
node_label = ifelse(wilcox_p_value > 0.05, taxon_ids, NA),
node_label_size = 2,
node_label_color = c("darkgreen"))
names(heat_tree_plots)
#> [1] "metacoder_object" "heat_trees" "taxmaps"
# Generate correlation_plots
corr_plots <- correlation_plots(metacoder_obj, primary_ranks = c("Phylum", "Class", "Order"))
names(corr_plots)
#> [1] "Phylum" "Class" "Order"
# Generate stacked_barplots
sb_plots <- stacked_barplots(metacoder_obj, tax_levels = c("Phylum", "Class", "Order"))
names(sb_plots)
#> [1] "Phylum" "Class" "Order"
# Save plots with a custom output path
save_heat_tree_plots(htrees = heat_tree_plots, custom_path = heat_tree_path)
save_correlation_plots(corr = corr_plots, custom_path = corr_plot_path)
save_stacked_barplots(sb_plots = sb_plots, custom_path = sb_plot_path)
save_ordination_plots(ord_plots = ordination, custom_path = ord_plot_path)
# Heat Trees
input_files <- pkg.data$input_files
metadata_file <- input_files$metadata$three_groups
biom_file <- input_files$biom_files$silva # Path to silva biom file
tree_file <- input_files$tree_files$silva # Path to silva tree file
parse_func <- parse_taxonomy_silva_128 # A custom phyloseq parsing function for silva annotations
# Get the phyloseq object
phy_obj <- create_phyloseq(biom_file = biom_file,
tree_file = tree_file,
metadata_file = metadata_file,
parse_func = parse_func)
# Get the taxmap object in the raw format
raw_metacoder <- as_MicrobiomeR_format(obj = phy_obj, format = "raw_format")
# Remove Archaea from the taxmap object
metacoder_obj <- taxa::filter_taxa(obj = raw_metacoder,
taxon_names == "Archaea",
subtaxa = TRUE,
invert = TRUE)
# Ambiguous Annotation Filter - Remove taxonomies with ambiguous names
metacoder_obj <- metacoder::filter_ambiguous_taxa(metacoder_obj,
subtaxa = TRUE)
# Low Sample Filter - Remove the low samples
# The sample filter should generally be implemented first
metacoder_obj <- sample_id_filter(obj = metacoder_obj,
.f_filter = ~sum(.),
.f_condition = ~.>= 20,
validated = TRUE)
# Master Threshold Filter - Add the otu_proportions table and then filter OTUs based on min %
metacoder_obj <- otu_proportion_filter(obj = metacoder_obj,
otu_percentage = 0.00001)
# Taxon Prevalence Filter - Add taxa_abundance and taxa_proportions and then filter OTUs that do not
# appear more than a certian amount of times in a certain percentage of samples at the specified
# agglomerated rank. This is considered a supervised method, because it relies on intermediate
# taxonomies to filter the data.
# The default minimum abundance is 5 and the sample percentage is 0.5 (5%).
# Phylum
metacoder_obj <- taxa_prevalence_filter(obj = metacoder_obj,
rank = "Phylum")
# Class
metacoder_obj <- taxa_prevalence_filter(obj = metacoder_obj,
rank = "Class",
validated = TRUE)
# Order
metacoder_obj <- taxa_prevalence_filter(obj = metacoder_obj,
rank = "Order",
validated = TRUE)
# OTU Prevalence Filter - Filter OTUs that do not appear more than a certian amount of times in a
# certain percentage of samples. This is considered an unsupervised method, because it relies only
# on the leaf OTU ids to filter the data.
metacoder_obj <- otu_prevalence_filter(obj = metacoder_obj,
validated = TRUE)
# Coefficient of Variation Filter - Filter OTUs based on the coefficient of variation
metacoder_obj <- cov_filter(obj = metacoder_obj,
coefficient_of_variation = 3,
validated = TRUE)
metacoder_obj <- as_MicrobiomeR_format(obj = metacoder_obj, format = "analyzed_format")
# Generate heat_trees
heat_tree_plots <- heat_tree_plots(metacoder_obj,
rank_list = c("Phylum", "Class", "Order"),
node_label = ifelse(wilcox_p_value < 0.05, taxon_ids, NA),
node_label_size = 2,
node_label_color = c("darkgreen"))
heat_trees <- heat_tree_plots$heat_trees
heat_trees$Phylum