Bo Yuan Oct 2019 updated


The R code has been developed with reference to R for Data Science (2e), and the official documentation of tidyverse, and DataBrewer.co. See breakdown of modules below:

  • Data visualization with ggplot2 (tutorial of the fundamentals; and data viz. gallery).

  • Data wrangling with the following packages: tidyr, transform (e.g., pivoting) the dataset into tidy structure; dplyr, the basic tools to work with data frames; stringr, work with strings; regular expression: search and match a string pattern; purrr, functional programming (e.g., iterating functions across elements of columns); and tibble, work with data frames in the modern tibble structure.


library(readxl)
library(tidyr)
library(dplyr)
library(stringr)
library(rebus)
library(ggplot2)
library(ComplexHeatmap)
library(circlize)
library(RColorBrewer)
library(gridExtra)
library(cowplot)
path = "/Users/Boyuan/Desktop/Manuscript files/5th_AIV leaf QQQ_JFC/Oct 2019 JFC/NS_leaf_PhytochemQQQ dataset.xlsx"

1 Contrast analysis

1.1 Tidy up

d = read_excel(path, sheet = "Contrast.analysis")

# gather compounds
d = d %>% 
  gather(`chlorogenic acid`, quercetin, kaempferol, rhamnetin, isorhamnetin, solasodine, diosgenin, tigogenin,
         key = compound, value = data) 
# replace N.D. with zero values
d$data = d$data %>% str_replace(pattern = "N.D.", replacement = "0 ± 0")
# plit data into mean and standard deviation
d = d %>% separate(data, into = c("mean", "std"), sep = " ± ")
# convert mean and std to numeric value
d$mean = d$mean %>% as.numeric()
d$std = d$std %>% as.numeric()
# arrange order of display
d$compound  = d$compound %>%
  factor(levels = (c("chlorogenic acid", "quercetin", "kaempferol", "rhamnetin", 
                     "isorhamnetin", "solasodine", "diosgenin", "tigogenin")), 
         ordered = T)

# NOTICE HERE!!
d = d %>% filter(ID != "PI 312110" & compound != "rhamnetin")
d = d %>% filter(compound != "rhamnetin")

1.2 Visualization

1.2.1 Define plotting function

# define plot function
myplot = function(dataset, category){
  dataset %>%
    ggplot(aes_string(x = 1, y = "mean", color = category, fill = category), alpha = 0.2) + 
    geom_boxplot(alpha = .2, outlier.alpha = 0) +
    facet_wrap(~compound, nrow = 1, strip.position = "bottom", scales = "free") +
    # format
    theme_classic() + 
    theme(axis.text = element_text(color = "black", size = 12), 
          axis.title = element_text(face = "bold"),
          legend.title =  element_blank(), legend.position = "right", 
          legend.text = element_text(colour = "black", size = 12),
          strip.background = element_blank(), strip.text = element_text(face = "bold"), 
          # remove all y-axis related (not for plot, not for aesthetic)
          axis.text.x = element_blank(), axis.line.x = element_blank(), 
          axis.ticks.x = element_blank(), axis.title.x = element_blank(), 
          axis.title.y = element_text(size = 12, face = "bold")) +
    # add mean!
    stat_summary(fun.y = mean, geom = "point", shape = 23, 
                 position = position_jitterdodge(0), 
                 color = "black", size = 3, stroke = 0.8)+
    scale_y_continuous(breaks = scales::pretty_breaks(7), 
                       limits = c(NA, NA)) + labs(y = "Content (mg/100g DW)") +
    scale_color_brewer(palette = ifelse(category == "Species", "Set1", "Set2")) +
    scale_fill_brewer(palette = ifelse(category == "Species", "Set1", "Set2")) +
    
    geom_text(aes(label = `Sample No.`), 
              position = position_jitterdodge(0.25), size = 3.4, fontface = "bold")
}

1.2.2 Plotting

# SPECIES CONTRAST  
# There are four accessions of S. nigrum. Only one planted in Kenya, while all four planted in RU. 
# THus compare those planted in RU.
# This dataframe contains species planted in RU: 3 nigrumw with the peculiar nigrum USDA 312110 excluded), & 8 scabrum
species.contrast.df = d %>% filter(`CultivationSite` == "RU" & Species != "N.D.")
plt.species = myplot(species.contrast.df, category = "Species") 

# Cultivation site (environment) CONTRAST  
# Only those IDs cultivated both in RU and Kenya are included for comparison.
# The comparison includes a total of 6 comparisions, all from from scabru 
selected.ID = c("BG 29", "Ex Hai", "BG 16", "SS 49 (Olevolosi)", "SS 04.2", "SS 52")
cultivationSite.contrast.df = d %>% filter(ID %in% selected.ID)
plt.cultivationSite = myplot(cultivationSite.contrast.df, category = "CultivationSite")

# combining two plot together
grid.arrange(plt.species, plt.cultivationSite, nrow = 2)

1.3 Scheffe’s contrast

1.3.1 Define contrast function

# category = "Species", group1 = "S. nigrum", group2 = "S. scabrum", alpha = 0.5, etc.
# There should be only two groups in a given category
calculateP = function(loop.df, alpha, category, group1, group2){ 
  n.group1 = ( loop.df[[category]] == group1 ) %>% sum()
  n.group2 = ( loop.df[[category]] == group2) %>% sum() 
  
  # print contrasted groups of the given category
  print(paste("Contrast =", group1, "-", group2))
  
  # set up Ci and Ci-related terms
  Ci = c()
  for (i in 1:nrow(loop.df)){
    if( loop.df[[category]] [i] == group1){Ci[i] = (1/n.group1) 
    } else { Ci[i] = - 1/ n.group2 }
  }
  
  Contrast = sum(Ci * loop.df$mean)
  `Sum.Ci^2` = sum(Ci^2)
  
  # MSE and critical value
  n.treatment = nrow(loop.df)
  df.error = (3-1) * n.treatment
  SSE = ((loop.df$std)^2*(3-1)) %>% sum()
  MSE = SSE / df.error
  critic.S = sqrt(  MSE/3 * `Sum.Ci^2`  ) * 
    sqrt( (n.treatment - 1) * qf(p = 1- alpha, df1 = (n.treatment - 1), df2 = df.error) ) 
  # notice alpha input
  
  # Significant or not?
  ifsignificant = ifelse(
    abs(Contrast) >= critic.S, "Yes! Significant!", "NO...."
  )
  
  # print
  
  cat(unique(loop.df$compound) %>% 
        as.character(), # factor as character, otherwise output level of factor
      ": Critical value = ", round(critic.S, 3), 
      ", Contrast =", round(Contrast, 3), ifsignificant, "\n\n") 
}

1.3.2 Calculate contrast statistics

# compounds to loop through
myCompounds = d$compound %>% unique()

1.3.2.1 contrast between species

for (i in myCompounds){
  loop.df = species.contrast.df %>% filter(compound == i) 
  calculateP(loop.df, alpha = 0.6, 
             category = "Species", group1 = "S. nigrum", group2 = "S. scabrum")
}
## [1] "Contrast = S. nigrum - S. scabrum"
## chlorogenic acid : Critical value =  31.932 , Contrast = 11.372 NO.... 
## 
## [1] "Contrast = S. nigrum - S. scabrum"
## quercetin : Critical value =  26.757 , Contrast = 13.824 NO.... 
## 
## [1] "Contrast = S. nigrum - S. scabrum"
## kaempferol : Critical value =  9.031 , Contrast = -10.293 Yes! Significant! 
## 
## [1] "Contrast = S. nigrum - S. scabrum"
## isorhamnetin : Critical value =  0.703 , Contrast = 0.869 Yes! Significant! 
## 
## [1] "Contrast = S. nigrum - S. scabrum"
## solasodine : Critical value =  1.667 , Contrast = 9.723 Yes! Significant! 
## 
## [1] "Contrast = S. nigrum - S. scabrum"
## diosgenin : Critical value =  5.541 , Contrast = 8.8 Yes! Significant! 
## 
## [1] "Contrast = S. nigrum - S. scabrum"
## tigogenin : Critical value =  32.492 , Contrast = 32.646 Yes! Significant!

1.3.2.2 contrast between cultivation site

for (i in myCompounds){
  loop.df = cultivationSite.contrast.df %>% filter(compound == i) 
  calculateP(loop.df, alpha = 0.001, 
             category = "CultivationSite", group1 = "RU", group2 = "Kenya")
}
## [1] "Contrast = RU - Kenya"
## chlorogenic acid : Critical value =  77.88 , Contrast = 58.585 NO.... 
## 
## [1] "Contrast = RU - Kenya"
## quercetin : Critical value =  73.267 , Contrast = -62.11 NO.... 
## 
## [1] "Contrast = RU - Kenya"
## kaempferol : Critical value =  17.43 , Contrast = -3.432 NO.... 
## 
## [1] "Contrast = RU - Kenya"
## isorhamnetin : Critical value =  1.609 , Contrast = -0.672 NO.... 
## 
## [1] "Contrast = RU - Kenya"
## solasodine : Critical value =  0.014 , Contrast = 0.018 Yes! Significant! 
## 
## [1] "Contrast = RU - Kenya"
## diosgenin : Critical value =  3.233 , Contrast = 9.95 Yes! Significant! 
## 
## [1] "Contrast = RU - Kenya"
## tigogenin : Critical value =  80.1 , Contrast = 141.198 Yes! Significant!

2 Variance partition

2.1 Effect of plant varieties

# SS between accessions
SS.accessions = species.contrast.df %>% 
  group_by(compound, Species) %>%
  mutate(Species.mean = mean(mean), 
         SS.accessions = (mean - Species.mean)^2*3) %>%
  group_by(compound) %>%  
  summarise(SS.accessions = sum(SS.accessions))

# SS between species
species.count = (species.contrast.df %>% 
                   filter(compound == "chlorogenic acid"))$Species %>% 
  table() %>% as.data.frame() %>%
  rename(Species = ".", counts = Freq)

SS.species = species.contrast.df %>% 
  group_by(compound) %>% mutate(grandmean = mean(mean)) %>%
  dplyr::group_by(compound, Species) %>% 
  summarise(Species.mean = mean(mean), 
            grandmean = unique(grandmean)) %>%
  
  inner_join(species.count, by = "Species") %>% 
  mutate(SS = (Species.mean - grandmean)^2 * counts * 3) %>%
  group_by(compound) %>% summarise(SS.species = sum(SS))
## Warning: Column `Species` joining character vector and factor, coercing
## into character vector
## SSE 
SSE = species.contrast.df %>% mutate(diff.square = std^2 * (3-1)) %>%
  group_by(compound) %>% summarise(SSE = sum(diff.square))


## combine SS together to get the SST
SS.df = cbind(SS.species, SS.accessions, SSE)
SS.df = SS.df[, !(SS.df %>% colnames() %>% duplicated())] %>% 
  mutate(SST = SS.species + SS.accessions + SSE)
# import most original data (showing triplicated measurements) 
raw = read_excel(path, sheet = "R_plant conc.mg 100g.DW") %>%
  filter(ID != "PI 312110")

# Calculate the generic SST to confirm calculation correctness
raw.species = raw %>% filter(`CultivationSite` == "RU" & Species != "N.D." ) %>%
  gather(quercetin, kaempferol, rhamnein, isorhamnetin, solasodine, diosgenin, tigogenin, 
         key = compound, value = content)

raw.species.SST = raw.species %>% 
  group_by(compound) %>% 
  mutate(SST = (content - mean(content))^2 ) %>%  
  summarise(SST.generic = sum(SST))

# This basically confirms the correctness of my calculation. 
# The larger the content magnitude, the less susceptible the calculation is to rounding error

inner_join(SS.df, raw.species.SST) %>% 
  mutate(error.percent = (SST - SST.generic)/SST * 100)
## Joining, by = "compound"
## Warning: Column `compound` joining factor and character vector, coercing
## into character vector
##       compound  SS.species SS.accessions        SSE         SST
## 1    quercetin 1250.886005   429603.7033 12322.1750 443176.7643
## 2   kaempferol  693.396368     6155.0267  1403.8180   8252.2410
## 3 isorhamnetin    4.940028      405.0009     8.5116    418.4525
## 4   solasodine  618.775256      704.4920    47.8488   1371.1160
## 5    diosgenin  506.832001     8972.3888   528.4644  10007.6852
## 6    tigogenin 6975.821023   735140.5134 18170.2368 760286.5713
##   SST.generic error.percent
## 1 443172.1504   0.001041076
## 2   9316.0197 -12.890784976
## 3    418.6207  -0.040209016
## 4    999.7155  27.087460461
## 5  10008.1637  -0.004781342
## 6 760276.0548   0.001383225
# Now plot the partitioned variance for each compound!!!
SS.df = SS.df %>% gather(-1, key = Source, value = value)

SS.df$Source = SS.df$Source %>% 
  str_replace(pattern = "SS.species", replacement = "Species")

SS.df$Source = SS.df$Source %>% 
  str_replace(pattern = "SS.accessions", replacement = "Accessions")

SS.df$Source = SS.df$Source %>%
  str_replace(pattern = "SSE", replacement = "Measurement")

source.levels = data.frame(Source = c("Species", "Accessions", "Measurement"), 
                           level = c(1, 2, 3))

SS.df = inner_join(SS.df, source.levels)

SS.df$Source = SS.df$Source %>% 
  factor(levels = c("Species",  "Accessions", "SST", "Measurement"))

SS.df$compound  = SS.df$compound %>% 
  factor(levels = (c("chlorogenic acid", "quercetin", "kaempferol", "rhamnetin", 
                     "isorhamnetin", "solasodine", "diosgenin", "tigogenin")), 
         ordered = T)

# add contribution percent
SS.df = SS.df %>% filter(Source != "SST") %>% 
  group_by(compound) %>% 
  mutate(percent = (value/sum(value)* 100) %>% round(1))

plt.species.vairance.partition = SS.df %>% filter(Source != "SST") %>%
  ggplot(aes(x = compound, y = percent, fill = Source)) + 
  geom_bar(stat = "identity", position = "stack", alpha = 0.7, 
           color = "black", size = 0.1)  + 
  theme_classic() + 
  theme(axis.text = element_text(color = "black", size = 12),
        axis.title = element_blank(), 
        legend.title = element_blank(),
        legend.text = element_text(size = 12)) +
  scale_fill_brewer(palette = "OrRd") +
  geom_text(aes(label = percent), 
            position = position_stack(0.5), 
            color = "black",  size = 3.1)

plt.species.vairance.partition

2.2 Effect of cultivation environment

# SS of accessions
cultivationSite.contrast.df = cultivationSite.contrast.df %>% 
  group_by(compound) %>%  
  mutate(grandmean = mean(mean)) %>% 
  group_by(compound, ID) %>% 
  mutate(accession.mean = mean(mean), 
         SS.accession = (accession.mean - grandmean)^2*3)

# SS of environment
cultivationSite.contrast.df = cultivationSite.contrast.df %>% 
  group_by(compound, CultivationSite) %>% 
  mutate(cultivationSite.mean = mean(mean), 
         SS.cultivationSite = (cultivationSite.mean - grandmean)^2*3)

# SS of accession & environment interaction
cultivationSite.contrast.df = cultivationSite.contrast.df %>% 
  group_by(compound, CultivationSite, ID) %>%
  mutate(SS.interaction = (mean + grandmean - accession.mean - cultivationSite.mean)^2*3)

# SSE
cultivationSite.contrast.df = cultivationSite.contrast.df %>% 
  mutate(SSE = (std^2) * (3-1))

# SST
cultivationSite.contrast.df = cultivationSite.contrast.df %>% 
  mutate(SST = SSE + SS.cultivationSite + SS.accession + SS.interaction)

# Sum rows up for final SS
cultivationSite.SS.summary = cultivationSite.contrast.df %>% 
  group_by(compound) %>% 
  summarise(SSE = sum(SSE), 
            SS.cultivationSite = sum(SS.cultivationSite),
            SS.accession = sum(SS.accession), 
            SS.interaction = sum(SS.interaction),
            SST = sum(SST))


# check with generic SST to confirm correctness
# recall "selected.ID" variable storing selected IDs for cultivation/environment comparision
raw.CultivationSite = raw %>% 
  filter(ID %in% selected.ID) %>%
  gather(quercetin, kaempferol, rhamnein, isorhamnetin, solasodine, diosgenin, tigogenin,
         key = compound, value = content)

raw.CultivationSite.SST = raw.CultivationSite %>% 
  group_by(compound) %>% mutate(SST = (content - mean(content))^2 ) %>%  
  summarise(SST.generic = sum(SST))

# plot Variance partition !!!
cultivationSite.SS.summary = cultivationSite.SS.summary %>% 
  gather(-1, key = source, value = SS)

cultivationSite.SS.summary$source = cultivationSite.SS.summary$source %>% 
  str_replace(pattern = "SSE", replacement = "Measurement")

cultivationSite.SS.summary$source = cultivationSite.SS.summary$source %>% 
  str_replace(pattern = "SS.cultivationSite", replacement = "Environment")

cultivationSite.SS.summary$source = cultivationSite.SS.summary$source %>% 
  str_replace(pattern = "SS.accession", replacement = "Accession")

cultivationSite.SS.summary$source = cultivationSite.SS.summary$source %>% 
  str_replace(pattern = "SS.interaction", replacement = "Interaction")

# display order
cultivationSite.SS.summary$source = cultivationSite.SS.summary$source %>% 
  factor(levels = c("Environment",  "Accession", "Interaction", "Measurement"))

cultivationSite.SS.summary$compound  = cultivationSite.SS.summary$compound %>% 
  factor(levels = (c("chlorogenic acid", "quercetin", "kaempferol", "rhamnetin", 
                     "isorhamnetin", "solasodine", "diosgenin", "tigogenin")), ordered = T)
# contribution percent
cultivationSite.SS.summary = cultivationSite.SS.summary %>% 
  filter(source != "SST") %>%
  group_by(compound) %>%
  mutate(percent = (SS/sum(SS)*100) %>% round(1))

# plotting
plt.cultivationSite.variance.partition =  cultivationSite.SS.summary  %>%  
  ggplot(aes(x = compound, y = percent, fill = source)) +
  geom_bar(stat = "identity", position = "stack", alpha = 0.7, 
           color = "black", size = 0.1)  + 
  theme_classic() + 
  theme(axis.text = element_text(color = "black", size = 12),
        axis.title = element_blank(),
        legend.title = element_blank(),
        legend.text = element_text(size = 12))+
  scale_fill_brewer(palette = "YlGn") + 
  geom_text(aes(label = percent), position = position_stack(0.5), color = "black", size = 3.1)
plt.cultivationSite.variance.partition

3 Content profile heatmap

3.1 Warm up

# set uniform parameter for heatmap
bar_width = 3 # for row and column color side bar

# read data
path <- "/Users/Boyuan/Desktop/My publication/5th. N.S. leaf quant QqQ/R data analysis manuscript data.xlsx"
data <- lapply(excel_sheets(path), read_excel, path = path)
content_df <- data[[1]] %>% as.data.frame()
sample_df <- data[[2]] %>% as.data.frame()

# simplify sample ID name in sample_df
sample_df$ID[7:8] <- rep("SS 49", 2)
sample_df$ID <-  str_replace_all(sample_df$ID, pattern = "-", replacement = "NA")


# remove deviation values in content_df
pattern = "±" %R% optional(SPC) %R% one_or_more(DGT) %R% DOT %R% one_or_more(DGT)
content_mat <-  vapply(content_df, str_replace, pattern = pattern, replacement = "",  
                       character(length = nrow(content_df))) # vapply convert dataframe to matrix

# order columns in alphabetical order
hm_mat <- content_mat[, -1][, colnames(content_df)[2: ncol(content_df)] %>% order()] 

# add row names
# matrix allow identical row names but data frame does not allow identical row names
rownames(hm_mat) <- sample_df$ID 

# add sample code number
rownames(hm_mat) = str_c(rownames(hm_mat), paste0("_", 1:20))

# type conversion: character to numeric
# set up matrix to be fill up 
hm_mat_dbl <- matrix(1: (ncol(hm_mat) * nrow(hm_mat)), nrow = nrow(hm_mat)) 
rownames(hm_mat_dbl) <- rownames(hm_mat)
colnames(hm_mat_dbl) <- colnames(hm_mat)

for (i in 1: nrow(hm_mat)) {
  for (j in 1: ncol(hm_mat)) {
    hm_mat_dbl[i, j] <- as.numeric(hm_mat[i, j])
  }
}

# check warning numbers equal to counts of undetected entries
# total number of "N.D." values for undetected levels
# sum(hm_mat == "N.D.") 

# value transformation
minimum_value <- min(hm_mat_dbl, na.rm = T)

hm_mat_dbl_NAzero <- replace_na(hm_mat_dbl, replace = 0)
hm_mat_transformed <- log10(hm_mat_dbl_NAzero + minimum_value/2) %>% round(digits = 3)

# color of main plot

myblue <- colorRampPalette(c("white",brewer.pal(9, "Blues"), "black"))(50)
# pie(rep(1, length(myblue)), col = myblue)

myblue_ramp2 <- colorRamp2(seq(from = log10(minimum_value), 
                               to = max(hm_mat_transformed), 
                               length.out = 50), myblue)

3.2 column annotation

# convert compound to class
cmpd_class_code <- c("chlorogenic acid" = "phenolic acid", 
                     "diosgenin" = "saponin", 
                     "isorhamnetin" = "flavonol",
                     "kaempferol" = "flavonol", 
                     "quercetin" = "flavonol", 
                     "rhamnetin" = "flavonol",
                     "solasodine" = "alkaloid", 
                     "tigogenin" = "saponin")
cmpd_class <-  cmpd_class_code[colnames(hm_mat_transformed)]

# convert class to color
Set1 <- brewer.pal(9, "Set1")
# pie(rep(1, length(Set1)), col = Set1) # 9 being the darkest color

class_color_code <- c("phenolic acid" = Set1[5],
                      "flavonol" = Set1[3],
                      "saponin" = "steelblue",
                      "alkaloid" = "firebrick")
class_color <- class_color_code[cmpd_class]


# make column sidebar 
col_anno <- HeatmapAnnotation(
  cmpd.class = cmpd_class,
  col = list(cmpd_class = class_color_code),
  annotation_legend_param = list(title = "compound category"),
  
  # boxplot 
  boxplot = anno_boxplot(hm_mat_transformed, 
                         gp = gpar(fill = class_color),axis = T),
  annotation_height = unit.c(unit(bar_width, "mm"), unit(20, "mm")))

3.3 Row annotations

3.3.1 species

red <- brewer.pal(9, "YlOrRd")
# pie(rep(1, length(red)), col = red)
species_color_code <- c("S. scabrum" = "#800026",
                        "S. nigrum" = "#FEB24C", 
                        "N.D." = "#FFFFCC")
species_anno <- rowAnnotation(species = sample_df$Species,
                              col = list(`sample_df$Species` = species_color_code),
                              width = unit(bar_width, "mm"),
                              annotation_legend_param = list(title = "species"))

3.3.2 institution source

sample_df$Source[18:19] <- rep("Simlaw Kenya",2)
sample_df$Source[20] <- "Baker Creek Heirloom"

institute_color_code <- c(
  "WorldVeg" = "#66C2A5",
  "USDA" = "#FC8D62", 
  "Kenyan local market" = "#8DA0CB", 
  "Simlaw Kenya" = "#E78AC3", 
  "Baker Creek Heirloom" = "#A6D854")

institute_anno <- rowAnnotation(source = sample_df$Source,
                                col = list(`sample_df$Source` = institute_color_code),
                                width = unit(bar_width, "mm"),
                                annotation_legend_param = list(title ="seed source"))

3.3.3 cultivation site

greys <- brewer.pal(9, "Greys")
# pie(rep(1, length(greys)), col = greys)
site_color_code <- c("Kenya" = greys[4], "RU" = greys[8])  

cultivation_site_anno <- 
  rowAnnotation(sites = sample_df$`Cultivation site`,
                col = list("sample_df$`Cultivation site`" = site_color_code),
                width = unit(bar_width, "mm"),
                annotation_legend_param = list(title = "cultivation site"))

3.3.4 cultivation year

sample_df$`Harvest time` <-  str_replace_all(sample_df$`Harvest time`, 
                                             pattern = exactly("2017-03-14"),
                                             replacement = "2017 March")

sample_df$`Harvest time` <- str_replace_all(sample_df$`Harvest time`, 
                                            pattern = exactly("2016-07-08"),
                                            replacement = "2016 July")
time <- sample_df$`Harvest time` %>% as.character()
paired <- brewer.pal(12, "Paired")
# pie(rep(1, length(paired)), col = paired)
time_color_code <- c("2017 March" = paired[5],
                     "2016 July" = paired[6])

harvest_time_anno <- rowAnnotation(time = time,
                                   col = list(time = time_color_code),
                                   width = unit(bar_width, "mm"),
                                   annotation_legend_param = list(title = "harvest time"))
# make row stacked bar plot
rowname <- rownames(hm_mat_dbl_NAzero)
rownames(hm_mat_dbl_NAzero) <- NULL
barplot_df <-  hm_mat_dbl_NAzero %>% as.data.frame()
barplot_df$ID <- rowname
barplot_df <- cbind(barplot_df["ID"], barplot_df[1: (ncol(barplot_df) -1) ]) 

barplot_mat <- barplot_df %>% 
  mutate(`phenolic acid` = `chlorogenic acid`,
         flavonol = isorhamnetin + kaempferol + quercetin + rhamnetin,
         alkaloid = solasodine,
         saponin = diosgenin + tigogenin) %>%
  select(`phenolic acid` : `saponin`) %>% 
  data.matrix()

rownames(barplot_mat) <- rowname

barplot_anno <- rowAnnotation(
  barplot = row_anno_barplot(
    barplot_mat, axis = T, 
    gp = gpar(col = NA, fill = c("phenolic acid" = Set1[5],
                                 "flavonol" = Set1[3],
                                 "alkaloid" = "firebrick",
                                 "saponin" = "steelblue"))),
  annotation_width = unit(50, "mm"))

3.4 Draw heatmap main body

# make heatmap  ------------------------------------------
main_plot <- Heatmap(hm_mat_transformed, col = myblue_ramp2,
                     heatmap_legend_param = list(title = "log10 [mg/100 g DW]", 
                                                 color_bar = "continuous"),
                     row_names_gp = gpar(fontsize = 10),
                     row_dend_width = unit(15, "mm"),
                     column_dend_height = unit(15, "mm"),
                     top_annotation = col_anno,
                     cluster_columns = T,
                     cluster_rows = T,
                     rect_gp = gpar(col = "black", lwd = 0.3))
draw(species_anno + institute_anno + 
       cultivation_site_anno + harvest_time_anno + main_plot + barplot_anno, 
     heatmap_legend_side = "bottom",
     annotation_legend_side = "bottom",
     row_dend_side = "left")

4 Principle component analysis

# recall that sample_df is the sample label data set
rownames(hm_mat_dbl_NAzero) = str_c(sample_df$`Sample No.`, "_", sample_df$ID)

4.1 All accessions

plt.PCA.ALL.accessions = hm_mat_dbl_NAzero %>% scale() %>% prcomp() %>% 
  ggbiplot::ggbiplot(labels = rowname, groups = as.character(sample_df$`Sample No.`)) +
  theme_classic()  + # theme(legend.position = "None") +
  scale_x_continuous(breaks = seq(-2, 3, by = 1)) +
  labs(title = "All 20 accessions") +
  theme(legend.position = "none") +
  stat_ellipse()
plt.PCA.ALL.accessions

4.2 Comparison by species

By Species (planted in RU; all S. nigrum were cultivated in RU, no nigrum in Kenya trail)

RU.site = sample_df$`Cultivation site` == "RU"

plt.PCA.species = hm_mat_dbl_NAzero[RU.site, ] %>% # selecting only those planted in RU 
  scale() %>%prcomp() %>%
  ggbiplot::ggbiplot(labels = rownames(hm_mat_dbl_NAzero[RU.site, ]),
                     groups = (sample_df %>% filter(`Cultivation site` == "RU"))$Species,
                     ellipse = T) +
  theme_classic() + 
  scale_color_brewer(palette = "Dark2") +
  scale_x_continuous() +
  labs(title = "species comparison in RU trail") 

plt.PCA.species

4.3 Comparison by cultivation site

By Cultivation site (comprising 6 pairwise comparision of S. scabrum)

plt.PCA.cultivationSite = 
  # select 6 pairwise S. scabrum in two sites; remove rhamnetin, found lacking in selected cultivars
  hm_mat_dbl_NAzero[1:12, -6] %>% 
  scale() %>% prcomp() %>%
  ggbiplot::ggbiplot(labels = rownames(hm_mat_dbl_NAzero[1:12, -6]), 
                     groups = sample_df$`Cultivation site`[1:12], ellipse = T) +
  theme_classic() + 
  scale_color_brewer(palette = "Set1") + 
  scale_x_continuous() +
  labs(title = "S.scabrum cultivation site comparison")
plt.PCA.cultivationSite

By Cultivation site–grouped by accessions

plt.PCA.cultivationSite_grouped.by.accessions = 
  # select 6 pairwise S. scabrum in two sites; remove rhamnetin, found lacking in selected cultivars
  hm_mat_dbl_NAzero[1:12, -6] %>% 
  scale() %>% prcomp() %>%
  ggbiplot::ggbiplot(var.axes = T,
                     labels = rownames(hm_mat_dbl_NAzero[1:12, -6]), 
                     groups = sample_df$ID[1:12]) +
  theme_classic() + # theme(legend.position = "None") + 
  scale_x_continuous() +  
  labs(title = "S.scabrum cultivation site comparison") 
plt.PCA.cultivationSite_grouped.by.accessions