In this novel work Method development and validation for analysis of phenolic compounds in fatty complex matrices using enhanced matrix removal (EMR) lipid cleanup and UHPLC-QqQ-MS/MS published in Food Chemistry, we developed a robust method to analyze 55 phenolic compounds in fatty matrices, utilizing QuEChERS and Enhanced Matrix Removal (EMR)-lipid cleanup in 96-well plates, along with UHPLC-QqQ-MS/MS. Method validation across seven high-fat matrices yielded promising recoveries and accuracy, showing potential for high-throughput and sensitive analysis in diverse applications.

This documentation records the R code developed for data analysis and visualization used in the publication.

The R code has been developed with reference to R for Data Science (2e), and the official documentation of tidyverse, and DataBrewer.co. See breakdown of modules below:


Method validation

library(tidyverse)
library(readxl)
library(stringr)
library(rebus)
library(RColorBrewer)
library(cowplot)
library(viridis)
library(ggrepel)

# global plot setup
theme_set(theme_bw() + 
            theme(
              # strip.text = element_text(face = "bold"),
              strip.text = element_blank(),
              strip.background = element_blank(),
              strip.placement = "outside",
              axis.text = element_text(colour = "black", size = 17),
              axis.title = element_text(colour = "black", face = "bold", size = 18),
              panel.grid = element_blank())) 

# input data
path = "/Users/Boyuan/Desktop/My publication/18. Polyphenol EMR J. Food Chem (Zhiya)/ready files/Validation result.xlsx"

d = read_excel(path, sheet = "all")
# str(d)


# remove Catechin analogues of poor recovery at drying step
d2 = d %>% select(-c("C", "EC", "PAC-B2", "4-O-Me-C", "3-O-Me-EC", "4-O-Me-EC"))
# str(d2)



# tidy up
d2.tidy = d2 %>%
  gather(-c(matrix, validation, mean.sd, IScorrection), 
         key = compound, value = value) 

# convert value to percentage
d2.tidy = d2.tidy %>% mutate(value = value * 100) %>%
  spread(key = mean.sd, value = value)

d2.tidy$compound %>% n_distinct()
## [1] 49
# accuracy
d.accuracy = d2.tidy %>% 
  filter(validation %>% str_detect("Accuracy"))

# add spike level
d.accuracy$level = d.accuracy$validation %>% 
  str_extract(pattern = DOT %R% WRD) %>%
  str_extract(WRD) 

d.accuracy = d.accuracy %>%
  mutate(validation = "Accuracy")

# define plot numbers
color.compound = (c(brewer.pal(8, "Dark2"),
                    brewer.pal(11, "Spectral")[-c(5, 6, 7)],# remove light colors
                    "black", "grey") %>% sort() %>%
                    colorRampPalette())(d2.tidy$compound %>% n_distinct()) 
n.matrix = d.accuracy$matrix %>% n_distinct()

# plot accuracy
dg = .5

d.badAccuracy = d.accuracy %>% filter(mean < 0 | mean > 200 | std > 50) 
d.badAccuracy
## # A tibble: 12 x 7
##    matrix      validation IScorrection compound    mean     std level
##    <chr>       <chr>      <chr>        <chr>      <dbl>   <dbl> <chr>
##  1 Avocado     Accuracy   T            4-HCA      -55.2  434.   A    
##  2 Avocado     Accuracy   T            CA         173.    80.0  A    
##  3 Avocado     Accuracy   T            FA         123.   604.   A    
##  4 Avocado     Accuracy   T            4-HCA    -1561.  2106.   B    
##  5 Avocado     Accuracy   T            CA         332.   454.   B    
##  6 Avocado     Accuracy   T            FA       -1042.  2593.   B    
##  7 Beef        Accuracy   T            DHF        236.     9.73 A    
##  8 Horse serum Accuracy   T            HA          95.8   60.1  A    
##  9 Horse serum Accuracy   T            HA          60.9  215.   B    
## 10 Horse serum Accuracy   T            PA         102.    57.9  B    
## 11 Pork liver  Accuracy   T            DHF        234.     9.63 A    
## 12 Pork liver  Accuracy   T            HA         172.    71.6  B
# Showing what data points are removed from plot

plt.accuracy = d.accuracy %>%
  anti_join(d.badAccuracy) %>%
  ggplot(aes(x = level, y = mean, color = compound)) +
  
  
  
  geom_boxplot(outlier.alpha = 0, aes(group = level), fill = "white", width = .8) +
  
  annotate("rect", alpha = .15, fill = "springgreen4",
           xmin = .5, xmax = 2.5, ymin = 80, ymax = 120) +
  
  geom_boxplot(outlier.alpha = 0, aes(group = level), fill = "white", width = .8) +
  
  geom_errorbar(size = .3, 
                aes(ymin = mean - std, 
                    ymax = mean + std),
                position = position_dodge(dg)) +
  
  geom_point(size = 4, position = position_dodge(dg), fill = "white", shape = 21) +
  geom_point(size = 4, position = position_dodge(dg), alpha = 0.3) +
  
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound) +
  facet_grid(matrix ~ validation, scales = "free_y", switch = "y") +
  labs(x = "Spike level") +
  theme(legend.position = "none", axis.title.y = element_blank()) +
  geom_segment(aes(x = .5, xend = 2.5, y = 100, yend = 100),
               color = "darkgreen", linetype = "dashed", size = .1)
# plt.accuracy





# recovery
d.recovery = d2.tidy %>%
  filter(validation %>% str_detect("recovery")) 


# cleanup and put in order sample prep step names 
d.recovery$validation = d.recovery$validation %>% str_remove(pattern = "recovery" %R% DOT)

d.recovery$validation = 
  d.recovery$validation %>% factor(
    levels = d.recovery$validation %>% unique() %>% rev(),
    ordered = T)


# check outlier numbers
d.badRecovery = d.recovery %>% filter(mean > 200 | std > 50 | std < 0 )
d.badRecovery
## # A tibble: 28 x 6
##    matrix  validation       IScorrection compound   mean     std
##    <chr>   <ord>            <chr>        <chr>     <dbl>   <dbl>
##  1 Avocado Overall recovery F            4-HCA     1213.  4953. 
##  2 Avocado Overall recovery F            FA       -1101. -7884. 
##  3 Avocado Overall recovery T            4-HBA      133.    54.5
##  4 Avocado Overall recovery T            CA         255.  1249. 
##  5 Avocado Drying           F            4-HCA      553.  2294. 
##  6 Avocado Drying           F            CA         117.    97.3
##  7 Avocado Drying           F            FA        -524. -3814. 
##  8 Avocado Drying           T            4-HCA      120.    56.2
##  9 Avocado Drying           T            CA         185.   978. 
## 10 Avocado Drying           T            FA         126.    75.1
## # … with 18 more rows
# plot
dg2 = .5

plt.recovery = d.recovery %>%
  anti_join(d.badRecovery) %>%
  
  ggplot(aes(x = IScorrection, y = mean, color = compound)) +
  
  geom_boxplot(outlier.alpha = 0, aes(group = IScorrection), fill = "white", width = .8) +
  
  annotate("rect", alpha = .15, fill = "springgreen4",
           xmin = .5, xmax = 2.5, ymin = 80, ymax = 120) +
  
  geom_boxplot(outlier.alpha = 0, aes(group = IScorrection), fill = "white", width = .8) +
  
  geom_errorbar(size = .3, aes(ymin = mean - std, 
                               ymax = mean + std),
                position = position_dodge(dg2)) +
  
  geom_point(size = 4, position = position_dodge(dg2), fill = "white", shape = 21) +
  geom_point(size = 4, position = position_dodge(dg2), alpha = 0.1) +
  
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound) +
  facet_grid(matrix ~ validation, 
             scales = "free_y", switch = "y") +
  labs(x = "IS-correction") +
  theme(legend.position = "none", axis.title.y = element_blank()) +
  geom_segment(aes(x = .5, xend = 2.5, y = 100, yend = 100), 
               color = "darkgreen", linetype = "dashed", size = .1)

# plt.recovery 




# matrix effect
d.matrixEffect = d2.tidy %>%
  filter(validation %>% str_detect("Matrix effect")) 


# check outlier numbers
d.badMatrix = d.matrixEffect %>% filter(mean > 200 | std > 50 | std < 0 )
d.badMatrix
## # A tibble: 14 x 6
##    matrix      validation    IScorrection compound    mean      std
##    <chr>       <chr>         <chr>        <chr>      <dbl>    <dbl>
##  1 Avocado     Matrix effect F            4-HCA     -157.   -638.  
##  2 Avocado     Matrix effect F            CA         425.    295.  
##  3 Avocado     Matrix effect F            FA         161.   1145.  
##  4 Avocado     Matrix effect T            4-HCA    -7085.  -2539.  
##  5 Avocado     Matrix effect T            CA         113.    534.  
##  6 Avocado     Matrix effect T            FA       -7321.  -3416.  
##  7 Beef        Matrix effect F            R3G        417.     22.8 
##  8 Beef        Matrix effect T            DHF        274.      9.25
##  9 Beef        Matrix effect T            R3G        609.     31.8 
## 10 Horse serum Matrix effect F            HA        1579.    370.  
## 11 Horse serum Matrix effect F            PA         216.     56.9 
## 12 Horse serum Matrix effect T            HA          91.2   148.  
## 13 Horse serum Matrix effect T            PA         105.     56.4 
## 14 Pork liver  Matrix effect T            DHF        274.      9.25
# plot
dg3 = .5

plt.MatrixEffect = d.matrixEffect %>%
  anti_join(d.badMatrix) %>%
  
  ggplot(aes(x = IScorrection, y = mean, color = compound)) +
  
  
  
  geom_boxplot(outlier.alpha = 0, aes(group = IScorrection), fill = "white", width = .8) +
  
  annotate("rect", alpha = .15, fill = "springgreen4",
           xmin = .5, xmax = 2.5, ymin = 80, ymax = 120) +
  
  geom_boxplot(outlier.alpha = 0, aes(group = IScorrection), fill = "white", width = .8) +
  
  geom_errorbar(size = .3, aes(ymin = mean - std, 
                               ymax = mean + std),
                position = position_dodge(dg3)) +
  
  
  
  geom_point(size = 4, position = position_dodge(dg3), fill = "white", shape = 21) +
  geom_point(size = 4, position = position_dodge(dg3), alpha = 0.3) +
  
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound) +
  facet_grid(matrix ~ validation, 
             scales = "free_y", switch = "y") +
  labs(x = "IS-correction") +
  theme(legend.position = "none", axis.title.y = element_blank()) +
  geom_segment(aes(x = .5, xend = 2.5, y = 100, yend = 100),
               color = "darkgreen", linetype = "dashed", size = .1)


# plt.MatrixEffect




# Processing efficiency
d.processEfficiency = d2.tidy %>%
  filter(validation %>% str_detect("Overall process efficiency")) 


# check outlier numbers
d.badProcessEfficiency = d.processEfficiency %>% filter(mean > 200 | std > 50 | std < 0 )
d.badProcessEfficiency
## # A tibble: 12 x 6
##    matrix      validation                 IScorrection compound    mean      std
##    <chr>       <chr>                      <chr>        <chr>      <dbl>    <dbl>
##  1 Avocado     Overall process efficiency F            4-HCA    -1901.   -674.  
##  2 Avocado     Overall process efficiency F            CA          47.6   197.  
##  3 Avocado     Overall process efficiency F            FA       -1767.  -1063.  
##  4 Avocado     Overall process efficiency T            4-HCA    -1471.  -1985.  
##  5 Avocado     Overall process efficiency T            CA         289.    395.  
##  6 Avocado     Overall process efficiency T            FA        -917.  -2283.  
##  7 Beef        Overall process efficiency T            DHF        203.      9.02
##  8 Horse serum Overall process efficiency F            HA         281.    316.  
##  9 Horse serum Overall process efficiency F            PA          93.6    56.2 
## 10 Horse serum Overall process efficiency T            HA          52.0   184.  
## 11 Horse serum Overall process efficiency T            PA          99.5    56.5 
## 12 Pork liver  Overall process efficiency T            DHF        203.      9.02
# plot
dg4 = .5

plt.processEfficiency = d.processEfficiency %>%
  anti_join(d.badProcessEfficiency) %>%
  
  ggplot(aes(x = IScorrection, y = mean, color = compound)) +
  
  geom_boxplot(outlier.alpha = 0, aes(group = IScorrection), fill = "white", width = .8) +
  
  annotate("rect", alpha = .15, fill = "springgreen4",
           xmin = .5, xmax = 2.5, ymin = 80, ymax = 120) +
  
  geom_boxplot(outlier.alpha = 0, aes(group = IScorrection), fill = "white", width = .8) +
  
  
  geom_errorbar(size = .3, aes(ymin = mean - std, 
                               ymax = mean + std),
                position = position_dodge(dg4)) +
  
  geom_point(size = 4, position = position_dodge(dg4), fill = "white", shape = 21) +
  geom_point(size = 4, position = position_dodge(dg4), alpha = 0.3) +
  
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound) +
  facet_grid(matrix ~ validation, 
             scales = "free_y", switch = "y") +
  labs(x = "IS-correction") +
  theme(legend.position = "none", axis.title.y = element_blank()) +
  geom_segment(aes(x = 0.5, xend = 2.5, y = 100, yend = 100),
               color = "darkgreen", linetype = "dashed", size = .1)

# plt.processEfficiency





# -------# -------# -------# -------# -------# -------
p1 = plot_grid(plt.accuracy, 
               plt.recovery,
               plt.MatrixEffect, 
               plt.processEfficiency,
               
               nrow = 1,
               rel_widths = c(1, 3.5, 1, 1)) 


# Check outlier compounds and associated matrix
d.badALL = d.badAccuracy %>% select(-level) %>% rbind(d.badMatrix) %>%
  rbind(d.badRecovery) %>% rbind(d.badProcessEfficiency)

# d.badALL %>% as.data.frame()

d.badALL %>% select(matrix, compound) %>%
  table()
##              compound
## matrix        4-HBA 4-HCA CA DHF FA HA PA R3G
##   Avocado         2    12 12   0 12  0  0   0
##   Beef            0     0  0   3  0  0  0   2
##   Horse serum     0     0  0   0  0 10  9   0
##   Pork liver      0     0  0   3  0  1  0   0
# bad criteria: filter(mean > 200 | std > 50 | std < 0 )



# new global setup
theme_set(theme_bw() + 
            theme(
              strip.text = element_text(face = "bold"),
              # strip.text = element_blank(),
              strip.background = element_blank(),
              strip.placement = "outside",
              axis.text = element_text(colour = "black", size = 15.5),
              axis.title = element_text(colour = "black", face = "bold", size = 15.5),
              panel.grid = element_blank(),
              legend.text = element_text(size = 15),
              legend.title = element_text(size = 15, face = "bold"))) 

# ----# -----

# plt.allMatrix.accuracy = d2.tidy %>%
#   filter(validation %>% str_detect("Accuracy")) %>%
#   
#   ggplot(aes(x =  mean, color = validation, fill = validation)) +
#   geom_histogram(binwidth = 5, alpha = .4, position = "dodge") +
#   scale_x_continuous(limits = c(0, 200), breaks = seq(0, 200, 20)) +
#   scale_y_continuous(breaks = seq(0, 200, 10)) +
#   theme(legend.position = "bottom")  
# 
# plt.allMatrix.accuracy


#  2D density plot
d2.tidy$validation = d2.tidy$validation %>% 
  str_replace(pattern = "Accuracy.A", replacement = "Accuracy")

d2.tidy$validation = d2.tidy$validation %>% 
  str_replace(pattern = "Accuracy.B", replacement = "Accuracy")


plt.allMatrix.accuracy.2Ddensity =
  d2.tidy %>% 
  filter(validation %>% str_detect("Accuracy")) %>%
  
  ggplot(aes(x = mean, y = std)) +
  stat_density_2d(aes(fill = stat(density)), geom = "raster", contour = F) +
  geom_point(color = "white", shape = ".", alpha = .4) + 
  # scale_fill_viridis(option = "inferno") + 
  scale_fill_viridis(option = "D") +
  geom_rug(sides = "tr", color = "black", alpha = 0.2) +
  scale_x_log10() + scale_y_log10() + 
  annotation_logticks() +
  
  # label outliers not presented in the box-scatter plot
  geom_text_repel(data = d.badAccuracy,
                  aes(label = paste(compound,"_", matrix, "_", level)),
                  size = 4, color = "snow3")

# plt.allMatrix.accuracy.2Ddensity






# D2 density plot for recovery (Absolute, without IS-correction; extraction, EMR, and drying)
d2.tidy$validation %>% unique()
## [1] "Accuracy"                   "Matrix effect"              "Overall process efficiency" "Overall recovery"          
## [5] "recovery.Drying"            "recovery.EMR"               "recovery.Extraction"
d2.tidy.stepwiseRecovery.absolute = d2.tidy %>%
  filter(validation %in% c("recovery.Extraction", "recovery.Drying", "recovery.EMR")) %>%
  filter(IScorrection == "F")

d2.tidy.stepwiseRecovery.absolute$validation =
  d2.tidy.stepwiseRecovery.absolute$validation %>% str_remove(pattern = "recovery.") %>%
  str_replace(pattern = "Drying", replacement = "Dry")


# simplify bad recovery database
d.badRecoverySimplified = d.badRecovery %>%
  filter(validation != "Overall recovery")

d.badRecoverySimplified$validation = 
  d.badRecoverySimplified$validation %>% 
  str_replace(pattern = "Drying", replacement = "Dry")

# plot
plt.allMatrix.Recovery.2Ddensity = 
  d2.tidy.stepwiseRecovery.absolute %>%
  ggplot(aes(x = mean, y = std)) +
  stat_density_2d(aes(fill = stat(density)), geom = "raster", contour = F) +
  geom_point(color = "white", shape = ".", alpha = .4) + 
  # scale_fill_viridis(option = "inferno") + 
  scale_fill_viridis(option = "D") +
  geom_rug(sides = "tr", color = "black", alpha = 0.2) +
  scale_x_log10() + scale_y_log10() + 
  annotation_logticks() +
  
  # label outliers not presented in the box-scatter plot
  geom_text_repel(data = d.badRecoverySimplified,
                  aes(label = paste(compound,"_", matrix, "_", validation)),
                  size = 2, color = "snow3", arrow = F) +
  facet_wrap(~validation, nrow = 1)

# plt.allMatrix.Recovery.2Ddensity



# d2.tidy.stepwiseRecovery.absolute %>%
#   ggplot(aes(x =  mean, color = IScorrection, fill = IScorrection)) +
#   geom_histogram(binwidth = 5, alpha = .4, position = "dodge") +
#   scale_x_continuous(limits = c(0, 200), breaks = seq(0, 200, 20)) +
#   scale_y_continuous(breaks = seq(0, 200, 10)) +
#   theme(legend.position = "bottom")  +
#   facet_wrap(~validation, nrow = 1)




# check validation success vs. background interference
d.background = read_excel(path, sheet = "background.vs.spike")
d.background = d.background %>%
  gather(-c(matrix, level), key = compound, value = background.spk.ratio)

d.background = d.accuracy %>%
  filter(IScorrection == "T") %>%
  left_join(d.background, by = c("matrix", "compound", "level")) 

# also add back accuracy dataset
d.background = d.background %>% left_join(
  d.badAccuracy %>% select(-mean, -std) %>% mutate(bad = "bad"))


# plot
plt.background.vs.spk = 
  d.background %>%
  ggplot(aes(x = background.spk.ratio, y = std, 
             shape = level, color = matrix, fill = matrix)) +
  scale_x_log10(breaks = c(.001, .01, .1, 1, 10, 100, 1000)) +
  scale_y_log10() +
  annotation_logticks() +
  geom_smooth(aes(group = 1), 
              se = F, color = "black", show.legend = F, size = 2) +
  
  geom_point(size = 4, alpha = .7) +
  geom_point(size = 4, alpha = .7, fill = "white") +
  
  geom_text_repel(data = d.background[!d.background$bad %>% is.na(), ], 
                  aes(label = paste(compound, "-", matrix), 
                      x = background.spk.ratio), fontface = "bold",
                  size = 4.5) +
  
  scale_shape_manual(values = c("A" = 21, "B" = 25)) +
  
  scale_color_brewer(palette = "Dark2") +
  scale_fill_brewer(palette = "Dark2") 

# plt.background.vs.spk



p2 = plot_grid(plt.allMatrix.accuracy.2Ddensity,
               plt.background.vs.spk)
p2

# plot_grid(p1, p2, nrow = 2,
#           rel_heights = c(7, 2))
# p1 small screen 17 X 12
p1

EMR method development

library(rebus)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(cowplot)
library(ggrepel)
library(readxl)
library(RColorBrewer)
library(tidyverse)

theme_set(theme_bw() + theme(axis.text = element_text(colour = "black"),
                             axis.title = element_text(colour = "black", face = "bold"),
                             strip.background = element_blank(),
                             strip.text = element_text(face = "bold")))

path = "/Users/Boyuan/Desktop/My publication/18. Polyphenol EMR J. Food Chem (Zhiya)/Polyphenol EMR All Data.xlsx"

# ACN percentage 
d.opt.pct = read_excel(path, sheet = "ACN percent opt2")
d.opt.pct = d.opt.pct %>%
  gather(-c(`Data File`, Name, `Acq. Date-Time`, solvent.pct), key = compound, value = resp)

x = d.opt.pct %>% filter(solvent.pct == "ctrl") %>%
  group_by(compound) %>%
  summarise(resp.ctrl.mean = mean(resp),resp.ctrl.sd = sd(resp))

y = d.opt.pct %>% filter(solvent.pct != "ctrl") %>%
  group_by(solvent.pct, compound) %>%
  summarise(resp.mean = mean(resp), resp.sd = sd(resp))

d.opt.pct.recovery = y %>% left_join(x, by = c("compound")) %>%
  mutate(recovery.mean = resp.mean / resp.ctrl.mean * 100,
         recovery.sd = sqrt((resp.sd/resp.mean)^2 + (resp.ctrl.sd/resp.ctrl.mean)^2) * recovery.mean  )
d.opt.pct.recovery
## # A tibble: 378 x 8
## # Groups:   solvent.pct [7]
##    solvent.pct compound               resp.mean resp.sd resp.ctrl.mean resp.ctrl.sd recovery.mean recovery.sd
##    <chr>       <chr>                      <dbl>   <dbl>          <dbl>        <dbl>         <dbl>       <dbl>
##  1 60          3-HBA                      862.     46.5          935.          99.9          92.1       11.0 
##  2 60          3-HPAA                    1101.     44.8         1258.          12.5          87.5        3.67
##  3 60          3-HPPA                    1595.     95.6         1804.         104.           88.4        7.35
##  4 60          3-HPVA                    1944.     95.2         2285.          22.4          85.1        4.25
##  5 60          3-Hydroxycinnamic acid    1857.     21.7         2090.          45.6          88.9        2.20
##  6 60          3-Hydroxyhippuric acid    1396.     41.1         1611.          46.5          86.6        3.58
##  7 60          3-Hydroxytyrosol(-)       3021.    128.          3675.         470.           82.2       11.1 
##  8 60          3-O-Me-EC                 2971.     68.6         3302.         106.           90.0        3.56
##  9 60          3,4-diHBA                  256.     63.4          374.         190.           68.5       38.7 
## 10 60          3,4-diHPAA                  27.2    13.1           48.1         28.4          56.6       43.2 
## # … with 368 more rows
plt.ACN.pct = d.opt.pct.recovery %>%
  ggplot(aes(x = solvent.pct, y = recovery.mean, color = compound)) +
  geom_boxplot(outlier.alpha = 0, aes(group = solvent.pct)) +
  # geom_errorbar(aes(ymin = recovery.mean - recovery.sd, ymax = recovery.mean + recovery.sd),
  #               position = position_dodge(.3), size = .2, width = 2) +
  geom_point(position = position_dodge(.4), shape = 21, fill = "white", size = 1) +
  theme(legend.position = "none") +
  # coord_flip(ylim = c(0, 130)) +
  scale_y_continuous(breaks = seq(0, 150, by = 10)) +
  labs(y = "Recovery", x= "Acetonitrile percentage") +
  coord_cartesian(ylim = c(0, 130)) 
# Activation with 200 uL varied ACN percent, without secondary wash

# plt.ACN.pct



# Secondary wash volume
d.opt.wash = read_excel(path, sheet = "2nd wash")
d.opt.wash = d.opt.wash %>% 
  gather(-c(`Data File`, Name, `Acq. Date-Time`, volume), key = compound, value = resp)

x = d.opt.wash %>% filter(volume == "ctrl") 
x = x %>% group_by(compound) %>%
  summarise(resp.ctrl = mean(resp), resp.ctrl.sd = sd(resp))

y = d.opt.wash %>% filter(volume != "ctrl") %>%
  group_by(volume, compound) %>%
  summarise(resp.mean = mean(resp), resp.sd = sd(resp))

d.opt.wash = y %>% left_join(x, by = "compound") %>%
  mutate(recovery.mean = resp.mean / resp.ctrl * 100,
         recovery.sd = sqrt((resp.sd/resp.mean)^2 + (resp.ctrl.sd/resp.ctrl)^2) * recovery.mean )

plt.wash = d.opt.wash %>% 
  ggplot(aes(x = volume, y = recovery.mean, color = compound)) +
  geom_boxplot(outlier.alpha = 0, aes(group = volume)) +
  geom_point(position = position_dodge(.5), shape = 21, fill = "white", size = 1) +
  # geom_errorbar(aes(ymin = recovery.mean - recovery.sd,
  #                   ymax = recovery.mean + recovery.sd),
  #               size = .1, width = .5, position = position_dodge(.5)) +
  theme(legend.position = "None") +
  labs(y = "Recovery", x = "Secondary wash volume (uL)") + 
  coord_cartesian(ylim = c(0, 130)) +
  scale_y_continuous(breaks = seq(0, 130, 10)) 
# 85% ACN elution, activation with 200 uL 85% ACN"

# plt.wash

# d.opt.wash$recovery.sd %>% hist(breaks = 100)

# grid.arrange(plt.ACN.pct, plt.wash, nrow = 1)





# Activation
d.opt.act = read_excel(path, sheet = "Activation")
d.opt.act = d.opt.act %>%
  gather(-c(`Data File`, Name, `Acq. Date-Time`, Volume), key = compound, value = resp)

x = d.opt.act %>% filter(Volume != "ctrl")
x = x %>% group_by(compound, Volume) %>%
  summarise(resp.mean = mean(resp),resp.sd = sd(resp))

y = d.opt.act %>% filter(Volume == "ctrl") %>%
  group_by(compound) %>%
  summarise(resp.ctrl.mean = mean(resp), resp.ctrl.sd = sd(resp))

d.opt.act = x %>% left_join(y, by = "compound") %>%
  mutate(recovery = resp.mean / resp.ctrl.mean * 100,
         recovery.sd = sqrt((resp.sd / resp.mean)^2 +  (resp.ctrl.sd/ resp.ctrl.mean)^2) * recovery )

plt.act = d.opt.act %>%
  ggplot(aes(x = Volume, y = recovery, color = compound)) +
  geom_boxplot(outlier.alpha = 0, aes(group = Volume)) +
  geom_point(position = position_dodge(.5), shape = 21, fill = "white") +
  theme(legend.position = "None")
# plt.act


# draw together
o = d.opt.pct.recovery %>% select(solvent.pct, compound, recovery.mean) %>%
  rename(recovery = recovery.mean, level = solvent.pct) %>%
  mutate(experiment = "ACN percentage")

p = d.opt.wash %>% select(volume, compound, recovery.mean) %>%
  rename(recovery = recovery.mean, level = volume) %>%
  mutate(experiment = "Secondary wash")

q = d.opt.act %>% select(Volume, compound, recovery) %>%
  rename(level = Volume) %>%
  mutate(experiment = "Activation")


k = rbind(o, p) %>% rbind(q)
k$experiment = k$experiment %>% 
  factor(levels = c("ACN percentage", "Secondary wash", "Activation"), ordered = T)

library(RColorBrewer)
color.compound = (c(brewer.pal(8, "Dark2"),
                    brewer.pal(11, "Spectral")[-c(5, 6, 7)],# remove light colors
                    "black", "grey") %>% sort() %>%
                    colorRampPalette())(k$compound %>% n_distinct()) 


plt.EMR.optimization = k %>% group_by(experiment) %>% mutate(n = n_distinct(level)) %>%  
  ggplot(aes(x = level, y = recovery, color = compound, fill = compound)) +
  geom_boxplot(aes(group = level), outlier.alpha = 0, fill = "snow4") +
  
  geom_point(position = position_dodge(.4), 
             shape = 21, fill = "white", size = 2) +
  
  geom_point(position = position_dodge(.4), 
             shape = 21, size = 2, alpha = .2) +
  
  facet_wrap(~experiment, scales = "free_x") +
  scale_y_continuous(limits = c(20, 120), breaks = seq(20, 120, 10)) +
  theme(legend.position = "None",
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 13),
        strip.text = element_text(size = 12)) +
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound)
plt.EMR.optimization
## Warning: Removed 27 rows containing non-finite values (stat_boxplot).
## Warning: Removed 27 rows containing missing values (geom_point).

## Warning: Removed 27 rows containing missing values (geom_point).

# EMR opt 9.4 X 5.03
#-&=$$$$$$$$$$-&=$$$$$$$$$$-&=$$$$$$$$$$-&=$$$$$$$$$$-&=$$$$$$$$$$-&=$$$$$$$$$$-&=$$$$$$$$$$-&=$$$$$$$$$$-----




# silica gel optimization 
# first check the recovery without silica gel

# No silica gel
d.gelFree = read_excel(path, sheet = "silica gel 1", range = "C19:BF20")
d.gelFree = d.gelFree %>% 
  gather(-c(1:2), key = compound, value = recovery) 


# compound color (for low recovery compounds)

d.low = d.gelFree %>% filter(recovery < 60)
lowRcmpd = d.low$compound

color.compound = colorRampPalette( brewer.pal(8, "Dark2") %>% sort())(lowRcmpd %>% length())
names(color.compound) = lowRcmpd


plt.sorbentFree = d.gelFree %>% 
  anti_join(d.low, by = "compound") %>%
  ggplot(aes(x = `mass (mg)`, y = recovery)) +
  geom_boxplot(outlier.alpha = 0, aes(group = `mass (mg)`), 
               data = d.gelFree) + # boxplot showing the distribution of all 
  geom_point(shape = 21, fill = "white", color = "black",
             position = position_jitter(.2)) +
  theme(legend.position = "None") +
  facet_wrap(~`mass (mg)`) +
  scale_y_continuous(breaks = seq(0, 130, 10)) +
  coord_cartesian(ylim = c(0, 130))  +
  
  # low recovery points
  geom_point(data =d.low,
             aes(color = compound), position = position_dodge(.2),
             shape = 21, stroke = .5, size = 2) +
  
  geom_point(data = d.low,
             aes(color = compound, fill = compound), position = position_dodge(.2),
             shape = 21, size = 2, alpha = .2) +
  
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound) +
  
  geom_text_repel(data = d.low, 
                  aes(label = compound, color = compound), size = 2,
                  position = position_dodge(.2))

plt.sorbentFree

# -<>--<>--<>--<>--<>--<>--<>--<>--<>--<>--<>-



# Now add the silica
d.silica = read_excel(path, sheet = "silica gel 1", range = "A1:BF12")

d.silica = d.silica %>%
  select(-c(`Data File`, `Acq. Date-Time`)) %>%
  gather(-c(`mass (mg)`, ratio), key = compound, value = resp)

x = d.silica %>%
  filter(`mass (mg)` == "control") %>%
  select(- c(ratio, `mass (mg)`)) %>%
  rename(resp.ctrl = resp)

d.silica = d.silica %>% 
  filter(`mass (mg)` != "control") %>%
  left_join(x, by = "compound") %>%
  mutate(recovery = resp/resp.ctrl * 100)



# Plot
# first convert levels to ordered factor

d.silica$ratio = d.silica$ratio %>% 
  factor(levels = c("NP", "NP:RP=8:2", "NP:RP=5:5", "NP:RP=3:7", "RP"), ordered = T)
d.silica$`mass (mg)`  = d.silica$`mass (mg)` %>% 
  factor(levels = d.silica$`mass (mg)` %>% unique(), ordered = T)


plt.silica = d.silica %>%
  filter(! compound %in% lowRcmpd) %>%
  
  ggplot(aes(x = `mass (mg)`, y = recovery)) +
  geom_boxplot(outlier.alpha = 0, aes(group = c(`mass (mg)`)),
               data = d.silica) + # boxplot showing the distribution of all
  geom_point(position = position_jitter(.1), shape = 21, fill = "white") +
  theme(legend.position = "None") +
  facet_wrap(~ratio, nrow = 1) +
  scale_y_continuous(breaks = seq(0, 140, 10)) +
  coord_cartesian(ylim = c(0, 130)) +
  
  
  geom_point(data = d.silica %>% filter(compound %in% lowRcmpd),
             position = position_dodge(.2), size = 2, stroke = .5,
             aes(color = compound), shape = 21, fill = "white") +
  
  geom_point(data = d.silica %>% filter(compound %in% lowRcmpd),
             position = position_dodge(.2),
             aes(color = compound, fill = compound), size = 2, alpha = .2) +
  
  geom_text_repel(data = d.silica %>% filter(compound %in% lowRcmpd),
                  aes(label = compound, color = compound), size = 2,
                  position = position_dodge(.2)) +
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound)

# plt.silica



# combine sorbent free + with sorbent
plt.silica.all = plot_grid(plt.sorbentFree, plt.silica, nrow = 1, rel_widths = c(1, 6))
plt.silica.all

# For those special low recovery compounds, check the condition of their best performance

plt.lowRcmpd = d.silica %>% filter(compound %in% lowRcmpd) %>%
  ggplot(aes(x = ratio, y = recovery, color = compound)) +
  geom_line(aes(group = compound), position = position_dodge(.2)) +
  geom_point(shape = 21, fill = "white", stroke = 1, size= 2, 
             position = position_dodge(.2)) +
  facet_wrap(~`mass (mg)`) +
  scale_y_continuous(breaks = seq(10, 80, 10)) +
  theme(legend.position = "bottom") +
  scale_color_manual(values = color.compound)
plt.lowRcmpd

plt.silica.all

# -《》--《》--《》--《》--《》--《》--《》--《》--《》--《》--《》--《》--《》--《》-


# Repeat the silica gel experiment in a more consistent manner July 18
d.silica2 = read_excel(path, sheet = "silica gel 2", range = "B2:BD16")
d.silica2 = d.silica2 %>% gather(-c(treatment), key = compound, value = resp)
x = d.silica2 %>% filter(treatment == "ctrl") %>% 
  group_by(compound) %>%
  summarise(resp.ctrl.mean = mean(resp),
            resp.ctrl.sd = sd(resp))

d.silica2.recovery = d.silica2 %>% filter(treatment != "ctrl") %>% 
  group_by(compound, treatment) %>%
  summarise(resp.mean = mean(resp),
            resp.sd = sd(resp)) %>%
  # augment with control response
  left_join(x, by = "compound") %>%
  mutate(recovery.mean = resp.mean / resp.ctrl.mean * 100,
         recovery.sd = sqrt((resp.sd / resp.mean)^2 + (resp.ctrl.sd/resp.ctrl.mean)^2) * recovery.mean)

d.silica2.recovery$treatment = d.silica2.recovery$treatment %>% 
  factor(levels = c("Sorbent free", "NP", "NP:RP=8:2","NP:RP=5:5","NP:RP=3:7", "RP" ), ordered = T)



# high recovery compounds
d.silica2.recovery.normal = d.silica2.recovery %>% filter(!(compound %in% lowRcmpd)) 

# low recovery compounds
d.silica2.recovery.Low = d.silica2.recovery %>% filter(compound %in% lowRcmpd)

# plot
dg2 = .2

plt.sorbent2 = d.silica2.recovery.normal %>%
  ggplot(aes(x = treatment, y = recovery.mean)) +
  geom_boxplot(data = d.silica2.recovery, outlier.alpha = 0, 
               aes(group = treatment), width = .6, fill = "snow1") +
  geom_point(position = position_jitter(.1), alpha = .5, size = 3)  +
  theme(legend.position = "None",panel.grid = element_blank()) +
  coord_cartesian(ylim = c(0, 140)) +
  scale_y_continuous(breaks = seq(0, 150, 10)) +
  
  # highlight low recovery compounds
  geom_errorbar(data = d.silica2.recovery.Low, 
                aes(ymin = recovery.mean - recovery.sd,
                    ymax = recovery.mean + recovery.sd, color = compound),
                size = .2, width = .5, position = position_dodge(dg2)) +
  
  
  
  
  geom_text_repel(data = d.silica2.recovery %>% filter(compound %in% lowRcmpd),
                  aes(label = compound, color = compound), 
                  position = position_dodge(dg2), size = 3.5, fontface = "bold") +
  
  geom_line(data = d.silica2.recovery.Low,
            aes(color = compound, fill = compound, group = compound), size = .1,
            position = position_dodge(dg2)) +
  
  geom_point(data = d.silica2.recovery.Low,
             aes(color = compound), shape = 21, fill = "white",
             position = position_dodge(dg2), size = 4)  +
  
  geom_point(data = d.silica2.recovery.Low,
             aes(color = compound, fill = compound), shape = 21,
             position = position_dodge(dg2), size = 4, alpha = .2)  +
  
  labs(y = "Recovery")  +
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound)
## Warning: Ignoring unknown aesthetics: fill
plt.sorbent2

# plt.silica.all
ggdraw() + 
  draw_plot(plt.EMR.optimization, y = .5, height = .5) +
  draw_plot(plt.sorbent2, x = .1, width = .8, height = .5)

#<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-<>-




# silica gel with matrix. July 19 experiment
# Using IS correction
d.silica3 = read_excel(path, sheet = "silica with matrix", range = "A49:BC55")
d.silica3 = d.silica3 %>%
  gather(-treatment, key = compound, value = recovery) 

d.silica3$treatment = d.silica3$treatment %>%
  factor(levels = d.silica3$treatment %>% unique(), ordered = T)

normalx = d.silica3 %>% filter(!(compound  %in% lowRcmpd))
lowx = d.silica3 %>% filter(compound  %in% lowRcmpd)

plt.silica.matrix.IS.Corrected = 
  normalx %>% ggplot(aes(x = treatment, y = recovery)) +
  geom_boxplot(data = d.silica3, outlier.alpha = 0, width = .6,
               fill = "snow1") +
  geom_point(position = position_jitter(.1), alpha = .6) +
  theme(legend.position = "None", panel.grid = element_blank()) +
  
  # highlight low recovery compounds
  geom_point(data = lowx, aes(color = compound),
             position = position_dodge(.2), size = 3) +
  
  geom_line(data = lowx, aes(group = compound, color = compound), size = .2,
            position = position_dodge(.2)) +
  
  geom_text_repel(data = lowx, aes(color = compound, label = compound),
                  position = position_dodge(.2), size = 2.5, fontface = "bold") +
  scale_color_manual(values = color.compound) +
  coord_cartesian(ylim = c(0, 140)) +
  scale_y_continuous(breaks = seq(0, 150, 10))

plt.silica.matrix.IS.Corrected

d.silica3$recovery %>% hist(breaks = 50)

# without IS correction
d.silica3.notCorrected = read_excel(path, sheet = "silica with matrix", range = "A1:BC17")

# extract treatment name
d.silica3.notCorrected$treatment = d.silica3.notCorrected$Sample %>%
  str_remove(pattern = DOT %R% "r") %>%
  str_remove(pattern = DOT %R% zero_or_more("2") %R% zero_or_more(DOT) %R% "d") 

# mean and std
d.silica3.notCorrected = d.silica3.notCorrected %>%
  select(-Sample) %>%
  gather(-treatment, key = compound, value = area) %>%
  group_by(treatment, compound) %>%
  summarise(area.mean = mean(area), area.sd = sd(area))


# background, contrl and sorbent
blk = d.silica3.notCorrected %>% filter(treatment == "1B") %>%
  ungroup() %>% select(-treatment) %>%
  rename(area.mean.blk = area.mean,
         area.sd.blk = area.sd)

ctrl = d.silica3.notCorrected %>% filter(treatment == "control") %>%
  ungroup() %>% select(-treatment) %>%
  rename(area.mean.ctrl = area.mean,
         area.sd.ctrl = area.sd)

# calculate recovery (not IS corrected)
d.silica3.notCorrected =d.silica3.notCorrected %>%
  filter(!(treatment %in% c("1B", "control"))) %>%
  left_join(ctrl, by = "compound") %>%
  left_join(blk, by = "compound") %>%
  mutate(area.net.mean = area.mean - area.mean.blk,
         recovery = area.net.mean / area.mean.ctrl * 100)

d.silica3.notCorrected$treatment = d.silica3.notCorrected$treatment %>% 
  factor(levels = c("gel free", "NP", "NP:RP=8:2", "NP:RP=5:5", "NP:RP=3:7", "RP"), ordered = T)


low3 = d.silica3.notCorrected %>% filter(compound %in% lowRcmpd)
normal3 = d.silica3.notCorrected %>% filter(!(compound %in% lowRcmpd))

plt.silica.matrix.Notcorrected = 
  normal3  %>%
  ggplot(aes(x = treatment, y = recovery)) +
  geom_boxplot(data = d.silica3.notCorrected, 
               outlier.alpha = 0, width = .6,
               fill = "snow1") +
  
  geom_point(position = position_jitter(.1), alpha = .6, size = 3) +
  coord_cartesian(ylim = c(0, 140)) +
  scale_y_continuous(breaks = seq(0, 150, 10)) +
  
  
  geom_line(data = low3, aes(group = compound, color = compound), 
            size = .2, position = position_dodge(.2)) +
  
  geom_point(data = low3, aes(color = compound), shape = 21, fill = "white",
             position = position_dodge(.2), size = 4) +
  
  geom_point(data = low3, aes(color = compound, fill = compound),
             position = position_dodge(.2), size = 4, alpha = .2) +
  
  geom_text_repel(data = low3,
                  aes(label = compound, color =compound),
                  position = position_dodge(.2), 
                  size = 3.5, fontface = "bold") +
  
  scale_color_manual(values = color.compound) +
  scale_fill_manual(values = color.compound) +
  theme(legend.position = "None", panel.grid = element_blank())
plt.silica.matrix.Notcorrected

# plot_grid(plt.sorbent2,
#           
#           plot_grid(plt.silica.matrix.Notcorrected, 
#                     plt.silica.matrix.IS.Corrected, nrow = 1),
#           nrow = 2)
ggdraw() +
  draw_plot(plt.sorbent2, y = .5, height = .5, x = .2, width = .6) +
  draw_plot(plot_grid(plt.silica.matrix.Notcorrected,
                      plt.silica.matrix.IS.Corrected, nrow = 1),
            x = 0, width = 1, height = .5, y = 0)

theme_set(theme_bw() + theme(axis.text = element_text(size = 19),
                axis.title = element_text(size = 19))) 

plot_grid(plt.sorbent2, 
          plt.silica.matrix.Notcorrected,
          #plt.silica.matrix.IS.Corrected, 
          nrow = 1, 
          # labels = c("A", "B"#, "C"
          #            ), 
          label_size = 18)

# device size 19 X 9.5