1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
| library(BSgenome.Drerio.UCSC.danRer7) library(CrispRVariants) library(ggplot2) library(reshape2) library(rtracklayer)
danRer7 <- BSgenome.Drerio.UCSC.danRer7 guides <- import("../annotation/shah_guides.bed") guides <- guides + 5 references <- getSeq(danRer7, guides)
parseCRISPResso <- function(results_dir){ results_f <- file.path(results_dir, "Quantification_of_editing_frequency.txt") system(paste0("echo '\n' >>", results_f)) f <- file(results_f) lns <- readLines(f) close(f) nhej <- lns[grep(".* NHEJ:", lns)] total <- lns[grep("Total", lns)] counts <- as.numeric(gsub(".*:([0-9]+)\ .*", "\\1", c(nhej, total))) result <- counts[1]/counts[2]*100 c(result, counts[2]) } parseCRISPResso2 <- function(results_dir){ results_f <- file.path(results_dir, "CRISPResso_quantification_of_editing_frequency.txt") rate_table <- read.table(results_f, header = TRUE, sep = "\t") rate_table["Modified."] # Modified% } parseMaChIAto <- function(results_dir){ results_f <- file.path(results_dir, "ALL_read_countdata.csv") if(file.exists(results_f)){ cnt_table <- read.csv(results_f, header = TRUE) return( (cnt_table["DELETION_EDITING"] + cnt_table["INSERT_EDITING"])# + cnt_table["SUBSTITUTION_EDITING"]) / (cnt_table["DELETION_EDITING"] + cnt_table["INSERT_EDITING"] + cnt_table["SUBSTITUTION_EDITING"] + cnt_table["UNTREATED"] + cnt_table["PRECISE_KNOCK_IN"] + cnt_table["IMPRECISE_KNOCK_IN"]) * 100 ) }else{ return(NA) } }
# Parse CRISPResso pooled results pooled_dirs <- list.files("../simulation/merged", recursive = TRUE, pattern = "CRISPResso_on", include.dirs = TRUE, full.names = TRUE) condition <- gsub(".*CRISPResso_on_", "", pooled_dirs) pooled_results <- sapply(pooled_dirs, function(x) parseCRISPResso(x)[1]) noff <- gsub(".*_([0-9]+)offtarget.*", "\\1", pooled_dirs) nmut <- as.numeric(gsub(".*_([0-9]+)mut.*", "\\1", pooled_dirs)) nwt <- as.numeric(gsub(".*_([0-9]+)wt.*", "\\1", pooled_dirs)) pooled_results <- data.frame(Guide = condition, Truth = nmut/(nmut+nwt) * 100, NOfftargets = noff, variable = "CRISPRessoPooled", value = unname(pooled_results)) # Parse ampliconDIVider results adiv_files <- list.files("../simulation/amplicondivider", full.names = TRUE) adiv_results <- sapply(adiv_files, function(fn){ tt <- read.table(fn, sep = "\t")[1,c(6)]*100 }) exclude <- adiv_results > 100 adiv_results <- adiv_results[!exclude] print(sprintf("excluded %s", length(exclude))) ## [1] "excluded 240" adiv_gd <- gsub("_.*","", basename(adiv_files)[!exclude]) adiv_noff <- gsub(".*_([0-9]+)offtarget.*", "\\1", adiv_files[!exclude]) adiv_nmut <- as.numeric(gsub(".*_([0-9]+)mut.*", "\\1", adiv_files[!exclude])) adiv_nwt <- as.numeric(gsub(".*_([0-9]+)wt.*", "\\1", adiv_files[!exclude])) adiv_results <- data.frame(Guide = adiv_gd, Truth = adiv_nmut/(adiv_nmut+adiv_nwt)*100, NOfftargets = adiv_noff, variable = "AmpliconDIVider", value = unname(adiv_results)) # CrispRVariants, CRISPResso, CRISPResso2, CRISPResso-MaChIAto and CRISPResso-MaChIAto2 base <- gsub(".fa|.gz","", list.files("../simulation", pattern = "*.fa")) bams <- file.path("../simulation", paste0(base, ".bam")) noff <- as.integer(gsub(".*wt_([0-9]+)offtarget.*", "\\1", base)) frag_len <- as.integer(gsub(".*offtarget_([0-9]+)readlen.*", "\\1", base)) sim_guides <- gsub("(.*)_[0-9]+mut.*","\\1", base) nmut <- as.numeric(gsub(".*_([0-9]+)mut.*", "\\1", base)) nwt <- as.numeric(gsub(".*_([0-9]+)wt.*", "\\1", base)) truth <- nmut/(nmut+nwt) * 100 crispresso_dirs <- list.files("../simulation/crispresso", full.names = TRUE) crispresso2_dirs <- list.files("../simulation/crispresso2", full.names = TRUE) #crispresso_20_machiato_dirs <- list.files("../simulation/machiato_homologylen20", full.names = TRUE) #crispresso_30_machiato_dirs <- list.files("../simulation/machiato_homologylen30", full.names = TRUE) #crispresso_40_machiato_dirs <- list.files("../simulation/machiato_homologylen40", full.names = TRUE) #crispresso_50_machiato_dirs <- list.files("../simulation/machiato_homologylen50", full.names = TRUE) crispresso2_20_machiato_dirs <- list.files("../simulation/machiato2_homologylen20", full.names = TRUE) #crispresso2_30_machiato_dirs <- list.files("../simulation/machiato2_homologylen30", full.names = TRUE) #crispresso2_40_machiato_dirs <- list.files("../simulation/machiato2_homologylen40", full.names = TRUE) #crispresso2_50_machiato_dirs <- list.files("../simulation/machiato2_homologylen50", full.names = TRUE) # # Check they are in the same order identical(unname(sapply(base, function(bb) grep(bb, crispresso_dirs))),c(1:length(base))) identical(unname(sapply(base, function(bb) grep(bb, crispresso2_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso_20_machiato_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso_30_machiato_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso_40_machiato_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso_50_machiato_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso2_20_machiato_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso2_30_machiato_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso2_40_machiato_dirs))),c(1:length(base))) # identical(unname(sapply(base, function(bb) grep(bb, crispresso2_50_machiato_dirs))),c(1:length(base))) ## [1] TRUE result <- lapply(seq_along(base), function(i){ print(i) sim_guide <- sim_guides[i] guide <- guides[guides$name == sim_guide] reference <- references[guides$name == sim_guide][[1]] cset <- readsToTarget(bams[i], target.loc = 22, target = guide, reference = reference, collapse.pairs = TRUE, verbose = FALSE) crispresso <- parseCRISPResso(crispresso_dirs[i]) crispresso2 <- parseCRISPResso2(crispresso2_dirs[i]) #name1 <- names(sapply(base, function(bb) grep(bb, crispresso_dirs))[i]) name2 <- names(sapply(base, function(bb) grep(bb, crispresso2_dirs))[i]) #machiato_20 <- parseMaChIAto(crispresso_20_machiato_dirs[grep(name1, crispresso_20_machiato_dirs)]) #machiato_30 <- parseMaChIAto(crispresso_30_machiato_dirs[grep(name1, crispresso_30_machiato_dirs)]) #machiato_40 <- parseMaChIAto(crispresso_40_machiato_dirs[grep(name1, crispresso_40_machiato_dirs)]) #machiato_50 <- parseMaChIAto(crispresso_50_machiato_dirs[grep(name1, crispresso_50_machiato_dirs)]) machiato2_20 <- parseMaChIAto(crispresso2_20_machiato_dirs[grep(name2, crispresso2_20_machiato_dirs)]) #machiato2_30 <- parseMaChIAto(crispresso2_30_machiato_dirs[grep(name2, crispresso2_30_machiato_dirs)]) #machiato2_40 <- parseMaChIAto(crispresso2_40_machiato_dirs[grep(name2, crispresso2_40_machiato_dirs)]) #machiato2_50 <- parseMaChIAto(crispresso2_50_machiato_dirs[grep(name2, crispresso2_50_machiato_dirs)]) c(sim_guide, noff[i], truth[i], mutationEfficiency(cset)[["Average"]], crispresso[[1]], crispresso2[[1]], machiato2_20[[1]]) })
browser() result <- data.frame(do.call(rbind, result)) colnames(result) <- c("Guide", "NOfftargets", "Truth", "CrispRVariants", "CRISPResso", "CRISPResso2", "CRISPResso2-MaChIAto") result <- melt(result, id.vars=c("Guide", "Truth", "NOfftargets"))
result <- rbind(result, pooled_results, adiv_results) result$NOfftargets <- factor(result$NOfftargets, levels = c(0, 33, 100)) levels(result$NOfftargets) <- c("0", "10", "25") class(result$value) <- "numeric" class(result$Truth) <- "factor" truths <- c("0","33.3%", "66.7%", "90%") levels(result$Truth) <- truths levels(result$variable) <- c("CrispRVariants", "CRISPResso", "CRISPResso2", "CRISPResso2-MaChIAto", "CRISPRessoPooled", "AmpliconDIVider") cols <- c("#007F94","#4B385E","#C22B26") tr <- data.frame(Truth = levels(result$Truth), TrNum = as.numeric(gsub("%", "", levels(result$Truth))))
ggplot(result[result$variable %in% c("CRISPResso", "CRISPResso2", "CRISPResso2-MaChIAto"),]) + geom_hline(data = tr, aes(yintercept = TrNum), linetype = "dotted") + facet_wrap(~Truth, nrow = 2) + geom_point(aes(x=NOfftargets, y=value, colour=variable), alpha = 0.3, size = 3, position = position_dodge(width = 0.3)) + theme_bw() + xlab("Percentage of off-target reads") + ylab("Estimated mutation efficiency %") + theme(axis.text = element_text(size = 12), axis.title.y = element_text(margin = margin(0,20,0,0), size = 14), axis.title.x = element_text(margin = margin(15,0,10,0), size = 14), strip.text.x = element_text(size = 14), legend.key = element_blank(), legend.title = element_blank(), legend.position = "bottom", strip.background = element_blank()) + scale_colour_manual(values = cols)
|