Update Khd4-Ada-Gfp editing analysis

838a6a8d · Srimeenakshi Sankaranarayanan · 855fbed5 · 838a6a8d · 838a6a8d · 838a6a8d
Commit 838a6a8d authored 11 months ago by Srimeenakshi Sankaranarayanan
--- a/.gitattributes
+++ b/.gitattributes
@@ -13,3 +13,5 @@ assays/S3_A2_HyperTRIBE_Rrm4_analysis/dataset/RNAseq_files_raw/control-Ada_rep2.
 assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep1.fastq.gz filter=lfs diff=lfs merge=lfs -text
 assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep2.fastq.gz filter=lfs diff=lfs merge=lfs -text
 assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz filter=lfs diff=lfs merge=lfs -text
+assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text
+assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text
--- a/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/Khd4_Ada_Gfp_editing_5%_rep1.bedgraph
+++ b/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/Khd4_Ada_Gfp_editing_5%_rep1.bedgraph
--- a/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/Khd4_Ada_Gfp_editing_5%_rep2.bedgraph
+++ b/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/Khd4_Ada_Gfp_editing_5%_rep2.bedgraph
--- a/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/control_Ada_editing_5%_rep1.bedgraph
+++ b/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/control_Ada_editing_5%_rep1.bedgraph
--- a/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/control_Ada_editing_5%_rep2.bedgraph
+++ b/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Editing_bedgraph_files/control_Ada_editing_5%_rep2.bedgraph
--- a/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html
+++ b/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html
--- a/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.qmd
+++ b/assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.qmd
+---
+title: "01_Characterization of editing sites"
+date: last-modified
+author: 
+    - name:  "Srimeenakshi Sankaranarayanan"
+title-block-banner: true
+format: 
+    html:
+        theme: flatly
+        self-contained: true
+        code-fold: true
+        code-tools: true
+        code-summary: "Show the code"
+        toc: true
+        number-sections: true
+        anchor-sections: true
+editor: visual
+execute:
+  echo: true
+  error: false
+  warning: false
+  message: false
+---
+```{r}
+# library
+library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
+library(ggplot2)
+library(BindingSiteFinder)
+library(rtracklayer)
+library(ComplexHeatmap)
+library(GenomicFeatures)
+library(forcats)
+library(tidyr)
+library(dplyr)
+library(tidyverse)
+library(GenomicRanges)
+library(magick)
+library(magrittr)
+library(gridExtra)
+library(IRanges)
+library(Biostrings)
+library(ggpp)
+library(gginnards)
+library(ggrepel)
+library(ggpubr)
+library(ggforce)
+library(ggrastr)
+library(viridis)
+library(reshape2)
+library(gprofiler2)
+library(ggsci)
+library(ggh4x)
+library(ggplotify)
+library(gridExtra)
+library(circlize)
+library(EnrichedHeatmap)
+library(UpSetR)
+library(kableExtra)
+library(cowplot)
+library(rstatix)
+library(beeswarm)
+library(clusterProfiler)
+library(ggseqlogo)
+library(tidyHeatmap)
+library(paletteer)
+library(ggvenn)
+library(colorspace)
+library(ggpointdensity)
+library(lookup)
+library(rstatix)
+library(ggrepel)
+```
+```{r}
+load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")
+```
+# Characterise editing sites
+The editing events across transcriptome was identified as described here https://hypertribe.readthedocs.io/en/latest/
+It is important to note that a few editing sites needed to be removed for the following reasons: - 6 sites were not on A - due to the gene extension by 300-nt, a handful of editing sites overlapped with 2 genes
+```{r}
+# Import hyperTRIBE data
+hyper.dir = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/"
+# get all bedgraph files
+hyper.files = list.files(hyper.dir, pattern="bedgraph", full.names=TRUE)
+# assign sample names (ATTENTION!)
+names(hyper.files) = c("khd4_rep1", "khd4_rep2", "ctrl_rep1", "ctrl_rep2")
+```
+```{r}
+# combine the hyperTRIBE results into a list
+hyper.list = lapply(hyper.files, function(i){
+  # import data
+  hyper = read.table(i, sep="\t")
+  # keep only first 8 columns
+  hyper = hyper[,c(1:2,4:5,8)]
+  # assign column names
+  colnames(hyper) = c("chr", "start", "perc.editing", "no.reads", "gene.id")
+  # extract number of reads
+  hyper$no.reads = strsplit(hyper$no.reads, "_") %>% sapply(., "[[", 2) %>% sub("r", "", .) %>% as.integer(.)
+  # add strand (from annotation)
+  pos = match(hyper$gene.id, genes.GR$gene_id)
+  hyper$strand = as.vector(strand(genes.GR))[pos]
+  # add id
+  hyper$id = paste(hyper$chr, hyper$start, sep="_")
+  # add number of motifs
+  sel = match(hyper$gene.id, genes.GR$gene_id)
+  hyper$AUACCC = genes.GR$AUACCC[sel]
+  hyper$AGAUCU = genes.GR$AGAUCU[sel]
+  hyper$GGGTAT = genes.GR$GGGTAT[sel]
+  hyper$ACACUC = genes.GR$ACACUC[sel]
+  return(hyper)
+})
+#head(hyper.list[[1]])
+# CHECK FOR DUPLICATED SITES -> overlapping genes due to extension
+# these genes are removed at the moment (could also be resolved differently if needed)
+sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
+for (i in seq_along(hyper.list)){
+  hyper = hyper.list[[i]]
+  dupls = hyper$id[duplicated(hyper$id)]
+  print( hyper[hyper$id %in% dupls,] )
+}
+# REMOVE DUPLICATED SITES
+hyper.list = lapply(hyper.list, function(hyper){
+  dupls = hyper$id[duplicated(hyper$id)]
+  hyper = subset(hyper, !(id %in% dupls))
+  return(hyper)
+})
+# test
+# sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
+```
+```{r}
+# convert editing sites into GRanges object
+hyper.GR = lapply(hyper.list, function(hyper){
+  gr = GRanges(
+    seqnames=hyper$chr,
+    ranges=IRanges(start=hyper$start, width=1),
+    strand=hyper$strand,
+    id=hyper$id,
+    gene.id=hyper$gene.id,
+    no.reads = hyper$no.reads,
+    perc.editing = hyper$perc.editing
+  )
+  return(gr)
+}) %>% as(., "GRangesList")
+```
+## Get sequence of editing sites
+```{r}
+# get sequence of all editing sites
+seqs = lapply(hyper.GR, function(gr){
+  seq = getSeq(Umaydis, gr)
+  return(seq)
+})
+# check distribution
+sapply(seqs, table)
+#### REMOVE INDIVIDUAL SITES THAT ARE NOT ON A
+hyper.GR = lapply(hyper.GR, function(gr){
+  seqs = getSeq(Umaydis, gr)
+  gr = subset(gr, seqs == "A")
+  return(gr)
+})
+# check that only A's remain
+seqs = lapply(hyper.GR, function(gr){
+  seq = getSeq(Umaydis, gr)
+  return(seq)
+})
+sapply(seqs, table)
+```
+## Compare between replicates
+Reproducible editing sites overlapped between the two independent biological replicates of the Khd4-Ada-Gfp and control-Ada data were compared with each other (#1,2- replicates 1 and 2). Only editing events with 5% editing were used for the analysis. (
+```{r}
+# Venn diagram of Khd4-ada-gfp replicates
+venn.list = lapply(hyper.GR[1:2], function(hyper){
+  return(hyper$id)
+})
+pl1 = ggvenn(venn.list, names(hyper.GR)[1:2], fill_color = c(lighten(khd4.col, 0.2), darken(khd4.col, 0.2)), set_name_size = 2) +theme_paper()+labs(x = NULL, y = NULL) + 
+  guides(x = "none", y = "none")
+# Venn diagram of control-Ada replicates
+venn.list = lapply(hyper.GR[3:4], function(hyper){
+  return(hyper$id)
+})
+pl2 = ggvenn(venn.list, names(hyper.GR)[3:4], fill_color = c(lighten(ada.col, 0.2), darken(ada.col, 0.2)), set_name_size = 2) +theme_paper()+labs(x = NULL, y = NULL) + 
+  guides(x = "none", y = "none")
+#FigS6A
+vennrep<-ggarrange(pl1, pl2, 
+          labels = c("A", "B"),
+          ncol = 2, nrow = 1)
+vennrep
+```
+```{r}
+# KEEP ONLY REPRODUCIBLE SITES FOR FURTHER ANALYSES
+# merge Khd4-Ada-Gfp replicates
+khd4.tab = merge(hyper.GR$khd4_rep1 %>% mcols(), hyper.GR$khd4_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
+# select only sites from overlap and add combined info of both replicates
+sel = match(khd4.tab$id, hyper.GR$khd4_rep1$id)
+khd4.rep = hyper.GR$khd4_rep1[sel]
+mcols(khd4.rep) = khd4.tab
+# merge control-Ada replicates
+ctrl.tab = merge(hyper.GR$ctrl_rep1 %>% mcols(), hyper.GR$ctrl_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
+# select only sites from overlap and add combined info of both replicates
+sel = match(ctrl.tab$id, hyper.GR$ctrl_rep1$id)
+ctrl.rep = hyper.GR$ctrl_rep1[sel]
+mcols(ctrl.rep) = ctrl.tab
+## NEW GRANGES OBJECT WITH ONLY REPRODUCIBLE SITES
+hyper.rep = GRangesList(khd4 = khd4.rep, ctrl = ctrl.rep)
+# number of reproducible editing sites
+elementNROWS(hyper.rep)
+# genes with reproducible sites
+sapply(hyper.rep, function(hyper) length(unique(hyper$gene.id)))
+elementNROWS(hyper.rep)
+```
+```{r}
+# Venn diaram
+venn.list = lapply(hyper.rep, function(hyper){
+  return(hyper$id)
+})
+# FigS6A and B
+ggvenn(venn.list, names(venn.list), fill_color = c(khd4.col, ada.col)) +
+  ggtitle("Overlap between Khd4.Ada and Ada.ctrl")
+```
+# Studying the editing events
+```{r}
+# distribution of % editing
+pf = data.frame(
+  perc.editing = c(khd4.tab$perc.editing.rep1, khd4.tab$perc.editing.rep2, ctrl.tab$perc.editing.rep1, ctrl.tab$perc.editing.rep2),
+  sample = rep(c("khd4.rep1", "khd4.rep2", "ctrl.rep1", "ctrl.rep2"), rep(c(nrow(khd4.tab), nrow(ctrl.tab)), each=2)),
+  condition = rep(c("khd4", "khd4", "ctrl", "ctrl"), rep(c(nrow(khd4.tab), nrow(ctrl.tab)), each=2)))
+# khd4-ada
+#FigS6C
+pl1 = ggplot(pf, aes(x=perc.editing, fill=sample)) +
+  geom_density(adjust = 1/5, alpha=0.5) +
+  scale_fill_manual(values=c(khd4.rep1 = lighten(khd4.col, 0.2), khd4.rep2 = darken(khd4.col, 0.2), ctrl.rep1 = lighten(ada.col, 0.2), ctrl.rep2 = darken(ada.col, 0.2)))+ 
+  xlab("% editing")+
+  ylab("density")+
+  myTheme1
+pl1
+```
+Note that I used coefficient determination to check the editing reproducibility because the R2 indicates the distance of points between 1:1 line, on the other hand magnitude. Therefore, I did not use pearson (the analysis is independent of magnitude, in our case, editing percentage) and spearman (as the editing relationship is not monotonic).
+```{r}
+# Note: used coefficient determination because the R2 indicates the distance of 
+# points between 1:1 line, on the other hand magnitude; didnt use pearson (as it 
+# is is independent of magnitude, in our case, independent of editing percentage)
+# no spearman (as the editing relationship is not monotonic)
+# scatter plots of % editing in overlapping sites
+# Khd4-Ada-Gfp
+pl3 = ggplot(khd4.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2)) +
+  geom_point(adjust=0.1, color="grey") +
+  ggtitle("Reproducible editing sites in Khd4-Ada-Gfp") +
+  theme_bw() +
+  labs(x="Khd4-Ada-Gfp #rep1 (editing %)",
+       y = "Khd4-Ada-Gfp #rep2 (editing %)")+
+  theme(aspect.ratio = 1) +
+  expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
+  stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
+  stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ myTheme1 
+pl3
+# Control-Ada
+pl4 = ggplot(ctrl.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2))+ 
+   geom_point(adjust=0.1, color="grey")+
+  ggtitle("Reproducible editing sites in control-Ada") +
+  labs(x="Control-Ada #rep1 (editing %)",
+       y = "Control-Ada #rep2 (editing %)")+
+  theme_bw() +
+  theme(aspect.ratio = 1) +
+  expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
+  stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
+  stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ myTheme1 
+pl4  
+```
+## De novo motif discovery analysis on sequences carrying unique control-Ada editing sites.
+XSTREME analysis was done using the random genome sequences of lenght 501-nt as a background and unique-Control-Ada editing sites (501-nt) as an input
+```{r}
+# unique to control-Ada
+Motif.frame.con = data.frame( Motif = c("GCGCGUG","CGABGACGAVGAVG","RVCAAGAWGRRCAAG","GUGAYUCGUGAYUGU","AGCAGCAGCAG","CGACUUGCG","WSVYCAAGMWGMUCR","ACCGCCAUCU","CAGCUCWAGCC","AWUCAYGAAUS"),
+absolute = c(43.7,40.9,21.7,4.9,30.8,72.5,1.7,19.9,76.7,26.3),
+EnR = c(1.25,1.17,1.20,1.58,1.09,1.03,1.44,1.01,1.00,0.97),
+set=rep("ctrl",10))
+# make a ggplot
+pl.con = ggplot(Motif.frame.con, aes(x=absolute, y=EnR, color=set))+
+  geom_point(alpha=1, size=2)+
+  scale_color_manual(values = ada.col)+
+  coord_cartesian(ylim=c(0,10))+
+  labs(subtitle= "control-Ada editing sites",
+    x="sequences with motif (%)",
+       y = "motif enrichment ratio")+
+  theme_paper()+
+  theme(legend.position = "none")
+pl.con
+```
+```{r}
+sessionInfo()
+```
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.qmd
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.qmd
+---
+title: "HyperTRIBE_identifies_highly_specific_targets_of_Khd4"
+date: last-modified
+author: 
+    - name:  "Srimeenakshi Sankaranarayanan"
+title-block-banner: true
+format: 
+    html:
+        theme: flatly
+        self-contained: true
+        code-fold: true
+        code-tools: true
+        code-summary: "Show the code"
+        toc: true
+        number-sections: true
+        anchor-sections: true
+editor: visual
+execute:
+  echo: true
+  error: false
+  warning: false
+  message: false
+---
+```{r}
+# library
+library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
+library(ggplot2)
+library(BindingSiteFinder)
+library(rtracklayer)
+library(ComplexHeatmap)
+library(GenomicFeatures)
+library(forcats)
+library(tidyr)
+library(dplyr)
+library(tidyverse)
+library(GenomicRanges)
+library(magick)
+library(magrittr)
+library(gridExtra)
+library(IRanges)
+library(Biostrings)
+library(ggpp)
+library(gginnards)
+library(ggrepel)
+library(ggpubr)
+library(ggforce)
+library(ggrastr)
+library(viridis)
+library(reshape2)
+library(gprofiler2)
+library(ggsci)
+library(ggh4x)
+library(ggplotify)
+library(gridExtra)
+library(circlize)
+library(EnrichedHeatmap)
+library(UpSetR)
+library(kableExtra)
+library(cowplot)
+library(rstatix)
+library(beeswarm)
+library(clusterProfiler)
+library(ggseqlogo)
+library(tidyHeatmap)
+library(paletteer)
+library(ggvenn)
+library(colorspace)
+library(ggpointdensity)
+library(lookup)
+library(rstatix)
+library(ggrepel)
+```
+```{r}
+load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")
+```
+```{r}
+# reproducible editing sites in Khd4-Ada-Gfp and control-Ada
+hyper.rep
+```
+```{r}
+# Venn diaram
+venn.list = lapply(hyper.rep, function(hyper){
+  return(hyper$id)
+})
+names(venn.list) = c("Khd4-Ada-Gfp", "control-Ada")
+# FigS6A and B
+ggvenn(venn.list, names(venn.list), 
+       fill_color = c("dodgerblue3", ada.col),  
+       set_name_size = 6) +
+  ggtitle("Overlap between Khd4-Ada-Gfp and control-Ada")+
+  theme(plot.title = element_text(size = 20, hjust = 0.5, vjust = 3))
+```
+```{r}
+# Identifying the RNA sequence motif
+################################################################################
+#AUACCC
+################################################################################
+# get gene sequences
+gene.seqs = getSeq(Umaydis, genes.GR)
+# find AUACCC motifs
+motif.auaccc = DNAString("ATACCC")
+# find all AUACCC motifs in the genome
+auaccc.gr = vmatchPattern(motif.auaccc, Umaydis)
+# count motifs in genes
+genes.GR$AUACCC = countOverlaps(genes.GR, auaccc.gr)
+#table(genes.GR$AUACCC)
+# assign motifs to genes
+ov = findOverlaps(auaccc.gr, genes.GR)
+# check whether motifs overlap with more than one gene
+table(duplicated(from(ov)))
+# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
+auaccc.gr = auaccc.gr[from(ov)]
+auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]
+# count motifs in transcript regions
+#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
+#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
+################################################################################
+#AGAUCU
+################################################################################
+# find AGAUCU motifs
+motif.ctrl = DNAString("AGATCT")
+agaucu.gr = vmatchPattern(motif.ctrl, Umaydis)
+genes.GR$AGAUCU = countOverlaps(genes.GR, agaucu.gr)
+table(genes.GR$AGAUCU)
+# assign motifs to genes
+ov = findOverlaps(agaucu.gr, genes.GR)
+# check whether motifs overlap with more than one gene
+#table(duplicated(from(ov)))
+# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
+agaucu.gr = agaucu.gr[from(ov)]
+agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]
+# count motifs in transcript regions
+#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
+#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
+```
+## De novo motif discovery analysis on sequences carrying unique Khd4–Ada–Gfp editing sites.
+XSTREME analysis was done using the random genome sequences of lenght 501-nt as a background and unique-Khd4-Ada-Gfp editing sites (501-nt) as an input.
+First, the Khd4-Ada-Gfp editing sites were extended on each side by 250nt to make a 501-nt window.
+```{r eval=FALSE}
+# keep only id column
+hyper.spe = lapply(hyper.rep, function (x){
+  mcols(x) = x[,c(1)]
+  names(x) = x$id
+  return(x)
+})
+# split the reproducible editing sites that are unique to Khd4 and Control , and shared
+hyper.specific = list(
+  Khd4.specific  = hyper.spe$khd4[!(hyper.spe$khd4$id %in% hyper.spe$ctrl$id)],
+  shared = hyper.spe$khd4[hyper.spe$khd4$id %in% hyper.spe$ctrl$id],
+  ctrl.specific  = hyper.spe$ctrl[!(hyper.spe$ctrl$id %in% hyper.spe$khd4$id)]
+)
+# as a GRanges list
+hyper.specific = as(hyper.specific, "GRangesList")
+# extend the window by 250 bp on both sides
+hyper_250 = lapply(hyper.specific, function(x){
+  x = x+250 # window 250 bp
+})
+# get fastafiles
+# get sequence
+khd4.specific = Biostrings::getSeq(Umaydis, hyper_250$Khd4.specific)
+khd4.ctrl.shared = Biostrings::getSeq(Umaydis, hyper_250$shared)
+ctrl.specific = Biostrings::getSeq(Umaydis, hyper_250$ctrl.specific)
+# export the fasta files
+output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"
+writeXStringSet(khd4.specific, filepath = paste0(output,"khd4.specific_250.fasta"))
+writeXStringSet(khd4.ctrl.shared, filepath = paste0(output,"khd4.ctrl.shared_250.fasta"))
+writeXStringSet(ctrl.specific, filepath = paste0(output,"ctrl.specific_250.fasta"))
+```
+Scatter plot comparing the relative enrichment ratio of enriched motifs with the percentage of each motif in the tested sequences. 
+```{r}
+# making a scatter plot
+# unique to Khd4-Ada-Gfp
+Motif.frame.khd4 = data.frame(Motif = 
+c("HMHHKAUACCC","HMKAUACCCVH","UGSAGGCGCURSWGS","UHCCUCUYCHCCMUC","CAGCARCARCADCAG","CCUCKAAACSNCGC","CYAAKSMUGCUYCUR","CYCCCW","CUUGAYUUGUWWYCC","YYUUGUAYUUGUA","UYGCUGCUCUC", "ACMUCUCCAAGVYCA","CUCCUGAUCCUGCCU", "AGCGSUCRUUUGASC", "CCUCUKU", "CAACGCUCCAC","CCAGCAACAGCARSM", "GCAGAUGRUAYCCAK", "RASAAGMVSAWCAAG", "AVUGGUGCUGU"),
+absolute = c(60.9, 59.3, 16.0, 65.8, 13.6, 37.4, 11.1, 37.9, 2.5, 6.2, 18.5, 6.2, 5.3, 3.7, 16.0, 50.6, 29.2, 3.7, 38.3, 11.1),
+EnR = c(9.40,9.60,8.35,1.46,4.01,1.80,3.30,1.60,37.98,4.82,2.08, 4.34, 5.06, 4.93, 1.81, 1.27, 1.43, 3.88	, 1.10, 1.05),
+set=rep("khd4",20))
+pl.khd4 = ggplot(Motif.frame.khd4, aes(x=absolute, y=EnR, color=set))+
+  geom_point(alpha=1, size=2)+
+  scale_color_manual(values = khd4.col)+
+  coord_cartesian(ylim=c(0,10))+
+  labs(subtitle= "Khd4-Ada-Gfp",
+       x="sequences with motif (%)",
+       y = "motif enrichment ratio")+
+  theme_paper()+
+  theme(legend.position = "none")
+pl.khd4
+```
+Make the sequence logo of the most enriched two motifs
+```{r}
+################################################################################
+# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts
+################################################################################
+# motif logo with u in it
+# Motif-1 TP% = 60.9   ENR-Ratio = 9.40 HMHHKAUACCC
+A= c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524
+)
+C= c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476
+)
+G = c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
+)
+U = c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000
+)
+pwDF1bU = rbind(A,C,G,U)
+# MoUif-2 UP% = 59.3   ENR-RaUio = 9.60 HMKAUACCCVH
+A= c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404
+)
+C= c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852
+)
+G = c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894
+)
+U = c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849
+)
+pwDF2bU = rbind(A,C,G,U)
+# make ggseqlogo
+m1 = ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")
+m2 = ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")
+pl = ggarrange(m1,m2,
+              labels = LETTERS[1:2],
+              nrow = 1)
+pl
+```
+Next I want to select mRNA targets that are specific to Khd4-Ada-Gfp and control-Ada data
+```{r}
+# Khd4.spe
+Khd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]
+# mRNAs specific to control
+ctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]
+# make a list with mRNAs specific to each data set
+# list
+hyper.spe = list(
+  Khd4_Ada_Gfp = Khd4.spe,
+  control_Ada = ctrl.spe.tx
+) %>% as(., "GRangesList")
+```
+## Distribution of editing sites from the nearest AUACCC motif
+Only editing sites on mRNAs specific to each dataset were considered
+```{r}
+pf = lapply(names(hyper.spe), function(i){
+  #i = "khd4"
+  hyper = hyper.spe[[i]]
+  head(hyper)
+  # calculate distance to nearest AUACCC motif
+  dists.auacc = distanceToNearest(hyper, auaccc.gr)
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]
+  table(sel)
+  dists.auacc= subset(dists.auacc, sel)
+  mcols(dists.auacc)$motif = "AUACCC"
+  # calculate distance to nearest AGAUCU motif
+  dists.agaucu = distanceToNearest(hyper, agaucu.gr)
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]
+  table(sel)
+  dists.agaucu= subset(dists.agaucu, sel)
+  mcols(dists.agaucu)$motif = "AGAUCU"
+  # get stats of distances
+  summary(mcols(dists.auacc)$distance)
+  summary(mcols(dists.agaucu)$distance)
+  pf = rbind(mcols(dists.auacc), mcols(dists.agaucu)) %>% as.data.frame()
+  pf$set = i
+  return(pf)
+})
+pf = do.call(rbind, pf)
+# FigS7B
+# Histogram
+pl4 = ggplot(pf, aes(x=distance, fill=set)) +
+  geom_histogram( binwidth = 150, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500)) +
+  scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey60")) +
+  labs(x="distance of editing sites to the nearest motif (nt)",
+       y="motif density")+
+  theme_bw() +
+  facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU")))+myTheme1
+pl4
+```
+```{r}
+# median distance between editing sites and the motif
+#Khd4-Ada-Gfp
+Khd4_auaccc= filter(pf, set == "khd4", motif == "AUACCC") 
+med_K_auaccc = median(Khd4_auaccc$distance)
+#Control-Ada
+Ctrl_auaccc= filter(pf, set == "ctrl", motif == "AUACCC") 
+med_C_auaccc = median(Ctrl_auaccc$distance)
+data.frame(Khd4_Ada_Gfp = med_K_auaccc,
+           control_Ada = med_C_auaccc) %>% 
+  kbl(caption = "median distance between auaccc and editing events",html_font = "Cambria", bold = T, color="black") %>% 
+  kable_styling(bootstrap_options = "striped", "hover", full_width = F, html_font = "Cambria") %>%
+  column_spec(1, bold = T) 
+```
+## Studying the enrichment of motifs in mRNAs specific to each datasets
+Here, we consider only the editing events on Khd4-bound mRNAs and on mRNAs specific to control-Ada. 
+Khd4-bound mRNAs- All mRNAs that carry reproducible Khd4-Ada-Gfp editing sites with atleast one AUACCC motif.
+Control-Ada mRNAs- mRNAs that have reproducible control-Ada editing sites but no Khd4-Ada-Gfp sites 
+```{r}
+# Khd4-bound mRNAs
+Khd4_boundRNA= hyper.rep$khd4[(hyper.rep$khd4$gene.id %in% auaccc.gr$gene.id)]
+# mRNAs specific to control
+ctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]
+#check if the numbers are similar to the venn diagram
+#n_distinct(khd4.spe.tx$gene.id)
+# make a list with mRNAs specific to each data set
+Khd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]
+# list
+hyper.rep.txspe = list(
+  khd4 = Khd4_boundRNA,
+  ctrl = ctrl.spe.tx
+) %>% as(., "GRangesList")
+```
+# Analyis of the high-confidence targets
+```{r}
+# only Khd4
+khd4.rep.edit=hyper.rep$khd4
+motifs=genes.GR
+khd4.rep.edit$AUACCC=lookup(khd4.rep.edit$gene.id, motifs$gene_id,motifs$AUACCC)
+#granges of high confidence Khd4
+high.conf.khd4=subset(khd4.rep.edit,AUACCC>0)
+high.conf = high.conf.khd4
+```
+## Transcript region enrichment of motif
+Since Khd4 binds AUACCC specifically, we first want to check the transcript region that the AUACCC motif is enriched in high-confidence mRNA targets. Besides using the AGAUCU motif as a control, I also want to use the all hyphae expressed AUACCC-containing mRNAs as a putative targets 
+Furthermore, I will use the already available AUACCC and AGAUCU motif position in  *U. maydis* genome, as these already include the transcript region locations of the motif.  
+```{r}
+# load DGE data to identify hyphae expressed genes
+# Use the lookup function to identify geneIDs overlapping with the high confident targets  
+Fi_wtvskd_DEG$high.conf_targets=lookup(Fi_wtvskd_DEG$gene.ID,high.conf$gene.id, high.conf$gene.id )
+# auaccc motif
+Fi_wtvskd_DEG$auaccc=lookup(Fi_wtvskd_DEG$gene.ID,auaccc.gr$gene.id, auaccc.gr$gene.id )
+# agaucu motif
+Fi_wtvskd_DEG$agaucu=lookup(Fi_wtvskd_DEG$gene.ID,agaucu.gr$gene.id, agaucu.gr$gene.id )
+# only expressed genes
+Fi_wtvskd_BM = Fi_wtvskd_DEG[Fi_wtvskd_DEG$baseMean>10,]
+```
+```{r}
+auaccc_DGE = Fi_wtvskd_BM[!is.na(Fi_wtvskd_BM$auaccc),] 
+agaucu_DGE = Fi_wtvskd_BM[!is.na(Fi_wtvskd_BM$agaucu),] 
+```
+```{r}
+# AUACCC
+auaccc.mot=read.csv("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/AUACCCmotif.csv")
+auaccc.mot$high.conf.targets = lookup::lookup(auaccc.mot$Gene_ID,high.conf$gene.id,high.conf$gene.id)
+auaccc.mot$put.targets = lookup::lookup(auaccc.mot$Gene_ID, auaccc_DGE$gene.ID, auaccc_DGE$gene.ID)
+# AGAUCU
+agaucu.mot=read.csv("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/AGAUCUmotif.csv")
+agaucu.mot$high.conf.targets = lookup::lookup(agaucu.mot$GeneID,high.conf$gene.id,high.conf$gene.id)
+agaucu.mot$put.targets = lookup::lookup(agaucu.mot$GeneID, auaccc_DGE$gene.ID, auaccc_DGE$gene.ID)
+```
+```{r}
+# highconfident targets (auaccc motif)
+df_high_conf_auaccc = auaccc.mot[!is.na(auaccc.mot$high.conf.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AUACCC", set = "Khd4_bound_mRNA")
+# highconfident targets (agaucu motif)
+df_high_conf_agaucuc = agaucu.mot[!is.na(agaucu.mot$high.conf.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AGAUCU", set = "Khd4_bound_mRNA")
+# putative targets (auaccc motif)
+df_put_auaccc = auaccc.mot[!is.na(auaccc.mot$put.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AUACCC", set = "putative_targets")
+# putative targets (agaucu motif)
+df_put_agaucuc = agaucu.mot[!is.na(agaucu.mot$put.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AGAUCU", set = "putative_targets")
+df = rbind(df_high_conf_auaccc,df_high_conf_agaucuc,df_put_auaccc,df_put_agaucuc)
+ggplot(df, aes(x=factor(set, levels=c("Khd4_bound_mRNA", "putative_targets")), y = Freq, fill = factor(variable, levels=c("5UTR", "ORF", "3UTR"))))+
+  geom_bar(position = position_fill(), stat = "identity")+
+  scale_fill_brewer(palette = "Greens")+
+  facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU")))+
+  labs(x="",
+       y = "% with motif",
+       fill= "transcript region")+
+  myTheme1
+```
+```{r}
+sessionInfo()
+```