upload motif analysis report

432c04ca · Srimeenakshi Sankaranarayanan · 838a6a8d · 432c04ca · 432c04ca · 432c04ca
Commit 432c04ca authored 10 months ago by Srimeenakshi Sankaranarayanan
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,3 +15,4 @@ assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep2.
 assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz filter=lfs diff=lfs merge=lfs -text
 assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text
 assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text
+assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]editing[[:space:]]sites[[:space:]]are[[:space:]]proximal[[:space:]]to[[:space:]]the[[:space:]]AUACCC[[:space:]]motif.html filter=lfs diff=lfs merge=lfs -text
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.html
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.html
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.qmd
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.qmd
+---
+title: "Khd4-Ada-Gfp editing sites are proximal to AUACCC motif"
+date: last-modified
+author: 
+    - name:  "Srimeenakshi Sankaranarayanan"
+      
+title-block-banner: true
+format: 
+    html:
+        theme: flatly
+        self-contained: true
+        code-fold: true
+        code-tools: true
+        code-summary: "Show the code"
+        toc: true
+        number-sections: true
+        anchor-sections: true
+editor: visual
+execute:
+  echo: true
+  error: false
+  warning: false
+  message: false
+---
+
+```{r}
+# library
+library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
+library(ggplot2)
+library(BindingSiteFinder)
+library(rtracklayer)
+library(ComplexHeatmap)
+library(GenomicFeatures)
+library(forcats)
+library(tidyr)
+library(dplyr)
+library(tidyverse)
+library(GenomicRanges)
+library(magick)
+library(magrittr)
+library(gridExtra)
+library(IRanges)
+library(Biostrings)
+library(ggpp)
+library(gginnards)
+library(ggrepel)
+library(ggpubr)
+library(ggforce)
+library(ggrastr)
+library(viridis)
+library(reshape2)
+library(gprofiler2)
+library(ggsci)
+library(ggh4x)
+library(ggplotify)
+library(gridExtra)
+library(circlize)
+library(EnrichedHeatmap)
+library(UpSetR)
+library(kableExtra)
+library(cowplot)
+library(rstatix)
+library(beeswarm)
+library(clusterProfiler)
+library(ggseqlogo)
+library(tidyHeatmap)
+library(paletteer)
+library(ggvenn)
+library(colorspace)
+library(ggpointdensity)
+library(lookup)
+library(rstatix)
+library(ggrepel)
+
+```
+
+```{r}
+load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")
+```
+
+Identifying the positions of RNA sequence to be tested across the genome.
+
+```{r}
+################################################################################
+#AUACCC
+################################################################################
+
+# get gene sequences
+gene.seqs = getSeq(Umaydis, genes.GR)
+
+# find AUACCC motifs
+motif.auaccc = DNAString("ATACCC")
+
+# find all AUACCC motifs in the genome
+auaccc.gr = vmatchPattern(motif.auaccc, Umaydis)
+
+# count motifs in genes
+genes.GR$AUACCC = countOverlaps(genes.GR, auaccc.gr)
+#table(genes.GR$AUACCC)
+
+# assign motifs to genes
+ov = findOverlaps(auaccc.gr, genes.GR)
+
+# check whether motifs overlap with more than one gene
+#table(duplicated(from(ov)))
+
+# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
+auaccc.gr = auaccc.gr[from(ov)]
+auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]
+
+# count motifs in transcript regions
+#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
+#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
+
+
+################################################################################
+#AGAUCU
+################################################################################
+
+# find AGAUCU motifs
+motif.ctrl = DNAString("AGATCT")
+agaucu.gr = vmatchPattern(motif.ctrl, Umaydis)
+genes.GR$AGAUCU = countOverlaps(genes.GR, agaucu.gr)
+#table(genes.GR$AGAUCU)
+
+# assign motifs to genes
+ov = findOverlaps(agaucu.gr, genes.GR)
+
+# check whether motifs overlap with more than one gene
+#table(duplicated(from(ov)))
+
+# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
+agaucu.gr = agaucu.gr[from(ov)]
+agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]
+
+# count motifs in transcript regions
+#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
+#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
+
+
+################################################################################
+#GGGTAT
+################################################################################
+
+# find GGGTAT motifs
+motif.ctrl = DNAString("GGGTAT")
+gggtat.gr = vmatchPattern(motif.ctrl, Umaydis)
+genes.GR$GGGTAT = countOverlaps(genes.GR, gggtat.gr)
+#table(genes.GR$GGGTAT)
+
+# assign motifs to genes
+ov = findOverlaps(gggtat.gr, genes.GR)
+
+# check whether motifs overlap with more than one gene
+#table(duplicated(from(ov)))
+
+# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
+gggtat.gr = gggtat.gr[from(ov)]
+gggtat.gr$gene.id = genes.GR$gene_id[to(ov)]
+
+# count motifs in transcript regions
+#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
+#findOverlaps(gggtat.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
+
+
+################################################################################
+#ACACUC
+################################################################################
+
+# find AGAUCU motifs
+motif.ctrl = DNAString("ACACTC")
+acacuc.gr = vmatchPattern(motif.ctrl, Umaydis)
+genes.GR$ACACUC = countOverlaps(genes.GR, acacuc.gr)
+#table(genes.GR$ACACUC)
+
+# assign motifs to genes
+ov = findOverlaps(acacuc.gr, genes.GR)
+
+# check whether motifs overlap with more than one gene
+#table(duplicated(from(ov)))
+
+# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
+acacuc.gr = acacuc.gr[from(ov)]
+acacuc.gr$gene.id = genes.GR$gene_id[to(ov)]
+
+# count motifs in transcript regions
+#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
+#findOverlaps(acacuc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
+```
+
+## Motif enrichment
+
+```{r}
+# check that numbers are same as in Venn
+#sapply(khd4.vs.ctrl, length)
+
+names(khd4.vs.ctrl) = c("Khd4-Ada-Gfp-unique", "overlap", "control-Ada unique", "all mRNAs")
+
+# table of AUACCC
+tab.auaccc = sapply(khd4.vs.ctrl, function(gene.set){
+  motifs = genes.GR$AUACCC[genes.GR$gene_id %in% gene.set]
+  tab = table(motifs > 0)
+  return(tab)
+}) %>% melt(.)
+names(tab.auaccc) = c("with.motif", "set", "no.genes")
+
+# table of AGAUCU
+tab.agaucu = sapply(khd4.vs.ctrl, function(gene.set){
+  motifs = genes.GR$AGAUCU[genes.GR$gene_id %in% gene.set]
+  tab = table(motifs > 0)
+  return(tab)
+}) %>% melt(.)
+names(tab.agaucu) = c("with.motif", "set", "no.genes")
+
+# table of GGGTAT
+tab.gggtat = sapply(khd4.vs.ctrl, function(gene.set){
+  motifs = genes.GR$GGGTAT[genes.GR$gene_id %in% gene.set]
+  tab = table(motifs > 0)
+  return(tab)
+}) %>% melt(.)
+names(tab.gggtat) = c("with.motif", "set", "no.genes")
+
+# table of CCCAUA
+tab.acacuc = sapply(khd4.vs.ctrl, function(gene.set){
+  motifs = genes.GR$ACACUC[genes.GR$gene_id %in% gene.set]
+  tab = table(motifs > 0)
+  return(tab)
+}) %>% melt(.)
+names(tab.acacuc) = c("with.motif", "set", "no.genes")
+
+
+# stacked barchart (scaled to 100%)
+pl3 = ggplot(tab.auaccc, aes(x=set, y=no.genes, fill=with.motif)) +
+  geom_bar(position="fill", stat="identity") +
+  scale_fill_manual(values=c('TRUE' = auaccc.col, 'FALSE' = "lightgrey")) +
+  ggtitle("AUACCC")+
+  labs(x="",
+       y="% with motif")+
+  myTheme1+
+  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
+
+
+# stacked barchart (scaled to 100%)
+pl4 = ggplot(tab.agaucu, aes(x=set, y=no.genes, fill=with.motif)) +
+  geom_bar(position="fill", stat="identity") +
+  scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) +
+  ggtitle("AGAUCU")+
+  labs(x="",
+       y="% with motif")+
+  myTheme1+
+  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
+
+# stacked barchart (scaled to 100%)
+pl5 = ggplot(tab.gggtat, aes(x=set, y=no.genes, fill=with.motif)) +
+  geom_bar(position="fill", stat="identity") +
+  scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) +
+  ggtitle("GGGTAT")+
+  labs(x="",
+       y="% with motif")+
+  myTheme1+
+  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
+
+# stacked barchart (scaled to 100%)
+pl6 = ggplot(tab.acacuc, aes(x=set, y=no.genes, fill=with.motif)) +
+  geom_bar(position="fill", stat="identity") +
+  scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) +
+  ggtitle("ACAUCU")+
+  labs(x="",
+       y="% with motif")+
+  myTheme1+
+  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
+
+# FigS7A
+
+pl<-ggarrange(pl3, pl4, pl5, pl6, 
+          labels = LETTERS[1:4],
+         nrow = 1, common.legend = TRUE)
+
+pl
+```
+
+Fischer's exact test was performed to calculate the enrichment of motif between datasets from the previous section. Note that I made the dataframe by manually entering the number from tab.motif variable from above.
+
+```{r}
+#Pvalue calculation
+# isolating the motif number in each set that is with or without motif
+
+################################################################################
+#AUACCC motif
+################################################################################
+
+khd4.uni=c(157,36)
+shared=c(60,42)
+ctrl.uni=c(226,682)
+all=c(1675,5256)
+
+enrmotif= data.frame(khd4.uni,shared,ctrl.uni,all) # make dataframe
+
+row.names(enrmotif)=c("auaccc", "nonauaccc") # assign rownames
+
+# fischers test
+chisq.khd4Mot=enrmotif[,c(1,4)]
+chisq.sharedMot=enrmotif[,c(2,4)]
+chisq.ctrlMot=enrmotif[,c(3,4)]
+
+#fischers test
+fisher.test(chisq.khd4Mot) # khd4unique vs all
+fisher.test(chisq.sharedMot) # shared vs all
+fisher.test(chisq.ctrlMot) # conrol vs all
+
+################################################################################
+#AGAUCU
+################################################################################
+
+mkhd4.uni=c(102,91)
+mshared=c(50,52)
+mctrl.uni=c(438,470)
+mall=c(2931,4000)
+
+menrmotif= data.frame(mkhd4.uni,mshared,mctrl.uni,mall) # make dataframe
+
+row.names(menrmotif)=c("agaucu", "nonagaucu") # assign rownames
+
+# fischers test
+chisq.mkhd4Mot=menrmotif[,c(1,4)]
+chisq.msharedMot=menrmotif[,c(2,4)]
+chisq.mctrlMot=menrmotif[,c(3,4)]
+
+
+#fischers test
+fisher.test(chisq.mkhd4Mot) # khd4unique vs all
+fisher.test(chisq.msharedMot) # shared vs all
+fisher.test(chisq.mctrlMot) # conrol vs all
+
+################################################################################
+#GGGUAU motif
+################################################################################
+
+m2khd4.uni=c(47,146)
+m2shared=c(31,71)
+m2ctrl.uni=c(232,676)
+m2all=c(1496,5435)
+
+m2enrmotif= data.frame(m2khd4.uni,m2shared,m2ctrl.uni,m2all) # make dataframe
+
+row.names(m2enrmotif)=c("cccaua", "nonagaucu") # assign rownames
+
+# fischers test
+chisq.m2khd4Mot=m2enrmotif[,c(1,4)]
+chisq.m2sharedMot=m2enrmotif[,c(2,4)]
+chisq.m2ctrlMot=m2enrmotif[,c(3,4)]
+chisq.m2ctrlMot_unimot=m2enrmotif[,c(1,3)]
+
+
+#fischers test
+fisher.test(chisq.m2khd4Mot) # khd4unique vs all
+fisher.test(chisq.m2sharedMot) # shared vs all
+fisher.test(chisq.m2ctrlMot) # conrol vs all
+fisher.test(chisq.m2ctrlMot_unimot) # khd4unique vs ctrl
+
+################################################################################
+#ACACUC motif
+################################################################################
+
+m3khd4.uni=c(115,78)
+m3shared=c(49,53)
+m3ctrl.uni=c(521,387)
+m3all=c(3482,3449)
+
+m3enrmotif= data.frame(m3khd4.uni,m3shared,m3ctrl.uni,m3all) # make dataframe
+
+row.names(m3enrmotif)=c("cccaua", "nonagaucu") # assign rownames
+
+# fischers test
+chisq.m3khd4Mot=m3enrmotif[,c(1,4)]
+chisq.m3sharedMot=m3enrmotif[,c(2,4)]
+chisq.m3ctrlMot=m3enrmotif[,c(3,4)]
+chisq.m3ctrlMot_unimot=m3enrmotif[,c(1,3)]
+
+
+#fischers test
+fisher.test(chisq.m3khd4Mot) # khd4unique vs all
+fisher.test(chisq.m3sharedMot) # shared vs all
+fisher.test(chisq.m3ctrlMot) # conrol vs all
+fisher.test(chisq.m3ctrlMot_unimot) # khd4unique vs ctrl
+```
+
+## Distance from editing sites to motifs
+
+Only editing sites on mRNAs specific to each dataset were considered
+
+```{r}
+names(hyper.rep) = c("Khd4_Ada_Gfp", "control_Ada")
+
+
+pf = lapply(names(hyper.rep), function(i){
+  #i = "khd4"
+  hyper = hyper.rep[[i]]
+  head(hyper)
+  
+  # calculate distance to nearest AUACCC motif
+  dists.auacc = distanceToNearest(hyper, auaccc.gr)
+  
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]
+  table(sel)
+  dists.auacc= subset(dists.auacc, sel)
+  mcols(dists.auacc)$motif = "AUACCC"
+  
+  # calculate distance to nearest AGAUCU motif
+  dists.agaucu = distanceToNearest(hyper, agaucu.gr)
+  
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]
+  table(sel)
+  dists.agaucu= subset(dists.agaucu, sel)
+  mcols(dists.agaucu)$motif = "AGAUCU"
+  
+  # calculate distance to nearest CCCAUA motif
+  dists.gggtat = distanceToNearest(hyper, gggtat.gr)
+  
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.gggtat)] == gggtat.gr$gene.id[to(dists.gggtat)]
+  table(sel)
+  dists.gggtat = subset(dists.gggtat, sel)
+  mcols(dists.gggtat)$motif = "GGGTAT"
+  
+  # calculate distance to nearest ACACUC motif
+  dists.acacuc = distanceToNearest(hyper, acacuc.gr)
+  
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.acacuc)] == acacuc.gr$gene.id[to(dists.acacuc)]
+  table(sel)
+  dists.acacuc = subset(dists.acacuc, sel)
+  mcols(dists.acacuc)$motif = "ACACUC"
+  
+  # get stats of distances
+  summary(mcols(dists.auacc)$distance)
+  summary(mcols(dists.agaucu)$distance)
+  summary(mcols(dists.gggtat)$distance)
+  summary(mcols(dists.acacuc)$distance)
+  
+  pf = rbind(mcols(dists.auacc), mcols(dists.agaucu), mcols(dists.gggtat), mcols(dists.acacuc)) %>% as.data.frame()
+  pf$set = i
+  
+  return(pf)
+})
+
+pf = do.call(rbind, pf)
+
+# Histogram
+pl4 = ggplot(pf, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500)) +
+  scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey")) +
+  theme_bw() +
+  facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU", "GGGTAT", "ACACUC")))+myTheme1
+pl4
+```
+
+```{r}
+################################################################################
+# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts
+################################################################################
+
+# motif logo with u in it
+# Motif-1 TP% = 60.9   ENR-Ratio = 9.40 HMHHKAUACCC
+A= c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524
+)
+C= c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476
+)
+G = c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
+)
+U = c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000
+)
+ 
+pwDF1bU = rbind(A,C,G,U)
+
+
+# MoUif-2 UP% = 59.3   ENR-RaUio = 9.60 HMKAUACCCVH
+
+A= c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404
+)
+C= c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852
+)
+G = c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894
+)
+U = c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849
+)
+ 
+pwDF2bU = rbind(A,C,G,U)
+
+
+# MoUif-4 UP% = 65.8  ENR-RaUio = 1.46 UHCCUCUYCHCCMUC 
+A= c(0.197917,0.229167,0.000000,0.000000,0.072917,0.000000,0.052083,0.000000,0.020833,0.416667,0.000000,0.145833,0.458333,0.000000,0.000000)
+C= c( 0.208333,0.427083,0.770833,0.906250,0.270833,0.739583,0.145833,0.333333,0.979167,0.260417,0.677083,0.697917,0.354167,0.312500,0.979167)
+G = c(0.125000,0.000000,0.062500,0.000000,0.093750,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.156250,0.114583,0.000000,0.020833)
+U = c(0.468750,0.343750,0.166667,0.093750,0.562500,0.260417,0.739583,0.666667,0.000000,0.322917,0.322917,0.000000,0.072917,0.687500,0.000000)
+ 
+pwDF4bU = rbind(A,C,G,U)
+
+
+
+
+
+m1 = ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")
+
+m2 = ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")
+
+m4 = ggseqlogo(pwDF4bU, col_scheme="nucleotide", font="helvetica_bold")
+
+pl = ggarrange(m1,m2,m4,
+               labels = LETTERS[1:3],
+          nrow = 1)
+
+pl = annotate_figure(pl, top = text_grob("de novo discovered motif on Khd4-Ada-Gfp edited transcripts", color = "red", face = "bold", size = 14))
+
+pl
+
+
+```
+
+## Using motif position weight matrix to look for their occurence in the genome
+
+As a next step, we will check the distribution of the identified motif. For this purpose, i will use *matchPWM()* function for motif occurence. First I will use AUACCC to see if the function can indeed work!
+
+```{r}
+
+# function to find motif occurence
+# define a function for identifying the motif occurence
+getmotgr=function(seq, pwm, gr_plus, gr_minus){
+  
+# motif in sense strand
+matches_list_strand = lapply(seq, matchPWM, pwm=pwm)
+# apply to all 
+test = lapply(names(matches_list_strand), function (i){
+  uc = matches_list_strand[[i]]
+  df= as.data.frame(uc) 
+  df$chr = i # make a column specifying chromosome name
+  df = cbind.DataFrame(df)
+  return(df)
+})
+# rbind the resulting df
+df1 = do.call(rbind, test) %>% as(., "GRanges")
+# find overlaps
+ov = findOverlaps(df1, gr_plus, ignore.strand=FALSE)
+df1 = df1[from(ov)]
+df1$gene.id = gr_plus$gene_id[to(ov)]
+strand(df1) =strand(gr_plus)[to(ov)]
+
+# motif in antisense strand
+matches_list_antistrand = lapply(seq, matchPWM, pwm = reverseComplement(pwm))
+# apply to all 
+test1 = lapply(names(matches_list_antistrand), function (i){
+  uc1 = matches_list_antistrand[[i]]
+  df1= as.data.frame(uc1) 
+  df1$chr = i
+  df1 = cbind.DataFrame(df1)
+  return(df1)
+})
+# rbind the resulting df
+df2 = do.call(rbind, test1) %>% as(., "GRanges")
+# find overlaps
+ov2 = findOverlaps(df2, gr_minus, ignore.strand=FALSE)
+df2= df2[from(ov2)]
+df2$gene.id = gr_minus$gene_id[to(ov2)]
+strand(df2) =strand(gr_minus)[to(ov2)]
+df3 = c(df1,df2) %>% as(., "GRanges")
+return(df3)
+}
+
+```
+
+```{r, warning=FALSE}
+
+# get sequneces of chromosome
+uma.seq = getSeq(Umaydis) 
+
+# isolate + and - strand separtely
+# + strand
+# strand specific data of GTF file
+# plus strands
+gns_plus = genes.GR[strand(genes.GR) == "+"]
+
+#minus strand
+gns_minus = genes.GR[strand(genes.GR) == "-"]
+
+```
+
+```{r}
+
+# convert RNA string to DNA string
+rownames(pwDF1bU) = c("A", "C", "G", "T")
+rownames(pwDF4bU) = c("A", "C", "G", "T")
+```
+
+```{r, warning=FALSE}
+# remove the um_scaf_contigs because they causes problem downstream
+uma.seq = uma.seq[-c(24,25,26,27)]
+
+#HMHHKAUACCC.gr 
+HMHHKAUACCC.gr = getmotgr(uma.seq, pwm=pwDF1bU, gns_plus, gns_minus)
+
+#UHCCUCUYCHCCMUC.gr
+UHCCUCUYCHCCMUC.gr = getmotgr(uma.seq, pwm=pwDF4bU, gns_plus, gns_minus)
+
+```
+
+Here I am going to find the distribution of the most enriched **HMHHKAUACCC** and frequently occurring **UHCCUCUYCHCCMUC** motifs from Khd4-Ada-Gfp editing sites.
+
+```{r}
+pf = lapply(names(hyper.rep), function(i){
+  #i = "khd4"
+  hyper = hyper.rep[[i]]
+  head(hyper)
+  
+  # calculate distance to nearest AUACCC motif
+  dists.HMHHKAUACCC = distanceToNearest(hyper, HMHHKAUACCC.gr)
+  
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.HMHHKAUACCC)] == HMHHKAUACCC.gr$gene.id[to(dists.HMHHKAUACCC)]
+  table(sel)
+  dists.HMHHKAUACCC= subset(dists.HMHHKAUACCC, sel)
+  mcols(dists.HMHHKAUACCC)$motif = "HMHHKAUACCC"
+  
+  # calculate distance to nearest UHCCUCUYCHCCMUC.gr motif
+  dists.UHCCUCUYCHCCMUC = distanceToNearest(hyper, UHCCUCUYCHCCMUC.gr)
+  
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.UHCCUCUYCHCCMUC)] == UHCCUCUYCHCCMUC.gr$gene.id[to(dists.UHCCUCUYCHCCMUC)]
+  table(sel)
+  dists.agcagcagc = subset(dists.UHCCUCUYCHCCMUC, sel)
+  mcols(dists.UHCCUCUYCHCCMUC)$motif = "UHCCUCUYCHCCMUC"
+  
+  
+  # get stats of distances
+  summary(mcols(dists.HMHHKAUACCC)$distance)
+  summary(mcols(dists.UHCCUCUYCHCCMUC)$distance)
+  
+   
+  pf = rbind(mcols(dists.HMHHKAUACCC),mcols(dists.UHCCUCUYCHCCMUC)) %>% as.data.frame()
+  pf$set = i
+  
+  return(pf)
+})
+
+pf = do.call(rbind, pf)
+
+
+pl4 = ggplot(pf, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500), ylim=c(0,0.004)) +
+  scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey")) +
+  labs(x="Distance to the neares motif (nt)",
+       y = "motif density")+
+  theme_bw() +
+  facet_wrap(~motif)+myTheme1
+
+
+ggarrange(pl4,  
+          labels = LETTERS[1],
+          ncol = 1, nrow = 1)
+
+```
+
+```{r}
+sessionInfo()
+```