diff --git a/.gitattributes b/.gitattributes index ba00e9cb77c8f4ef85dc9ca6a6c8c1fb57547a5a..8c348403415d8557d3f2b1684d649ad833cac78d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,3 +15,4 @@ assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep2. assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz filter=lfs diff=lfs merge=lfs -text assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text +assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]editing[[:space:]]sites[[:space:]]are[[:space:]]proximal[[:space:]]to[[:space:]]the[[:space:]]AUACCC[[:space:]]motif.html filter=lfs diff=lfs merge=lfs -text diff --git a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.html b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.html new file mode 100644 index 0000000000000000000000000000000000000000..4b6eaf6eac240235bef8d2a0f594a94a6a576491 --- /dev/null +++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.html @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac165db84f0b1ccd9ade281fcfe67e582559acc09e488e301d516d218728a621 +size 1528260 diff --git a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.qmd b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.qmd new file mode 100644 index 0000000000000000000000000000000000000000..de0f6003c4a8dcd8b528a79c8d1c1e2b853d99ef --- /dev/null +++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp editing sites are proximal to the AUACCC motif.qmd @@ -0,0 +1,668 @@ +--- +title: "Khd4-Ada-Gfp editing sites are proximal to AUACCC motif" +date: last-modified +author: + - name: "Srimeenakshi Sankaranarayanan" + +title-block-banner: true +format: + html: + theme: flatly + self-contained: true + code-fold: true + code-tools: true + code-summary: "Show the code" + toc: true + number-sections: true + anchor-sections: true +editor: visual +execute: + echo: true + error: false + warning: false + message: false +--- + +```{r} +# library +library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome +library(ggplot2) +library(BindingSiteFinder) +library(rtracklayer) +library(ComplexHeatmap) +library(GenomicFeatures) +library(forcats) +library(tidyr) +library(dplyr) +library(tidyverse) +library(GenomicRanges) +library(magick) +library(magrittr) +library(gridExtra) +library(IRanges) +library(Biostrings) +library(ggpp) +library(gginnards) +library(ggrepel) +library(ggpubr) +library(ggforce) +library(ggrastr) +library(viridis) +library(reshape2) +library(gprofiler2) +library(ggsci) +library(ggh4x) +library(ggplotify) +library(gridExtra) +library(circlize) +library(EnrichedHeatmap) +library(UpSetR) +library(kableExtra) +library(cowplot) +library(rstatix) +library(beeswarm) +library(clusterProfiler) +library(ggseqlogo) +library(tidyHeatmap) +library(paletteer) +library(ggvenn) +library(colorspace) +library(ggpointdensity) +library(lookup) +library(rstatix) +library(ggrepel) + +``` + +```{r} +load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds") +``` + +Identifying the positions of RNA sequence to be tested across the genome. + +```{r} +################################################################################ +#AUACCC +################################################################################ + +# get gene sequences +gene.seqs = getSeq(Umaydis, genes.GR) + +# find AUACCC motifs +motif.auaccc = DNAString("ATACCC") + +# find all AUACCC motifs in the genome +auaccc.gr = vmatchPattern(motif.auaccc, Umaydis) + +# count motifs in genes +genes.GR$AUACCC = countOverlaps(genes.GR, auaccc.gr) +#table(genes.GR$AUACCC) + +# assign motifs to genes +ov = findOverlaps(auaccc.gr, genes.GR) + +# check whether motifs overlap with more than one gene +#table(duplicated(from(ov))) + +# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes +auaccc.gr = auaccc.gr[from(ov)] +auaccc.gr$gene.id = genes.GR$gene_id[to(ov)] + +# count motifs in transcript regions +#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.) +#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.) + + +################################################################################ +#AGAUCU +################################################################################ + +# find AGAUCU motifs +motif.ctrl = DNAString("AGATCT") +agaucu.gr = vmatchPattern(motif.ctrl, Umaydis) +genes.GR$AGAUCU = countOverlaps(genes.GR, agaucu.gr) +#table(genes.GR$AGAUCU) + +# assign motifs to genes +ov = findOverlaps(agaucu.gr, genes.GR) + +# check whether motifs overlap with more than one gene +#table(duplicated(from(ov))) + +# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes +agaucu.gr = agaucu.gr[from(ov)] +agaucu.gr$gene.id = genes.GR$gene_id[to(ov)] + +# count motifs in transcript regions +#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.) +#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.) + + +################################################################################ +#GGGTAT +################################################################################ + +# find GGGTAT motifs +motif.ctrl = DNAString("GGGTAT") +gggtat.gr = vmatchPattern(motif.ctrl, Umaydis) +genes.GR$GGGTAT = countOverlaps(genes.GR, gggtat.gr) +#table(genes.GR$GGGTAT) + +# assign motifs to genes +ov = findOverlaps(gggtat.gr, genes.GR) + +# check whether motifs overlap with more than one gene +#table(duplicated(from(ov))) + +# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes +gggtat.gr = gggtat.gr[from(ov)] +gggtat.gr$gene.id = genes.GR$gene_id[to(ov)] + +# count motifs in transcript regions +#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.) +#findOverlaps(gggtat.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.) + + +################################################################################ +#ACACUC +################################################################################ + +# find AGAUCU motifs +motif.ctrl = DNAString("ACACTC") +acacuc.gr = vmatchPattern(motif.ctrl, Umaydis) +genes.GR$ACACUC = countOverlaps(genes.GR, acacuc.gr) +#table(genes.GR$ACACUC) + +# assign motifs to genes +ov = findOverlaps(acacuc.gr, genes.GR) + +# check whether motifs overlap with more than one gene +#table(duplicated(from(ov))) + +# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes +acacuc.gr = acacuc.gr[from(ov)] +acacuc.gr$gene.id = genes.GR$gene_id[to(ov)] + +# count motifs in transcript regions +#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.) +#findOverlaps(acacuc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.) +``` + +## Motif enrichment + +```{r} +# check that numbers are same as in Venn +#sapply(khd4.vs.ctrl, length) + +names(khd4.vs.ctrl) = c("Khd4-Ada-Gfp-unique", "overlap", "control-Ada unique", "all mRNAs") + +# table of AUACCC +tab.auaccc = sapply(khd4.vs.ctrl, function(gene.set){ + motifs = genes.GR$AUACCC[genes.GR$gene_id %in% gene.set] + tab = table(motifs > 0) + return(tab) +}) %>% melt(.) +names(tab.auaccc) = c("with.motif", "set", "no.genes") + +# table of AGAUCU +tab.agaucu = sapply(khd4.vs.ctrl, function(gene.set){ + motifs = genes.GR$AGAUCU[genes.GR$gene_id %in% gene.set] + tab = table(motifs > 0) + return(tab) +}) %>% melt(.) +names(tab.agaucu) = c("with.motif", "set", "no.genes") + +# table of GGGTAT +tab.gggtat = sapply(khd4.vs.ctrl, function(gene.set){ + motifs = genes.GR$GGGTAT[genes.GR$gene_id %in% gene.set] + tab = table(motifs > 0) + return(tab) +}) %>% melt(.) +names(tab.gggtat) = c("with.motif", "set", "no.genes") + +# table of CCCAUA +tab.acacuc = sapply(khd4.vs.ctrl, function(gene.set){ + motifs = genes.GR$ACACUC[genes.GR$gene_id %in% gene.set] + tab = table(motifs > 0) + return(tab) +}) %>% melt(.) +names(tab.acacuc) = c("with.motif", "set", "no.genes") + + +# stacked barchart (scaled to 100%) +pl3 = ggplot(tab.auaccc, aes(x=set, y=no.genes, fill=with.motif)) + + geom_bar(position="fill", stat="identity") + + scale_fill_manual(values=c('TRUE' = auaccc.col, 'FALSE' = "lightgrey")) + + ggtitle("AUACCC")+ + labs(x="", + y="% with motif")+ + myTheme1+ + theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) + + +# stacked barchart (scaled to 100%) +pl4 = ggplot(tab.agaucu, aes(x=set, y=no.genes, fill=with.motif)) + + geom_bar(position="fill", stat="identity") + + scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) + + ggtitle("AGAUCU")+ + labs(x="", + y="% with motif")+ + myTheme1+ + theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) + +# stacked barchart (scaled to 100%) +pl5 = ggplot(tab.gggtat, aes(x=set, y=no.genes, fill=with.motif)) + + geom_bar(position="fill", stat="identity") + + scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) + + ggtitle("GGGTAT")+ + labs(x="", + y="% with motif")+ + myTheme1+ + theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) + +# stacked barchart (scaled to 100%) +pl6 = ggplot(tab.acacuc, aes(x=set, y=no.genes, fill=with.motif)) + + geom_bar(position="fill", stat="identity") + + scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) + + ggtitle("ACAUCU")+ + labs(x="", + y="% with motif")+ + myTheme1+ + theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) + +# FigS7A + +pl<-ggarrange(pl3, pl4, pl5, pl6, + labels = LETTERS[1:4], + nrow = 1, common.legend = TRUE) + +pl +``` + +Fischer's exact test was performed to calculate the enrichment of motif between datasets from the previous section. Note that I made the dataframe by manually entering the number from tab.motif variable from above. + +```{r} +#Pvalue calculation +# isolating the motif number in each set that is with or without motif + +################################################################################ +#AUACCC motif +################################################################################ + +khd4.uni=c(157,36) +shared=c(60,42) +ctrl.uni=c(226,682) +all=c(1675,5256) + +enrmotif= data.frame(khd4.uni,shared,ctrl.uni,all) # make dataframe + +row.names(enrmotif)=c("auaccc", "nonauaccc") # assign rownames + +# fischers test +chisq.khd4Mot=enrmotif[,c(1,4)] +chisq.sharedMot=enrmotif[,c(2,4)] +chisq.ctrlMot=enrmotif[,c(3,4)] + +#fischers test +fisher.test(chisq.khd4Mot) # khd4unique vs all +fisher.test(chisq.sharedMot) # shared vs all +fisher.test(chisq.ctrlMot) # conrol vs all + +################################################################################ +#AGAUCU +################################################################################ + +mkhd4.uni=c(102,91) +mshared=c(50,52) +mctrl.uni=c(438,470) +mall=c(2931,4000) + +menrmotif= data.frame(mkhd4.uni,mshared,mctrl.uni,mall) # make dataframe + +row.names(menrmotif)=c("agaucu", "nonagaucu") # assign rownames + +# fischers test +chisq.mkhd4Mot=menrmotif[,c(1,4)] +chisq.msharedMot=menrmotif[,c(2,4)] +chisq.mctrlMot=menrmotif[,c(3,4)] + + +#fischers test +fisher.test(chisq.mkhd4Mot) # khd4unique vs all +fisher.test(chisq.msharedMot) # shared vs all +fisher.test(chisq.mctrlMot) # conrol vs all + +################################################################################ +#GGGUAU motif +################################################################################ + +m2khd4.uni=c(47,146) +m2shared=c(31,71) +m2ctrl.uni=c(232,676) +m2all=c(1496,5435) + +m2enrmotif= data.frame(m2khd4.uni,m2shared,m2ctrl.uni,m2all) # make dataframe + +row.names(m2enrmotif)=c("cccaua", "nonagaucu") # assign rownames + +# fischers test +chisq.m2khd4Mot=m2enrmotif[,c(1,4)] +chisq.m2sharedMot=m2enrmotif[,c(2,4)] +chisq.m2ctrlMot=m2enrmotif[,c(3,4)] +chisq.m2ctrlMot_unimot=m2enrmotif[,c(1,3)] + + +#fischers test +fisher.test(chisq.m2khd4Mot) # khd4unique vs all +fisher.test(chisq.m2sharedMot) # shared vs all +fisher.test(chisq.m2ctrlMot) # conrol vs all +fisher.test(chisq.m2ctrlMot_unimot) # khd4unique vs ctrl + +################################################################################ +#ACACUC motif +################################################################################ + +m3khd4.uni=c(115,78) +m3shared=c(49,53) +m3ctrl.uni=c(521,387) +m3all=c(3482,3449) + +m3enrmotif= data.frame(m3khd4.uni,m3shared,m3ctrl.uni,m3all) # make dataframe + +row.names(m3enrmotif)=c("cccaua", "nonagaucu") # assign rownames + +# fischers test +chisq.m3khd4Mot=m3enrmotif[,c(1,4)] +chisq.m3sharedMot=m3enrmotif[,c(2,4)] +chisq.m3ctrlMot=m3enrmotif[,c(3,4)] +chisq.m3ctrlMot_unimot=m3enrmotif[,c(1,3)] + + +#fischers test +fisher.test(chisq.m3khd4Mot) # khd4unique vs all +fisher.test(chisq.m3sharedMot) # shared vs all +fisher.test(chisq.m3ctrlMot) # conrol vs all +fisher.test(chisq.m3ctrlMot_unimot) # khd4unique vs ctrl +``` + +## Distance from editing sites to motifs + +Only editing sites on mRNAs specific to each dataset were considered + +```{r} +names(hyper.rep) = c("Khd4_Ada_Gfp", "control_Ada") + + +pf = lapply(names(hyper.rep), function(i){ + #i = "khd4" + hyper = hyper.rep[[i]] + head(hyper) + + # calculate distance to nearest AUACCC motif + dists.auacc = distanceToNearest(hyper, auaccc.gr) + + # keep only pairs on same genes + sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)] + table(sel) + dists.auacc= subset(dists.auacc, sel) + mcols(dists.auacc)$motif = "AUACCC" + + # calculate distance to nearest AGAUCU motif + dists.agaucu = distanceToNearest(hyper, agaucu.gr) + + # keep only pairs on same genes + sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)] + table(sel) + dists.agaucu= subset(dists.agaucu, sel) + mcols(dists.agaucu)$motif = "AGAUCU" + + # calculate distance to nearest CCCAUA motif + dists.gggtat = distanceToNearest(hyper, gggtat.gr) + + # keep only pairs on same genes + sel = hyper$gene.id[from(dists.gggtat)] == gggtat.gr$gene.id[to(dists.gggtat)] + table(sel) + dists.gggtat = subset(dists.gggtat, sel) + mcols(dists.gggtat)$motif = "GGGTAT" + + # calculate distance to nearest ACACUC motif + dists.acacuc = distanceToNearest(hyper, acacuc.gr) + + # keep only pairs on same genes + sel = hyper$gene.id[from(dists.acacuc)] == acacuc.gr$gene.id[to(dists.acacuc)] + table(sel) + dists.acacuc = subset(dists.acacuc, sel) + mcols(dists.acacuc)$motif = "ACACUC" + + # get stats of distances + summary(mcols(dists.auacc)$distance) + summary(mcols(dists.agaucu)$distance) + summary(mcols(dists.gggtat)$distance) + summary(mcols(dists.acacuc)$distance) + + pf = rbind(mcols(dists.auacc), mcols(dists.agaucu), mcols(dists.gggtat), mcols(dists.acacuc)) %>% as.data.frame() + pf$set = i + + return(pf) +}) + +pf = do.call(rbind, pf) + +# Histogram +pl4 = ggplot(pf, aes(x=distance, fill=set)) + + geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") + + coord_cartesian(xlim=c(0, 2500)) + + scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey")) + + theme_bw() + + facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU", "GGGTAT", "ACACUC")))+myTheme1 +pl4 +``` + +```{r} +################################################################################ +# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts +################################################################################ + +# motif logo with u in it +# Motif-1 TP% = 60.9 ENR-Ratio = 9.40 HMHHKAUACCC +A= c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524 +) +C= c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476 +) +G = c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000 +) +U = c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000 +) + +pwDF1bU = rbind(A,C,G,U) + + +# MoUif-2 UP% = 59.3 ENR-RaUio = 9.60 HMKAUACCCVH + +A= c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404 +) +C= c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852 +) +G = c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894 +) +U = c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849 +) + +pwDF2bU = rbind(A,C,G,U) + + +# MoUif-4 UP% = 65.8 ENR-RaUio = 1.46 UHCCUCUYCHCCMUC +A= c(0.197917,0.229167,0.000000,0.000000,0.072917,0.000000,0.052083,0.000000,0.020833,0.416667,0.000000,0.145833,0.458333,0.000000,0.000000) +C= c( 0.208333,0.427083,0.770833,0.906250,0.270833,0.739583,0.145833,0.333333,0.979167,0.260417,0.677083,0.697917,0.354167,0.312500,0.979167) +G = c(0.125000,0.000000,0.062500,0.000000,0.093750,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.156250,0.114583,0.000000,0.020833) +U = c(0.468750,0.343750,0.166667,0.093750,0.562500,0.260417,0.739583,0.666667,0.000000,0.322917,0.322917,0.000000,0.072917,0.687500,0.000000) + +pwDF4bU = rbind(A,C,G,U) + + + + + +m1 = ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold") + +m2 = ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold") + +m4 = ggseqlogo(pwDF4bU, col_scheme="nucleotide", font="helvetica_bold") + +pl = ggarrange(m1,m2,m4, + labels = LETTERS[1:3], + nrow = 1) + +pl = annotate_figure(pl, top = text_grob("de novo discovered motif on Khd4-Ada-Gfp edited transcripts", color = "red", face = "bold", size = 14)) + +pl + + +``` + +## Using motif position weight matrix to look for their occurence in the genome + +As a next step, we will check the distribution of the identified motif. For this purpose, i will use *matchPWM()* function for motif occurence. First I will use AUACCC to see if the function can indeed work! + +```{r} + +# function to find motif occurence +# define a function for identifying the motif occurence +getmotgr=function(seq, pwm, gr_plus, gr_minus){ + +# motif in sense strand +matches_list_strand = lapply(seq, matchPWM, pwm=pwm) +# apply to all +test = lapply(names(matches_list_strand), function (i){ + uc = matches_list_strand[[i]] + df= as.data.frame(uc) + df$chr = i # make a column specifying chromosome name + df = cbind.DataFrame(df) + return(df) +}) +# rbind the resulting df +df1 = do.call(rbind, test) %>% as(., "GRanges") +# find overlaps +ov = findOverlaps(df1, gr_plus, ignore.strand=FALSE) +df1 = df1[from(ov)] +df1$gene.id = gr_plus$gene_id[to(ov)] +strand(df1) =strand(gr_plus)[to(ov)] + +# motif in antisense strand +matches_list_antistrand = lapply(seq, matchPWM, pwm = reverseComplement(pwm)) +# apply to all +test1 = lapply(names(matches_list_antistrand), function (i){ + uc1 = matches_list_antistrand[[i]] + df1= as.data.frame(uc1) + df1$chr = i + df1 = cbind.DataFrame(df1) + return(df1) +}) +# rbind the resulting df +df2 = do.call(rbind, test1) %>% as(., "GRanges") +# find overlaps +ov2 = findOverlaps(df2, gr_minus, ignore.strand=FALSE) +df2= df2[from(ov2)] +df2$gene.id = gr_minus$gene_id[to(ov2)] +strand(df2) =strand(gr_minus)[to(ov2)] +df3 = c(df1,df2) %>% as(., "GRanges") +return(df3) +} + +``` + +```{r, warning=FALSE} + +# get sequneces of chromosome +uma.seq = getSeq(Umaydis) + +# isolate + and - strand separtely +# + strand +# strand specific data of GTF file +# plus strands +gns_plus = genes.GR[strand(genes.GR) == "+"] + +#minus strand +gns_minus = genes.GR[strand(genes.GR) == "-"] + +``` + +```{r} + +# convert RNA string to DNA string +rownames(pwDF1bU) = c("A", "C", "G", "T") +rownames(pwDF4bU) = c("A", "C", "G", "T") +``` + +```{r, warning=FALSE} +# remove the um_scaf_contigs because they causes problem downstream +uma.seq = uma.seq[-c(24,25,26,27)] + +#HMHHKAUACCC.gr +HMHHKAUACCC.gr = getmotgr(uma.seq, pwm=pwDF1bU, gns_plus, gns_minus) + +#UHCCUCUYCHCCMUC.gr +UHCCUCUYCHCCMUC.gr = getmotgr(uma.seq, pwm=pwDF4bU, gns_plus, gns_minus) + +``` + +Here I am going to find the distribution of the most enriched **HMHHKAUACCC** and frequently occurring **UHCCUCUYCHCCMUC** motifs from Khd4-Ada-Gfp editing sites. + +```{r} +pf = lapply(names(hyper.rep), function(i){ + #i = "khd4" + hyper = hyper.rep[[i]] + head(hyper) + + # calculate distance to nearest AUACCC motif + dists.HMHHKAUACCC = distanceToNearest(hyper, HMHHKAUACCC.gr) + + # keep only pairs on same genes + sel = hyper$gene.id[from(dists.HMHHKAUACCC)] == HMHHKAUACCC.gr$gene.id[to(dists.HMHHKAUACCC)] + table(sel) + dists.HMHHKAUACCC= subset(dists.HMHHKAUACCC, sel) + mcols(dists.HMHHKAUACCC)$motif = "HMHHKAUACCC" + + # calculate distance to nearest UHCCUCUYCHCCMUC.gr motif + dists.UHCCUCUYCHCCMUC = distanceToNearest(hyper, UHCCUCUYCHCCMUC.gr) + + # keep only pairs on same genes + sel = hyper$gene.id[from(dists.UHCCUCUYCHCCMUC)] == UHCCUCUYCHCCMUC.gr$gene.id[to(dists.UHCCUCUYCHCCMUC)] + table(sel) + dists.agcagcagc = subset(dists.UHCCUCUYCHCCMUC, sel) + mcols(dists.UHCCUCUYCHCCMUC)$motif = "UHCCUCUYCHCCMUC" + + + # get stats of distances + summary(mcols(dists.HMHHKAUACCC)$distance) + summary(mcols(dists.UHCCUCUYCHCCMUC)$distance) + + + pf = rbind(mcols(dists.HMHHKAUACCC),mcols(dists.UHCCUCUYCHCCMUC)) %>% as.data.frame() + pf$set = i + + return(pf) +}) + +pf = do.call(rbind, pf) + + +pl4 = ggplot(pf, aes(x=distance, fill=set)) + + geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") + + coord_cartesian(xlim=c(0, 2500), ylim=c(0,0.004)) + + scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey")) + + labs(x="Distance to the neares motif (nt)", + y = "motif density")+ + theme_bw() + + facet_wrap(~motif)+myTheme1 + + +ggarrange(pl4, + labels = LETTERS[1], + ncol = 1, nrow = 1) + +``` + +```{r} +sessionInfo() +```