# Venn diaramvenn.list =lapply(hyper.rep, function(hyper){return(hyper$id)})names(venn.list) =c("Khd4-Ada-Gfp", "control-Ada")# FigS6A and Bggvenn(venn.list, names(venn.list), fill_color =c("dodgerblue3", ada.col), set_name_size =6) +ggtitle("Overlap between Khd4-Ada-Gfp and control-Ada")+theme(plot.title =element_text(size =20, hjust =0.5, vjust =3))
Show the code
# Identifying the RNA sequence motif#################################################################################AUACCC################################################################################# get gene sequencesgene.seqs =getSeq(Umaydis, genes.GR)# find AUACCC motifsmotif.auaccc =DNAString("ATACCC")# find all AUACCC motifs in the genomeauaccc.gr =vmatchPattern(motif.auaccc, Umaydis)# count motifs in genesgenes.GR$AUACCC =countOverlaps(genes.GR, auaccc.gr)#table(genes.GR$AUACCC)# assign motifs to genesov =findOverlaps(auaccc.gr, genes.GR)# check whether motifs overlap with more than one genetable(duplicated(from(ov)))
FALSE TRUE
1975 18
Show the code
# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesauaccc.gr = auaccc.gr[from(ov)]auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################AGAUCU################################################################################# find AGAUCU motifsmotif.ctrl =DNAString("AGATCT")agaucu.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$AGAUCU =countOverlaps(genes.GR, agaucu.gr)table(genes.GR$AGAUCU)
# assign motifs to genesov =findOverlaps(agaucu.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesagaucu.gr = agaucu.gr[from(ov)]agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
0.1 De novo motif discovery analysis on sequences carrying unique Khd4–Ada–Gfp editing sites.
XSTREME analysis was done using the random genome sequences of lenght 501-nt as a background and unique-Khd4-Ada-Gfp editing sites (501-nt) as an input.
First, the Khd4-Ada-Gfp editing sites were extended on each side by 250nt to make a 501-nt window.
Show the code
# keep only id columnhyper.spe =lapply(hyper.rep, function (x){mcols(x) = x[,c(1)]names(x) = x$idreturn(x)})# split the reproducible editing sites that are unique to Khd4 and Control , and sharedhyper.specific =list(Khd4.specific = hyper.spe$khd4[!(hyper.spe$khd4$id %in% hyper.spe$ctrl$id)],shared = hyper.spe$khd4[hyper.spe$khd4$id %in% hyper.spe$ctrl$id],ctrl.specific = hyper.spe$ctrl[!(hyper.spe$ctrl$id %in% hyper.spe$khd4$id)])# as a GRanges listhyper.specific =as(hyper.specific, "GRangesList")# extend the window by 250 bp on both sideshyper_250 =lapply(hyper.specific, function(x){ x = x+250# window 250 bp})# get fastafiles# get sequencekhd4.specific = Biostrings::getSeq(Umaydis, hyper_250$Khd4.specific)khd4.ctrl.shared = Biostrings::getSeq(Umaydis, hyper_250$shared)ctrl.specific = Biostrings::getSeq(Umaydis, hyper_250$ctrl.specific)# export the fasta filesoutput ="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"writeXStringSet(khd4.specific, filepath =paste0(output,"khd4.specific_250.fasta"))writeXStringSet(khd4.ctrl.shared, filepath =paste0(output,"khd4.ctrl.shared_250.fasta"))writeXStringSet(ctrl.specific, filepath =paste0(output,"ctrl.specific_250.fasta"))
Scatter plot comparing the relative enrichment ratio of enriched motifs with the percentage of each motif in the tested sequences.
Make the sequence logo of the most enriched two motifs
Show the code
################################################################################# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts################################################################################# motif logo with u in it# Motif-1 TP% = 60.9 ENR-Ratio = 9.40 HMHHKAUACCCA=c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524)C=c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476)G =c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000)U =c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000)pwDF1bU =rbind(A,C,G,U)# MoUif-2 UP% = 59.3 ENR-RaUio = 9.60 HMKAUACCCVHA=c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404)C=c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852)G =c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894)U =c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849)pwDF2bU =rbind(A,C,G,U)# make ggseqlogom1 =ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")m2 =ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")pl =ggarrange(m1,m2,labels = LETTERS[1:2],nrow =1)pl
Next I want to select mRNA targets that are specific to Khd4-Ada-Gfp and control-Ada data
Show the code
# Khd4.speKhd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]# mRNAs specific to controlctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]# make a list with mRNAs specific to each data set# listhyper.spe =list(Khd4_Ada_Gfp = Khd4.spe,control_Ada = ctrl.spe.tx) %>%as(., "GRangesList")
0.2 Distribution of editing sites from the nearest AUACCC motif
Only editing sites on mRNAs specific to each dataset were considered
Show the code
pf =lapply(names(hyper.spe), function(i){#i = "khd4" hyper = hyper.spe[[i]]head(hyper)# calculate distance to nearest AUACCC motif dists.auacc =distanceToNearest(hyper, auaccc.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]table(sel) dists.auacc=subset(dists.auacc, sel)mcols(dists.auacc)$motif ="AUACCC"# calculate distance to nearest AGAUCU motif dists.agaucu =distanceToNearest(hyper, agaucu.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]table(sel) dists.agaucu=subset(dists.agaucu, sel)mcols(dists.agaucu)$motif ="AGAUCU"# get stats of distancessummary(mcols(dists.auacc)$distance)summary(mcols(dists.agaucu)$distance) pf =rbind(mcols(dists.auacc), mcols(dists.agaucu)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)# FigS7B# Histogrampl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram( binwidth =150, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500)) +scale_fill_manual(values=c(Khd4_Ada_Gfp ="dodgerblue4", control_Ada ="grey60")) +labs(x="distance of editing sites to the nearest motif (nt)",y="motif density")+theme_bw() +facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU")))+myTheme1pl4
Show the code
# median distance between editing sites and the motif#Khd4-Ada-GfpKhd4_auaccc=filter(pf, set =="khd4", motif =="AUACCC") med_K_auaccc =median(Khd4_auaccc$distance)#Control-AdaCtrl_auaccc=filter(pf, set =="ctrl", motif =="AUACCC") med_C_auaccc =median(Ctrl_auaccc$distance)data.frame(Khd4_Ada_Gfp = med_K_auaccc,control_Ada = med_C_auaccc) %>%kbl(caption ="median distance between auaccc and editing events",html_font ="Cambria", bold = T, color="black") %>%kable_styling(bootstrap_options ="striped", "hover", full_width = F, html_font ="Cambria") %>%column_spec(1, bold = T)
median distance between auaccc and editing events
Khd4_Ada_Gfp
control_Ada
NA
NA
0.3 Studying the enrichment of motifs in mRNAs specific to each datasets
Here, we consider only the editing events on Khd4-bound mRNAs and on mRNAs specific to control-Ada. Khd4-bound mRNAs- All mRNAs that carry reproducible Khd4-Ada-Gfp editing sites with atleast one AUACCC motif. Control-Ada mRNAs- mRNAs that have reproducible control-Ada editing sites but no Khd4-Ada-Gfp sites
Show the code
# Khd4-bound mRNAsKhd4_boundRNA= hyper.rep$khd4[(hyper.rep$khd4$gene.id %in% auaccc.gr$gene.id)]# mRNAs specific to controlctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]#check if the numbers are similar to the venn diagram#n_distinct(khd4.spe.tx$gene.id)# make a list with mRNAs specific to each data setKhd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]# listhyper.rep.txspe =list(khd4 = Khd4_boundRNA,ctrl = ctrl.spe.tx) %>%as(., "GRangesList")
1 Analyis of the high-confidence targets
Show the code
# only Khd4khd4.rep.edit=hyper.rep$khd4motifs=genes.GRkhd4.rep.edit$AUACCC=lookup(khd4.rep.edit$gene.id, motifs$gene_id,motifs$AUACCC)#granges of high confidence Khd4high.conf.khd4=subset(khd4.rep.edit,AUACCC>0)high.conf = high.conf.khd4
1.1 Transcript region enrichment of motif
Since Khd4 binds AUACCC specifically, we first want to check the transcript region that the AUACCC motif is enriched in high-confidence mRNA targets. Besides using the AGAUCU motif as a control, I also want to use the all hyphae expressed AUACCC-containing mRNAs as a putative targets
Furthermore, I will use the already available AUACCC and AGAUCU motif position in U. maydis genome, as these already include the transcript region locations of the motif.
Show the code
# load DGE data to identify hyphae expressed genes# Use the lookup function to identify geneIDs overlapping with the high confident targets Fi_wtvskd_DEG$high.conf_targets=lookup(Fi_wtvskd_DEG$gene.ID,high.conf$gene.id, high.conf$gene.id )# auaccc motifFi_wtvskd_DEG$auaccc=lookup(Fi_wtvskd_DEG$gene.ID,auaccc.gr$gene.id, auaccc.gr$gene.id )# agaucu motifFi_wtvskd_DEG$agaucu=lookup(Fi_wtvskd_DEG$gene.ID,agaucu.gr$gene.id, agaucu.gr$gene.id )# only expressed genesFi_wtvskd_BM = Fi_wtvskd_DEG[Fi_wtvskd_DEG$baseMean>10,]
---title: "HyperTRIBE_identifies_highly_specific_targets_of_Khd4"date: last-modifiedauthor: - name: "Srimeenakshi Sankaranarayanan"title-block-banner: trueformat: html: theme: flatly self-contained: true code-fold: true code-tools: true code-summary: "Show the code" toc: true number-sections: true anchor-sections: trueeditor: visualexecute: echo: true error: false warning: false message: false---```{r}# librarylibrary(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genomelibrary(ggplot2)library(BindingSiteFinder)library(rtracklayer)library(ComplexHeatmap)library(GenomicFeatures)library(forcats)library(tidyr)library(dplyr)library(tidyverse)library(GenomicRanges)library(magick)library(magrittr)library(gridExtra)library(IRanges)library(Biostrings)library(ggpp)library(gginnards)library(ggrepel)library(ggpubr)library(ggforce)library(ggrastr)library(viridis)library(reshape2)library(gprofiler2)library(ggsci)library(ggh4x)library(ggplotify)library(gridExtra)library(circlize)library(EnrichedHeatmap)library(UpSetR)library(kableExtra)library(cowplot)library(rstatix)library(beeswarm)library(clusterProfiler)library(ggseqlogo)library(tidyHeatmap)library(paletteer)library(ggvenn)library(colorspace)library(ggpointdensity)library(lookup)library(rstatix)library(ggrepel)``````{r}load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")``````{r}# reproducible editing sites in Khd4-Ada-Gfp and control-Adahyper.rep``````{r}# Venn diaramvenn.list =lapply(hyper.rep, function(hyper){return(hyper$id)})names(venn.list) =c("Khd4-Ada-Gfp", "control-Ada")# FigS6A and Bggvenn(venn.list, names(venn.list), fill_color =c("dodgerblue3", ada.col), set_name_size =6) +ggtitle("Overlap between Khd4-Ada-Gfp and control-Ada")+theme(plot.title =element_text(size =20, hjust =0.5, vjust =3))``````{r}# Identifying the RNA sequence motif#################################################################################AUACCC################################################################################# get gene sequencesgene.seqs =getSeq(Umaydis, genes.GR)# find AUACCC motifsmotif.auaccc =DNAString("ATACCC")# find all AUACCC motifs in the genomeauaccc.gr =vmatchPattern(motif.auaccc, Umaydis)# count motifs in genesgenes.GR$AUACCC =countOverlaps(genes.GR, auaccc.gr)#table(genes.GR$AUACCC)# assign motifs to genesov =findOverlaps(auaccc.gr, genes.GR)# check whether motifs overlap with more than one genetable(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesauaccc.gr = auaccc.gr[from(ov)]auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################AGAUCU################################################################################# find AGAUCU motifsmotif.ctrl =DNAString("AGATCT")agaucu.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$AGAUCU =countOverlaps(genes.GR, agaucu.gr)table(genes.GR$AGAUCU)# assign motifs to genesov =findOverlaps(agaucu.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesagaucu.gr = agaucu.gr[from(ov)]agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)```## De novo motif discovery analysis on sequences carrying unique Khd4–Ada–Gfp editing sites.XSTREME analysis was done using the random genome sequences of lenght 501-nt as a background and unique-Khd4-Ada-Gfp editing sites (501-nt) as an input.First, the Khd4-Ada-Gfp editing sites were extended on each side by 250nt to make a 501-nt window.```{r eval=FALSE}# keep only id columnhyper.spe =lapply(hyper.rep, function (x){mcols(x) = x[,c(1)]names(x) = x$idreturn(x)})# split the reproducible editing sites that are unique to Khd4 and Control , and sharedhyper.specific =list(Khd4.specific = hyper.spe$khd4[!(hyper.spe$khd4$id %in% hyper.spe$ctrl$id)],shared = hyper.spe$khd4[hyper.spe$khd4$id %in% hyper.spe$ctrl$id],ctrl.specific = hyper.spe$ctrl[!(hyper.spe$ctrl$id %in% hyper.spe$khd4$id)])# as a GRanges listhyper.specific =as(hyper.specific, "GRangesList")# extend the window by 250 bp on both sideshyper_250 =lapply(hyper.specific, function(x){ x = x+250# window 250 bp})# get fastafiles# get sequencekhd4.specific = Biostrings::getSeq(Umaydis, hyper_250$Khd4.specific)khd4.ctrl.shared = Biostrings::getSeq(Umaydis, hyper_250$shared)ctrl.specific = Biostrings::getSeq(Umaydis, hyper_250$ctrl.specific)# export the fasta filesoutput ="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"writeXStringSet(khd4.specific, filepath =paste0(output,"khd4.specific_250.fasta"))writeXStringSet(khd4.ctrl.shared, filepath =paste0(output,"khd4.ctrl.shared_250.fasta"))writeXStringSet(ctrl.specific, filepath =paste0(output,"ctrl.specific_250.fasta"))```Scatter plot comparing the relative enrichment ratio of enriched motifs with the percentage of each motif in the tested sequences. ```{r}# making a scatter plot# unique to Khd4-Ada-GfpMotif.frame.khd4 =data.frame(Motif =c("HMHHKAUACCC","HMKAUACCCVH","UGSAGGCGCURSWGS","UHCCUCUYCHCCMUC","CAGCARCARCADCAG","CCUCKAAACSNCGC","CYAAKSMUGCUYCUR","CYCCCW","CUUGAYUUGUWWYCC","YYUUGUAYUUGUA","UYGCUGCUCUC", "ACMUCUCCAAGVYCA","CUCCUGAUCCUGCCU", "AGCGSUCRUUUGASC", "CCUCUKU", "CAACGCUCCAC","CCAGCAACAGCARSM", "GCAGAUGRUAYCCAK", "RASAAGMVSAWCAAG", "AVUGGUGCUGU"),absolute =c(60.9, 59.3, 16.0, 65.8, 13.6, 37.4, 11.1, 37.9, 2.5, 6.2, 18.5, 6.2, 5.3, 3.7, 16.0, 50.6, 29.2, 3.7, 38.3, 11.1),EnR =c(9.40,9.60,8.35,1.46,4.01,1.80,3.30,1.60,37.98,4.82,2.08, 4.34, 5.06, 4.93, 1.81, 1.27, 1.43, 3.88 , 1.10, 1.05),set=rep("khd4",20))pl.khd4 =ggplot(Motif.frame.khd4, aes(x=absolute, y=EnR, color=set))+geom_point(alpha=1, size=2)+scale_color_manual(values = khd4.col)+coord_cartesian(ylim=c(0,10))+labs(subtitle="Khd4-Ada-Gfp",x="sequences with motif (%)",y ="motif enrichment ratio")+theme_paper()+theme(legend.position ="none")pl.khd4```Make the sequence logo of the most enriched two motifs```{r}################################################################################# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts################################################################################# motif logo with u in it# Motif-1 TP% = 60.9 ENR-Ratio = 9.40 HMHHKAUACCCA=c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524)C=c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476)G =c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000)U =c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000)pwDF1bU =rbind(A,C,G,U)# MoUif-2 UP% = 59.3 ENR-RaUio = 9.60 HMKAUACCCVHA=c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404)C=c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852)G =c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894)U =c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849)pwDF2bU =rbind(A,C,G,U)# make ggseqlogom1 =ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")m2 =ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")pl =ggarrange(m1,m2,labels = LETTERS[1:2],nrow =1)pl```Next I want to select mRNA targets that are specific to Khd4-Ada-Gfp and control-Ada data```{r}# Khd4.speKhd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]# mRNAs specific to controlctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]# make a list with mRNAs specific to each data set# listhyper.spe =list(Khd4_Ada_Gfp = Khd4.spe,control_Ada = ctrl.spe.tx) %>%as(., "GRangesList")```## Distribution of editing sites from the nearest AUACCC motifOnly editing sites on mRNAs specific to each dataset were considered```{r}pf =lapply(names(hyper.spe), function(i){#i = "khd4" hyper = hyper.spe[[i]]head(hyper)# calculate distance to nearest AUACCC motif dists.auacc =distanceToNearest(hyper, auaccc.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]table(sel) dists.auacc=subset(dists.auacc, sel)mcols(dists.auacc)$motif ="AUACCC"# calculate distance to nearest AGAUCU motif dists.agaucu =distanceToNearest(hyper, agaucu.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]table(sel) dists.agaucu=subset(dists.agaucu, sel)mcols(dists.agaucu)$motif ="AGAUCU"# get stats of distancessummary(mcols(dists.auacc)$distance)summary(mcols(dists.agaucu)$distance) pf =rbind(mcols(dists.auacc), mcols(dists.agaucu)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)# FigS7B# Histogrampl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram( binwidth =150, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500)) +scale_fill_manual(values=c(Khd4_Ada_Gfp ="dodgerblue4", control_Ada ="grey60")) +labs(x="distance of editing sites to the nearest motif (nt)",y="motif density")+theme_bw() +facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU")))+myTheme1pl4``````{r}# median distance between editing sites and the motif#Khd4-Ada-GfpKhd4_auaccc=filter(pf, set =="khd4", motif =="AUACCC") med_K_auaccc =median(Khd4_auaccc$distance)#Control-AdaCtrl_auaccc=filter(pf, set =="ctrl", motif =="AUACCC") med_C_auaccc =median(Ctrl_auaccc$distance)data.frame(Khd4_Ada_Gfp = med_K_auaccc,control_Ada = med_C_auaccc) %>%kbl(caption ="median distance between auaccc and editing events",html_font ="Cambria", bold = T, color="black") %>%kable_styling(bootstrap_options ="striped", "hover", full_width = F, html_font ="Cambria") %>%column_spec(1, bold = T) ```## Studying the enrichment of motifs in mRNAs specific to each datasetsHere, we consider only the editing events on Khd4-bound mRNAs and on mRNAs specific to control-Ada. Khd4-bound mRNAs- All mRNAs that carry reproducible Khd4-Ada-Gfp editing sites with atleast one AUACCC motif.Control-Ada mRNAs- mRNAs that have reproducible control-Ada editing sites but no Khd4-Ada-Gfp sites ```{r}# Khd4-bound mRNAsKhd4_boundRNA= hyper.rep$khd4[(hyper.rep$khd4$gene.id %in% auaccc.gr$gene.id)]# mRNAs specific to controlctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]#check if the numbers are similar to the venn diagram#n_distinct(khd4.spe.tx$gene.id)# make a list with mRNAs specific to each data setKhd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]# listhyper.rep.txspe =list(khd4 = Khd4_boundRNA,ctrl = ctrl.spe.tx) %>%as(., "GRangesList")```# Analyis of the high-confidence targets```{r}# only Khd4khd4.rep.edit=hyper.rep$khd4motifs=genes.GRkhd4.rep.edit$AUACCC=lookup(khd4.rep.edit$gene.id, motifs$gene_id,motifs$AUACCC)#granges of high confidence Khd4high.conf.khd4=subset(khd4.rep.edit,AUACCC>0)high.conf = high.conf.khd4```## Transcript region enrichment of motifSince Khd4 binds AUACCC specifically, we first want to check the transcript region that the AUACCC motif is enriched in high-confidence mRNA targets. Besides using the AGAUCU motif as a control, I also want to use the all hyphae expressed AUACCC-containing mRNAs as a putative targets Furthermore, I will use the already available AUACCC and AGAUCU motif position in *U. maydis* genome, as these already include the transcript region locations of the motif. ```{r}# load DGE data to identify hyphae expressed genes# Use the lookup function to identify geneIDs overlapping with the high confident targets Fi_wtvskd_DEG$high.conf_targets=lookup(Fi_wtvskd_DEG$gene.ID,high.conf$gene.id, high.conf$gene.id )# auaccc motifFi_wtvskd_DEG$auaccc=lookup(Fi_wtvskd_DEG$gene.ID,auaccc.gr$gene.id, auaccc.gr$gene.id )# agaucu motifFi_wtvskd_DEG$agaucu=lookup(Fi_wtvskd_DEG$gene.ID,agaucu.gr$gene.id, agaucu.gr$gene.id )# only expressed genesFi_wtvskd_BM = Fi_wtvskd_DEG[Fi_wtvskd_DEG$baseMean>10,]``````{r}auaccc_DGE = Fi_wtvskd_BM[!is.na(Fi_wtvskd_BM$auaccc),] agaucu_DGE = Fi_wtvskd_BM[!is.na(Fi_wtvskd_BM$agaucu),] ``````{r}# AUACCCauaccc.mot=read.csv("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/AUACCCmotif.csv")auaccc.mot$high.conf.targets = lookup::lookup(auaccc.mot$Gene_ID,high.conf$gene.id,high.conf$gene.id)auaccc.mot$put.targets = lookup::lookup(auaccc.mot$Gene_ID, auaccc_DGE$gene.ID, auaccc_DGE$gene.ID)# AGAUCUagaucu.mot=read.csv("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/AGAUCUmotif.csv")agaucu.mot$high.conf.targets = lookup::lookup(agaucu.mot$GeneID,high.conf$gene.id,high.conf$gene.id)agaucu.mot$put.targets = lookup::lookup(agaucu.mot$GeneID, auaccc_DGE$gene.ID, auaccc_DGE$gene.ID)``````{r}# highconfident targets (auaccc motif)df_high_conf_auaccc = auaccc.mot[!is.na(auaccc.mot$high.conf.targets),] %>% .[,"Position"] %>%table(.) %>% as.data.frame %>%set_colnames(.,c("variable", "Freq")) %>%mutate(motif="AUACCC", set ="Khd4_bound_mRNA")# highconfident targets (agaucu motif)df_high_conf_agaucuc = agaucu.mot[!is.na(agaucu.mot$high.conf.targets),] %>% .[,"Position"] %>%table(.) %>% as.data.frame %>%set_colnames(.,c("variable", "Freq")) %>%mutate(motif="AGAUCU", set ="Khd4_bound_mRNA")# putative targets (auaccc motif)df_put_auaccc = auaccc.mot[!is.na(auaccc.mot$put.targets),] %>% .[,"Position"] %>%table(.) %>% as.data.frame %>%set_colnames(.,c("variable", "Freq")) %>%mutate(motif="AUACCC", set ="putative_targets")# putative targets (agaucu motif)df_put_agaucuc = agaucu.mot[!is.na(agaucu.mot$put.targets),] %>% .[,"Position"] %>%table(.) %>% as.data.frame %>%set_colnames(.,c("variable", "Freq")) %>%mutate(motif="AGAUCU", set ="putative_targets")df =rbind(df_high_conf_auaccc,df_high_conf_agaucuc,df_put_auaccc,df_put_agaucuc)ggplot(df, aes(x=factor(set, levels=c("Khd4_bound_mRNA", "putative_targets")), y = Freq, fill =factor(variable, levels=c("5UTR", "ORF", "3UTR"))))+geom_bar(position =position_fill(), stat ="identity")+scale_fill_brewer(palette ="Greens")+facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU")))+labs(x="",y ="% with motif",fill="transcript region")+ myTheme1``````{r}sessionInfo()```