Identifying the positions of RNA sequence to be tested across the genome.
Show the code
#################################################################################AUACCC################################################################################# get gene sequencesgene.seqs =getSeq(Umaydis, genes.GR)# find AUACCC motifsmotif.auaccc =DNAString("ATACCC")# find all AUACCC motifs in the genomeauaccc.gr =vmatchPattern(motif.auaccc, Umaydis)# count motifs in genesgenes.GR$AUACCC =countOverlaps(genes.GR, auaccc.gr)#table(genes.GR$AUACCC)# assign motifs to genesov =findOverlaps(auaccc.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesauaccc.gr = auaccc.gr[from(ov)]auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################AGAUCU################################################################################# find AGAUCU motifsmotif.ctrl =DNAString("AGATCT")agaucu.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$AGAUCU =countOverlaps(genes.GR, agaucu.gr)#table(genes.GR$AGAUCU)# assign motifs to genesov =findOverlaps(agaucu.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesagaucu.gr = agaucu.gr[from(ov)]agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################GGGTAT################################################################################# find GGGTAT motifsmotif.ctrl =DNAString("GGGTAT")gggtat.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$GGGTAT =countOverlaps(genes.GR, gggtat.gr)#table(genes.GR$GGGTAT)# assign motifs to genesov =findOverlaps(gggtat.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesgggtat.gr = gggtat.gr[from(ov)]gggtat.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(gggtat.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################ACACUC################################################################################# find AGAUCU motifsmotif.ctrl =DNAString("ACACTC")acacuc.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$ACACUC =countOverlaps(genes.GR, acacuc.gr)#table(genes.GR$ACACUC)# assign motifs to genesov =findOverlaps(acacuc.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesacacuc.gr = acacuc.gr[from(ov)]acacuc.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(acacuc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
Fischer’s exact test was performed to calculate the enrichment of motif between datasets from the previous section. Note that I made the dataframe by manually entering the number from tab.motif variable from above.
Show the code
#Pvalue calculation# isolating the motif number in each set that is with or without motif#################################################################################AUACCC motif################################################################################khd4.uni=c(157,36)shared=c(60,42)ctrl.uni=c(226,682)all=c(1675,5256)enrmotif=data.frame(khd4.uni,shared,ctrl.uni,all) # make dataframerow.names(enrmotif)=c("auaccc", "nonauaccc") # assign rownames# fischers testchisq.khd4Mot=enrmotif[,c(1,4)]chisq.sharedMot=enrmotif[,c(2,4)]chisq.ctrlMot=enrmotif[,c(3,4)]#fischers testfisher.test(chisq.khd4Mot) # khd4unique vs all
Fisher's Exact Test for Count Data
data: chisq.khd4Mot
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
9.427969 20.322387
sample estimates:
odds ratio
13.67828
Show the code
fisher.test(chisq.sharedMot) # shared vs all
Fisher's Exact Test for Count Data
data: chisq.sharedMot
p-value = 1.529e-13
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
2.958915 6.842948
sample estimates:
odds ratio
4.481005
Show the code
fisher.test(chisq.ctrlMot) # conrol vs all
Fisher's Exact Test for Count Data
data: chisq.ctrlMot
p-value = 0.6506
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.8818158 1.2228807
sample estimates:
odds ratio
1.039832
Show the code
#################################################################################AGAUCU################################################################################mkhd4.uni=c(102,91)mshared=c(50,52)mctrl.uni=c(438,470)mall=c(2931,4000)menrmotif=data.frame(mkhd4.uni,mshared,mctrl.uni,mall) # make dataframerow.names(menrmotif)=c("agaucu", "nonagaucu") # assign rownames# fischers testchisq.mkhd4Mot=menrmotif[,c(1,4)]chisq.msharedMot=menrmotif[,c(2,4)]chisq.mctrlMot=menrmotif[,c(3,4)]#fischers testfisher.test(chisq.mkhd4Mot) # khd4unique vs all
Fisher's Exact Test for Count Data
data: chisq.mkhd4Mot
p-value = 0.003924
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.136806 2.060719
sample estimates:
odds ratio
1.529565
Show the code
fisher.test(chisq.msharedMot) # shared vs all
Fisher's Exact Test for Count Data
data: chisq.msharedMot
p-value = 0.1896
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.869246 1.979156
sample estimates:
odds ratio
1.312167
Show the code
fisher.test(chisq.mctrlMot) # conrol vs all
Fisher's Exact Test for Count Data
data: chisq.mctrlMot
p-value = 0.0007047
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.104319 1.464441
sample estimates:
odds ratio
1.271753
Show the code
#################################################################################GGGUAU motif################################################################################m2khd4.uni=c(47,146)m2shared=c(31,71)m2ctrl.uni=c(232,676)m2all=c(1496,5435)m2enrmotif=data.frame(m2khd4.uni,m2shared,m2ctrl.uni,m2all) # make dataframerow.names(m2enrmotif)=c("cccaua", "nonagaucu") # assign rownames# fischers testchisq.m2khd4Mot=m2enrmotif[,c(1,4)]chisq.m2sharedMot=m2enrmotif[,c(2,4)]chisq.m2ctrlMot=m2enrmotif[,c(3,4)]chisq.m2ctrlMot_unimot=m2enrmotif[,c(1,3)]#fischers testfisher.test(chisq.m2khd4Mot) # khd4unique vs all
Fisher's Exact Test for Count Data
data: chisq.m2khd4Mot
p-value = 0.3754
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.8191105 1.6439441
sample estimates:
odds ratio
1.169544
Show the code
fisher.test(chisq.m2sharedMot) # shared vs all
Fisher's Exact Test for Count Data
data: chisq.m2sharedMot
p-value = 0.03911
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.001058 2.461645
sample estimates:
odds ratio
1.58613
Show the code
fisher.test(chisq.m2ctrlMot) # conrol vs all
Fisher's Exact Test for Count Data
data: chisq.m2ctrlMot
p-value = 0.007319
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.057883 1.465752
sample estimates:
odds ratio
1.246811
Show the code
fisher.test(chisq.m2ctrlMot_unimot) # khd4unique vs ctrl
Fisher's Exact Test for Count Data
data: chisq.m2ctrlMot_unimot
p-value = 0.7848
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.6387363 1.3598368
sample estimates:
odds ratio
0.9380518
Show the code
#################################################################################ACACUC motif################################################################################m3khd4.uni=c(115,78)m3shared=c(49,53)m3ctrl.uni=c(521,387)m3all=c(3482,3449)m3enrmotif=data.frame(m3khd4.uni,m3shared,m3ctrl.uni,m3all) # make dataframerow.names(m3enrmotif)=c("cccaua", "nonagaucu") # assign rownames# fischers testchisq.m3khd4Mot=m3enrmotif[,c(1,4)]chisq.m3sharedMot=m3enrmotif[,c(2,4)]chisq.m3ctrlMot=m3enrmotif[,c(3,4)]chisq.m3ctrlMot_unimot=m3enrmotif[,c(1,3)]#fischers testfisher.test(chisq.m3khd4Mot) # khd4unique vs all
Fisher's Exact Test for Count Data
data: chisq.m3khd4Mot
p-value = 0.01063
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.081501 1.980467
sample estimates:
odds ratio
1.460302
Show the code
fisher.test(chisq.m3sharedMot) # shared vs all
Fisher's Exact Test for Count Data
data: chisq.m3sharedMot
p-value = 0.6906
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.6062636 1.3809297
sample estimates:
odds ratio
0.9157924
Show the code
fisher.test(chisq.m3ctrlMot) # conrol vs all
Fisher's Exact Test for Count Data
data: chisq.m3ctrlMot
p-value = 5.612e-05
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.157002 1.537709
sample estimates:
odds ratio
1.333439
Show the code
fisher.test(chisq.m3ctrlMot_unimot) # khd4unique vs ctrl
Fisher's Exact Test for Count Data
data: chisq.m3ctrlMot_unimot
p-value = 0.6303
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.7894775 1.5244677
sample estimates:
odds ratio
1.095037
2 Distance from editing sites to motifs
Only editing sites on mRNAs specific to each dataset were considered
Show the code
names(hyper.rep) =c("Khd4_Ada_Gfp", "control_Ada")pf =lapply(names(hyper.rep), function(i){#i = "khd4" hyper = hyper.rep[[i]]head(hyper)# calculate distance to nearest AUACCC motif dists.auacc =distanceToNearest(hyper, auaccc.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]table(sel) dists.auacc=subset(dists.auacc, sel)mcols(dists.auacc)$motif ="AUACCC"# calculate distance to nearest AGAUCU motif dists.agaucu =distanceToNearest(hyper, agaucu.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]table(sel) dists.agaucu=subset(dists.agaucu, sel)mcols(dists.agaucu)$motif ="AGAUCU"# calculate distance to nearest CCCAUA motif dists.gggtat =distanceToNearest(hyper, gggtat.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.gggtat)] == gggtat.gr$gene.id[to(dists.gggtat)]table(sel) dists.gggtat =subset(dists.gggtat, sel)mcols(dists.gggtat)$motif ="GGGTAT"# calculate distance to nearest ACACUC motif dists.acacuc =distanceToNearest(hyper, acacuc.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.acacuc)] == acacuc.gr$gene.id[to(dists.acacuc)]table(sel) dists.acacuc =subset(dists.acacuc, sel)mcols(dists.acacuc)$motif ="ACACUC"# get stats of distancessummary(mcols(dists.auacc)$distance)summary(mcols(dists.agaucu)$distance)summary(mcols(dists.gggtat)$distance)summary(mcols(dists.acacuc)$distance) pf =rbind(mcols(dists.auacc), mcols(dists.agaucu), mcols(dists.gggtat), mcols(dists.acacuc)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)# Histogrampl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram(binwidth =150, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500)) +scale_fill_manual(values=c(Khd4_Ada_Gfp ="dodgerblue4", control_Ada ="grey")) +theme_bw() +facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU", "GGGTAT", "ACACUC")))+myTheme1pl4
Show the code
################################################################################# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts################################################################################# motif logo with u in it# Motif-1 TP% = 60.9 ENR-Ratio = 9.40 HMHHKAUACCCA=c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524)C=c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476)G =c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000)U =c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000)pwDF1bU =rbind(A,C,G,U)# MoUif-2 UP% = 59.3 ENR-RaUio = 9.60 HMKAUACCCVHA=c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404)C=c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852)G =c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894)U =c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849)pwDF2bU =rbind(A,C,G,U)# MoUif-4 UP% = 65.8 ENR-RaUio = 1.46 UHCCUCUYCHCCMUC A=c(0.197917,0.229167,0.000000,0.000000,0.072917,0.000000,0.052083,0.000000,0.020833,0.416667,0.000000,0.145833,0.458333,0.000000,0.000000)C=c( 0.208333,0.427083,0.770833,0.906250,0.270833,0.739583,0.145833,0.333333,0.979167,0.260417,0.677083,0.697917,0.354167,0.312500,0.979167)G =c(0.125000,0.000000,0.062500,0.000000,0.093750,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.156250,0.114583,0.000000,0.020833)U =c(0.468750,0.343750,0.166667,0.093750,0.562500,0.260417,0.739583,0.666667,0.000000,0.322917,0.322917,0.000000,0.072917,0.687500,0.000000)pwDF4bU =rbind(A,C,G,U)m1 =ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")m2 =ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")m4 =ggseqlogo(pwDF4bU, col_scheme="nucleotide", font="helvetica_bold")pl =ggarrange(m1,m2,m4,labels = LETTERS[1:3],nrow =1)pl =annotate_figure(pl, top =text_grob("de novo discovered motif on Khd4-Ada-Gfp edited transcripts", color ="red", face ="bold", size =14))pl
3 Using motif position weight matrix to look for their occurence in the genome
As a next step, we will check the distribution of the identified motif. For this purpose, i will use matchPWM() function for motif occurence. First I will use AUACCC to see if the function can indeed work!
Show the code
# function to find motif occurence# define a function for identifying the motif occurencegetmotgr=function(seq, pwm, gr_plus, gr_minus){# motif in sense strandmatches_list_strand =lapply(seq, matchPWM, pwm=pwm)# apply to all test =lapply(names(matches_list_strand), function (i){ uc = matches_list_strand[[i]] df=as.data.frame(uc) df$chr = i # make a column specifying chromosome name df =cbind.DataFrame(df)return(df)})# rbind the resulting dfdf1 =do.call(rbind, test) %>%as(., "GRanges")# find overlapsov =findOverlaps(df1, gr_plus, ignore.strand=FALSE)df1 = df1[from(ov)]df1$gene.id = gr_plus$gene_id[to(ov)]strand(df1) =strand(gr_plus)[to(ov)]# motif in antisense strandmatches_list_antistrand =lapply(seq, matchPWM, pwm =reverseComplement(pwm))# apply to all test1 =lapply(names(matches_list_antistrand), function (i){ uc1 = matches_list_antistrand[[i]] df1=as.data.frame(uc1) df1$chr = i df1 =cbind.DataFrame(df1)return(df1)})# rbind the resulting dfdf2 =do.call(rbind, test1) %>%as(., "GRanges")# find overlapsov2 =findOverlaps(df2, gr_minus, ignore.strand=FALSE)df2= df2[from(ov2)]df2$gene.id = gr_minus$gene_id[to(ov2)]strand(df2) =strand(gr_minus)[to(ov2)]df3 =c(df1,df2) %>%as(., "GRanges")return(df3)}
Show the code
# get sequneces of chromosomeuma.seq =getSeq(Umaydis) # isolate + and - strand separtely# + strand# strand specific data of GTF file# plus strandsgns_plus = genes.GR[strand(genes.GR) =="+"]#minus strandgns_minus = genes.GR[strand(genes.GR) =="-"]
Show the code
# convert RNA string to DNA stringrownames(pwDF1bU) =c("A", "C", "G", "T")rownames(pwDF4bU) =c("A", "C", "G", "T")
Show the code
# remove the um_scaf_contigs because they causes problem downstreamuma.seq = uma.seq[-c(24,25,26,27)]#HMHHKAUACCC.gr HMHHKAUACCC.gr =getmotgr(uma.seq, pwm=pwDF1bU, gns_plus, gns_minus)#UHCCUCUYCHCCMUC.grUHCCUCUYCHCCMUC.gr =getmotgr(uma.seq, pwm=pwDF4bU, gns_plus, gns_minus)
Here I am going to find the distribution of the most enriched HMHHKAUACCC and frequently occurring UHCCUCUYCHCCMUC motifs from Khd4-Ada-Gfp editing sites.
Show the code
pf =lapply(names(hyper.rep), function(i){#i = "khd4" hyper = hyper.rep[[i]]head(hyper)# calculate distance to nearest AUACCC motif dists.HMHHKAUACCC =distanceToNearest(hyper, HMHHKAUACCC.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.HMHHKAUACCC)] == HMHHKAUACCC.gr$gene.id[to(dists.HMHHKAUACCC)]table(sel) dists.HMHHKAUACCC=subset(dists.HMHHKAUACCC, sel)mcols(dists.HMHHKAUACCC)$motif ="HMHHKAUACCC"# calculate distance to nearest UHCCUCUYCHCCMUC.gr motif dists.UHCCUCUYCHCCMUC =distanceToNearest(hyper, UHCCUCUYCHCCMUC.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.UHCCUCUYCHCCMUC)] == UHCCUCUYCHCCMUC.gr$gene.id[to(dists.UHCCUCUYCHCCMUC)]table(sel) dists.agcagcagc =subset(dists.UHCCUCUYCHCCMUC, sel)mcols(dists.UHCCUCUYCHCCMUC)$motif ="UHCCUCUYCHCCMUC"# get stats of distancessummary(mcols(dists.HMHHKAUACCC)$distance)summary(mcols(dists.UHCCUCUYCHCCMUC)$distance) pf =rbind(mcols(dists.HMHHKAUACCC),mcols(dists.UHCCUCUYCHCCMUC)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)pl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram(binwidth =150, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500), ylim=c(0,0.004)) +scale_fill_manual(values=c(Khd4_Ada_Gfp ="dodgerblue4", control_Ada ="grey")) +labs(x="Distance to the neares motif (nt)",y ="motif density")+theme_bw() +facet_wrap(~motif)+myTheme1ggarrange(pl4, labels = LETTERS[1],ncol =1, nrow =1)
---title: "Khd4-Ada-Gfp editing sites are proximal to AUACCC motif"date: last-modifiedauthor: - name: "Srimeenakshi Sankaranarayanan"title-block-banner: trueformat: html: theme: flatly self-contained: true code-fold: true code-tools: true code-summary: "Show the code" toc: true number-sections: true anchor-sections: trueeditor: visualexecute: echo: true error: false warning: false message: false---```{r}# librarylibrary(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genomelibrary(ggplot2)library(BindingSiteFinder)library(rtracklayer)library(ComplexHeatmap)library(GenomicFeatures)library(forcats)library(tidyr)library(dplyr)library(tidyverse)library(GenomicRanges)library(magick)library(magrittr)library(gridExtra)library(IRanges)library(Biostrings)library(ggpp)library(gginnards)library(ggrepel)library(ggpubr)library(ggforce)library(ggrastr)library(viridis)library(reshape2)library(gprofiler2)library(ggsci)library(ggh4x)library(ggplotify)library(gridExtra)library(circlize)library(EnrichedHeatmap)library(UpSetR)library(kableExtra)library(cowplot)library(rstatix)library(beeswarm)library(clusterProfiler)library(ggseqlogo)library(tidyHeatmap)library(paletteer)library(ggvenn)library(colorspace)library(ggpointdensity)library(lookup)library(rstatix)library(ggrepel)``````{r}load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")```Identifying the positions of RNA sequence to be tested across the genome.```{r}#################################################################################AUACCC################################################################################# get gene sequencesgene.seqs =getSeq(Umaydis, genes.GR)# find AUACCC motifsmotif.auaccc =DNAString("ATACCC")# find all AUACCC motifs in the genomeauaccc.gr =vmatchPattern(motif.auaccc, Umaydis)# count motifs in genesgenes.GR$AUACCC =countOverlaps(genes.GR, auaccc.gr)#table(genes.GR$AUACCC)# assign motifs to genesov =findOverlaps(auaccc.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesauaccc.gr = auaccc.gr[from(ov)]auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################AGAUCU################################################################################# find AGAUCU motifsmotif.ctrl =DNAString("AGATCT")agaucu.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$AGAUCU =countOverlaps(genes.GR, agaucu.gr)#table(genes.GR$AGAUCU)# assign motifs to genesov =findOverlaps(agaucu.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesagaucu.gr = agaucu.gr[from(ov)]agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################GGGTAT################################################################################# find GGGTAT motifsmotif.ctrl =DNAString("GGGTAT")gggtat.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$GGGTAT =countOverlaps(genes.GR, gggtat.gr)#table(genes.GR$GGGTAT)# assign motifs to genesov =findOverlaps(gggtat.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesgggtat.gr = gggtat.gr[from(ov)]gggtat.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(gggtat.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)#################################################################################ACACUC################################################################################# find AGAUCU motifsmotif.ctrl =DNAString("ACACTC")acacuc.gr =vmatchPattern(motif.ctrl, Umaydis)genes.GR$ACACUC =countOverlaps(genes.GR, acacuc.gr)#table(genes.GR$ACACUC)# assign motifs to genesov =findOverlaps(acacuc.gr, genes.GR)# check whether motifs overlap with more than one gene#table(duplicated(from(ov)))# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genesacacuc.gr = acacuc.gr[from(ov)]acacuc.gr$gene.id = genes.GR$gene_id[to(ov)]# count motifs in transcript regions#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)#findOverlaps(acacuc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)```## Motif enrichment```{r}# check that numbers are same as in Venn#sapply(khd4.vs.ctrl, length)names(khd4.vs.ctrl) =c("Khd4-Ada-Gfp-unique", "overlap", "control-Ada unique", "all mRNAs")# table of AUACCCtab.auaccc =sapply(khd4.vs.ctrl, function(gene.set){ motifs = genes.GR$AUACCC[genes.GR$gene_id %in% gene.set] tab =table(motifs >0)return(tab)}) %>%melt(.)names(tab.auaccc) =c("with.motif", "set", "no.genes")# table of AGAUCUtab.agaucu =sapply(khd4.vs.ctrl, function(gene.set){ motifs = genes.GR$AGAUCU[genes.GR$gene_id %in% gene.set] tab =table(motifs >0)return(tab)}) %>%melt(.)names(tab.agaucu) =c("with.motif", "set", "no.genes")# table of GGGTATtab.gggtat =sapply(khd4.vs.ctrl, function(gene.set){ motifs = genes.GR$GGGTAT[genes.GR$gene_id %in% gene.set] tab =table(motifs >0)return(tab)}) %>%melt(.)names(tab.gggtat) =c("with.motif", "set", "no.genes")# table of CCCAUAtab.acacuc =sapply(khd4.vs.ctrl, function(gene.set){ motifs = genes.GR$ACACUC[genes.GR$gene_id %in% gene.set] tab =table(motifs >0)return(tab)}) %>%melt(.)names(tab.acacuc) =c("with.motif", "set", "no.genes")# stacked barchart (scaled to 100%)pl3 =ggplot(tab.auaccc, aes(x=set, y=no.genes, fill=with.motif)) +geom_bar(position="fill", stat="identity") +scale_fill_manual(values=c('TRUE'= auaccc.col, 'FALSE'="lightgrey")) +ggtitle("AUACCC")+labs(x="",y="% with motif")+ myTheme1+theme(axis.text.x =element_text(angle =45, hjust=1, vjust=1))# stacked barchart (scaled to 100%)pl4 =ggplot(tab.agaucu, aes(x=set, y=no.genes, fill=with.motif)) +geom_bar(position="fill", stat="identity") +scale_fill_manual(values=c('TRUE'= agaucu.col, 'FALSE'="lightgrey")) +ggtitle("AGAUCU")+labs(x="",y="% with motif")+ myTheme1+theme(axis.text.x =element_text(angle =45, hjust=1, vjust=1))# stacked barchart (scaled to 100%)pl5 =ggplot(tab.gggtat, aes(x=set, y=no.genes, fill=with.motif)) +geom_bar(position="fill", stat="identity") +scale_fill_manual(values=c('TRUE'= agaucu.col, 'FALSE'="lightgrey")) +ggtitle("GGGTAT")+labs(x="",y="% with motif")+ myTheme1+theme(axis.text.x =element_text(angle =45, hjust=1, vjust=1))# stacked barchart (scaled to 100%)pl6 =ggplot(tab.acacuc, aes(x=set, y=no.genes, fill=with.motif)) +geom_bar(position="fill", stat="identity") +scale_fill_manual(values=c('TRUE'= agaucu.col, 'FALSE'="lightgrey")) +ggtitle("ACAUCU")+labs(x="",y="% with motif")+ myTheme1+theme(axis.text.x =element_text(angle =45, hjust=1, vjust=1))# FigS7Apl<-ggarrange(pl3, pl4, pl5, pl6, labels = LETTERS[1:4],nrow =1, common.legend =TRUE)pl```Fischer's exact test was performed to calculate the enrichment of motif between datasets from the previous section. Note that I made the dataframe by manually entering the number from tab.motif variable from above.```{r}#Pvalue calculation# isolating the motif number in each set that is with or without motif#################################################################################AUACCC motif################################################################################khd4.uni=c(157,36)shared=c(60,42)ctrl.uni=c(226,682)all=c(1675,5256)enrmotif=data.frame(khd4.uni,shared,ctrl.uni,all) # make dataframerow.names(enrmotif)=c("auaccc", "nonauaccc") # assign rownames# fischers testchisq.khd4Mot=enrmotif[,c(1,4)]chisq.sharedMot=enrmotif[,c(2,4)]chisq.ctrlMot=enrmotif[,c(3,4)]#fischers testfisher.test(chisq.khd4Mot) # khd4unique vs allfisher.test(chisq.sharedMot) # shared vs allfisher.test(chisq.ctrlMot) # conrol vs all#################################################################################AGAUCU################################################################################mkhd4.uni=c(102,91)mshared=c(50,52)mctrl.uni=c(438,470)mall=c(2931,4000)menrmotif=data.frame(mkhd4.uni,mshared,mctrl.uni,mall) # make dataframerow.names(menrmotif)=c("agaucu", "nonagaucu") # assign rownames# fischers testchisq.mkhd4Mot=menrmotif[,c(1,4)]chisq.msharedMot=menrmotif[,c(2,4)]chisq.mctrlMot=menrmotif[,c(3,4)]#fischers testfisher.test(chisq.mkhd4Mot) # khd4unique vs allfisher.test(chisq.msharedMot) # shared vs allfisher.test(chisq.mctrlMot) # conrol vs all#################################################################################GGGUAU motif################################################################################m2khd4.uni=c(47,146)m2shared=c(31,71)m2ctrl.uni=c(232,676)m2all=c(1496,5435)m2enrmotif=data.frame(m2khd4.uni,m2shared,m2ctrl.uni,m2all) # make dataframerow.names(m2enrmotif)=c("cccaua", "nonagaucu") # assign rownames# fischers testchisq.m2khd4Mot=m2enrmotif[,c(1,4)]chisq.m2sharedMot=m2enrmotif[,c(2,4)]chisq.m2ctrlMot=m2enrmotif[,c(3,4)]chisq.m2ctrlMot_unimot=m2enrmotif[,c(1,3)]#fischers testfisher.test(chisq.m2khd4Mot) # khd4unique vs allfisher.test(chisq.m2sharedMot) # shared vs allfisher.test(chisq.m2ctrlMot) # conrol vs allfisher.test(chisq.m2ctrlMot_unimot) # khd4unique vs ctrl#################################################################################ACACUC motif################################################################################m3khd4.uni=c(115,78)m3shared=c(49,53)m3ctrl.uni=c(521,387)m3all=c(3482,3449)m3enrmotif=data.frame(m3khd4.uni,m3shared,m3ctrl.uni,m3all) # make dataframerow.names(m3enrmotif)=c("cccaua", "nonagaucu") # assign rownames# fischers testchisq.m3khd4Mot=m3enrmotif[,c(1,4)]chisq.m3sharedMot=m3enrmotif[,c(2,4)]chisq.m3ctrlMot=m3enrmotif[,c(3,4)]chisq.m3ctrlMot_unimot=m3enrmotif[,c(1,3)]#fischers testfisher.test(chisq.m3khd4Mot) # khd4unique vs allfisher.test(chisq.m3sharedMot) # shared vs allfisher.test(chisq.m3ctrlMot) # conrol vs allfisher.test(chisq.m3ctrlMot_unimot) # khd4unique vs ctrl```## Distance from editing sites to motifsOnly editing sites on mRNAs specific to each dataset were considered```{r}names(hyper.rep) =c("Khd4_Ada_Gfp", "control_Ada")pf =lapply(names(hyper.rep), function(i){#i = "khd4" hyper = hyper.rep[[i]]head(hyper)# calculate distance to nearest AUACCC motif dists.auacc =distanceToNearest(hyper, auaccc.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]table(sel) dists.auacc=subset(dists.auacc, sel)mcols(dists.auacc)$motif ="AUACCC"# calculate distance to nearest AGAUCU motif dists.agaucu =distanceToNearest(hyper, agaucu.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]table(sel) dists.agaucu=subset(dists.agaucu, sel)mcols(dists.agaucu)$motif ="AGAUCU"# calculate distance to nearest CCCAUA motif dists.gggtat =distanceToNearest(hyper, gggtat.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.gggtat)] == gggtat.gr$gene.id[to(dists.gggtat)]table(sel) dists.gggtat =subset(dists.gggtat, sel)mcols(dists.gggtat)$motif ="GGGTAT"# calculate distance to nearest ACACUC motif dists.acacuc =distanceToNearest(hyper, acacuc.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.acacuc)] == acacuc.gr$gene.id[to(dists.acacuc)]table(sel) dists.acacuc =subset(dists.acacuc, sel)mcols(dists.acacuc)$motif ="ACACUC"# get stats of distancessummary(mcols(dists.auacc)$distance)summary(mcols(dists.agaucu)$distance)summary(mcols(dists.gggtat)$distance)summary(mcols(dists.acacuc)$distance) pf =rbind(mcols(dists.auacc), mcols(dists.agaucu), mcols(dists.gggtat), mcols(dists.acacuc)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)# Histogrampl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram(binwidth =150, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500)) +scale_fill_manual(values=c(Khd4_Ada_Gfp ="dodgerblue4", control_Ada ="grey")) +theme_bw() +facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU", "GGGTAT", "ACACUC")))+myTheme1pl4``````{r}################################################################################# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts################################################################################# motif logo with u in it# Motif-1 TP% = 60.9 ENR-Ratio = 9.40 HMHHKAUACCCA=c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524)C=c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476)G =c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000)U =c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000)pwDF1bU =rbind(A,C,G,U)# MoUif-2 UP% = 59.3 ENR-RaUio = 9.60 HMKAUACCCVHA=c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404)C=c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852)G =c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894)U =c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849)pwDF2bU =rbind(A,C,G,U)# MoUif-4 UP% = 65.8 ENR-RaUio = 1.46 UHCCUCUYCHCCMUC A=c(0.197917,0.229167,0.000000,0.000000,0.072917,0.000000,0.052083,0.000000,0.020833,0.416667,0.000000,0.145833,0.458333,0.000000,0.000000)C=c( 0.208333,0.427083,0.770833,0.906250,0.270833,0.739583,0.145833,0.333333,0.979167,0.260417,0.677083,0.697917,0.354167,0.312500,0.979167)G =c(0.125000,0.000000,0.062500,0.000000,0.093750,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.156250,0.114583,0.000000,0.020833)U =c(0.468750,0.343750,0.166667,0.093750,0.562500,0.260417,0.739583,0.666667,0.000000,0.322917,0.322917,0.000000,0.072917,0.687500,0.000000)pwDF4bU =rbind(A,C,G,U)m1 =ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")m2 =ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")m4 =ggseqlogo(pwDF4bU, col_scheme="nucleotide", font="helvetica_bold")pl =ggarrange(m1,m2,m4,labels = LETTERS[1:3],nrow =1)pl =annotate_figure(pl, top =text_grob("de novo discovered motif on Khd4-Ada-Gfp edited transcripts", color ="red", face ="bold", size =14))pl```## Using motif position weight matrix to look for their occurence in the genomeAs a next step, we will check the distribution of the identified motif. For this purpose, i will use *matchPWM()* function for motif occurence. First I will use AUACCC to see if the function can indeed work!```{r}# function to find motif occurence# define a function for identifying the motif occurencegetmotgr=function(seq, pwm, gr_plus, gr_minus){# motif in sense strandmatches_list_strand =lapply(seq, matchPWM, pwm=pwm)# apply to all test =lapply(names(matches_list_strand), function (i){ uc = matches_list_strand[[i]] df=as.data.frame(uc) df$chr = i # make a column specifying chromosome name df =cbind.DataFrame(df)return(df)})# rbind the resulting dfdf1 =do.call(rbind, test) %>%as(., "GRanges")# find overlapsov =findOverlaps(df1, gr_plus, ignore.strand=FALSE)df1 = df1[from(ov)]df1$gene.id = gr_plus$gene_id[to(ov)]strand(df1) =strand(gr_plus)[to(ov)]# motif in antisense strandmatches_list_antistrand =lapply(seq, matchPWM, pwm =reverseComplement(pwm))# apply to all test1 =lapply(names(matches_list_antistrand), function (i){ uc1 = matches_list_antistrand[[i]] df1=as.data.frame(uc1) df1$chr = i df1 =cbind.DataFrame(df1)return(df1)})# rbind the resulting dfdf2 =do.call(rbind, test1) %>%as(., "GRanges")# find overlapsov2 =findOverlaps(df2, gr_minus, ignore.strand=FALSE)df2= df2[from(ov2)]df2$gene.id = gr_minus$gene_id[to(ov2)]strand(df2) =strand(gr_minus)[to(ov2)]df3 =c(df1,df2) %>%as(., "GRanges")return(df3)}``````{r, warning=FALSE}# get sequneces of chromosomeuma.seq =getSeq(Umaydis) # isolate + and - strand separtely# + strand# strand specific data of GTF file# plus strandsgns_plus = genes.GR[strand(genes.GR) =="+"]#minus strandgns_minus = genes.GR[strand(genes.GR) =="-"]``````{r}# convert RNA string to DNA stringrownames(pwDF1bU) =c("A", "C", "G", "T")rownames(pwDF4bU) =c("A", "C", "G", "T")``````{r, warning=FALSE}# remove the um_scaf_contigs because they causes problem downstreamuma.seq = uma.seq[-c(24,25,26,27)]#HMHHKAUACCC.gr HMHHKAUACCC.gr =getmotgr(uma.seq, pwm=pwDF1bU, gns_plus, gns_minus)#UHCCUCUYCHCCMUC.grUHCCUCUYCHCCMUC.gr =getmotgr(uma.seq, pwm=pwDF4bU, gns_plus, gns_minus)```Here I am going to find the distribution of the most enriched **HMHHKAUACCC** and frequently occurring **UHCCUCUYCHCCMUC** motifs from Khd4-Ada-Gfp editing sites.```{r}pf =lapply(names(hyper.rep), function(i){#i = "khd4" hyper = hyper.rep[[i]]head(hyper)# calculate distance to nearest AUACCC motif dists.HMHHKAUACCC =distanceToNearest(hyper, HMHHKAUACCC.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.HMHHKAUACCC)] == HMHHKAUACCC.gr$gene.id[to(dists.HMHHKAUACCC)]table(sel) dists.HMHHKAUACCC=subset(dists.HMHHKAUACCC, sel)mcols(dists.HMHHKAUACCC)$motif ="HMHHKAUACCC"# calculate distance to nearest UHCCUCUYCHCCMUC.gr motif dists.UHCCUCUYCHCCMUC =distanceToNearest(hyper, UHCCUCUYCHCCMUC.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.UHCCUCUYCHCCMUC)] == UHCCUCUYCHCCMUC.gr$gene.id[to(dists.UHCCUCUYCHCCMUC)]table(sel) dists.agcagcagc =subset(dists.UHCCUCUYCHCCMUC, sel)mcols(dists.UHCCUCUYCHCCMUC)$motif ="UHCCUCUYCHCCMUC"# get stats of distancessummary(mcols(dists.HMHHKAUACCC)$distance)summary(mcols(dists.UHCCUCUYCHCCMUC)$distance) pf =rbind(mcols(dists.HMHHKAUACCC),mcols(dists.UHCCUCUYCHCCMUC)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)pl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram(binwidth =150, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500), ylim=c(0,0.004)) +scale_fill_manual(values=c(Khd4_Ada_Gfp ="dodgerblue4", control_Ada ="grey")) +labs(x="Distance to the neares motif (nt)",y ="motif density")+theme_bw() +facet_wrap(~motif)+myTheme1ggarrange(pl4, labels = LETTERS[1],ncol =1, nrow =1)``````{r}sessionInfo()```