81 of 377 editing events were removed from the analysis as the transcripts hosting the editing sites lack the AUACCC motif. Here, do novo motif discovery analysis was performed around these editing sites to check if Khd4 can bind to motif other than the AUACCC.
1 Khd4-Ada-Gfp editing sites on transcripts lacking the AUACCC motif
Show the code
# Khd4.repKrep_no_auaccc = khd4.rep# make an auaccc columnKrep_no_auaccc$auaccc =NAKrep_no_auaccc$auaccc =lookup(Krep_no_auaccc$gene.id, auaccc.gr$gene.id, auaccc.gr$gene.id)# Khd4-Ada-Gfp editing sites without AUACCC motif in them. n=81Krep_no_auaccc = Krep_no_auaccc[is.na(Krep_no_auaccc$auaccc)]# no. of genes with no AUACCC motif#n_distinct(Krep_no_auaccc$gene.id)names(Krep_no_auaccc) = Krep_no_auaccc$id# extend both sides by 250 ntKrep_no_auaccc_ext = Krep_no_auaccc+250# get sequenceKrep_no_auaccc_ext = Biostrings::getSeq(Umaydis, Krep_no_auaccc_ext)# export the fasta files#output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"#writeXStringSet(Krep_no_auaccc_ext, filepath = paste0(output,"Krep_no_auaccc_ext_250.fasta"))
2 XSTREME analysis
Show the code
## make a scatter plot# unique to Khd4-Ada-GfpMotif.frame.khd4.noauaccc =data.frame(Motif =c("CWCUUKUGYCUUGY", "UCAUCUCUCGU", "UCAUSWYUCGU", "YMUAUAUCCCAGHC", "UAGGCUUUGG", "GCUUCACUAGCAGSC"),absolute =c(13.70, 5.48, 9.59, 5.48, 46.58, 19.18),EnR =c(98.4, 89.5, 11, 44.7, 1.34, 1.65))pl.khd4.noauaccc =ggplot(Motif.frame.khd4.noauaccc, aes(x=absolute, y=EnR))+geom_point(alpha=1, size=2, color="grey20")+scale_color_manual(values = khd4.col)+coord_cartesian(xlim=c(0,70))+labs(subtitle="Khd4-Ada-Gfp editing events in transcripts lacking AUAUCCC motif",x="sequences with motif (%)",y ="motif enrichment ratio")+theme_paper()+theme(legend.position ="none")
3 Most prevalent motif
Show the code
#UAGGCUUUGG (high prevalency)A=c(0.000836,0.914690,0.000836,0.000836,0.000836,0.107471,0.107471,0.000836,0.000836,0.107471)C=c(0.001057,0.083589,0.001057,0.107692,0.997443,0.001057,0.001057,0.001057,0.001057,0.107692)G =c(0.060271,0.000921,0.997308,0.890673,0.000921,0.000921,0.214191,0.000921,0.997308,0.677403)U =c(0.937836,0.000800,0.000800,0.000800,0.000800,0.890551,0.677281,0.997186,0.000800,0.107435)pwD02U =rbind(A,C,G,U)# make the motif logoggseqlogo(pwD02U, col_scheme="nucleotide", font="helvetica_bold")
4 Enrichment of the UAGGCUUUGG motif
Show the code
# function1# define a function for identifying the motif occurencegetmotgr=function(seq, pwm, gr_plus, gr_minus){# motif in sense strandmatches_list_strand =lapply(seq, matchPWM, pwm=pwm)# apply to all test =lapply(names(matches_list_strand), function (i){ uc = matches_list_strand[[i]] df=as.data.frame(uc) df$chr = i # make a column specifying chromosome name df =cbind.DataFrame(df)return(df)})# rbind the resulting dfdf1 =do.call(rbind, test) %>%as(., "GRanges")# find overlapsov =findOverlaps(df1, gr_plus, ignore.strand=FALSE)df1 = df1[from(ov)]df1$gene.id = gr_plus$gene_id[to(ov)]strand(df1) =strand(gr_plus)[to(ov)]# motif in antisense strandmatches_list_antistrand =lapply(seq, matchPWM, pwm =reverseComplement(pwm))# apply to all test1 =lapply(names(matches_list_antistrand), function (i){ uc1 = matches_list_antistrand[[i]] df1=as.data.frame(uc1) df1$chr = i df1 =cbind.DataFrame(df1)return(df1)})# rbind the resulting dfdf2 =do.call(rbind, test1) %>%as(., "GRanges")# find overlapsov2 =findOverlaps(df2, gr_minus, ignore.strand=FALSE)df2= df2[from(ov2)]df2$gene.id = gr_minus$gene_id[to(ov2)]strand(df2) =strand(gr_minus)[to(ov2)]df3 =c(df1,df2) %>%as(., "GRanges")return(df3)}
Show the code
# as found in meme suite analysis# remove the um_scaf_contigs because they causes problem downstreamuma.seq =getSeq(Umaydis)uma.seq = uma.seq[-c(24,25,26,27)]# plus strandsgns_plus = genes.GR[strand(genes.GR) =="+"]#minus strandgns_minus = genes.GR[strand(genes.GR) =="-"]
Show the code
# change U to Trownames(pwD02U) =c("A", "C", "G", "T")#UAGGCUUUGGUAGGCUUUGG.gr =getmotgr(uma.seq, pwm=pwD02U, gns_plus, gns_minus)
Show the code
#change genes.gr column namekhd4vcntrl =list(khd4 =as.data.frame(unique(hyper.rep$khd4$gene.id)) %>%set_colnames(.,"gene.id"),ctrl =as.data.frame(unique(hyper.rep$ctrl$gene.id)) %>%set_colnames(., "gene.id"),all =as.data.frame(genes.GR$gene_id[genes.GR$gene_biotype =="protein_coding"]) %>%set_colnames(., "gene.id"))# add columns to determine number of genes with CGAGCAAG.GR or GUCUUGCUVY.grkhd4vcntrl =lapply(khd4vcntrl, function(i){# add gucuugcuvy column and look for matching geneID i$UAGGCUUUGG =lookup(i$gene.id,UAGGCUUUGG.gr$gene.id,UAGGCUUUGG.gr$gene.id)return(i)})# counts the total number of sites in each datasetdf2=lapply (names(khd4vcntrl), function(i){#i is khd4 hyp = khd4vcntrl[[i]] df2 =table(!is.na(hyp$UAGGCUUUGG)) %>%as.data.frame() %>%mutate(set = i, motif ="UAGGCUUUGG") %>%set_colnames(.,c("with_motif", "no.of.genes","set","motif")) return(df2)}) df1_mot2 =do.call(rbind,df2)pldf =ggplot(df1_mot2, aes(x=set, y= no.of.genes, fill = with_motif))+geom_bar(stat="identity", position="fill")+scale_fill_manual(values=c("TRUE"="grey30", "FALSE"="white"))+labs(x="",y ="transcripts (%) ")+facet_wrap(~motif)+ myTheme1pldf
Show the code
pf =lapply(names(hyper.rep), function(i){#i= khd4 hyper = hyper.rep[[i]]# calculate distance to nearest UAGGCUUUGG motif dists.GUCUUGCUVY =distanceToNearest(hyper, UAGGCUUUGG.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.GUCUUGCUVY)] == UAGGCUUUGG.gr$gene.id[to(dists.GUCUUGCUVY)]table(sel) dists.GUCUUGCUVY =subset(dists.GUCUUGCUVY, sel)mcols(dists.GUCUUGCUVY)$motif ="UAGGCUUUGG"# get stats of distancessummary(mcols(dists.GUCUUGCUVY)$distance) pf =rbind(mcols(dists.GUCUUGCUVY)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)pl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram(alpha=0.5, binwidth =100, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500)) +scale_fill_manual(values=c(khd4 = khd4.col, ctrl = ada.col)) +labs(x="Distance to the neares motif (nt)",y ="motif density")+coord_cartesian(ylim=c(0.000,0.004),xlim =c(0,2500))+theme_bw() +facet_wrap(~motif)+myTheme1pl4
---title: "Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs"date: last-modifiedauthor: - name: "Srimeenakshi Sankaranarayanan"title-block-banner: trueformat: html: theme: flatly self-contained: true code-fold: true code-tools: true code-summary: "Show the code" toc: true number-sections: true anchor-sections: trueeditor: visualexecute: echo: true error: false warning: false message: false---```{r}# librarylibrary(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genomelibrary(ggplot2)library(BindingSiteFinder)library(rtracklayer)library(ComplexHeatmap)library(GenomicFeatures)library(forcats)library(tidyr)library(dplyr)library(tidyverse)library(GenomicRanges)library(magick)library(magrittr)library(gridExtra)library(IRanges)library(Biostrings)library(ggpp)library(gginnards)library(ggrepel)library(ggpubr)library(ggforce)library(ggrastr)library(viridis)library(reshape2)library(gprofiler2)library(ggsci)library(ggh4x)library(ggplotify)library(gridExtra)library(circlize)library(EnrichedHeatmap)library(UpSetR)library(kableExtra)library(cowplot)library(rstatix)library(beeswarm)library(clusterProfiler)library(ggseqlogo)library(tidyHeatmap)library(paletteer)library(ggvenn)library(colorspace)library(ggpointdensity)library(lookup)library(rstatix)library(ggrepel)``````{r}load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")```81 of 377 editing events were removed from the analysis as the transcripts hosting the editing sites lack the AUACCC motif. Here, *do novo* motif discovery analysis was performed around these editing sites to check if Khd4 can bind to motif other than the AUACCC.## Khd4-Ada-Gfp editing sites on transcripts lacking the AUACCC motif```{r}# Khd4.repKrep_no_auaccc = khd4.rep# make an auaccc columnKrep_no_auaccc$auaccc =NAKrep_no_auaccc$auaccc =lookup(Krep_no_auaccc$gene.id, auaccc.gr$gene.id, auaccc.gr$gene.id)# Khd4-Ada-Gfp editing sites without AUACCC motif in them. n=81Krep_no_auaccc = Krep_no_auaccc[is.na(Krep_no_auaccc$auaccc)]# no. of genes with no AUACCC motif#n_distinct(Krep_no_auaccc$gene.id)names(Krep_no_auaccc) = Krep_no_auaccc$id# extend both sides by 250 ntKrep_no_auaccc_ext = Krep_no_auaccc+250# get sequenceKrep_no_auaccc_ext = Biostrings::getSeq(Umaydis, Krep_no_auaccc_ext)# export the fasta files#output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"#writeXStringSet(Krep_no_auaccc_ext, filepath = paste0(output,"Krep_no_auaccc_ext_250.fasta"))```## XSTREME analysis```{r}## make a scatter plot# unique to Khd4-Ada-GfpMotif.frame.khd4.noauaccc =data.frame(Motif =c("CWCUUKUGYCUUGY", "UCAUCUCUCGU", "UCAUSWYUCGU", "YMUAUAUCCCAGHC", "UAGGCUUUGG", "GCUUCACUAGCAGSC"),absolute =c(13.70, 5.48, 9.59, 5.48, 46.58, 19.18),EnR =c(98.4, 89.5, 11, 44.7, 1.34, 1.65))pl.khd4.noauaccc =ggplot(Motif.frame.khd4.noauaccc, aes(x=absolute, y=EnR))+geom_point(alpha=1, size=2, color="grey20")+scale_color_manual(values = khd4.col)+coord_cartesian(xlim=c(0,70))+labs(subtitle="Khd4-Ada-Gfp editing events in transcripts lacking AUAUCCC motif",x="sequences with motif (%)",y ="motif enrichment ratio")+theme_paper()+theme(legend.position ="none")```## Most prevalent motif```{r}#UAGGCUUUGG (high prevalency)A=c(0.000836,0.914690,0.000836,0.000836,0.000836,0.107471,0.107471,0.000836,0.000836,0.107471)C=c(0.001057,0.083589,0.001057,0.107692,0.997443,0.001057,0.001057,0.001057,0.001057,0.107692)G =c(0.060271,0.000921,0.997308,0.890673,0.000921,0.000921,0.214191,0.000921,0.997308,0.677403)U =c(0.937836,0.000800,0.000800,0.000800,0.000800,0.890551,0.677281,0.997186,0.000800,0.107435)pwD02U =rbind(A,C,G,U)# make the motif logoggseqlogo(pwD02U, col_scheme="nucleotide", font="helvetica_bold")```## Enrichment of the UAGGCUUUGG motif```{r}# function1# define a function for identifying the motif occurencegetmotgr=function(seq, pwm, gr_plus, gr_minus){# motif in sense strandmatches_list_strand =lapply(seq, matchPWM, pwm=pwm)# apply to all test =lapply(names(matches_list_strand), function (i){ uc = matches_list_strand[[i]] df=as.data.frame(uc) df$chr = i # make a column specifying chromosome name df =cbind.DataFrame(df)return(df)})# rbind the resulting dfdf1 =do.call(rbind, test) %>%as(., "GRanges")# find overlapsov =findOverlaps(df1, gr_plus, ignore.strand=FALSE)df1 = df1[from(ov)]df1$gene.id = gr_plus$gene_id[to(ov)]strand(df1) =strand(gr_plus)[to(ov)]# motif in antisense strandmatches_list_antistrand =lapply(seq, matchPWM, pwm =reverseComplement(pwm))# apply to all test1 =lapply(names(matches_list_antistrand), function (i){ uc1 = matches_list_antistrand[[i]] df1=as.data.frame(uc1) df1$chr = i df1 =cbind.DataFrame(df1)return(df1)})# rbind the resulting dfdf2 =do.call(rbind, test1) %>%as(., "GRanges")# find overlapsov2 =findOverlaps(df2, gr_minus, ignore.strand=FALSE)df2= df2[from(ov2)]df2$gene.id = gr_minus$gene_id[to(ov2)]strand(df2) =strand(gr_minus)[to(ov2)]df3 =c(df1,df2) %>%as(., "GRanges")return(df3)}``````{r warning=FALSE}# as found in meme suite analysis# remove the um_scaf_contigs because they causes problem downstreamuma.seq =getSeq(Umaydis)uma.seq = uma.seq[-c(24,25,26,27)]# plus strandsgns_plus = genes.GR[strand(genes.GR) =="+"]#minus strandgns_minus = genes.GR[strand(genes.GR) =="-"]``````{r warning=FALSE}# change U to Trownames(pwD02U) =c("A", "C", "G", "T")#UAGGCUUUGGUAGGCUUUGG.gr =getmotgr(uma.seq, pwm=pwD02U, gns_plus, gns_minus)``````{r}#change genes.gr column namekhd4vcntrl =list(khd4 =as.data.frame(unique(hyper.rep$khd4$gene.id)) %>%set_colnames(.,"gene.id"),ctrl =as.data.frame(unique(hyper.rep$ctrl$gene.id)) %>%set_colnames(., "gene.id"),all =as.data.frame(genes.GR$gene_id[genes.GR$gene_biotype =="protein_coding"]) %>%set_colnames(., "gene.id"))# add columns to determine number of genes with CGAGCAAG.GR or GUCUUGCUVY.grkhd4vcntrl =lapply(khd4vcntrl, function(i){# add gucuugcuvy column and look for matching geneID i$UAGGCUUUGG =lookup(i$gene.id,UAGGCUUUGG.gr$gene.id,UAGGCUUUGG.gr$gene.id)return(i)})# counts the total number of sites in each datasetdf2=lapply (names(khd4vcntrl), function(i){#i is khd4 hyp = khd4vcntrl[[i]] df2 =table(!is.na(hyp$UAGGCUUUGG)) %>%as.data.frame() %>%mutate(set = i, motif ="UAGGCUUUGG") %>%set_colnames(.,c("with_motif", "no.of.genes","set","motif")) return(df2)}) df1_mot2 =do.call(rbind,df2)pldf =ggplot(df1_mot2, aes(x=set, y= no.of.genes, fill = with_motif))+geom_bar(stat="identity", position="fill")+scale_fill_manual(values=c("TRUE"="grey30", "FALSE"="white"))+labs(x="",y ="transcripts (%) ")+facet_wrap(~motif)+ myTheme1pldf``````{r}pf =lapply(names(hyper.rep), function(i){#i= khd4 hyper = hyper.rep[[i]]# calculate distance to nearest UAGGCUUUGG motif dists.GUCUUGCUVY =distanceToNearest(hyper, UAGGCUUUGG.gr)# keep only pairs on same genes sel = hyper$gene.id[from(dists.GUCUUGCUVY)] == UAGGCUUUGG.gr$gene.id[to(dists.GUCUUGCUVY)]table(sel) dists.GUCUUGCUVY =subset(dists.GUCUUGCUVY, sel)mcols(dists.GUCUUGCUVY)$motif ="UAGGCUUUGG"# get stats of distancessummary(mcols(dists.GUCUUGCUVY)$distance) pf =rbind(mcols(dists.GUCUUGCUVY)) %>%as.data.frame() pf$set = ireturn(pf)})pf =do.call(rbind, pf)pl4 =ggplot(pf, aes(x=distance, fill=set)) +geom_histogram(alpha=0.5, binwidth =100, aes(y = ..density..), position="dodge") +coord_cartesian(xlim=c(0, 2500)) +scale_fill_manual(values=c(khd4 = khd4.col, ctrl = ada.col)) +labs(x="Distance to the neares motif (nt)",y ="motif density")+coord_cartesian(ylim=c(0.000,0.004),xlim =c(0,2500))+theme_bw() +facet_wrap(~motif)+myTheme1pl4```## Session Info```{r}sessionInfo()```