Skip to content
Snippets Groups Projects
Commit 838a6a8d authored by Srimeenakshi Sankaranarayanan's avatar Srimeenakshi Sankaranarayanan
Browse files

Update Khd4-Ada-Gfp editing analysis

parent 855fbed5
No related branches found
No related tags found
No related merge requests found
Showing
with 8572 additions and 0 deletions
...@@ -13,3 +13,5 @@ assays/S3_A2_HyperTRIBE_Rrm4_analysis/dataset/RNAseq_files_raw/control-Ada_rep2. ...@@ -13,3 +13,5 @@ assays/S3_A2_HyperTRIBE_Rrm4_analysis/dataset/RNAseq_files_raw/control-Ada_rep2.
assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep1.fastq.gz filter=lfs diff=lfs merge=lfs -text assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep1.fastq.gz filter=lfs diff=lfs merge=lfs -text
assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep2.fastq.gz filter=lfs diff=lfs merge=lfs -text assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Ada-Gfp_rep2.fastq.gz filter=lfs diff=lfs merge=lfs -text
assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz filter=lfs diff=lfs merge=lfs -text assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz filter=lfs diff=lfs merge=lfs -text
assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text
assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
---
title: "01_Characterization of editing sites"
date: last-modified
author:
- name: "Srimeenakshi Sankaranarayanan"
title-block-banner: true
format:
html:
theme: flatly
self-contained: true
code-fold: true
code-tools: true
code-summary: "Show the code"
toc: true
number-sections: true
anchor-sections: true
editor: visual
execute:
echo: true
error: false
warning: false
message: false
---
```{r}
# library
library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
library(ggplot2)
library(BindingSiteFinder)
library(rtracklayer)
library(ComplexHeatmap)
library(GenomicFeatures)
library(forcats)
library(tidyr)
library(dplyr)
library(tidyverse)
library(GenomicRanges)
library(magick)
library(magrittr)
library(gridExtra)
library(IRanges)
library(Biostrings)
library(ggpp)
library(gginnards)
library(ggrepel)
library(ggpubr)
library(ggforce)
library(ggrastr)
library(viridis)
library(reshape2)
library(gprofiler2)
library(ggsci)
library(ggh4x)
library(ggplotify)
library(gridExtra)
library(circlize)
library(EnrichedHeatmap)
library(UpSetR)
library(kableExtra)
library(cowplot)
library(rstatix)
library(beeswarm)
library(clusterProfiler)
library(ggseqlogo)
library(tidyHeatmap)
library(paletteer)
library(ggvenn)
library(colorspace)
library(ggpointdensity)
library(lookup)
library(rstatix)
library(ggrepel)
```
```{r}
load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")
```
# Characterise editing sites
The editing events across transcriptome was identified as described here https://hypertribe.readthedocs.io/en/latest/
It is important to note that a few editing sites needed to be removed for the following reasons: - 6 sites were not on A - due to the gene extension by 300-nt, a handful of editing sites overlapped with 2 genes
```{r}
# Import hyperTRIBE data
hyper.dir = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/"
# get all bedgraph files
hyper.files = list.files(hyper.dir, pattern="bedgraph", full.names=TRUE)
# assign sample names (ATTENTION!)
names(hyper.files) = c("khd4_rep1", "khd4_rep2", "ctrl_rep1", "ctrl_rep2")
```
```{r}
# combine the hyperTRIBE results into a list
hyper.list = lapply(hyper.files, function(i){
# import data
hyper = read.table(i, sep="\t")
# keep only first 8 columns
hyper = hyper[,c(1:2,4:5,8)]
# assign column names
colnames(hyper) = c("chr", "start", "perc.editing", "no.reads", "gene.id")
# extract number of reads
hyper$no.reads = strsplit(hyper$no.reads, "_") %>% sapply(., "[[", 2) %>% sub("r", "", .) %>% as.integer(.)
# add strand (from annotation)
pos = match(hyper$gene.id, genes.GR$gene_id)
hyper$strand = as.vector(strand(genes.GR))[pos]
# add id
hyper$id = paste(hyper$chr, hyper$start, sep="_")
# add number of motifs
sel = match(hyper$gene.id, genes.GR$gene_id)
hyper$AUACCC = genes.GR$AUACCC[sel]
hyper$AGAUCU = genes.GR$AGAUCU[sel]
hyper$GGGTAT = genes.GR$GGGTAT[sel]
hyper$ACACUC = genes.GR$ACACUC[sel]
return(hyper)
})
#head(hyper.list[[1]])
# CHECK FOR DUPLICATED SITES -> overlapping genes due to extension
# these genes are removed at the moment (could also be resolved differently if needed)
sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
for (i in seq_along(hyper.list)){
hyper = hyper.list[[i]]
dupls = hyper$id[duplicated(hyper$id)]
print( hyper[hyper$id %in% dupls,] )
}
# REMOVE DUPLICATED SITES
hyper.list = lapply(hyper.list, function(hyper){
dupls = hyper$id[duplicated(hyper$id)]
hyper = subset(hyper, !(id %in% dupls))
return(hyper)
})
# test
# sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
```
```{r}
# convert editing sites into GRanges object
hyper.GR = lapply(hyper.list, function(hyper){
gr = GRanges(
seqnames=hyper$chr,
ranges=IRanges(start=hyper$start, width=1),
strand=hyper$strand,
id=hyper$id,
gene.id=hyper$gene.id,
no.reads = hyper$no.reads,
perc.editing = hyper$perc.editing
)
return(gr)
}) %>% as(., "GRangesList")
```
## Get sequence of editing sites
```{r}
# get sequence of all editing sites
seqs = lapply(hyper.GR, function(gr){
seq = getSeq(Umaydis, gr)
return(seq)
})
# check distribution
sapply(seqs, table)
#### REMOVE INDIVIDUAL SITES THAT ARE NOT ON A
hyper.GR = lapply(hyper.GR, function(gr){
seqs = getSeq(Umaydis, gr)
gr = subset(gr, seqs == "A")
return(gr)
})
# check that only A's remain
seqs = lapply(hyper.GR, function(gr){
seq = getSeq(Umaydis, gr)
return(seq)
})
sapply(seqs, table)
```
## Compare between replicates
Reproducible editing sites overlapped between the two independent biological replicates of the Khd4-Ada-Gfp and control-Ada data were compared with each other (#1,2- replicates 1 and 2). Only editing events with 5% editing were used for the analysis. (
```{r}
# Venn diagram of Khd4-ada-gfp replicates
venn.list = lapply(hyper.GR[1:2], function(hyper){
return(hyper$id)
})
pl1 = ggvenn(venn.list, names(hyper.GR)[1:2], fill_color = c(lighten(khd4.col, 0.2), darken(khd4.col, 0.2)), set_name_size = 2) +theme_paper()+labs(x = NULL, y = NULL) +
guides(x = "none", y = "none")
# Venn diagram of control-Ada replicates
venn.list = lapply(hyper.GR[3:4], function(hyper){
return(hyper$id)
})
pl2 = ggvenn(venn.list, names(hyper.GR)[3:4], fill_color = c(lighten(ada.col, 0.2), darken(ada.col, 0.2)), set_name_size = 2) +theme_paper()+labs(x = NULL, y = NULL) +
guides(x = "none", y = "none")
#FigS6A
vennrep<-ggarrange(pl1, pl2,
labels = c("A", "B"),
ncol = 2, nrow = 1)
vennrep
```
```{r}
# KEEP ONLY REPRODUCIBLE SITES FOR FURTHER ANALYSES
# merge Khd4-Ada-Gfp replicates
khd4.tab = merge(hyper.GR$khd4_rep1 %>% mcols(), hyper.GR$khd4_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
# select only sites from overlap and add combined info of both replicates
sel = match(khd4.tab$id, hyper.GR$khd4_rep1$id)
khd4.rep = hyper.GR$khd4_rep1[sel]
mcols(khd4.rep) = khd4.tab
# merge control-Ada replicates
ctrl.tab = merge(hyper.GR$ctrl_rep1 %>% mcols(), hyper.GR$ctrl_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
# select only sites from overlap and add combined info of both replicates
sel = match(ctrl.tab$id, hyper.GR$ctrl_rep1$id)
ctrl.rep = hyper.GR$ctrl_rep1[sel]
mcols(ctrl.rep) = ctrl.tab
## NEW GRANGES OBJECT WITH ONLY REPRODUCIBLE SITES
hyper.rep = GRangesList(khd4 = khd4.rep, ctrl = ctrl.rep)
# number of reproducible editing sites
elementNROWS(hyper.rep)
# genes with reproducible sites
sapply(hyper.rep, function(hyper) length(unique(hyper$gene.id)))
elementNROWS(hyper.rep)
```
```{r}
# Venn diaram
venn.list = lapply(hyper.rep, function(hyper){
return(hyper$id)
})
# FigS6A and B
ggvenn(venn.list, names(venn.list), fill_color = c(khd4.col, ada.col)) +
ggtitle("Overlap between Khd4.Ada and Ada.ctrl")
```
# Studying the editing events
```{r}
# distribution of % editing
pf = data.frame(
perc.editing = c(khd4.tab$perc.editing.rep1, khd4.tab$perc.editing.rep2, ctrl.tab$perc.editing.rep1, ctrl.tab$perc.editing.rep2),
sample = rep(c("khd4.rep1", "khd4.rep2", "ctrl.rep1", "ctrl.rep2"), rep(c(nrow(khd4.tab), nrow(ctrl.tab)), each=2)),
condition = rep(c("khd4", "khd4", "ctrl", "ctrl"), rep(c(nrow(khd4.tab), nrow(ctrl.tab)), each=2)))
# khd4-ada
#FigS6C
pl1 = ggplot(pf, aes(x=perc.editing, fill=sample)) +
geom_density(adjust = 1/5, alpha=0.5) +
scale_fill_manual(values=c(khd4.rep1 = lighten(khd4.col, 0.2), khd4.rep2 = darken(khd4.col, 0.2), ctrl.rep1 = lighten(ada.col, 0.2), ctrl.rep2 = darken(ada.col, 0.2)))+
xlab("% editing")+
ylab("density")+
myTheme1
pl1
```
Note that I used coefficient determination to check the editing reproducibility because the R2 indicates the distance of points between 1:1 line, on the other hand magnitude. Therefore, I did not use pearson (the analysis is independent of magnitude, in our case, editing percentage) and spearman (as the editing relationship is not monotonic).
```{r}
# Note: used coefficient determination because the R2 indicates the distance of
# points between 1:1 line, on the other hand magnitude; didnt use pearson (as it
# is is independent of magnitude, in our case, independent of editing percentage)
# no spearman (as the editing relationship is not monotonic)
# scatter plots of % editing in overlapping sites
# Khd4-Ada-Gfp
pl3 = ggplot(khd4.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2)) +
geom_point(adjust=0.1, color="grey") +
ggtitle("Reproducible editing sites in Khd4-Ada-Gfp") +
theme_bw() +
labs(x="Khd4-Ada-Gfp #rep1 (editing %)",
y = "Khd4-Ada-Gfp #rep2 (editing %)")+
theme(aspect.ratio = 1) +
expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ myTheme1
pl3
# Control-Ada
pl4 = ggplot(ctrl.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2))+
geom_point(adjust=0.1, color="grey")+
ggtitle("Reproducible editing sites in control-Ada") +
labs(x="Control-Ada #rep1 (editing %)",
y = "Control-Ada #rep2 (editing %)")+
theme_bw() +
theme(aspect.ratio = 1) +
expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ myTheme1
pl4
```
## De novo motif discovery analysis on sequences carrying unique control-Ada editing sites.
XSTREME analysis was done using the random genome sequences of lenght 501-nt as a background and unique-Control-Ada editing sites (501-nt) as an input
```{r}
# unique to control-Ada
Motif.frame.con = data.frame( Motif = c("GCGCGUG","CGABGACGAVGAVG","RVCAAGAWGRRCAAG","GUGAYUCGUGAYUGU","AGCAGCAGCAG","CGACUUGCG","WSVYCAAGMWGMUCR","ACCGCCAUCU","CAGCUCWAGCC","AWUCAYGAAUS"),
absolute = c(43.7,40.9,21.7,4.9,30.8,72.5,1.7,19.9,76.7,26.3),
EnR = c(1.25,1.17,1.20,1.58,1.09,1.03,1.44,1.01,1.00,0.97),
set=rep("ctrl",10))
# make a ggplot
pl.con = ggplot(Motif.frame.con, aes(x=absolute, y=EnR, color=set))+
geom_point(alpha=1, size=2)+
scale_color_manual(values = ada.col)+
coord_cartesian(ylim=c(0,10))+
labs(subtitle= "control-Ada editing sites",
x="sequences with motif (%)",
y = "motif enrichment ratio")+
theme_paper()+
theme(legend.position = "none")
pl.con
```
```{r}
sessionInfo()
```
source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
---
title: "HyperTRIBE_identifies_highly_specific_targets_of_Khd4"
date: last-modified
author:
- name: "Srimeenakshi Sankaranarayanan"
title-block-banner: true
format:
html:
theme: flatly
self-contained: true
code-fold: true
code-tools: true
code-summary: "Show the code"
toc: true
number-sections: true
anchor-sections: true
editor: visual
execute:
echo: true
error: false
warning: false
message: false
---
```{r}
# library
library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
library(ggplot2)
library(BindingSiteFinder)
library(rtracklayer)
library(ComplexHeatmap)
library(GenomicFeatures)
library(forcats)
library(tidyr)
library(dplyr)
library(tidyverse)
library(GenomicRanges)
library(magick)
library(magrittr)
library(gridExtra)
library(IRanges)
library(Biostrings)
library(ggpp)
library(gginnards)
library(ggrepel)
library(ggpubr)
library(ggforce)
library(ggrastr)
library(viridis)
library(reshape2)
library(gprofiler2)
library(ggsci)
library(ggh4x)
library(ggplotify)
library(gridExtra)
library(circlize)
library(EnrichedHeatmap)
library(UpSetR)
library(kableExtra)
library(cowplot)
library(rstatix)
library(beeswarm)
library(clusterProfiler)
library(ggseqlogo)
library(tidyHeatmap)
library(paletteer)
library(ggvenn)
library(colorspace)
library(ggpointdensity)
library(lookup)
library(rstatix)
library(ggrepel)
```
```{r}
load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")
```
```{r}
# reproducible editing sites in Khd4-Ada-Gfp and control-Ada
hyper.rep
```
```{r}
# Venn diaram
venn.list = lapply(hyper.rep, function(hyper){
return(hyper$id)
})
names(venn.list) = c("Khd4-Ada-Gfp", "control-Ada")
# FigS6A and B
ggvenn(venn.list, names(venn.list),
fill_color = c("dodgerblue3", ada.col),
set_name_size = 6) +
ggtitle("Overlap between Khd4-Ada-Gfp and control-Ada")+
theme(plot.title = element_text(size = 20, hjust = 0.5, vjust = 3))
```
```{r}
# Identifying the RNA sequence motif
################################################################################
#AUACCC
################################################################################
# get gene sequences
gene.seqs = getSeq(Umaydis, genes.GR)
# find AUACCC motifs
motif.auaccc = DNAString("ATACCC")
# find all AUACCC motifs in the genome
auaccc.gr = vmatchPattern(motif.auaccc, Umaydis)
# count motifs in genes
genes.GR$AUACCC = countOverlaps(genes.GR, auaccc.gr)
#table(genes.GR$AUACCC)
# assign motifs to genes
ov = findOverlaps(auaccc.gr, genes.GR)
# check whether motifs overlap with more than one gene
table(duplicated(from(ov)))
# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
auaccc.gr = auaccc.gr[from(ov)]
auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]
# count motifs in transcript regions
#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
################################################################################
#AGAUCU
################################################################################
# find AGAUCU motifs
motif.ctrl = DNAString("AGATCT")
agaucu.gr = vmatchPattern(motif.ctrl, Umaydis)
genes.GR$AGAUCU = countOverlaps(genes.GR, agaucu.gr)
table(genes.GR$AGAUCU)
# assign motifs to genes
ov = findOverlaps(agaucu.gr, genes.GR)
# check whether motifs overlap with more than one gene
#table(duplicated(from(ov)))
# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
agaucu.gr = agaucu.gr[from(ov)]
agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]
# count motifs in transcript regions
#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)
```
## De novo motif discovery analysis on sequences carrying unique Khd4–Ada–Gfp editing sites.
XSTREME analysis was done using the random genome sequences of lenght 501-nt as a background and unique-Khd4-Ada-Gfp editing sites (501-nt) as an input.
First, the Khd4-Ada-Gfp editing sites were extended on each side by 250nt to make a 501-nt window.
```{r eval=FALSE}
# keep only id column
hyper.spe = lapply(hyper.rep, function (x){
mcols(x) = x[,c(1)]
names(x) = x$id
return(x)
})
# split the reproducible editing sites that are unique to Khd4 and Control , and shared
hyper.specific = list(
Khd4.specific = hyper.spe$khd4[!(hyper.spe$khd4$id %in% hyper.spe$ctrl$id)],
shared = hyper.spe$khd4[hyper.spe$khd4$id %in% hyper.spe$ctrl$id],
ctrl.specific = hyper.spe$ctrl[!(hyper.spe$ctrl$id %in% hyper.spe$khd4$id)]
)
# as a GRanges list
hyper.specific = as(hyper.specific, "GRangesList")
# extend the window by 250 bp on both sides
hyper_250 = lapply(hyper.specific, function(x){
x = x+250 # window 250 bp
})
# get fastafiles
# get sequence
khd4.specific = Biostrings::getSeq(Umaydis, hyper_250$Khd4.specific)
khd4.ctrl.shared = Biostrings::getSeq(Umaydis, hyper_250$shared)
ctrl.specific = Biostrings::getSeq(Umaydis, hyper_250$ctrl.specific)
# export the fasta files
output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"
writeXStringSet(khd4.specific, filepath = paste0(output,"khd4.specific_250.fasta"))
writeXStringSet(khd4.ctrl.shared, filepath = paste0(output,"khd4.ctrl.shared_250.fasta"))
writeXStringSet(ctrl.specific, filepath = paste0(output,"ctrl.specific_250.fasta"))
```
Scatter plot comparing the relative enrichment ratio of enriched motifs with the percentage of each motif in the tested sequences.
```{r}
# making a scatter plot
# unique to Khd4-Ada-Gfp
Motif.frame.khd4 = data.frame(Motif =
c("HMHHKAUACCC","HMKAUACCCVH","UGSAGGCGCURSWGS","UHCCUCUYCHCCMUC","CAGCARCARCADCAG","CCUCKAAACSNCGC","CYAAKSMUGCUYCUR","CYCCCW","CUUGAYUUGUWWYCC","YYUUGUAYUUGUA","UYGCUGCUCUC", "ACMUCUCCAAGVYCA","CUCCUGAUCCUGCCU", "AGCGSUCRUUUGASC", "CCUCUKU", "CAACGCUCCAC","CCAGCAACAGCARSM", "GCAGAUGRUAYCCAK", "RASAAGMVSAWCAAG", "AVUGGUGCUGU"),
absolute = c(60.9, 59.3, 16.0, 65.8, 13.6, 37.4, 11.1, 37.9, 2.5, 6.2, 18.5, 6.2, 5.3, 3.7, 16.0, 50.6, 29.2, 3.7, 38.3, 11.1),
EnR = c(9.40,9.60,8.35,1.46,4.01,1.80,3.30,1.60,37.98,4.82,2.08, 4.34, 5.06, 4.93, 1.81, 1.27, 1.43, 3.88 , 1.10, 1.05),
set=rep("khd4",20))
pl.khd4 = ggplot(Motif.frame.khd4, aes(x=absolute, y=EnR, color=set))+
geom_point(alpha=1, size=2)+
scale_color_manual(values = khd4.col)+
coord_cartesian(ylim=c(0,10))+
labs(subtitle= "Khd4-Ada-Gfp",
x="sequences with motif (%)",
y = "motif enrichment ratio")+
theme_paper()+
theme(legend.position = "none")
pl.khd4
```
Make the sequence logo of the most enriched two motifs
```{r}
################################################################################
# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts
################################################################################
# motif logo with u in it
# Motif-1 TP% = 60.9 ENR-Ratio = 9.40 HMHHKAUACCC
A= c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524
)
C= c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476
)
G = c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
)
U = c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000
)
pwDF1bU = rbind(A,C,G,U)
# MoUif-2 UP% = 59.3 ENR-RaUio = 9.60 HMKAUACCCVH
A= c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404
)
C= c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852
)
G = c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894
)
U = c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849
)
pwDF2bU = rbind(A,C,G,U)
# make ggseqlogo
m1 = ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")
m2 = ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")
pl = ggarrange(m1,m2,
labels = LETTERS[1:2],
nrow = 1)
pl
```
Next I want to select mRNA targets that are specific to Khd4-Ada-Gfp and control-Ada data
```{r}
# Khd4.spe
Khd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]
# mRNAs specific to control
ctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]
# make a list with mRNAs specific to each data set
# list
hyper.spe = list(
Khd4_Ada_Gfp = Khd4.spe,
control_Ada = ctrl.spe.tx
) %>% as(., "GRangesList")
```
## Distribution of editing sites from the nearest AUACCC motif
Only editing sites on mRNAs specific to each dataset were considered
```{r}
pf = lapply(names(hyper.spe), function(i){
#i = "khd4"
hyper = hyper.spe[[i]]
head(hyper)
# calculate distance to nearest AUACCC motif
dists.auacc = distanceToNearest(hyper, auaccc.gr)
# keep only pairs on same genes
sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]
table(sel)
dists.auacc= subset(dists.auacc, sel)
mcols(dists.auacc)$motif = "AUACCC"
# calculate distance to nearest AGAUCU motif
dists.agaucu = distanceToNearest(hyper, agaucu.gr)
# keep only pairs on same genes
sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]
table(sel)
dists.agaucu= subset(dists.agaucu, sel)
mcols(dists.agaucu)$motif = "AGAUCU"
# get stats of distances
summary(mcols(dists.auacc)$distance)
summary(mcols(dists.agaucu)$distance)
pf = rbind(mcols(dists.auacc), mcols(dists.agaucu)) %>% as.data.frame()
pf$set = i
return(pf)
})
pf = do.call(rbind, pf)
# FigS7B
# Histogram
pl4 = ggplot(pf, aes(x=distance, fill=set)) +
geom_histogram( binwidth = 150, aes(y = ..density..), position="dodge") +
coord_cartesian(xlim=c(0, 2500)) +
scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey60")) +
labs(x="distance of editing sites to the nearest motif (nt)",
y="motif density")+
theme_bw() +
facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU")))+myTheme1
pl4
```
```{r}
# median distance between editing sites and the motif
#Khd4-Ada-Gfp
Khd4_auaccc= filter(pf, set == "khd4", motif == "AUACCC")
med_K_auaccc = median(Khd4_auaccc$distance)
#Control-Ada
Ctrl_auaccc= filter(pf, set == "ctrl", motif == "AUACCC")
med_C_auaccc = median(Ctrl_auaccc$distance)
data.frame(Khd4_Ada_Gfp = med_K_auaccc,
control_Ada = med_C_auaccc) %>%
kbl(caption = "median distance between auaccc and editing events",html_font = "Cambria", bold = T, color="black") %>%
kable_styling(bootstrap_options = "striped", "hover", full_width = F, html_font = "Cambria") %>%
column_spec(1, bold = T)
```
## Studying the enrichment of motifs in mRNAs specific to each datasets
Here, we consider only the editing events on Khd4-bound mRNAs and on mRNAs specific to control-Ada.
Khd4-bound mRNAs- All mRNAs that carry reproducible Khd4-Ada-Gfp editing sites with atleast one AUACCC motif.
Control-Ada mRNAs- mRNAs that have reproducible control-Ada editing sites but no Khd4-Ada-Gfp sites
```{r}
# Khd4-bound mRNAs
Khd4_boundRNA= hyper.rep$khd4[(hyper.rep$khd4$gene.id %in% auaccc.gr$gene.id)]
# mRNAs specific to control
ctrl.spe.tx = hyper.rep$ctrl[!(hyper.rep$ctrl$gene.id %in% hyper.rep$khd4$gene.id)]
#check if the numbers are similar to the venn diagram
#n_distinct(khd4.spe.tx$gene.id)
# make a list with mRNAs specific to each data set
Khd4.spe= hyper.rep$khd4[!(hyper.rep$khd4$gene.id %in% hyper.rep$ctrl$gene.id)]
# list
hyper.rep.txspe = list(
khd4 = Khd4_boundRNA,
ctrl = ctrl.spe.tx
) %>% as(., "GRangesList")
```
# Analyis of the high-confidence targets
```{r}
# only Khd4
khd4.rep.edit=hyper.rep$khd4
motifs=genes.GR
khd4.rep.edit$AUACCC=lookup(khd4.rep.edit$gene.id, motifs$gene_id,motifs$AUACCC)
#granges of high confidence Khd4
high.conf.khd4=subset(khd4.rep.edit,AUACCC>0)
high.conf = high.conf.khd4
```
## Transcript region enrichment of motif
Since Khd4 binds AUACCC specifically, we first want to check the transcript region that the AUACCC motif is enriched in high-confidence mRNA targets. Besides using the AGAUCU motif as a control, I also want to use the all hyphae expressed AUACCC-containing mRNAs as a putative targets
Furthermore, I will use the already available AUACCC and AGAUCU motif position in *U. maydis* genome, as these already include the transcript region locations of the motif.
```{r}
# load DGE data to identify hyphae expressed genes
# Use the lookup function to identify geneIDs overlapping with the high confident targets
Fi_wtvskd_DEG$high.conf_targets=lookup(Fi_wtvskd_DEG$gene.ID,high.conf$gene.id, high.conf$gene.id )
# auaccc motif
Fi_wtvskd_DEG$auaccc=lookup(Fi_wtvskd_DEG$gene.ID,auaccc.gr$gene.id, auaccc.gr$gene.id )
# agaucu motif
Fi_wtvskd_DEG$agaucu=lookup(Fi_wtvskd_DEG$gene.ID,agaucu.gr$gene.id, agaucu.gr$gene.id )
# only expressed genes
Fi_wtvskd_BM = Fi_wtvskd_DEG[Fi_wtvskd_DEG$baseMean>10,]
```
```{r}
auaccc_DGE = Fi_wtvskd_BM[!is.na(Fi_wtvskd_BM$auaccc),]
agaucu_DGE = Fi_wtvskd_BM[!is.na(Fi_wtvskd_BM$agaucu),]
```
```{r}
# AUACCC
auaccc.mot=read.csv("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/AUACCCmotif.csv")
auaccc.mot$high.conf.targets = lookup::lookup(auaccc.mot$Gene_ID,high.conf$gene.id,high.conf$gene.id)
auaccc.mot$put.targets = lookup::lookup(auaccc.mot$Gene_ID, auaccc_DGE$gene.ID, auaccc_DGE$gene.ID)
# AGAUCU
agaucu.mot=read.csv("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/AGAUCUmotif.csv")
agaucu.mot$high.conf.targets = lookup::lookup(agaucu.mot$GeneID,high.conf$gene.id,high.conf$gene.id)
agaucu.mot$put.targets = lookup::lookup(agaucu.mot$GeneID, auaccc_DGE$gene.ID, auaccc_DGE$gene.ID)
```
```{r}
# highconfident targets (auaccc motif)
df_high_conf_auaccc = auaccc.mot[!is.na(auaccc.mot$high.conf.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AUACCC", set = "Khd4_bound_mRNA")
# highconfident targets (agaucu motif)
df_high_conf_agaucuc = agaucu.mot[!is.na(agaucu.mot$high.conf.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AGAUCU", set = "Khd4_bound_mRNA")
# putative targets (auaccc motif)
df_put_auaccc = auaccc.mot[!is.na(auaccc.mot$put.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AUACCC", set = "putative_targets")
# putative targets (agaucu motif)
df_put_agaucuc = agaucu.mot[!is.na(agaucu.mot$put.targets),] %>% .[,"Position"] %>% table(.) %>% as.data.frame %>% set_colnames(.,c("variable", "Freq")) %>% mutate(motif= "AGAUCU", set = "putative_targets")
df = rbind(df_high_conf_auaccc,df_high_conf_agaucuc,df_put_auaccc,df_put_agaucuc)
ggplot(df, aes(x=factor(set, levels=c("Khd4_bound_mRNA", "putative_targets")), y = Freq, fill = factor(variable, levels=c("5UTR", "ORF", "3UTR"))))+
geom_bar(position = position_fill(), stat = "identity")+
scale_fill_brewer(palette = "Greens")+
facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU")))+
labs(x="",
y = "% with motif",
fill= "transcript region")+
myTheme1
```
```{r}
sessionInfo()
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment