Contents

In this analysis, the validation of Rrm4-Ada-Gfp analyis were carried out by comparing the hyperTRIBE editing events with the available iCLIP data.

For this purpose, both iCLIP datasets from Olgeiser et.al.(2019) as well as Stoffel et.al(2024) (under preparation) were used. But only the results from Olgeiser et.al. (2019) were used in Sankaranarayanan et.al(2023).

#Loading all the required libraries for the analyses.
library(BSgenome.Umaydis.ENSMBL.UM1)
library(ggplot2)
library(reshape2)
library(dplyr)
library(GenomicRanges)
library(magrittr)
library(ggvenn)
library(colorspace)
library(ggpubr)
library(ggpointdensity)
library(lookup)
library(rstatix)
library(ggpmisc)
library(ggrepel)
library(RColorBrewer)
library(tibble)
library(stringr)
library(gprofiler2)
library(tidyr)
library(stats)
library(DESeq2)
library(rtracklayer)
library(ComplexHeatmap)
library(GenomicFeatures)
library(GenomicRanges)
library(magick)
library(magrittr)
library(gridExtra)
library(IRanges)
library(Biostrings)
library(ggpmisc)
library(ggrepel)
library(ggpubr)
library(ggseqlogo)
library(rtracklayer)
library(circlize)
#Theme for ggplots
myTheme1 <- theme_bw() +
    theme(axis.text = element_text(size = 8, colour="black"),
          axis.title = element_text(size=8, colour="black", face="bold"),
          axis.ticks=element_line(color="black"),
          axis.ticks.length=unit(.15, "cm"),
          panel.border=element_rect(color="black", fill = NA),
          panel.background = element_blank(),
          plot.background = element_blank(),
          legend.text = element_text(size=12),
          legend.position = "top")
#Define colors for plots
# different target sets
# high-confidence from hyperTRIBE
tribe.col = "#3EAE96"


# Khd4 & control
Rrm4.col = "#3EAE96"
ada.col = "#D7D7D7"


### with the functions lighten() and darken() from the package colorspace you can modulate the colours e.g. in Venn diagrams

1 Preparing U. maydis gene annotations

First, we will import gene annotation and sequence information Here I used the genome annotation of U. maydis from Ensembl. Note that in contrast to Pedant, the chromosomes are indicated by integer (eg: 1,2 etc) as opposed to character (Um_chr1 etc).

# Define GTF file location
ann = "C:/Users/Sri/Documents/annotation/genome/Ensembl/Ensembl_annotation/Ustilago.gtf"

# Target gene assignment
##make TxDb object for storing transcript annotations
annoDb= makeTxDbFromGFF(file=ann, format="gtf") 
## import the gtf file as GRanges
annoInfo = import(ann, format = "gtf")
## get genes as GRanges
gns = genes(annoDb) # genes() retreives the positions of all genes 
## matching the positions of gene_id in annoInfo
idx = match(gns$gene_id, annoInfo$gene_id) 
## add Metadata from annoInfo to gns 
elementMetadata(gns) = cbind(elementMetadata(gns),
                             elementMetadata(annoInfo)[idx,]) 

# find all duplicated genes. In case of U. maydis genes its only tRNAs that were duplicated
dup = gns[duplicated(gns)]
##all tRNAs with their duplicated annotation
dup_tRNAs = subsetByOverlaps(gns, dup, type = "equal") 
##only tRNAs with UMAG_Ids using the string "UMAG"
dup_UMAG_tRNAs = dup_tRNAs[grepl("UMAG", dup_tRNAs$gene_id)]
##remove all annotation to tRNAs from gns
gns = gns[!(gns %in% dup_tRNAs)]
##combine tRNA annotation having UMAG number with gns 
genes.GR = c(gns, dup_UMAG_tRNAs) %>%
      sort()


# remove objects from workspace
rm(dup, dup_tRNAs, dup_UMAG_tRNAs, idx)
#### Since we manaullay extend the genes by 300 nt on either side for UTRs, the following code removes genes that are bothersome to the analysis (MAY BE NEEDED TO CHANGE THIS IF IMPORTANT FACTORS GET LOST)

# remove genes in the first 300 nt of chromosomes (6 genes)
sel = start(genes.GR) < 300
cat(sum(sel), "genes were removed because they were to close to the chromosome start:", paste(genes.GR$gene_id[sel], collapse=", "), fill=TRUE)
## 6 genes were removed because they were to close to the chromosome start: 
## UMAG_00827, UMAG_12058, UMAG_04308, UMAG_05621, UMAG_05798, UMAG_06513
genes.GR = genes.GR[!sel]

# remove genes in last 300 nt (12 genes)
chr.length = seqlengths(Umaydis)[as.vector(seqnames(genes.GR))]
sel = end(genes.GR) + 300 > chr.length
cat(sum(sel), "genes were removed because they were to close to the chromosome end:", paste(genes.GR$gene_id[sel], collapse=", "), fill=TRUE)
## 12 genes were removed because they were to close to the chromosome end: 
## UMAG_11874, UMAG_06476, UMAG_10949, UMAG_06505, UMAG_03635, UMAG_11109, UMAG_04307, UMAG_11979, UMAG_12288, UMAG_05977, UMAG_10981, UMAG_12076
genes.GR = genes.GR[!sel]
# define transcript regions
tx.regions = GRangesList(
  UTR3 = flank(genes.GR, 300, start=FALSE),
  ORF = genes.GR,
  UTR5 = flank(genes.GR, 300)
)

# extend genes by 300 nt on both sides
genes.GR = genes.GR + 300

2 Import iCLIP dataset

In order to compare the specificity of hyperTRIBE with Rrm4, the comparison with the existing iCLIP data is critical.

# loading the recent iCLIP data of Rrm4-Gfp
iCLIP_NKS= import("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/Rrm4-Rscript/bdsWTmerge.gtf", format = "gtf")

#loading iCLIP data of Lilli
iCLIP_lilli = import("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/Rrm4-Rscript/iCLIP_lilli/new_peaks_Rrm4G_SOB_0.bed")
# find overlaps
OV= findOverlaps(iCLIP_lilli, genes.GR)
#assign gene id
iCLIP_lilli=iCLIP_lilli[from(OV)]
iCLIP_lilli$gene_id= genes.GR$gene_id[to(OV)]

# make a column for Rrm4 binding site
genes.GR$rrm4_bs_nks = countOverlaps(genes.GR, iCLIP_NKS )
genes.GR$rrm4_bs_LO = countOverlaps(genes.GR, iCLIP_lilli)

3 Characterise editing sites

Note that a few editing sites needed to be removed for different reasons: - 6 sites were not on A - due to the gene extension by 300 nt, a handful of editing sites overlapped with 2 genes

3.1 Import hyperTRIBE data

hyper.dir = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/Rrm4-Rscript/bedgraph_rrm4_5%"

# get all bedgraph files
hyper.files = list.files(hyper.dir, pattern="bedgraph", full.names=TRUE)

# assign sample names (ATTENTION!)
names(hyper.files) = c("Rrm4_rep1", "Rrm4_rep2", "ctrl_rep1", "ctrl_rep2")

First, a list containing hyperTRIBE files was created. For simplicity, only the integer in “no.reads” was kept. The strand information was added from the annotation file.

The hyperTRIBE pipeline generates the list of all genomic coordinates overlapping with the corresponding editing events. To resolve this overlapping problem, all duplicated editing events were removed from the analysis.

hyper.list = lapply(hyper.files, function(i){
  # import data
  hyper = read.table(i, sep="\t")
  
  # keep only first 8 columns
  hyper = hyper[,c(1:2,4:5,8)]
  
  # assign column names
  colnames(hyper) = c("chr", "start", "perc.editing", "no.reads", "gene.id")
  
  # extract number of reads
  hyper$no.reads = strsplit(hyper$no.reads, "_") %>% sapply(., "[[", 2) %>% sub("r", "", .) %>% as.integer(.)
  
  # add strand (from annotation)
  pos = match(hyper$gene.id, genes.GR$gene_id)
  hyper$strand = as.vector(strand(genes.GR))[pos]
  
  # add id
  hyper$id = paste(hyper$chr, hyper$start, sep="_")
  
  
  return(hyper)
})

#head(hyper.list[[1]])

# CHECK FOR DUPLICATED SITES -> overlapping genes due to extension
# these genes are removed at the moment (could also be resolved differently if needed)
sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
##       Rrm4_rep1 Rrm4_rep2 ctrl_rep1 ctrl_rep2
## FALSE      4146      4139      2936      2730
## TRUE         12        15         8         5
for (i in seq_along(hyper.list)){
  hyper = hyper.list[[i]]
  dupls = hyper$id[duplicated(hyper$id)]
  #print( hyper[hyper$id %in% dupls,] )
}

# REMOVE DUPLICATED SITES
hyper.list = lapply(hyper.list, function(hyper){
  dupls = hyper$id[duplicated(hyper$id)]
  hyper = subset(hyper, !(id %in% dupls))
  return(hyper)
})

# check the duplicated events
sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
## Rrm4_rep1.FALSE Rrm4_rep2.FALSE ctrl_rep1.FALSE ctrl_rep2.FALSE 
##            4134            4124            2928            2725

For those gene that are removed from the gene annotation, if they contained editing events, their strand information will be marked with “*” which were also removed.

# convert editing sites into GRanges object
hyper.GR = lapply(hyper.list, function(hyper){
  gr = GRanges(
    seqnames=hyper$chr,
    ranges=IRanges(start=hyper$start, width=1),
    strand=hyper$strand,
    id=hyper$id,
    gene.id=hyper$gene.id,
    no.reads = hyper$no.reads,
    perc.editing = hyper$perc.editing
  )
  return(gr)
}) %>% as(., "GRangesList")


# how many rows have * for granges (this is expected because, genes at the chromosomes boarder are removed from genes.GR)
hyper.GR_star_strand = lapply (hyper.GR, function(i){
 star = which(strand(i) == "*")
  return(star)
})

# remove rows with * for strand
hyper.GR= lapply(hyper.GR, function(i){
  i = subset(i, strand != "*")
  return(i)
})

Although the pipeline specifically detects Adenosine, a minor fraction of editing events were from nucleotides other than A. All editing events observed in nucleotides other than Adenosine was removed.

3.2 Get sequence of editing sites

# get sequence of all editing sites
seqs = lapply(hyper.GR, function(gr){
  seq = getSeq(Umaydis, gr)
  return(seq)
})

# check distribution
sapply(seqs, table)
## $Rrm4_rep1
## 
##    A    C    G 
## 4128    1    3 
## 
## $Rrm4_rep2
## 
##    A    G 
## 4117    4 
## 
## $ctrl_rep1
## 
##    A    G 
## 2926    2 
## 
## $ctrl_rep2
## 
##    A    G 
## 2724    1
# REMOVE INDIVIDUAL SITES THAT ARE NOT ON A
hyper.GR = lapply(hyper.GR, function(gr){
  seqs = getSeq(Umaydis, gr)
  gr = subset(gr, seqs == "A")
  return(gr)
})

# check that only A's remain
seqs = lapply(hyper.GR, function(gr){
  seq = getSeq(Umaydis, gr)
  return(seq)
})

sapply(seqs, table)
## Rrm4_rep1.A Rrm4_rep2.A ctrl_rep1.A ctrl_rep2.A 
##        4128        4117        2926        2724

To this end, challenges related to editing event detection, including overlapping genomic coordinates, were resolved. Subsequently, editing events that demonstrated reproducibility across two biological replicates were identified and utilized for all subsequent analyses.

4 Compare between replicates

# Reproducibilty of editing events between two biological replicates
# rrm4-ada-gfp
venn.list = lapply(hyper.GR[1:2], function(hyper){
  return(hyper$id)
})
pl1 = ggvenn(venn.list, names(hyper.GR)[1:2], fill_color = c(lighten(Rrm4.col, 0.2), darken(Rrm4.col, 0.2)), set_name_size = 2) +myTheme1+
  labs(x = NULL, y = NULL) + 
  guides(x = "none", y = "none")

# control-ada
venn.list = lapply(hyper.GR[3:4], function(hyper){
  return(hyper$id)
})
pl2 = ggvenn(venn.list, names(hyper.GR)[3:4], fill_color = c(lighten(ada.col, 0.2), darken(ada.col, 0.2)), set_name_size = 2) +myTheme1+labs(x = NULL, y = NULL) + 
  guides(x = "none", y = "none")
  
# ggarrange to arranging multiple ggplots in the same plot
vennrep<-ggarrange(pl1, pl2, 
          labels = c("A", "B"),
          ncol = 2, nrow = 1)
vennrep

# ggsave
ggsave(vennrep, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename="FigEV3B_rev_minor_changereps.pdf", height=2.6, width=3.2, unit= "cm")
# Keep only reproducible editing events were kept.
# merge rrm4-ada-gfp replicates
rrm4.tab = merge(hyper.GR$Rrm4_rep1 %>% mcols(), hyper.GR$Rrm4_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
# select only sites from overlap and add combined info of both replicates
sel = match(rrm4.tab$id, hyper.GR$Rrm4_rep1$id)
rrm4.rep = hyper.GR$Rrm4_rep1[sel]
mcols(rrm4.rep) = rrm4.tab

# merge control-ada replicates
ctrl.tab = merge(hyper.GR$ctrl_rep1 %>% mcols(), hyper.GR$ctrl_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
# select only sites from overlap and add combined info of both replicates
sel = match(ctrl.tab$id, hyper.GR$ctrl_rep1$id)
ctrl.rep = hyper.GR$ctrl_rep1[sel]
mcols(ctrl.rep) = ctrl.tab

## NEW GRANGES OBJECT WITH ONLY REPRODUCIBLE SITES
hyper.rep = GRangesList(rrm4 = rrm4.rep, ctrl = ctrl.rep)

# number of reproducible editing sites
elementNROWS(hyper.rep)
## rrm4 ctrl 
## 1604 1129
# number of genes with reproducible sites
sapply(hyper.rep, function(hyper) length(unique(hyper$gene.id)))
## rrm4 ctrl 
## 1168  990
elementNROWS(hyper.rep)
## rrm4 ctrl 
## 1604 1129

5 Studying the editing events

In this analysis, the editing events were analysed for their distribution and reproducibility between two biological replicates. The correlation of determination was used to study the editing reproducibility as it indicates the distance of points between 1:1 line, on the other hand magnitud. And therefore, both pearson (the analysis is independent of magnitude, in our case, editing percentage ) and spearman (as the editing relationship is not monotonic) was not used.

# distribution of % editing
pf = data.frame(
  perc.editing = c(rrm4.tab$perc.editing.rep1, rrm4.tab$perc.editing.rep2, ctrl.tab$perc.editing.rep1, ctrl.tab$perc.editing.rep2),
  sample = rep(c("rrm4.rep1", "rrm4.rep2", "ctrl.rep1", "ctrl.rep2"), rep(c(nrow(rrm4.tab), nrow(ctrl.tab)), each=2)),
  condition = rep(c("rrm4", "rrm4", "ctrl", "ctrl"), rep(c(nrow(rrm4.tab), nrow(ctrl.tab)), each=2)))
  
# rrm4-ada-gfp
# density
####FigS4B
pl1 = ggplot(pf, aes(x=perc.editing, fill=sample)) +
  geom_density(adjust = 1/5, alpha=0.5) +
  scale_fill_manual(values=c(rrm4.rep1 = lighten(Rrm4.col, 0.2), rrm4.rep2 = darken(Rrm4.col, 0.2), ctrl.rep1 = lighten(ada.col, 0.2), ctrl.rep2 = darken(ada.col, 0.2)))+ 
  xlab("% editing")+
  ylab("density")+
  myTheme1

# histogram
pl2 = ggplot(pf, aes(x=perc.editing, fill=sample)) +
  geom_histogram(aes(y = ..density..), position="dodge") +
  scale_fill_manual(values=c(rrm4.rep1 = lighten(Rrm4.col, 0.2), rrm4.rep2 = darken(Rrm4.col, 0.2), ctrl.rep1 = lighten(ada.col, 0.2), ctrl.rep2 = darken(ada.col, 0.2)))+
  myTheme1

# scatter plots of % editing in overlapping sites
# rrm4-ada-gfp
#### FigS4C
pl3 = ggplot(rrm4.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2)) +
  geom_pointdensity(adjust=0.1) +
  ggtitle("Shared sites in Rrm4.ada") +
  theme_bw() +
  theme(aspect.ratio = 1) +
  expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
  stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
  stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ 
  labs(subtitle="FigS4C")+
  myTheme1 
pl3

# control-ada
#### FigS4D
pl4 = ggplot(ctrl.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2))+ 
  geom_pointdensity(adjust=0.1) +
  ggtitle("Shared sites in Ada.ctrl") +
  theme_bw() +
  theme(aspect.ratio = 1) +
  expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
  stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
  stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ 
  labs(subtitle="FigS4D")+
  myTheme1
  
# ggarrange
figEVC<-ggarrange(pl1, pl2, pl3, pl4,
          labels = c("A", "B", "C", "D"),
          ncol = 2, nrow = 2)
figEVC

# ggsave
ggsave(figEVC, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename="FigEV3c.pdf", height=23, width=16, unit= "cm")

To check the distance between the editing sites and iCLIP binding site, I will use distancetonearest() function in GenomicRanges.

# Using the iCLIP data of Nina
dis_df = lapply(names(hyper.rep), function(i){
  
  hyper = hyper.rep[[i]]
  head(hyper)
  
  # calculat distance to nearest iCLIP site
  dists.rrm4 = distanceToNearest(hyper, iCLIP_NKS)
  
  # keep only pairs on same genes
  
  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_NKS$geneID[to(dists.rrm4)]
  table(sel)
  dists.rrm4 = subset(dists.rrm4, sel)
  mcols(dists.rrm4)$BS = "iCLIP"
  
  # make df
  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
  df$set = i
  
  return(df)
})

# rowbind the list objects
dis_df = do.call(rbind, dis_df)

# distance between editing sites and iCLIP binding sites
# density
pl1 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_density(adjust = 1/5, alpha=0.5) +
  coord_cartesian(xlim=c(0, 2000)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw() +
  facet_wrap(~BS)+myTheme1

# histogram
pl2 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 100, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2000)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw() +
  facet_wrap(~BS)+myTheme1

# ggarrange
nks<- ggarrange(pl1, pl2,  
          labels = LETTERS[1:4],
          ncol = 2, nrow = 1)
nks

# ggsave
ggsave(nks, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev.pdf", height=22, width=16, unit= "cm")
# using the iCLIP data of Lilli
dis_df = lapply(names(hyper.rep), function(i){
  
  hyper = hyper.rep[[i]]
  head(hyper)
  
  # calculat distance to nearest iCLIP site
  dists.rrm4 = distanceToNearest(hyper, iCLIP_lilli)
  
  # kepp only pairs on same genes
  
  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_lilli$gene_id[to(dists.rrm4)]
  table(sel)
  dists.rrm4 = subset(dists.rrm4, sel)
  mcols(dists.rrm4)$BS = "iCLIP"
  
  # make df
  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
  df$set = i
  
  return(df)
})

# row bind the list objects
dis_df = do.call(rbind, dis_df)

# distance between editing sites and iCLIP binding sites
# density
pl1 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_density(adjust = 1/5, alpha=0.5) +
  coord_cartesian(xlim=c(0, 2000)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw() +
  facet_wrap(~BS)+myTheme1

# histogram
pl2 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 100, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2000)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw() +
  facet_wrap(~BS)+myTheme1

# ggarrange
lilli<- ggarrange(pl1, pl2,  
          labels = LETTERS[1:4],
          ncol = 2, nrow = 1)
lilli

# ggsave
ggsave(lilli, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev_lilli.pdf", height=22, width=16, unit= "cm")

6 Enrichment of Rrm4-Ada-Gfp targets in iCLIP

In this analysis, I compared the overlap between high-confidence Rrm4-Ada-Gfp targets and Rrm4-iCLIP targets.

# comparing the reproducible editing sites between rrm4-ada and control
#### Fig2B
venn.list = lapply(hyper.rep, function(hyper){
  return(hyper$id)
})

pl1=ggvenn(venn.list, names(venn.list), fill_color=c(Rrm4.col, ada.col))+
    ggtitle("Fig2B")+myTheme1


# define sets
rrm4.vs.ctrl=list(
  rrm4.specific=hyper.rep$rrm4$gene.id[hyper.rep$rrm4$id %in% setdiff(hyper.rep$rrm4$id, hyper.rep$ctrl$id)],
  shared=hyper.rep$ctrl$gene.id[hyper.rep$ctrl$id %in% intersect(hyper.rep$ctrl$id, hyper.rep$rrm4$id)],
  ctr.specific=hyper.rep$ctrl$gene.id[hyper.rep$ctrl$id %in% setdiff(hyper.rep$ctrl$id, hyper.rep$rrm4$id)],
  all=genes.GR$gene_id
)


# iCLIP_lilli vs hyperTRIBE genes
tab.Rrm4Bs_LO = lapply(rrm4.vs.ctrl, function(i){
  df = genes.GR$rrm4_bs_LO[genes.GR$gene_id %in% i]
  tabGn = table(df>0)
  return(tabGn)
}) %>% melt(.) %>% set_colnames(., c("with.bs","no.of.genes", "set"))

#### FigS4F
pl3 = ggplot(tab.Rrm4Bs_LO, aes(x=factor(set, levels=c("rrm4.specific", "shared", "ctr.specific", "all" )), y= no.of.genes, fill=with.bs))+
  geom_bar(stat="identity", position="fill", color="black")+
  scale_fill_manual(values=c('TRUE' = "skyblue", 'FALSE' = "lightgrey"))+
  myTheme1


# iCLIP_NKS vs hyperTRIBE genes
tab.Rrm4Bs_NKS = sapply(rrm4.vs.ctrl, function(i){
  df = genes.GR$rrm4_bs_nks[genes.GR$gene_id %in% i]
  tabGn = table(df>0)
  return(tabGn)
}) %>% melt(.) %>%set_colnames(., c("with.bs","set", "no.of.genes"))

# Not used for publication
pl2 = ggplot(tab.Rrm4Bs_NKS, aes(x=set, y= no.of.genes, fill=with.bs))+
  geom_bar(stat="identity", position="fill", color="black")+
  scale_fill_manual(values=c('TRUE' = "skyblue", 'FALSE' = "lightgrey"))+
  myTheme1

# ggarrange
Nks_lO_HT<- ggarrange(pl1, pl2, pl3,  
          labels = LETTERS[1:3],
          ncol = 2, nrow = 2)
Nks_lO_HT

# ggsave
ggsave(Nks_lO_HT, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Nks_lO_HT.pdf", height=22, width=16, unit= "cm")

The significance of mRNA target enrichment in comparison to all mRNAs present in U. maydis was carried out using the Fisher’s exact test

# make data frame 
bs_enrich = data.frame(rrm4.uni = c(683,130),
                       shared = c(377,79),
                       ctrl.uni = c(430,157),
                       all = c(4461,2470)) %>% 
  set_rownames(.,c("with iCLIP BS", "no iCLIP BS"))
  
# Fisher exact test
# rrm4-ada-gfp unique
chisq.rrm4bs=bs_enrich[,c(1,4)]
fisher.test(chisq.rrm4bs)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  chisq.rrm4bs
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  2.391688 3.558903
## sample estimates:
## odds ratio 
##   2.908602
# overlap
chisq.sharedbst=bs_enrich[,c(2,4)]
fisher.test(chisq.sharedbst)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  chisq.sharedbst
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  2.056910 3.428465
## sample estimates:
## odds ratio 
##   2.641978
#control-Ada unique
chisq.ctrlbs=bs_enrich[,c(3,4)]
fisher.test(chisq.ctrlbs)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  chisq.ctrlbs
## p-value = 1.171e-05
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.251550 1.844536
## sample estimates:
## odds ratio 
##   1.516389

7 separate Rrm4-Ada-Gfp specific and control-Ada specific

In here, we will select editing sites that are specific to Rrm4-Ada-Gfp and control-Ada. Note that some mRNAs could be still present in both data sets.

#hyper.rep
# Rrm4-Ada-Gfp specific
Rrm4_spe= hyper.rep$rrm4[!hyper.rep$rrm4$id %in% hyper.rep$ctrl$id]

# editing events shared between Rrm4 and ctrl
Rrm4_con_sh = hyper.rep$rrm4[hyper.rep$rrm4$id %in% hyper.rep$ctrl$id]

# control-Ada specific
con_spe= hyper.rep$ctrl[!hyper.rep$ctrl$id %in% hyper.rep$rrm4$id]

8 Find mRNAs specific to each category

#venn list
venn_mRNAs = list(
  Rrm4Ada_mRNAs = Rrm4_spe$gene.id,
  controlAda_mRNAs = con_spe$gene.id)

# isolate mRNAs specific to each data set
# rrm4-ada-gfp
Rrm4Ada_txspe = Rrm4_spe[!Rrm4_spe$gene.id %in% con_spe$gene.id]
# shared
sh.ada = Rrm4_spe[Rrm4_spe$gene.id %in% con_spe$gene.id]
# control-ada
conAda_txspe = con_spe[!con_spe$gene.id %in% Rrm4_spe$gene.id]

9 Study the distance between editing sites and iCLIP binding sites in mRNAs specific to each datasets

9.1 iCLIP data of LO

# find binding sites with motif
# UAUG bindinsites
uaug = DNAString("TATG")
uaug.gr = vmatchPattern(uaug, Umaydis)
ov= findOverlaps(uaug.gr, genes.GR)
uaug.gr = uaug.gr[from(ov)] 
uaug.gr$gene.id = genes.GR$gene_id[to(ov)]

# assign uaug motif to the iCLIP binding sites
ov = findOverlaps(iCLIP_lilli, uaug.gr)
# make a new uaug column
iCLIP_lilli$uaug = NA
# find the binding sites having UAUG motif
iCLIP_lilli$uaug[from(ov)] = uaug.gr$gene.id[to(ov)]

# UCUC bindinsites
ucuc= DNAString("TCTC")
ucuc.gr = vmatchPattern(ucuc, Umaydis)
ov= findOverlaps(ucuc.gr, genes.GR)
ucuc.gr = ucuc.gr[from(ov)] 
ucuc.gr$gene.id = genes.GR$gene_id[to(ov)]

# assign uaug motif to the iCLIP binding sites
ov = findOverlaps(iCLIP_lilli, ucuc.gr)
# make a new uaug column
iCLIP_lilli$ucuc = NA
# find the binding sites having UAUG motif
iCLIP_lilli$ucuc[from(ov)] = ucuc.gr$gene.id[to(ov)]


# CAUG bindinsites
caug= DNAString("CATG")
caug.gr = vmatchPattern(caug, Umaydis)
ov= findOverlaps(caug.gr, genes.GR)
caug.gr = caug.gr[from(ov)] 
caug.gr$gene.id = genes.GR$gene_id[to(ov)]

# assign uaug motif to the iCLIP binding sites
ov = findOverlaps(iCLIP_lilli, caug.gr)
# make a new uaug column
iCLIP_lilli$caug = NA
# find the binding sites having UAUG motif
iCLIP_lilli$caug[from(ov)] = caug.gr$gene.id[to(ov)]

9.1.1 mRNAs specific to each hyperTRIBE dataset vs iCLIP site with UAUG

Here, the distance between editing sites and UAUG-containing Rrm4-iCLIP binding sites were calculated in mRNAs specific to each set.

# make a list 
hyper.rep.mRNA = list(
  rrm4 = Rrm4Ada_txspe,
  ctrl = conAda_txspe
) %>% as(., "GRangesList")

# only those Rrm4 bs with UAUG 
Rrm4_bs_UAUG = iCLIP_lilli[!is.na(iCLIP_lilli$uaug)]

## DIstance between bs with uaug ( iCLIP data of lilli) and editing site
dis = lapply(names(hyper.rep.mRNA), function(i){
  
  hyper = hyper.rep.mRNA[[i]]
  head(hyper)
  
  # calculat distance to nearest iCLIP site
  dists.rrm4 = distanceToNearest(hyper, Rrm4_bs_UAUG)
  
  # kepp only pairs on same genes
  
  sel = hyper$gene.id[from(dists.rrm4)] == Rrm4_bs_UAUG$gene_id[to(dists.rrm4)]
  table(sel)
  dists.rrm4 = subset(dists.rrm4, sel)
  mcols(dists.rrm4)$BS = "iCLIP"
  
  # make df
  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
  df$set = i
  
  return(df)
})

dis_df = do.call(rbind, dis)

#######Not used in publication
pl2 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_density(adjust = 1/5, alpha=0.5) +
  coord_cartesian(xlim=c(0, 2500)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw()+
  myTheme1

### Fig2C right panel
pl3 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2500),ylim=c(0, 0.002)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw()+
  myTheme1

# ggarrange
HTrrm4_UAUG<- ggarrange(pl2, pl3, 
          labels = LETTERS[1:4],
          ncol = 2, nrow = 1)
HTrrm4_UAUG

# ggsave
ggsave(HTrrm4_UAUG, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev_uaug.pdf", height=16, width=16, unit= "cm")
# calculate median distance iCLIP site with uaug
# rrm4-ada-gfp
med_rrm4bs = median(dis[[1]]$distance)
# control-ada
med_ctrlbs = median(dis[[2]]$distance)

9.1.2 mRNAs specific to each hyperTRIBE dataset vs all BS

Here, the distance between editing sites and all Rrm4-iCLIP binding sites were calculated in mRNAs specific to each set.

# distance a-Gbwtween editing sites and all BS in mRNAs specific to Rrm4-Ada-Gfp and control-Ada

dis = lapply(names(hyper.rep.mRNA), function(i){
  hyper = hyper.rep.mRNA[[i]]
  head(hyper)
  # calculat distance to nearest iCLIP site
  dists.rrm4 = distanceToNearest(hyper, iCLIP_lilli)
  # kepp only pairs on same genes
  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_lilli$gene_id[to(dists.rrm4)]
  table(sel)
  dists.rrm4 = subset(dists.rrm4, sel)
  mcols(dists.rrm4)$BS = "iCLIP"
  # make df
  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
  df$set = i
  return(df)
})

dis_df = do.call(rbind, dis)

#######Not used in publication
pl4 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_density(adjust = 1/5, alpha=0.5) +
  coord_cartesian(xlim=c(0, 2500)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw()+
  myTheme1

### Fig2C left
pl5 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2500)) +
  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
  theme_bw() +
  myTheme1

HTrrm4_spemRNA<- ggarrange(pl4, pl5, 
          labels = LETTERS[1:2],
          ncol = 2, nrow = 1)
HTrrm4_spemRNA

ggsave(HTrrm4_spemRNA, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev_all_BS_specificmRNA.pdf", height=8, width=16, unit= "cm")
# calculate median distance all iCLIP site
# for rrm4
med_rrm4bs = median(dis[[1]]$distance)

med_ctrlbs = median(dis[[2]]$distance)

9.2 In the iCLIP data of NKS

# UAUG bindinsites
uaug = DNAString("TATG")
uaug.gr = vmatchPattern(uaug, Umaydis)
ov= findOverlaps(uaug.gr, genes.GR)
uaug.gr = uaug.gr[from(ov)] 
uaug.gr$gene.id = genes.GR$gene_id[to(ov)]

# assign uaug motif to the iCLIP binding sites
ov = findOverlaps(iCLIP_NKS, uaug.gr)
# make a new uaug column
iCLIP_NKS$uaug = NA
# find the binding sites having UAUG motif
iCLIP_NKS$uaug[from(ov)] = uaug.gr$gene.id[to(ov)]

# UCUC bindinsites
ucuc= DNAString("TCTC")
ucuc.gr = vmatchPattern(ucuc, Umaydis)
ov= findOverlaps(ucuc.gr, genes.GR)
ucuc.gr = ucuc.gr[from(ov)] 
ucuc.gr$gene.id = genes.GR$gene_id[to(ov)]

# assign uaug motif to the iCLIP binding sites
ov = findOverlaps(iCLIP_NKS, ucuc.gr)
# make a new uaug column
iCLIP_NKS$ucuc = NA
# find the binding sites having UAUG motif
iCLIP_NKS$ucuc[from(ov)] = ucuc.gr$gene.id[to(ov)]


# CAUG bindinsites
caug= DNAString("CATG")
caug.gr = vmatchPattern(caug, Umaydis)
ov= findOverlaps(caug.gr, genes.GR)
caug.gr = caug.gr[from(ov)] 
caug.gr$gene.id = genes.GR$gene_id[to(ov)]

# assign uaug motif to the iCLIP binding sites
ov = findOverlaps(iCLIP_NKS, caug.gr)
# make a new uaug column
iCLIP_NKS$caug = NA
# find the binding sites having UAUG motif
iCLIP_NKS$caug[from(ov)] = caug.gr$gene.id[to(ov)]

9.2.1 mRNAs specific to each hyperTRIBE dataset vs iCLIP site with UAUG

# make a list 
hyper.rep.mRNA = list(
  rrm4 = Rrm4Ada_txspe,
  ctrl = conAda_txspe
) %>% as(., "GRangesList")

# only those Rrm4 bs with UAUG 
Rrm4_bs_UAUG = iCLIP_NKS[!is.na(iCLIP_NKS$uaug)]

## DIstance between bs with uaug ( iCLIP data of lilli) and editing site
dis = lapply(names(hyper.rep.mRNA), function(i){
  
  hyper = hyper.rep.mRNA[[i]]
  head(hyper)
  
  # calculat distance to nearest iCLIP site
  dists.rrm4 = distanceToNearest(hyper, Rrm4_bs_UAUG)
  
  # kepp only pairs on same genes
  
  sel = hyper$gene.id[from(dists.rrm4)] == Rrm4_bs_UAUG$geneID[to(dists.rrm4)]
  table(sel)
  dists.rrm4 = subset(dists.rrm4, sel)
  mcols(dists.rrm4)$BS = "iCLIP"
  
  # make df
  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
  df$set = i
  
  return(df)
})

dis_df = do.call(rbind, dis)

pl3 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2500),ylim=c(0, 0.0035)) +
  scale_fill_manual(values=c(rrm4 = "grey30", ctrl = "grey")) +
  labs(subtitle = "Rrm4-Ada-Gfp vs UAUG-containing iCLIP2 BS",
       y="",
       x=" ")+
  theme_bw()+myTheme1+
  theme(axis.text.y=element_blank())

9.2.2 mRNAs specific to each hyperTRIBE dataset vs all BS

# distance a-Gbwtween editing sites and all BS in mRNAs specific to Rrm4-Ada-Gfp and control-Ada

dis = lapply(names(hyper.rep.mRNA), function(i){
  
  hyper = hyper.rep.mRNA[[i]]
  head(hyper)
  
  # calculat distance to nearest iCLIP site
  dists.rrm4 = distanceToNearest(hyper, iCLIP_NKS)
  
  # kepp only pairs on same genes
  
  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_NKS$geneID[to(dists.rrm4)]
  table(sel)
  dists.rrm4 = subset(dists.rrm4, sel)
  mcols(dists.rrm4)$BS = "iCLIP"
  
  # make df
  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
  df$set = i
  
  return(df)
})

dis_df = do.call(rbind, dis)

pl5 = ggplot(dis_df, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2500),ylim=c(0, 0.0035)) +
  scale_fill_manual(values=c(rrm4 = "grey30", ctrl = "grey")) +
  labs(subtitle="Rrm4-Ada-Gfp vs all iCLIP2 BS",
       y= "",
       x="")+
  theme_bw()+myTheme1

HTrrm4_spemRNA =  ggarrange(pl5, pl3,
          ncol = 2, nrow = 1, common.legend = TRUE)

HTrrm4_spemRNA = annotate_figure(HTrrm4_spemRNA, left = textGrob("binding site density (nt)", rot = 90), bottom = textGrob("distance to iCLIP2 binding site (nt)"))

HTrrm4_spemRNA

ggsave(HTrrm4_spemRNA, path="C:/Users/Sri/Documents/Bioinfo_analysis/iCLIP_course/iCLIP2_Methods_Stoffel_etal_2023/", filename = "iCLIP2vHT.pdf", height=8, width=16, unit= "cm")

# calculate median distance all iCLIP site
# for rrm4
med_rrm4bs = median(dis[[1]]$distance)
med_ctrlbs = median(dis[[2]]$distance)

10 Session Info

sessionInfo()
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_Germany.utf8  LC_CTYPE=English_Germany.utf8   
## [3] LC_MONETARY=English_Germany.utf8 LC_NUMERIC=C                    
## [5] LC_TIME=English_Germany.utf8    
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] circlize_0.4.15                   ggseqlogo_0.1                    
##  [3] gridExtra_2.3                     magick_2.8.2                     
##  [5] GenomicFeatures_1.54.1            AnnotationDbi_1.64.1             
##  [7] ComplexHeatmap_2.18.0             DESeq2_1.42.0                    
##  [9] SummarizedExperiment_1.32.0       Biobase_2.62.0                   
## [11] MatrixGenerics_1.14.0             matrixStats_1.2.0                
## [13] tidyr_1.3.0                       gprofiler2_0.2.2                 
## [15] stringr_1.5.1                     tibble_3.2.1                     
## [17] RColorBrewer_1.1-3                ggrepel_0.9.4                    
## [19] ggpmisc_0.5.5                     ggpp_0.5.5                       
## [21] rstatix_0.7.2                     lookup_1.0                       
## [23] ggpointdensity_0.1.0              ggpubr_0.6.0                     
## [25] colorspace_2.1-0                  ggvenn_0.1.10                    
## [27] magrittr_2.0.3                    dplyr_1.1.4                      
## [29] reshape2_1.4.4                    ggplot2_3.4.4                    
## [31] BSgenome.Umaydis.ENSMBL.UM1_2.0.0 BSgenome_1.70.1                  
## [33] rtracklayer_1.62.0                BiocIO_1.12.0                    
## [35] Biostrings_2.70.1                 XVector_0.42.0                   
## [37] GenomicRanges_1.54.1              GenomeInfoDb_1.38.5              
## [39] IRanges_2.36.0                    S4Vectors_0.40.2                 
## [41] BiocGenerics_0.48.1               BiocStyle_2.30.0                 
## 
## loaded via a namespace (and not attached):
##   [1] rstudioapi_0.15.0        jsonlite_1.8.8           shape_1.4.6             
##   [4] farver_2.1.1             rmarkdown_2.25           ragg_1.2.7              
##   [7] GlobalOptions_0.1.2      zlibbioc_1.48.0          vctrs_0.6.5             
##  [10] memoise_2.0.1            Rsamtools_2.18.0         RCurl_1.98-1.13         
##  [13] progress_1.2.3           htmltools_0.5.7          S4Arrays_1.2.0          
##  [16] polynom_1.4-1            curl_5.2.0               broom_1.0.5             
##  [19] SparseArray_1.2.3        sass_0.4.8               bslib_0.6.1             
##  [22] htmlwidgets_1.6.4        plyr_1.8.9               plotly_4.10.3           
##  [25] cachem_1.0.8             GenomicAlignments_1.38.0 lifecycle_1.0.4         
##  [28] iterators_1.0.14         pkgconfig_2.0.3          Matrix_1.6-4            
##  [31] R6_2.5.1                 fastmap_1.1.1            GenomeInfoDbData_1.2.11 
##  [34] clue_0.3-65              digest_0.6.33            textshaping_0.3.7       
##  [37] RSQLite_2.3.4            labeling_0.4.3           filelock_1.0.3          
##  [40] fansi_1.0.6              mgcv_1.9-1               httr_1.4.7              
##  [43] abind_1.4-5              compiler_4.3.1           bit64_4.0.5             
##  [46] withr_2.5.2              doParallel_1.0.17        backports_1.4.1         
##  [49] BiocParallel_1.36.0      carData_3.0-5            DBI_1.2.0               
##  [52] highr_0.10               biomaRt_2.58.0           ggsignif_0.6.4          
##  [55] MASS_7.3-60              quantreg_5.97            rappdirs_0.3.3          
##  [58] DelayedArray_0.28.0      rjson_0.2.21             tools_4.3.1             
##  [61] glue_1.6.2               restfulr_0.0.15          nlme_3.1-164            
##  [64] cluster_2.1.6            generics_0.1.3           gtable_0.3.4            
##  [67] hms_1.1.3                data.table_1.14.10       xml2_1.3.6              
##  [70] car_3.1-2                utf8_1.2.4               foreach_1.5.2           
##  [73] pillar_1.9.0             splines_4.3.1            BiocFileCache_2.10.1    
##  [76] lattice_0.22-5           survival_3.5-7           bit_4.0.5               
##  [79] SparseM_1.81             tidyselect_1.2.0         locfit_1.5-9.8          
##  [82] knitr_1.45               bookdown_0.37            xfun_0.39               
##  [85] stringi_1.8.3            lazyeval_0.2.2           yaml_2.3.7              
##  [88] evaluate_0.23            codetools_0.2-19         BiocManager_1.30.22     
##  [91] cli_3.6.1                systemfonts_1.0.5        munsell_0.5.0           
##  [94] jquerylib_0.1.4          Rcpp_1.0.11              dbplyr_2.4.0            
##  [97] png_0.1-8                XML_3.99-0.16            parallel_4.3.1          
## [100] MatrixModels_0.5-3       blob_1.2.4               prettyunits_1.2.0       
## [103] bitops_1.0-7             viridisLite_0.4.2        scales_1.3.0            
## [106] purrr_1.0.2              crayon_1.5.2             GetoptLong_1.0.5        
## [109] rlang_1.1.1              cowplot_1.1.2            KEGGREST_1.42.0