Uploaded the microscopic data for Study 1

3d208e20 · Srimeenakshi Sankaranarayanan · 0fc603bd · 3d208e20 · 3d208e20 · 3d208e20
Commit 3d208e20 authored 1 year ago by Srimeenakshi Sankaranarayanan
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,3 +4,4 @@ publication/sankaranarayanan-et-al-2023-the-mrna-stability-factor-khd4-defines-a
 _publication/pnas.2301731120.sapp.pdf filter=lfs diff=lfs merge=lfs -text
 _publication/pnas.2301731120.sd05.xlsx filter=lfs diff=lfs merge=lfs -text
 _publication/sankaranarayanan-et-al-2023-the-mrna-stability-factor-khd4-defines-a-specific-mrna-regulon-for-membrane-trafficking-in.pdf filter=lfs diff=lfs merge=lfs -text
+assays/S3_A3_HyperTRIBE_Rrm4_iCLIP/dataset/01_hyperTRIBE_v5_verfied_rrm4.html filter=lfs diff=lfs merge=lfs -text
--- a/assays/S10_A1_microscopy_FM4-64/isa.assay.xlsx
+++ b/assays/S10_A1_microscopy_FM4-64/isa.assay.xlsx
--- a/assays/S10_A2_microscopy_CMAG/isa.assay.xlsx
+++ b/assays/S10_A2_microscopy_CMAG/isa.assay.xlsx
--- a/assays/S1_A1_microscopy_yeast/isa.assay.xlsx
+++ b/assays/S1_A1_microscopy_yeast/isa.assay.xlsx
--- a/assays/S1_A2_microscopy_hyphae/isa.assay.xlsx
+++ b/assays/S1_A2_microscopy_hyphae/isa.assay.xlsx
--- a/assays/S2_A1_microscopy_DIC/isa.assay.xlsx
+++ b/assays/S2_A1_microscopy_DIC/isa.assay.xlsx
--- a/assays/S2_A2_microscopy_Rrm4-Gfp/isa.assay.xlsx
+++ b/assays/S2_A2_microscopy_Rrm4-Gfp/isa.assay.xlsx
--- a/assays/S2_A3_protein_expression/isa.assay.xlsx
+++ b/assays/S2_A3_protein_expression/isa.assay.xlsx
--- a/assays/S2_A4_microscopy_Rfp_Gfp/isa.assay.xlsx
+++ b/assays/S2_A4_microscopy_Rfp_Gfp/isa.assay.xlsx
--- a/assays/S3_A1_HyperTRIBE_Rrm4_workflow/isa.assay.xlsx
+++ b/assays/S3_A1_HyperTRIBE_Rrm4_workflow/isa.assay.xlsx
--- a/assays/S3_A2_HyperTRIBE_Rrm4_analysis/isa.assay.xlsx
+++ b/assays/S3_A2_HyperTRIBE_Rrm4_analysis/isa.assay.xlsx
--- a/assays/S3_A3_HyperTRIBE_Rrm4_iCLIP/dataset/01_hyperTRIBE_v5_verfied_rrm4.Rmd
+++ b/assays/S3_A3_HyperTRIBE_Rrm4_iCLIP/dataset/01_hyperTRIBE_v5_verfied_rrm4.Rmd
+---
+title: "Rrm4-Ada-Gfp analysis for Sankaranarayanan et.al 2023"
+author: "Srimeenakshi Sankaranarayanan"
+date: "`r Sys.Date()`"
+output: 
+    BiocStyle::html_document:
+    toc: true
+    toc_float: true
+    code_folding: hide
+    theme: Flatly
+---
+In this analysis, the validation of Rrm4-Ada-Gfp analyis were carried out by comparing the hyperTRIBE editing events with the available iCLIP data.
+For this purpose, both iCLIP datasets from Olgeiser et.al.(2019) as well as Stoffel et.al(2024) (under preparation) were used. But only the results from Olgeiser et.al. (2019) were used in Sankaranarayanan et.al(2023).
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE,
+                      results = TRUE, crop=NULL)
+```
+```{r, message=FALSE, warning=FALSE}
+#Loading all the required libraries for the analyses.
+library(BSgenome.Umaydis.ENSMBL.UM1)
+library(ggplot2)
+library(reshape2)
+library(dplyr)
+library(GenomicRanges)
+library(magrittr)
+library(ggvenn)
+library(colorspace)
+library(ggpubr)
+library(ggpointdensity)
+library(lookup)
+library(rstatix)
+library(ggpmisc)
+library(ggrepel)
+library(RColorBrewer)
+library(tibble)
+library(stringr)
+library(gprofiler2)
+library(tidyr)
+library(stats)
+library(DESeq2)
+library(rtracklayer)
+library(ComplexHeatmap)
+library(GenomicFeatures)
+library(GenomicRanges)
+library(magick)
+library(magrittr)
+library(gridExtra)
+library(IRanges)
+library(Biostrings)
+library(ggpmisc)
+library(ggrepel)
+library(ggpubr)
+library(ggseqlogo)
+library(rtracklayer)
+library(circlize)
+```
+```{r}
+#Theme for ggplots
+myTheme1 <- theme_bw() +
+    theme(axis.text = element_text(size = 8, colour="black"),
+          axis.title = element_text(size=8, colour="black", face="bold"),
+          axis.ticks=element_line(color="black"),
+          axis.ticks.length=unit(.15, "cm"),
+          panel.border=element_rect(color="black", fill = NA),
+          panel.background = element_blank(),
+          plot.background = element_blank(),
+          legend.text = element_text(size=12),
+          legend.position = "top")
+```
+```{r, colours}
+#Define colors for plots
+# different target sets
+# high-confidence from hyperTRIBE
+tribe.col = "#3EAE96"
+# Khd4 & control
+Rrm4.col = "#3EAE96"
+ada.col = "#D7D7D7"
+### with the functions lighten() and darken() from the package colorspace you can modulate the colours e.g. in Venn diagrams
+```
+# Preparing *U. maydis* gene annotations
+First, we will import gene annotation and sequence information Here I used the genome annotation of *U. maydis* from Ensembl. Note that in contrast to Pedant, the chromosomes are indicated by integer (eg: 1,2 etc) as opposed to character (Um_chr1 etc).
+```{r}
+# Define GTF file location
+ann = "C:/Users/Sri/Documents/annotation/genome/Ensembl/Ensembl_annotation/Ustilago.gtf"
+# Target gene assignment
+##make TxDb object for storing transcript annotations
+annoDb= makeTxDbFromGFF(file=ann, format="gtf") 
+## import the gtf file as GRanges
+annoInfo = import(ann, format = "gtf")
+## get genes as GRanges
+gns = genes(annoDb) # genes() retreives the positions of all genes 
+## matching the positions of gene_id in annoInfo
+idx = match(gns$gene_id, annoInfo$gene_id) 
+## add Metadata from annoInfo to gns 
+elementMetadata(gns) = cbind(elementMetadata(gns),
+                             elementMetadata(annoInfo)[idx,]) 
+# find all duplicated genes. In case of U. maydis genes its only tRNAs that were duplicated
+dup = gns[duplicated(gns)]
+##all tRNAs with their duplicated annotation
+dup_tRNAs = subsetByOverlaps(gns, dup, type = "equal") 
+##only tRNAs with UMAG_Ids using the string "UMAG"
+dup_UMAG_tRNAs = dup_tRNAs[grepl("UMAG", dup_tRNAs$gene_id)]
+##remove all annotation to tRNAs from gns
+gns = gns[!(gns %in% dup_tRNAs)]
+##combine tRNA annotation having UMAG number with gns 
+genes.GR = c(gns, dup_UMAG_tRNAs) %>%
+      sort()
+# remove objects from workspace
+rm(dup, dup_tRNAs, dup_UMAG_tRNAs, idx)
+```
+```{r}
+#### Since we manaullay extend the genes by 300 nt on either side for UTRs, the following code removes genes that are bothersome to the analysis (MAY BE NEEDED TO CHANGE THIS IF IMPORTANT FACTORS GET LOST)
+# remove genes in the first 300 nt of chromosomes (6 genes)
+sel = start(genes.GR) < 300
+cat(sum(sel), "genes were removed because they were to close to the chromosome start:", paste(genes.GR$gene_id[sel], collapse=", "), fill=TRUE)
+genes.GR = genes.GR[!sel]
+# remove genes in last 300 nt (12 genes)
+chr.length = seqlengths(Umaydis)[as.vector(seqnames(genes.GR))]
+sel = end(genes.GR) + 300 > chr.length
+cat(sum(sel), "genes were removed because they were to close to the chromosome end:", paste(genes.GR$gene_id[sel], collapse=", "), fill=TRUE)
+genes.GR = genes.GR[!sel]
+```
+```{r}
+# define transcript regions
+tx.regions = GRangesList(
+  UTR3 = flank(genes.GR, 300, start=FALSE),
+  ORF = genes.GR,
+  UTR5 = flank(genes.GR, 300)
+)
+# extend genes by 300 nt on both sides
+genes.GR = genes.GR + 300
+```
+# Import iCLIP dataset
+In order to compare the specificity of hyperTRIBE with Rrm4, the comparison with the existing iCLIP data is critical.
+```{r}
+# loading the recent iCLIP data of Rrm4-Gfp
+iCLIP_NKS= import("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/Rrm4-Rscript/bdsWTmerge.gtf", format = "gtf")
+#loading iCLIP data of Lilli
+iCLIP_lilli = import("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/Rrm4-Rscript/iCLIP_lilli/new_peaks_Rrm4G_SOB_0.bed")
+# find overlaps
+OV= findOverlaps(iCLIP_lilli, genes.GR)
+#assign gene id
+iCLIP_lilli=iCLIP_lilli[from(OV)]
+iCLIP_lilli$gene_id= genes.GR$gene_id[to(OV)]
+# make a column for Rrm4 binding site
+genes.GR$rrm4_bs_nks = countOverlaps(genes.GR, iCLIP_NKS )
+genes.GR$rrm4_bs_LO = countOverlaps(genes.GR, iCLIP_lilli)
+```
+# Characterise editing sites
+Note that a few editing sites needed to be removed for different reasons: - 6 sites were not on A - due to the gene extension by 300 nt, a handful of editing sites overlapped with 2 genes
+## Import hyperTRIBE data
+```{r}
+hyper.dir = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/Rrm4-Rscript/bedgraph_rrm4_5%"
+# get all bedgraph files
+hyper.files = list.files(hyper.dir, pattern="bedgraph", full.names=TRUE)
+# assign sample names (ATTENTION!)
+names(hyper.files) = c("Rrm4_rep1", "Rrm4_rep2", "ctrl_rep1", "ctrl_rep2")
+```
+First, a list containing hyperTRIBE files was created. For simplicity, only the integer in "no.reads" was kept. The strand  information was added from the annotation file.
+The hyperTRIBE pipeline generates the list of all genomic coordinates overlapping with the corresponding editing events. To resolve this overlapping problem,  all duplicated editing events were removed from the analysis.
+```{r}
+hyper.list = lapply(hyper.files, function(i){
+  # import data
+  hyper = read.table(i, sep="\t")
+  # keep only first 8 columns
+  hyper = hyper[,c(1:2,4:5,8)]
+  # assign column names
+  colnames(hyper) = c("chr", "start", "perc.editing", "no.reads", "gene.id")
+  # extract number of reads
+  hyper$no.reads = strsplit(hyper$no.reads, "_") %>% sapply(., "[[", 2) %>% sub("r", "", .) %>% as.integer(.)
+  # add strand (from annotation)
+  pos = match(hyper$gene.id, genes.GR$gene_id)
+  hyper$strand = as.vector(strand(genes.GR))[pos]
+  # add id
+  hyper$id = paste(hyper$chr, hyper$start, sep="_")
+  return(hyper)
+})
+#head(hyper.list[[1]])
+# CHECK FOR DUPLICATED SITES -> overlapping genes due to extension
+# these genes are removed at the moment (could also be resolved differently if needed)
+sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
+for (i in seq_along(hyper.list)){
+  hyper = hyper.list[[i]]
+  dupls = hyper$id[duplicated(hyper$id)]
+  #print( hyper[hyper$id %in% dupls,] )
+}
+# REMOVE DUPLICATED SITES
+hyper.list = lapply(hyper.list, function(hyper){
+  dupls = hyper$id[duplicated(hyper$id)]
+  hyper = subset(hyper, !(id %in% dupls))
+  return(hyper)
+})
+# check the duplicated events
+sapply(hyper.list, function(hyper) table(duplicated(hyper$id)))
+```
+For those gene that  are removed from the gene annotation, if they contained  editing events, their strand information will be marked with  "*" which were also removed. 
+```{r}
+# convert editing sites into GRanges object
+hyper.GR = lapply(hyper.list, function(hyper){
+  gr = GRanges(
+    seqnames=hyper$chr,
+    ranges=IRanges(start=hyper$start, width=1),
+    strand=hyper$strand,
+    id=hyper$id,
+    gene.id=hyper$gene.id,
+    no.reads = hyper$no.reads,
+    perc.editing = hyper$perc.editing
+  )
+  return(gr)
+}) %>% as(., "GRangesList")
+# how many rows have * for granges (this is expected because, genes at the chromosomes boarder are removed from genes.GR)
+hyper.GR_star_strand = lapply (hyper.GR, function(i){
+ star = which(strand(i) == "*")
+  return(star)
+})
+# remove rows with * for strand
+hyper.GR= lapply(hyper.GR, function(i){
+  i = subset(i, strand != "*")
+  return(i)
+})
+```
+Although the pipeline specifically detects Adenosine, a minor fraction of editing events were from nucleotides other than A. All editing events observed in nucleotides other than Adenosine was removed.
+## Get sequence of editing sites
+```{r}
+# get sequence of all editing sites
+seqs = lapply(hyper.GR, function(gr){
+  seq = getSeq(Umaydis, gr)
+  return(seq)
+})
+# check distribution
+sapply(seqs, table)
+# REMOVE INDIVIDUAL SITES THAT ARE NOT ON A
+hyper.GR = lapply(hyper.GR, function(gr){
+  seqs = getSeq(Umaydis, gr)
+  gr = subset(gr, seqs == "A")
+  return(gr)
+})
+# check that only A's remain
+seqs = lapply(hyper.GR, function(gr){
+  seq = getSeq(Umaydis, gr)
+  return(seq)
+})
+sapply(seqs, table)
+```
+To this end, challenges related to editing event detection, including overlapping genomic coordinates, were resolved. Subsequently, editing events that demonstrated reproducibility across two biological replicates were identified and utilized for all subsequent analyses.
+# Compare between replicates
+```{r}
+# Reproducibilty of editing events between two biological replicates
+# rrm4-ada-gfp
+venn.list = lapply(hyper.GR[1:2], function(hyper){
+  return(hyper$id)
+})
+pl1 = ggvenn(venn.list, names(hyper.GR)[1:2], fill_color = c(lighten(Rrm4.col, 0.2), darken(Rrm4.col, 0.2)), set_name_size = 2) +myTheme1+
+  labs(x = NULL, y = NULL) + 
+  guides(x = "none", y = "none")
+# control-ada
+venn.list = lapply(hyper.GR[3:4], function(hyper){
+  return(hyper$id)
+})
+pl2 = ggvenn(venn.list, names(hyper.GR)[3:4], fill_color = c(lighten(ada.col, 0.2), darken(ada.col, 0.2)), set_name_size = 2) +myTheme1+labs(x = NULL, y = NULL) + 
+  guides(x = "none", y = "none")
+# ggarrange to arranging multiple ggplots in the same plot
+vennrep<-ggarrange(pl1, pl2, 
+          labels = c("A", "B"),
+          ncol = 2, nrow = 1)
+vennrep
+# ggsave
+ggsave(vennrep, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename="FigEV3B_rev_minor_changereps.pdf", height=2.6, width=3.2, unit= "cm")
+```
+```{r}
+# Keep only reproducible editing events were kept.
+# merge rrm4-ada-gfp replicates
+rrm4.tab = merge(hyper.GR$Rrm4_rep1 %>% mcols(), hyper.GR$Rrm4_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
+# select only sites from overlap and add combined info of both replicates
+sel = match(rrm4.tab$id, hyper.GR$Rrm4_rep1$id)
+rrm4.rep = hyper.GR$Rrm4_rep1[sel]
+mcols(rrm4.rep) = rrm4.tab
+# merge control-ada replicates
+ctrl.tab = merge(hyper.GR$ctrl_rep1 %>% mcols(), hyper.GR$ctrl_rep2 %>% mcols(), by=c("id", "gene.id"), all=FALSE, suffixes=c(".rep1", ".rep2")) %>% as.data.frame(.)
+# select only sites from overlap and add combined info of both replicates
+sel = match(ctrl.tab$id, hyper.GR$ctrl_rep1$id)
+ctrl.rep = hyper.GR$ctrl_rep1[sel]
+mcols(ctrl.rep) = ctrl.tab
+## NEW GRANGES OBJECT WITH ONLY REPRODUCIBLE SITES
+hyper.rep = GRangesList(rrm4 = rrm4.rep, ctrl = ctrl.rep)
+# number of reproducible editing sites
+elementNROWS(hyper.rep)
+# number of genes with reproducible sites
+sapply(hyper.rep, function(hyper) length(unique(hyper$gene.id)))
+elementNROWS(hyper.rep)
+```
+# Studying the editing events
+In this analysis, the editing events were analysed for their distribution and reproducibility between two biological replicates. The correlation of determination was used to study the editing reproducibility as it indicates the distance of points between 1:1 line, on the other hand magnitud. And therefore, both  pearson (the analysis is independent of magnitude, in our case, editing percentage ) and  spearman (as the editing relationship is not monotonic) was not used.
+```{r}
+# distribution of % editing
+pf = data.frame(
+  perc.editing = c(rrm4.tab$perc.editing.rep1, rrm4.tab$perc.editing.rep2, ctrl.tab$perc.editing.rep1, ctrl.tab$perc.editing.rep2),
+  sample = rep(c("rrm4.rep1", "rrm4.rep2", "ctrl.rep1", "ctrl.rep2"), rep(c(nrow(rrm4.tab), nrow(ctrl.tab)), each=2)),
+  condition = rep(c("rrm4", "rrm4", "ctrl", "ctrl"), rep(c(nrow(rrm4.tab), nrow(ctrl.tab)), each=2)))
+# rrm4-ada-gfp
+# density
+####FigS4B
+pl1 = ggplot(pf, aes(x=perc.editing, fill=sample)) +
+  geom_density(adjust = 1/5, alpha=0.5) +
+  scale_fill_manual(values=c(rrm4.rep1 = lighten(Rrm4.col, 0.2), rrm4.rep2 = darken(Rrm4.col, 0.2), ctrl.rep1 = lighten(ada.col, 0.2), ctrl.rep2 = darken(ada.col, 0.2)))+ 
+  xlab("% editing")+
+  ylab("density")+
+  myTheme1
+# histogram
+pl2 = ggplot(pf, aes(x=perc.editing, fill=sample)) +
+  geom_histogram(aes(y = ..density..), position="dodge") +
+  scale_fill_manual(values=c(rrm4.rep1 = lighten(Rrm4.col, 0.2), rrm4.rep2 = darken(Rrm4.col, 0.2), ctrl.rep1 = lighten(ada.col, 0.2), ctrl.rep2 = darken(ada.col, 0.2)))+
+  myTheme1
+# scatter plots of % editing in overlapping sites
+# rrm4-ada-gfp
+#### FigS4C
+pl3 = ggplot(rrm4.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2)) +
+  geom_pointdensity(adjust=0.1) +
+  ggtitle("Shared sites in Rrm4.ada") +
+  theme_bw() +
+  theme(aspect.ratio = 1) +
+  expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
+  stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
+  stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ 
+  labs(subtitle="FigS4C")+
+  myTheme1 
+pl3
+# control-ada
+#### FigS4D
+pl4 = ggplot(ctrl.tab, aes(x=perc.editing.rep1, y=perc.editing.rep2))+ 
+  geom_pointdensity(adjust=0.1) +
+  ggtitle("Shared sites in Ada.ctrl") +
+  theme_bw() +
+  theme(aspect.ratio = 1) +
+  expand_limits(x=c(0,100), y=c(0,100))+geom_smooth(method = "lm", size=1)+
+  stat_regline_equation(label.y = 95, aes(label = ..eq.label..)) +
+  stat_regline_equation(label.y = 85, aes(label = ..rr.label..))+ 
+  labs(subtitle="FigS4D")+
+  myTheme1
+# ggarrange
+figEVC<-ggarrange(pl1, pl2, pl3, pl4,
+          labels = c("A", "B", "C", "D"),
+          ncol = 2, nrow = 2)
+figEVC
+# ggsave
+ggsave(figEVC, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename="FigEV3c.pdf", height=23, width=16, unit= "cm")
+```
+To check the distance between the editing sites and iCLIP binding site, I will use distancetonearest() function in GenomicRanges.
+```{r}
+# Using the iCLIP data of Nina
+dis_df = lapply(names(hyper.rep), function(i){
+  hyper = hyper.rep[[i]]
+  head(hyper)
+  # calculat distance to nearest iCLIP site
+  dists.rrm4 = distanceToNearest(hyper, iCLIP_NKS)
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_NKS$geneID[to(dists.rrm4)]
+  table(sel)
+  dists.rrm4 = subset(dists.rrm4, sel)
+  mcols(dists.rrm4)$BS = "iCLIP"
+  # make df
+  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
+  df$set = i
+  return(df)
+})
+# rowbind the list objects
+dis_df = do.call(rbind, dis_df)
+# distance between editing sites and iCLIP binding sites
+# density
+pl1 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_density(adjust = 1/5, alpha=0.5) +
+  coord_cartesian(xlim=c(0, 2000)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw() +
+  facet_wrap(~BS)+myTheme1
+# histogram
+pl2 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 100, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2000)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw() +
+  facet_wrap(~BS)+myTheme1
+# ggarrange
+nks<- ggarrange(pl1, pl2,  
+          labels = LETTERS[1:4],
+          ncol = 2, nrow = 1)
+nks
+# ggsave
+ggsave(nks, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev.pdf", height=22, width=16, unit= "cm")
+```
+```{r}
+# using the iCLIP data of Lilli
+dis_df = lapply(names(hyper.rep), function(i){
+  hyper = hyper.rep[[i]]
+  head(hyper)
+  # calculat distance to nearest iCLIP site
+  dists.rrm4 = distanceToNearest(hyper, iCLIP_lilli)
+  # kepp only pairs on same genes
+  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_lilli$gene_id[to(dists.rrm4)]
+  table(sel)
+  dists.rrm4 = subset(dists.rrm4, sel)
+  mcols(dists.rrm4)$BS = "iCLIP"
+  # make df
+  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
+  df$set = i
+  return(df)
+})
+# row bind the list objects
+dis_df = do.call(rbind, dis_df)
+# distance between editing sites and iCLIP binding sites
+# density
+pl1 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_density(adjust = 1/5, alpha=0.5) +
+  coord_cartesian(xlim=c(0, 2000)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw() +
+  facet_wrap(~BS)+myTheme1
+# histogram
+pl2 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 100, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2000)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw() +
+  facet_wrap(~BS)+myTheme1
+# ggarrange
+lilli<- ggarrange(pl1, pl2,  
+          labels = LETTERS[1:4],
+          ncol = 2, nrow = 1)
+lilli
+# ggsave
+ggsave(lilli, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev_lilli.pdf", height=22, width=16, unit= "cm")
+```
+# Enrichment of Rrm4-Ada-Gfp targets in iCLIP
+In this analysis, I compared the overlap between high-confidence Rrm4-Ada-Gfp targets and Rrm4-iCLIP targets. 
+```{r}
+# comparing the reproducible editing sites between rrm4-ada and control
+#### Fig2B
+venn.list = lapply(hyper.rep, function(hyper){
+  return(hyper$id)
+})
+pl1=ggvenn(venn.list, names(venn.list), fill_color=c(Rrm4.col, ada.col))+
+    ggtitle("Fig2B")+myTheme1
+# define sets
+rrm4.vs.ctrl=list(
+  rrm4.specific=hyper.rep$rrm4$gene.id[hyper.rep$rrm4$id %in% setdiff(hyper.rep$rrm4$id, hyper.rep$ctrl$id)],
+  shared=hyper.rep$ctrl$gene.id[hyper.rep$ctrl$id %in% intersect(hyper.rep$ctrl$id, hyper.rep$rrm4$id)],
+  ctr.specific=hyper.rep$ctrl$gene.id[hyper.rep$ctrl$id %in% setdiff(hyper.rep$ctrl$id, hyper.rep$rrm4$id)],
+  all=genes.GR$gene_id
+)
+# iCLIP_lilli vs hyperTRIBE genes
+tab.Rrm4Bs_LO = lapply(rrm4.vs.ctrl, function(i){
+  df = genes.GR$rrm4_bs_LO[genes.GR$gene_id %in% i]
+  tabGn = table(df>0)
+  return(tabGn)
+}) %>% melt(.) %>% set_colnames(., c("with.bs","no.of.genes", "set"))
+#### FigS4F
+pl3 = ggplot(tab.Rrm4Bs_LO, aes(x=factor(set, levels=c("rrm4.specific", "shared", "ctr.specific", "all" )), y= no.of.genes, fill=with.bs))+
+  geom_bar(stat="identity", position="fill", color="black")+
+  scale_fill_manual(values=c('TRUE' = "skyblue", 'FALSE' = "lightgrey"))+
+  myTheme1
+# iCLIP_NKS vs hyperTRIBE genes
+tab.Rrm4Bs_NKS = sapply(rrm4.vs.ctrl, function(i){
+  df = genes.GR$rrm4_bs_nks[genes.GR$gene_id %in% i]
+  tabGn = table(df>0)
+  return(tabGn)
+}) %>% melt(.) %>%set_colnames(., c("with.bs","set", "no.of.genes"))
+# Not used for publication
+pl2 = ggplot(tab.Rrm4Bs_NKS, aes(x=set, y= no.of.genes, fill=with.bs))+
+  geom_bar(stat="identity", position="fill", color="black")+
+  scale_fill_manual(values=c('TRUE' = "skyblue", 'FALSE' = "lightgrey"))+
+  myTheme1
+# ggarrange
+Nks_lO_HT<- ggarrange(pl1, pl2, pl3,  
+          labels = LETTERS[1:3],
+          ncol = 2, nrow = 2)
+Nks_lO_HT
+# ggsave
+ggsave(Nks_lO_HT, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Nks_lO_HT.pdf", height=22, width=16, unit= "cm")
+```
+The significance of mRNA target enrichment in comparison to all mRNAs present in *U. maydis*  was carried out using the Fisher's exact test
+```{r}
+# make data frame 
+bs_enrich = data.frame(rrm4.uni = c(683,130),
+                       shared = c(377,79),
+                       ctrl.uni = c(430,157),
+                       all = c(4461,2470)) %>% 
+  set_rownames(.,c("with iCLIP BS", "no iCLIP BS"))
+# Fisher exact test
+# rrm4-ada-gfp unique
+chisq.rrm4bs=bs_enrich[,c(1,4)]
+fisher.test(chisq.rrm4bs)
+# overlap
+chisq.sharedbst=bs_enrich[,c(2,4)]
+fisher.test(chisq.sharedbst)
+#control-Ada unique
+chisq.ctrlbs=bs_enrich[,c(3,4)]
+fisher.test(chisq.ctrlbs)
+```
+# separate Rrm4-Ada-Gfp specific and control-Ada specific
+In here, we will select editing sites that are specific to Rrm4-Ada-Gfp and control-Ada. *Note that some mRNAs could be still present in both data sets.*
+```{r}
+#hyper.rep
+# Rrm4-Ada-Gfp specific
+Rrm4_spe= hyper.rep$rrm4[!hyper.rep$rrm4$id %in% hyper.rep$ctrl$id]
+# editing events shared between Rrm4 and ctrl
+Rrm4_con_sh = hyper.rep$rrm4[hyper.rep$rrm4$id %in% hyper.rep$ctrl$id]
+# control-Ada specific
+con_spe= hyper.rep$ctrl[!hyper.rep$ctrl$id %in% hyper.rep$rrm4$id]
+```
+# Find mRNAs specific to each category
+```{r}
+#venn list
+venn_mRNAs = list(
+  Rrm4Ada_mRNAs = Rrm4_spe$gene.id,
+  controlAda_mRNAs = con_spe$gene.id)
+# isolate mRNAs specific to each data set
+# rrm4-ada-gfp
+Rrm4Ada_txspe = Rrm4_spe[!Rrm4_spe$gene.id %in% con_spe$gene.id]
+# shared
+sh.ada = Rrm4_spe[Rrm4_spe$gene.id %in% con_spe$gene.id]
+# control-ada
+conAda_txspe = con_spe[!con_spe$gene.id %in% Rrm4_spe$gene.id]
+```
+# Study the distance between editing sites and iCLIP binding sites in mRNAs specific to each datasets
+## iCLIP data of LO
+```{r}
+# find binding sites with motif
+# UAUG bindinsites
+uaug = DNAString("TATG")
+uaug.gr = vmatchPattern(uaug, Umaydis)
+ov= findOverlaps(uaug.gr, genes.GR)
+uaug.gr = uaug.gr[from(ov)] 
+uaug.gr$gene.id = genes.GR$gene_id[to(ov)]
+# assign uaug motif to the iCLIP binding sites
+ov = findOverlaps(iCLIP_lilli, uaug.gr)
+# make a new uaug column
+iCLIP_lilli$uaug = NA
+# find the binding sites having UAUG motif
+iCLIP_lilli$uaug[from(ov)] = uaug.gr$gene.id[to(ov)]
+# UCUC bindinsites
+ucuc= DNAString("TCTC")
+ucuc.gr = vmatchPattern(ucuc, Umaydis)
+ov= findOverlaps(ucuc.gr, genes.GR)
+ucuc.gr = ucuc.gr[from(ov)] 
+ucuc.gr$gene.id = genes.GR$gene_id[to(ov)]
+# assign uaug motif to the iCLIP binding sites
+ov = findOverlaps(iCLIP_lilli, ucuc.gr)
+# make a new uaug column
+iCLIP_lilli$ucuc = NA
+# find the binding sites having UAUG motif
+iCLIP_lilli$ucuc[from(ov)] = ucuc.gr$gene.id[to(ov)]
+# CAUG bindinsites
+caug= DNAString("CATG")
+caug.gr = vmatchPattern(caug, Umaydis)
+ov= findOverlaps(caug.gr, genes.GR)
+caug.gr = caug.gr[from(ov)] 
+caug.gr$gene.id = genes.GR$gene_id[to(ov)]
+# assign uaug motif to the iCLIP binding sites
+ov = findOverlaps(iCLIP_lilli, caug.gr)
+# make a new uaug column
+iCLIP_lilli$caug = NA
+# find the binding sites having UAUG motif
+iCLIP_lilli$caug[from(ov)] = caug.gr$gene.id[to(ov)]
+```
+### mRNAs specific to each hyperTRIBE dataset vs iCLIP site with UAUG
+Here, the distance between editing sites and UAUG-containing Rrm4-iCLIP binding sites were calculated in mRNAs specific to each set. 
+```{r}
+# make a list 
+hyper.rep.mRNA = list(
+  rrm4 = Rrm4Ada_txspe,
+  ctrl = conAda_txspe
+) %>% as(., "GRangesList")
+# only those Rrm4 bs with UAUG 
+Rrm4_bs_UAUG = iCLIP_lilli[!is.na(iCLIP_lilli$uaug)]
+## DIstance between bs with uaug ( iCLIP data of lilli) and editing site
+dis = lapply(names(hyper.rep.mRNA), function(i){
+  hyper = hyper.rep.mRNA[[i]]
+  head(hyper)
+  # calculat distance to nearest iCLIP site
+  dists.rrm4 = distanceToNearest(hyper, Rrm4_bs_UAUG)
+  # kepp only pairs on same genes
+  sel = hyper$gene.id[from(dists.rrm4)] == Rrm4_bs_UAUG$gene_id[to(dists.rrm4)]
+  table(sel)
+  dists.rrm4 = subset(dists.rrm4, sel)
+  mcols(dists.rrm4)$BS = "iCLIP"
+  # make df
+  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
+  df$set = i
+  return(df)
+})
+dis_df = do.call(rbind, dis)
+#######Not used in publication
+pl2 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_density(adjust = 1/5, alpha=0.5) +
+  coord_cartesian(xlim=c(0, 2500)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw()+
+  myTheme1
+### Fig2C right panel
+pl3 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500),ylim=c(0, 0.002)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw()+
+  myTheme1
+# ggarrange
+HTrrm4_UAUG<- ggarrange(pl2, pl3, 
+          labels = LETTERS[1:4],
+          ncol = 2, nrow = 1)
+HTrrm4_UAUG
+# ggsave
+ggsave(HTrrm4_UAUG, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev_uaug.pdf", height=16, width=16, unit= "cm")
+```
+```{r}
+# calculate median distance iCLIP site with uaug
+# rrm4-ada-gfp
+med_rrm4bs = median(dis[[1]]$distance)
+# control-ada
+med_ctrlbs = median(dis[[2]]$distance)
+```
+### mRNAs specific to each hyperTRIBE dataset vs all BS
+Here, the distance between editing sites and all Rrm4-iCLIP binding sites were calculated in mRNAs specific to each set.
+```{r}
+# distance a-Gbwtween editing sites and all BS in mRNAs specific to Rrm4-Ada-Gfp and control-Ada
+dis = lapply(names(hyper.rep.mRNA), function(i){
+  hyper = hyper.rep.mRNA[[i]]
+  head(hyper)
+  # calculat distance to nearest iCLIP site
+  dists.rrm4 = distanceToNearest(hyper, iCLIP_lilli)
+  # kepp only pairs on same genes
+  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_lilli$gene_id[to(dists.rrm4)]
+  table(sel)
+  dists.rrm4 = subset(dists.rrm4, sel)
+  mcols(dists.rrm4)$BS = "iCLIP"
+  # make df
+  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
+  df$set = i
+  return(df)
+})
+dis_df = do.call(rbind, dis)
+#######Not used in publication
+pl4 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_density(adjust = 1/5, alpha=0.5) +
+  coord_cartesian(xlim=c(0, 2500)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw()+
+  myTheme1
+### Fig2C left
+pl5 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500)) +
+  scale_fill_manual(values=c(rrm4 = Rrm4.col, ctrl = ada.col)) +
+  theme_bw() +
+  myTheme1
+HTrrm4_spemRNA<- ggarrange(pl4, pl5, 
+          labels = LETTERS[1:2],
+          ncol = 2, nrow = 1)
+HTrrm4_spemRNA
+ggsave(HTrrm4_spemRNA, path="C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Rrm4/Plots_new_revision", filename = "Fig3D_rev_all_BS_specificmRNA.pdf", height=8, width=16, unit= "cm")
+```
+```{r}
+# calculate median distance all iCLIP site
+# for rrm4
+med_rrm4bs = median(dis[[1]]$distance)
+med_ctrlbs = median(dis[[2]]$distance)
+```
+## In the iCLIP data of NKS
+```{r}
+# UAUG bindinsites
+uaug = DNAString("TATG")
+uaug.gr = vmatchPattern(uaug, Umaydis)
+ov= findOverlaps(uaug.gr, genes.GR)
+uaug.gr = uaug.gr[from(ov)] 
+uaug.gr$gene.id = genes.GR$gene_id[to(ov)]
+# assign uaug motif to the iCLIP binding sites
+ov = findOverlaps(iCLIP_NKS, uaug.gr)
+# make a new uaug column
+iCLIP_NKS$uaug = NA
+# find the binding sites having UAUG motif
+iCLIP_NKS$uaug[from(ov)] = uaug.gr$gene.id[to(ov)]
+# UCUC bindinsites
+ucuc= DNAString("TCTC")
+ucuc.gr = vmatchPattern(ucuc, Umaydis)
+ov= findOverlaps(ucuc.gr, genes.GR)
+ucuc.gr = ucuc.gr[from(ov)] 
+ucuc.gr$gene.id = genes.GR$gene_id[to(ov)]
+# assign uaug motif to the iCLIP binding sites
+ov = findOverlaps(iCLIP_NKS, ucuc.gr)
+# make a new uaug column
+iCLIP_NKS$ucuc = NA
+# find the binding sites having UAUG motif
+iCLIP_NKS$ucuc[from(ov)] = ucuc.gr$gene.id[to(ov)]
+# CAUG bindinsites
+caug= DNAString("CATG")
+caug.gr = vmatchPattern(caug, Umaydis)
+ov= findOverlaps(caug.gr, genes.GR)
+caug.gr = caug.gr[from(ov)] 
+caug.gr$gene.id = genes.GR$gene_id[to(ov)]
+# assign uaug motif to the iCLIP binding sites
+ov = findOverlaps(iCLIP_NKS, caug.gr)
+# make a new uaug column
+iCLIP_NKS$caug = NA
+# find the binding sites having UAUG motif
+iCLIP_NKS$caug[from(ov)] = caug.gr$gene.id[to(ov)]
+```
+### mRNAs specific to each hyperTRIBE dataset vs iCLIP site with UAUG
+```{r}
+# make a list 
+hyper.rep.mRNA = list(
+  rrm4 = Rrm4Ada_txspe,
+  ctrl = conAda_txspe
+) %>% as(., "GRangesList")
+# only those Rrm4 bs with UAUG 
+Rrm4_bs_UAUG = iCLIP_NKS[!is.na(iCLIP_NKS$uaug)]
+## DIstance between bs with uaug ( iCLIP data of lilli) and editing site
+dis = lapply(names(hyper.rep.mRNA), function(i){
+  hyper = hyper.rep.mRNA[[i]]
+  head(hyper)
+  # calculat distance to nearest iCLIP site
+  dists.rrm4 = distanceToNearest(hyper, Rrm4_bs_UAUG)
+  # kepp only pairs on same genes
+  sel = hyper$gene.id[from(dists.rrm4)] == Rrm4_bs_UAUG$geneID[to(dists.rrm4)]
+  table(sel)
+  dists.rrm4 = subset(dists.rrm4, sel)
+  mcols(dists.rrm4)$BS = "iCLIP"
+  # make df
+  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
+  df$set = i
+  return(df)
+})
+dis_df = do.call(rbind, dis)
+pl3 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500),ylim=c(0, 0.0035)) +
+  scale_fill_manual(values=c(rrm4 = "grey30", ctrl = "grey")) +
+  labs(subtitle = "Rrm4-Ada-Gfp vs UAUG-containing iCLIP2 BS",
+       y="",
+       x=" ")+
+  theme_bw()+myTheme1+
+  theme(axis.text.y=element_blank())
+```
+### mRNAs specific to each hyperTRIBE dataset vs all BS
+```{r}
+# distance a-Gbwtween editing sites and all BS in mRNAs specific to Rrm4-Ada-Gfp and control-Ada
+dis = lapply(names(hyper.rep.mRNA), function(i){
+  hyper = hyper.rep.mRNA[[i]]
+  head(hyper)
+  # calculat distance to nearest iCLIP site
+  dists.rrm4 = distanceToNearest(hyper, iCLIP_NKS)
+  # kepp only pairs on same genes
+  sel = hyper$gene.id[from(dists.rrm4)] == iCLIP_NKS$geneID[to(dists.rrm4)]
+  table(sel)
+  dists.rrm4 = subset(dists.rrm4, sel)
+  mcols(dists.rrm4)$BS = "iCLIP"
+  # make df
+  df = rbind(mcols(dists.rrm4)) %>% as.data.frame()
+  df$set = i
+  return(df)
+})
+dis_df = do.call(rbind, dis)
+pl5 = ggplot(dis_df, aes(x=distance, fill=set)) +
+  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500),ylim=c(0, 0.0035)) +
+  scale_fill_manual(values=c(rrm4 = "grey30", ctrl = "grey")) +
+  labs(subtitle="Rrm4-Ada-Gfp vs all iCLIP2 BS",
+       y= "",
+       x="")+
+  theme_bw()+myTheme1
+HTrrm4_spemRNA =  ggarrange(pl5, pl3,
+          ncol = 2, nrow = 1, common.legend = TRUE)
+HTrrm4_spemRNA = annotate_figure(HTrrm4_spemRNA, left = textGrob("binding site density (nt)", rot = 90), bottom = textGrob("distance to iCLIP2 binding site (nt)"))
+HTrrm4_spemRNA
+ggsave(HTrrm4_spemRNA, path="C:/Users/Sri/Documents/Bioinfo_analysis/iCLIP_course/iCLIP2_Methods_Stoffel_etal_2023/", filename = "iCLIP2vHT.pdf", height=8, width=16, unit= "cm")
+# calculate median distance all iCLIP site
+# for rrm4
+med_rrm4bs = median(dis[[1]]$distance)
+med_ctrlbs = median(dis[[2]]$distance)
+```
+# Session Info
+```{r}
+sessionInfo()
+```
--- a/assays/S3_A3_HyperTRIBE_Rrm4_iCLIP/dataset/01_hyperTRIBE_v5_verfied_rrm4.html
+++ b/assays/S3_A3_HyperTRIBE_Rrm4_iCLIP/dataset/01_hyperTRIBE_v5_verfied_rrm4.html
--- a/assays/S3_A3_HyperTRIBE_Rrm4_iCLIP/isa.assay.xlsx
+++ b/assays/S3_A3_HyperTRIBE_Rrm4_iCLIP/isa.assay.xlsx
--- a/assays/S4_A1_microscopy_DIC_Gfp/isa.assay.xlsx
+++ b/assays/S4_A1_microscopy_DIC_Gfp/isa.assay.xlsx
--- a/assays/S4_A2_protein_expression/isa.assay.xlsx
+++ b/assays/S4_A2_protein_expression/isa.assay.xlsx
--- a/assays/S5_A1_HyperTRIBE_Khd4_workflow/isa.assay.xlsx
+++ b/assays/S5_A1_HyperTRIBE_Khd4_workflow/isa.assay.xlsx
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/isa.assay.xlsx
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/isa.assay.xlsx
--- a/assays/S5_A3_GO-term/isa.assay.xlsx
+++ b/assays/S5_A3_GO-term/isa.assay.xlsx
--- a/assays/S6_A1_RNAseq_data/isa.assay.xlsx
+++ b/assays/S6_A1_RNAseq_data/isa.assay.xlsx