uploaded qmd file

0e42568b · Srimeenakshi Sankaranarayanan · 432c04ca · 0e42568b · 0e42568b · 0e42568b
Commit 0e42568b authored 9 months ago by Srimeenakshi Sankaranarayanan
--- a/.gitattributes
+++ b/.gitattributes
@@ -16,3 +16,4 @@ assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz
 assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text
 assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text
 assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]editing[[:space:]]sites[[:space:]]are[[:space:]]proximal[[:space:]]to[[:space:]]the[[:space:]]AUACCC[[:space:]]motif.html filter=lfs diff=lfs merge=lfs -text
+assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]edited[[:space:]]transcripts[[:space:]]lacking[[:space:]]the[[:space:]]AUACCC[[:space:]]motif[[:space:]]do[[:space:]]not[[:space:]]show[[:space:]]enrichment[[:space:]]for[[:space:]]other[[:space:]]motifs.html filter=lfs diff=lfs merge=lfs -text
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/.Rhistory
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/.Rhistory
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.html
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.html
--- a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.qmd
+++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.qmd
+---
+title: "Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs"
+date: last-modified
+author: 
+    - name:  "Srimeenakshi Sankaranarayanan"
+title-block-banner: true
+format: 
+    html:
+        theme: flatly
+        self-contained: true
+        code-fold: true
+        code-tools: true
+        code-summary: "Show the code"
+        toc: true
+        number-sections: true
+        anchor-sections: true
+editor: visual
+execute:
+  echo: true
+  error: false
+  warning: false
+  message: false
+---
+```{r}
+# library
+library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
+library(ggplot2)
+library(BindingSiteFinder)
+library(rtracklayer)
+library(ComplexHeatmap)
+library(GenomicFeatures)
+library(forcats)
+library(tidyr)
+library(dplyr)
+library(tidyverse)
+library(GenomicRanges)
+library(magick)
+library(magrittr)
+library(gridExtra)
+library(IRanges)
+library(Biostrings)
+library(ggpp)
+library(gginnards)
+library(ggrepel)
+library(ggpubr)
+library(ggforce)
+library(ggrastr)
+library(viridis)
+library(reshape2)
+library(gprofiler2)
+library(ggsci)
+library(ggh4x)
+library(ggplotify)
+library(gridExtra)
+library(circlize)
+library(EnrichedHeatmap)
+library(UpSetR)
+library(kableExtra)
+library(cowplot)
+library(rstatix)
+library(beeswarm)
+library(clusterProfiler)
+library(ggseqlogo)
+library(tidyHeatmap)
+library(paletteer)
+library(ggvenn)
+library(colorspace)
+library(ggpointdensity)
+library(lookup)
+library(rstatix)
+library(ggrepel)
+```
+```{r}
+load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")
+```
+81 of 377 editing events were removed from the analysis as the transcripts hosting the editing sites lack the AUACCC motif. Here, *do novo* motif discovery analysis was performed around these editing sites to check if Khd4 can bind to motif other than the AUACCC.
+## Khd4-Ada-Gfp editing sites on transcripts lacking the AUACCC motif
+```{r}
+# Khd4.rep
+Krep_no_auaccc = khd4.rep
+# make an auaccc column
+Krep_no_auaccc$auaccc = NA
+Krep_no_auaccc$auaccc = lookup(Krep_no_auaccc$gene.id, auaccc.gr$gene.id, auaccc.gr$gene.id)
+# Khd4-Ada-Gfp editing sites without AUACCC motif in them. n=81
+Krep_no_auaccc = Krep_no_auaccc[is.na(Krep_no_auaccc$auaccc)]
+# no. of genes with no AUACCC motif
+#n_distinct(Krep_no_auaccc$gene.id)
+names(Krep_no_auaccc) = Krep_no_auaccc$id
+# extend both sides by 250 nt
+Krep_no_auaccc_ext = Krep_no_auaccc+250
+# get sequence
+Krep_no_auaccc_ext = Biostrings::getSeq(Umaydis, Krep_no_auaccc_ext)
+# export the fasta files
+#output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"
+#writeXStringSet(Krep_no_auaccc_ext, filepath = paste0(output,"Krep_no_auaccc_ext_250.fasta"))
+```
+## XSTREME analysis
+```{r}
+## make a scatter plot
+# unique to Khd4-Ada-Gfp
+Motif.frame.khd4.noauaccc = data.frame(Motif = 
+c("CWCUUKUGYCUUGY", "UCAUCUCUCGU", "UCAUSWYUCGU", "YMUAUAUCCCAGHC", "UAGGCUUUGG", "GCUUCACUAGCAGSC"),
+absolute = c(13.70,	5.48,	9.59,	5.48,	46.58,	19.18),
+EnR = c(98.4,	89.5,	11,	44.7,	1.34,	1.65))
+pl.khd4.noauaccc = ggplot(Motif.frame.khd4.noauaccc, aes(x=absolute, y=EnR))+
+  geom_point(alpha=1, size=2, color="grey20")+
+  scale_color_manual(values = khd4.col)+
+  coord_cartesian(xlim=c(0,70))+
+  labs(subtitle= "Khd4-Ada-Gfp editing events in transcripts lacking AUAUCCC motif",
+       x="sequences with motif (%)",
+       y = "motif enrichment ratio")+
+  theme_paper()+
+  theme(legend.position = "none")
+```
+## Most prevalent motif
+```{r}
+#UAGGCUUUGG (high prevalency)
+A= c(0.000836,0.914690,0.000836,0.000836,0.000836,0.107471,0.107471,0.000836,0.000836,0.107471)
+C= c(0.001057,0.083589,0.001057,0.107692,0.997443,0.001057,0.001057,0.001057,0.001057,0.107692)
+G = c(0.060271,0.000921,0.997308,0.890673,0.000921,0.000921,0.214191,0.000921,0.997308,0.677403)
+U = c(0.937836,0.000800,0.000800,0.000800,0.000800,0.890551,0.677281,0.997186,0.000800,0.107435)
+pwD02U = rbind(A,C,G,U)
+# make the motif logo
+ggseqlogo(pwD02U, col_scheme="nucleotide", font="helvetica_bold")
+```
+## Enrichment of the UAGGCUUUGG motif
+```{r}
+# function1
+# define a function for identifying the motif occurence
+getmotgr=function(seq, pwm, gr_plus, gr_minus){
+# motif in sense strand
+matches_list_strand = lapply(seq, matchPWM, pwm=pwm)
+# apply to all 
+test = lapply(names(matches_list_strand), function (i){
+  uc = matches_list_strand[[i]]
+  df= as.data.frame(uc) 
+  df$chr = i # make a column specifying chromosome name
+  df = cbind.DataFrame(df)
+  return(df)
+})
+# rbind the resulting df
+df1 = do.call(rbind, test) %>% as(., "GRanges")
+# find overlaps
+ov = findOverlaps(df1, gr_plus, ignore.strand=FALSE)
+df1 = df1[from(ov)]
+df1$gene.id = gr_plus$gene_id[to(ov)]
+strand(df1) =strand(gr_plus)[to(ov)]
+# motif in antisense strand
+matches_list_antistrand = lapply(seq, matchPWM, pwm = reverseComplement(pwm))
+# apply to all 
+test1 = lapply(names(matches_list_antistrand), function (i){
+  uc1 = matches_list_antistrand[[i]]
+  df1= as.data.frame(uc1) 
+  df1$chr = i
+  df1 = cbind.DataFrame(df1)
+  return(df1)
+})
+# rbind the resulting df
+df2 = do.call(rbind, test1) %>% as(., "GRanges")
+# find overlaps
+ov2 = findOverlaps(df2, gr_minus, ignore.strand=FALSE)
+df2= df2[from(ov2)]
+df2$gene.id = gr_minus$gene_id[to(ov2)]
+strand(df2) =strand(gr_minus)[to(ov2)]
+df3 = c(df1,df2) %>% as(., "GRanges")
+return(df3)
+}
+```
+```{r warning=FALSE}
+# as found in meme suite analysis
+# remove the um_scaf_contigs because they causes problem downstream
+uma.seq = getSeq(Umaydis)
+uma.seq = uma.seq[-c(24,25,26,27)]
+# plus strands
+gns_plus = genes.GR[strand(genes.GR) == "+"]
+#minus strand
+gns_minus = genes.GR[strand(genes.GR) == "-"]
+```
+```{r warning=FALSE}
+# change U to T
+rownames(pwD02U) = c("A", "C", "G", "T")
+#UAGGCUUUGG
+UAGGCUUUGG.gr = getmotgr(uma.seq, pwm=pwD02U, gns_plus, gns_minus)
+```
+```{r}
+#change genes.gr column name
+khd4vcntrl = list(
+khd4 = as.data.frame(unique(hyper.rep$khd4$gene.id)) %>% set_colnames(.,"gene.id"),
+ctrl = as.data.frame(unique(hyper.rep$ctrl$gene.id)) %>% set_colnames(., "gene.id"),
+all = as.data.frame(genes.GR$gene_id[genes.GR$gene_biotype == "protein_coding"]) %>% set_colnames(., "gene.id"))
+# add columns to determine number of genes with CGAGCAAG.GR or GUCUUGCUVY.gr
+khd4vcntrl = lapply(khd4vcntrl, function(i){
+  # add gucuugcuvy column and look for matching geneID
+  i$UAGGCUUUGG = lookup(i$gene.id,UAGGCUUUGG.gr$gene.id,UAGGCUUUGG.gr$gene.id)
+  return(i)
+})
+# counts the total number of sites in each dataset
+df2= lapply (names(khd4vcntrl), function(i){
+  #i is khd4
+  hyp  = khd4vcntrl[[i]]
+  df2 = table(!is.na(hyp$UAGGCUUUGG)) %>%
+                as.data.frame() %>%
+                mutate(set = i, motif = "UAGGCUUUGG") %>%
+                set_colnames(.,c("with_motif", "no.of.genes","set","motif")) 
+ return(df2)}) 
+df1_mot2 = do.call(rbind,df2)
+pldf = ggplot(df1_mot2, aes(x=set, y= no.of.genes, fill = with_motif))+
+  geom_bar(stat="identity", position="fill")+
+  scale_fill_manual(values=c("TRUE" = "grey30", "FALSE" = "white"))+
+  labs(x="",
+       y = "transcripts (%) ")+
+  facet_wrap(~motif)+
+  myTheme1
+pldf
+```
+```{r}
+pf = lapply(names(hyper.rep), function(i){
+  #i= khd4
+  hyper = hyper.rep[[i]]
+  # calculate distance to nearest UAGGCUUUGG motif
+  dists.GUCUUGCUVY = distanceToNearest(hyper, UAGGCUUUGG.gr)
+  # keep only pairs on same genes
+  sel = hyper$gene.id[from(dists.GUCUUGCUVY)] == UAGGCUUUGG.gr$gene.id[to(dists.GUCUUGCUVY)]
+  table(sel)
+  dists.GUCUUGCUVY = subset(dists.GUCUUGCUVY, sel)
+  mcols(dists.GUCUUGCUVY)$motif = "UAGGCUUUGG"
+  # get stats of distances
+  summary(mcols(dists.GUCUUGCUVY)$distance)
+  pf = rbind(mcols(dists.GUCUUGCUVY)) %>% as.data.frame()
+  pf$set = i
+  return(pf)
+})
+pf = do.call(rbind, pf)
+pl4 = ggplot(pf, aes(x=distance, fill=set)) +
+  geom_histogram(alpha=0.5, binwidth = 100, aes(y = ..density..), position="dodge") +
+  coord_cartesian(xlim=c(0, 2500)) +
+  scale_fill_manual(values=c(khd4 = khd4.col, ctrl = ada.col)) +
+  labs(x="Distance to the neares motif (nt)",
+       y = "motif density")+
+  coord_cartesian(ylim=c(0.000,0.004),
+                  xlim = c(0,2500))+
+  theme_bw() +
+  facet_wrap(~motif)+myTheme1
+pl4
+```
+## Session Info
+```{r}
+sessionInfo()
+```