Skip to content
Snippets Groups Projects
Commit 0e42568b authored by Srimeenakshi Sankaranarayanan's avatar Srimeenakshi Sankaranarayanan
Browse files

uploaded qmd file

parent 432c04ca
No related branches found
No related tags found
No related merge requests found
...@@ -16,3 +16,4 @@ assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz ...@@ -16,3 +16,4 @@ assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz
assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text
assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text
assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]editing[[:space:]]sites[[:space:]]are[[:space:]]proximal[[:space:]]to[[:space:]]the[[:space:]]AUACCC[[:space:]]motif.html filter=lfs diff=lfs merge=lfs -text assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]editing[[:space:]]sites[[:space:]]are[[:space:]]proximal[[:space:]]to[[:space:]]the[[:space:]]AUACCC[[:space:]]motif.html filter=lfs diff=lfs merge=lfs -text
assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]edited[[:space:]]transcripts[[:space:]]lacking[[:space:]]the[[:space:]]AUACCC[[:space:]]motif[[:space:]]do[[:space:]]not[[:space:]]show[[:space:]]enrichment[[:space:]]for[[:space:]]other[[:space:]]motifs.html filter=lfs diff=lfs merge=lfs -text
source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
---
title: "Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs"
date: last-modified
author:
- name: "Srimeenakshi Sankaranarayanan"
title-block-banner: true
format:
html:
theme: flatly
self-contained: true
code-fold: true
code-tools: true
code-summary: "Show the code"
toc: true
number-sections: true
anchor-sections: true
editor: visual
execute:
echo: true
error: false
warning: false
message: false
---
```{r}
# library
library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
library(ggplot2)
library(BindingSiteFinder)
library(rtracklayer)
library(ComplexHeatmap)
library(GenomicFeatures)
library(forcats)
library(tidyr)
library(dplyr)
library(tidyverse)
library(GenomicRanges)
library(magick)
library(magrittr)
library(gridExtra)
library(IRanges)
library(Biostrings)
library(ggpp)
library(gginnards)
library(ggrepel)
library(ggpubr)
library(ggforce)
library(ggrastr)
library(viridis)
library(reshape2)
library(gprofiler2)
library(ggsci)
library(ggh4x)
library(ggplotify)
library(gridExtra)
library(circlize)
library(EnrichedHeatmap)
library(UpSetR)
library(kableExtra)
library(cowplot)
library(rstatix)
library(beeswarm)
library(clusterProfiler)
library(ggseqlogo)
library(tidyHeatmap)
library(paletteer)
library(ggvenn)
library(colorspace)
library(ggpointdensity)
library(lookup)
library(rstatix)
library(ggrepel)
```
```{r}
load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")
```
81 of 377 editing events were removed from the analysis as the transcripts hosting the editing sites lack the AUACCC motif. Here, *do novo* motif discovery analysis was performed around these editing sites to check if Khd4 can bind to motif other than the AUACCC.
## Khd4-Ada-Gfp editing sites on transcripts lacking the AUACCC motif
```{r}
# Khd4.rep
Krep_no_auaccc = khd4.rep
# make an auaccc column
Krep_no_auaccc$auaccc = NA
Krep_no_auaccc$auaccc = lookup(Krep_no_auaccc$gene.id, auaccc.gr$gene.id, auaccc.gr$gene.id)
# Khd4-Ada-Gfp editing sites without AUACCC motif in them. n=81
Krep_no_auaccc = Krep_no_auaccc[is.na(Krep_no_auaccc$auaccc)]
# no. of genes with no AUACCC motif
#n_distinct(Krep_no_auaccc$gene.id)
names(Krep_no_auaccc) = Krep_no_auaccc$id
# extend both sides by 250 nt
Krep_no_auaccc_ext = Krep_no_auaccc+250
# get sequence
Krep_no_auaccc_ext = Biostrings::getSeq(Umaydis, Krep_no_auaccc_ext)
# export the fasta files
#output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"
#writeXStringSet(Krep_no_auaccc_ext, filepath = paste0(output,"Krep_no_auaccc_ext_250.fasta"))
```
## XSTREME analysis
```{r}
## make a scatter plot
# unique to Khd4-Ada-Gfp
Motif.frame.khd4.noauaccc = data.frame(Motif =
c("CWCUUKUGYCUUGY", "UCAUCUCUCGU", "UCAUSWYUCGU", "YMUAUAUCCCAGHC", "UAGGCUUUGG", "GCUUCACUAGCAGSC"),
absolute = c(13.70, 5.48, 9.59, 5.48, 46.58, 19.18),
EnR = c(98.4, 89.5, 11, 44.7, 1.34, 1.65))
pl.khd4.noauaccc = ggplot(Motif.frame.khd4.noauaccc, aes(x=absolute, y=EnR))+
geom_point(alpha=1, size=2, color="grey20")+
scale_color_manual(values = khd4.col)+
coord_cartesian(xlim=c(0,70))+
labs(subtitle= "Khd4-Ada-Gfp editing events in transcripts lacking AUAUCCC motif",
x="sequences with motif (%)",
y = "motif enrichment ratio")+
theme_paper()+
theme(legend.position = "none")
```
## Most prevalent motif
```{r}
#UAGGCUUUGG (high prevalency)
A= c(0.000836,0.914690,0.000836,0.000836,0.000836,0.107471,0.107471,0.000836,0.000836,0.107471)
C= c(0.001057,0.083589,0.001057,0.107692,0.997443,0.001057,0.001057,0.001057,0.001057,0.107692)
G = c(0.060271,0.000921,0.997308,0.890673,0.000921,0.000921,0.214191,0.000921,0.997308,0.677403)
U = c(0.937836,0.000800,0.000800,0.000800,0.000800,0.890551,0.677281,0.997186,0.000800,0.107435)
pwD02U = rbind(A,C,G,U)
# make the motif logo
ggseqlogo(pwD02U, col_scheme="nucleotide", font="helvetica_bold")
```
## Enrichment of the UAGGCUUUGG motif
```{r}
# function1
# define a function for identifying the motif occurence
getmotgr=function(seq, pwm, gr_plus, gr_minus){
# motif in sense strand
matches_list_strand = lapply(seq, matchPWM, pwm=pwm)
# apply to all
test = lapply(names(matches_list_strand), function (i){
uc = matches_list_strand[[i]]
df= as.data.frame(uc)
df$chr = i # make a column specifying chromosome name
df = cbind.DataFrame(df)
return(df)
})
# rbind the resulting df
df1 = do.call(rbind, test) %>% as(., "GRanges")
# find overlaps
ov = findOverlaps(df1, gr_plus, ignore.strand=FALSE)
df1 = df1[from(ov)]
df1$gene.id = gr_plus$gene_id[to(ov)]
strand(df1) =strand(gr_plus)[to(ov)]
# motif in antisense strand
matches_list_antistrand = lapply(seq, matchPWM, pwm = reverseComplement(pwm))
# apply to all
test1 = lapply(names(matches_list_antistrand), function (i){
uc1 = matches_list_antistrand[[i]]
df1= as.data.frame(uc1)
df1$chr = i
df1 = cbind.DataFrame(df1)
return(df1)
})
# rbind the resulting df
df2 = do.call(rbind, test1) %>% as(., "GRanges")
# find overlaps
ov2 = findOverlaps(df2, gr_minus, ignore.strand=FALSE)
df2= df2[from(ov2)]
df2$gene.id = gr_minus$gene_id[to(ov2)]
strand(df2) =strand(gr_minus)[to(ov2)]
df3 = c(df1,df2) %>% as(., "GRanges")
return(df3)
}
```
```{r warning=FALSE}
# as found in meme suite analysis
# remove the um_scaf_contigs because they causes problem downstream
uma.seq = getSeq(Umaydis)
uma.seq = uma.seq[-c(24,25,26,27)]
# plus strands
gns_plus = genes.GR[strand(genes.GR) == "+"]
#minus strand
gns_minus = genes.GR[strand(genes.GR) == "-"]
```
```{r warning=FALSE}
# change U to T
rownames(pwD02U) = c("A", "C", "G", "T")
#UAGGCUUUGG
UAGGCUUUGG.gr = getmotgr(uma.seq, pwm=pwD02U, gns_plus, gns_minus)
```
```{r}
#change genes.gr column name
khd4vcntrl = list(
khd4 = as.data.frame(unique(hyper.rep$khd4$gene.id)) %>% set_colnames(.,"gene.id"),
ctrl = as.data.frame(unique(hyper.rep$ctrl$gene.id)) %>% set_colnames(., "gene.id"),
all = as.data.frame(genes.GR$gene_id[genes.GR$gene_biotype == "protein_coding"]) %>% set_colnames(., "gene.id"))
# add columns to determine number of genes with CGAGCAAG.GR or GUCUUGCUVY.gr
khd4vcntrl = lapply(khd4vcntrl, function(i){
# add gucuugcuvy column and look for matching geneID
i$UAGGCUUUGG = lookup(i$gene.id,UAGGCUUUGG.gr$gene.id,UAGGCUUUGG.gr$gene.id)
return(i)
})
# counts the total number of sites in each dataset
df2= lapply (names(khd4vcntrl), function(i){
#i is khd4
hyp = khd4vcntrl[[i]]
df2 = table(!is.na(hyp$UAGGCUUUGG)) %>%
as.data.frame() %>%
mutate(set = i, motif = "UAGGCUUUGG") %>%
set_colnames(.,c("with_motif", "no.of.genes","set","motif"))
return(df2)})
df1_mot2 = do.call(rbind,df2)
pldf = ggplot(df1_mot2, aes(x=set, y= no.of.genes, fill = with_motif))+
geom_bar(stat="identity", position="fill")+
scale_fill_manual(values=c("TRUE" = "grey30", "FALSE" = "white"))+
labs(x="",
y = "transcripts (%) ")+
facet_wrap(~motif)+
myTheme1
pldf
```
```{r}
pf = lapply(names(hyper.rep), function(i){
#i= khd4
hyper = hyper.rep[[i]]
# calculate distance to nearest UAGGCUUUGG motif
dists.GUCUUGCUVY = distanceToNearest(hyper, UAGGCUUUGG.gr)
# keep only pairs on same genes
sel = hyper$gene.id[from(dists.GUCUUGCUVY)] == UAGGCUUUGG.gr$gene.id[to(dists.GUCUUGCUVY)]
table(sel)
dists.GUCUUGCUVY = subset(dists.GUCUUGCUVY, sel)
mcols(dists.GUCUUGCUVY)$motif = "UAGGCUUUGG"
# get stats of distances
summary(mcols(dists.GUCUUGCUVY)$distance)
pf = rbind(mcols(dists.GUCUUGCUVY)) %>% as.data.frame()
pf$set = i
return(pf)
})
pf = do.call(rbind, pf)
pl4 = ggplot(pf, aes(x=distance, fill=set)) +
geom_histogram(alpha=0.5, binwidth = 100, aes(y = ..density..), position="dodge") +
coord_cartesian(xlim=c(0, 2500)) +
scale_fill_manual(values=c(khd4 = khd4.col, ctrl = ada.col)) +
labs(x="Distance to the neares motif (nt)",
y = "motif density")+
coord_cartesian(ylim=c(0.000,0.004),
xlim = c(0,2500))+
theme_bw() +
facet_wrap(~motif)+myTheme1
pl4
```
## Session Info
```{r}
sessionInfo()
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment