diff --git a/.gitattributes b/.gitattributes index 8c348403415d8557d3f2b1684d649ad833cac78d..87d93a28bf95d149612897b6df23465bef5007de 100644 --- a/.gitattributes +++ b/.gitattributes @@ -16,3 +16,4 @@ assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/RNAseq_rawfiles/Khd4-Gfp.fastq.gz assays/S5_A1_HyperTRIBE_Khd4_workflow/dataset/Fig_S6_characterization_of_Khd4_Ada_GFP_editing_events.html filter=lfs diff=lfs merge=lfs -text assays/S5_A2_HyperTRIBE_motif_analysis/HyperTRIBE_identifies_highly_specific_mRNA_Targets_of_Khd4.html filter=lfs diff=lfs merge=lfs -text assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]editing[[:space:]]sites[[:space:]]are[[:space:]]proximal[[:space:]]to[[:space:]]the[[:space:]]AUACCC[[:space:]]motif.html filter=lfs diff=lfs merge=lfs -text +assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp[[:space:]]edited[[:space:]]transcripts[[:space:]]lacking[[:space:]]the[[:space:]]AUACCC[[:space:]]motif[[:space:]]do[[:space:]]not[[:space:]]show[[:space:]]enrichment[[:space:]]for[[:space:]]other[[:space:]]motifs.html filter=lfs diff=lfs merge=lfs -text diff --git a/assays/S5_A2_HyperTRIBE_motif_analysis/.Rhistory b/assays/S5_A2_HyperTRIBE_motif_analysis/.Rhistory new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.html b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.html new file mode 100644 index 0000000000000000000000000000000000000000..c24842a34a4c0785dc4d32ae109a03fed43db575 --- /dev/null +++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.html @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:404cff6b9d8c72283db329d622477b4e3893bcd5b0f4c8f106103579bfa1c300 +size 1353103 diff --git a/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.qmd b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.qmd new file mode 100644 index 0000000000000000000000000000000000000000..723c0770dcdff22c106a3aba265535a37e5aa286 --- /dev/null +++ b/assays/S5_A2_HyperTRIBE_motif_analysis/Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs.qmd @@ -0,0 +1,310 @@ +--- +title: "Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs" +date: last-modified +author: + - name: "Srimeenakshi Sankaranarayanan" + +title-block-banner: true +format: + html: + theme: flatly + self-contained: true + code-fold: true + code-tools: true + code-summary: "Show the code" + toc: true + number-sections: true + anchor-sections: true +editor: visual +execute: + echo: true + error: false + warning: false + message: false +--- + +```{r} +# library +library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome +library(ggplot2) +library(BindingSiteFinder) +library(rtracklayer) +library(ComplexHeatmap) +library(GenomicFeatures) +library(forcats) +library(tidyr) +library(dplyr) +library(tidyverse) +library(GenomicRanges) +library(magick) +library(magrittr) +library(gridExtra) +library(IRanges) +library(Biostrings) +library(ggpp) +library(gginnards) +library(ggrepel) +library(ggpubr) +library(ggforce) +library(ggrastr) +library(viridis) +library(reshape2) +library(gprofiler2) +library(ggsci) +library(ggh4x) +library(ggplotify) +library(gridExtra) +library(circlize) +library(EnrichedHeatmap) +library(UpSetR) +library(kableExtra) +library(cowplot) +library(rstatix) +library(beeswarm) +library(clusterProfiler) +library(ggseqlogo) +library(tidyHeatmap) +library(paletteer) +library(ggvenn) +library(colorspace) +library(ggpointdensity) +library(lookup) +library(rstatix) +library(ggrepel) + +``` + +```{r} +load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds") +``` + + +81 of 377 editing events were removed from the analysis as the transcripts hosting the editing sites lack the AUACCC motif. Here, *do novo* motif discovery analysis was performed around these editing sites to check if Khd4 can bind to motif other than the AUACCC. + +## Khd4-Ada-Gfp editing sites on transcripts lacking the AUACCC motif + +```{r} +# Khd4.rep +Krep_no_auaccc = khd4.rep +# make an auaccc column +Krep_no_auaccc$auaccc = NA +Krep_no_auaccc$auaccc = lookup(Krep_no_auaccc$gene.id, auaccc.gr$gene.id, auaccc.gr$gene.id) + +# Khd4-Ada-Gfp editing sites without AUACCC motif in them. n=81 +Krep_no_auaccc = Krep_no_auaccc[is.na(Krep_no_auaccc$auaccc)] +# no. of genes with no AUACCC motif +#n_distinct(Krep_no_auaccc$gene.id) +names(Krep_no_auaccc) = Krep_no_auaccc$id + +# extend both sides by 250 nt +Krep_no_auaccc_ext = Krep_no_auaccc+250 +# get sequence +Krep_no_auaccc_ext = Biostrings::getSeq(Umaydis, Krep_no_auaccc_ext) +# export the fasta files +#output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/" +#writeXStringSet(Krep_no_auaccc_ext, filepath = paste0(output,"Krep_no_auaccc_ext_250.fasta")) +``` + + +## XSTREME analysis +```{r} +## make a scatter plot +# unique to Khd4-Ada-Gfp +Motif.frame.khd4.noauaccc = data.frame(Motif = +c("CWCUUKUGYCUUGY", "UCAUCUCUCGU", "UCAUSWYUCGU", "YMUAUAUCCCAGHC", "UAGGCUUUGG", "GCUUCACUAGCAGSC"), +absolute = c(13.70, 5.48, 9.59, 5.48, 46.58, 19.18), +EnR = c(98.4, 89.5, 11, 44.7, 1.34, 1.65)) + +pl.khd4.noauaccc = ggplot(Motif.frame.khd4.noauaccc, aes(x=absolute, y=EnR))+ + geom_point(alpha=1, size=2, color="grey20")+ + scale_color_manual(values = khd4.col)+ + coord_cartesian(xlim=c(0,70))+ + labs(subtitle= "Khd4-Ada-Gfp editing events in transcripts lacking AUAUCCC motif", + x="sequences with motif (%)", + y = "motif enrichment ratio")+ + theme_paper()+ + theme(legend.position = "none") +``` + + +## Most prevalent motif + +```{r} +#UAGGCUUUGG (high prevalency) +A= c(0.000836,0.914690,0.000836,0.000836,0.000836,0.107471,0.107471,0.000836,0.000836,0.107471) +C= c(0.001057,0.083589,0.001057,0.107692,0.997443,0.001057,0.001057,0.001057,0.001057,0.107692) +G = c(0.060271,0.000921,0.997308,0.890673,0.000921,0.000921,0.214191,0.000921,0.997308,0.677403) +U = c(0.937836,0.000800,0.000800,0.000800,0.000800,0.890551,0.677281,0.997186,0.000800,0.107435) + +pwD02U = rbind(A,C,G,U) + +# make the motif logo +ggseqlogo(pwD02U, col_scheme="nucleotide", font="helvetica_bold") + +``` + +## Enrichment of the UAGGCUUUGG motif + +```{r} +# function1 +# define a function for identifying the motif occurence +getmotgr=function(seq, pwm, gr_plus, gr_minus){ + +# motif in sense strand +matches_list_strand = lapply(seq, matchPWM, pwm=pwm) +# apply to all +test = lapply(names(matches_list_strand), function (i){ + uc = matches_list_strand[[i]] + df= as.data.frame(uc) + df$chr = i # make a column specifying chromosome name + df = cbind.DataFrame(df) + return(df) +}) +# rbind the resulting df +df1 = do.call(rbind, test) %>% as(., "GRanges") +# find overlaps +ov = findOverlaps(df1, gr_plus, ignore.strand=FALSE) +df1 = df1[from(ov)] +df1$gene.id = gr_plus$gene_id[to(ov)] +strand(df1) =strand(gr_plus)[to(ov)] + +# motif in antisense strand +matches_list_antistrand = lapply(seq, matchPWM, pwm = reverseComplement(pwm)) +# apply to all +test1 = lapply(names(matches_list_antistrand), function (i){ + uc1 = matches_list_antistrand[[i]] + df1= as.data.frame(uc1) + df1$chr = i + df1 = cbind.DataFrame(df1) + return(df1) +}) +# rbind the resulting df +df2 = do.call(rbind, test1) %>% as(., "GRanges") +# find overlaps +ov2 = findOverlaps(df2, gr_minus, ignore.strand=FALSE) +df2= df2[from(ov2)] +df2$gene.id = gr_minus$gene_id[to(ov2)] +strand(df2) =strand(gr_minus)[to(ov2)] +df3 = c(df1,df2) %>% as(., "GRanges") +return(df3) +} + +``` + + +```{r warning=FALSE} +# as found in meme suite analysis +# remove the um_scaf_contigs because they causes problem downstream +uma.seq = getSeq(Umaydis) + +uma.seq = uma.seq[-c(24,25,26,27)] + +# plus strands +gns_plus = genes.GR[strand(genes.GR) == "+"] + +#minus strand +gns_minus = genes.GR[strand(genes.GR) == "-"] +``` + + +```{r warning=FALSE} +# change U to T +rownames(pwD02U) = c("A", "C", "G", "T") + +#UAGGCUUUGG +UAGGCUUUGG.gr = getmotgr(uma.seq, pwm=pwD02U, gns_plus, gns_minus) + +``` + +```{r} +#change genes.gr column name +khd4vcntrl = list( +khd4 = as.data.frame(unique(hyper.rep$khd4$gene.id)) %>% set_colnames(.,"gene.id"), +ctrl = as.data.frame(unique(hyper.rep$ctrl$gene.id)) %>% set_colnames(., "gene.id"), +all = as.data.frame(genes.GR$gene_id[genes.GR$gene_biotype == "protein_coding"]) %>% set_colnames(., "gene.id")) + +# add columns to determine number of genes with CGAGCAAG.GR or GUCUUGCUVY.gr +khd4vcntrl = lapply(khd4vcntrl, function(i){ + # add gucuugcuvy column and look for matching geneID + i$UAGGCUUUGG = lookup(i$gene.id,UAGGCUUUGG.gr$gene.id,UAGGCUUUGG.gr$gene.id) + return(i) +}) + +# counts the total number of sites in each dataset +df2= lapply (names(khd4vcntrl), function(i){ + #i is khd4 + hyp = khd4vcntrl[[i]] + df2 = table(!is.na(hyp$UAGGCUUUGG)) %>% + as.data.frame() %>% + mutate(set = i, motif = "UAGGCUUUGG") %>% + set_colnames(.,c("with_motif", "no.of.genes","set","motif")) + return(df2)}) +df1_mot2 = do.call(rbind,df2) + + +pldf = ggplot(df1_mot2, aes(x=set, y= no.of.genes, fill = with_motif))+ + geom_bar(stat="identity", position="fill")+ + scale_fill_manual(values=c("TRUE" = "grey30", "FALSE" = "white"))+ + labs(x="", + y = "transcripts (%) ")+ + facet_wrap(~motif)+ + myTheme1 + +pldf + + +``` + +```{r} +pf = lapply(names(hyper.rep), function(i){ + + #i= khd4 + hyper = hyper.rep[[i]] + + # calculate distance to nearest UAGGCUUUGG motif + dists.GUCUUGCUVY = distanceToNearest(hyper, UAGGCUUUGG.gr) + + # keep only pairs on same genes + sel = hyper$gene.id[from(dists.GUCUUGCUVY)] == UAGGCUUUGG.gr$gene.id[to(dists.GUCUUGCUVY)] + table(sel) + dists.GUCUUGCUVY = subset(dists.GUCUUGCUVY, sel) + mcols(dists.GUCUUGCUVY)$motif = "UAGGCUUUGG" + + + + + # get stats of distances + summary(mcols(dists.GUCUUGCUVY)$distance) + + + pf = rbind(mcols(dists.GUCUUGCUVY)) %>% as.data.frame() + pf$set = i + + return(pf) +}) + +pf = do.call(rbind, pf) + + +pl4 = ggplot(pf, aes(x=distance, fill=set)) + + geom_histogram(alpha=0.5, binwidth = 100, aes(y = ..density..), position="dodge") + + coord_cartesian(xlim=c(0, 2500)) + + scale_fill_manual(values=c(khd4 = khd4.col, ctrl = ada.col)) + + labs(x="Distance to the neares motif (nt)", + y = "motif density")+ + coord_cartesian(ylim=c(0.000,0.004), + xlim = c(0,2500))+ + theme_bw() + + facet_wrap(~motif)+myTheme1 + + +pl4 + +``` +## Session Info + +```{r} +sessionInfo() +``` + +