Khd4-Ada-Gfp edited transcripts lacking the AUACCC motif do not show enrichment for other motifs

Author

Srimeenakshi Sankaranarayanan

Published

August 8, 2024

Show the code
# library
library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
library(ggplot2)
library(BindingSiteFinder)
library(rtracklayer)
library(ComplexHeatmap)
library(GenomicFeatures)
library(forcats)
library(tidyr)
library(dplyr)
library(tidyverse)
library(GenomicRanges)
library(magick)
library(magrittr)
library(gridExtra)
library(IRanges)
library(Biostrings)
library(ggpp)
library(gginnards)
library(ggrepel)
library(ggpubr)
library(ggforce)
library(ggrastr)
library(viridis)
library(reshape2)
library(gprofiler2)
library(ggsci)
library(ggh4x)
library(ggplotify)
library(gridExtra)
library(circlize)
library(EnrichedHeatmap)
library(UpSetR)
library(kableExtra)
library(cowplot)
library(rstatix)
library(beeswarm)
library(clusterProfiler)
library(ggseqlogo)
library(tidyHeatmap)
library(paletteer)
library(ggvenn)
library(colorspace)
library(ggpointdensity)
library(lookup)
library(rstatix)
library(ggrepel)
Show the code
load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")

81 of 377 editing events were removed from the analysis as the transcripts hosting the editing sites lack the AUACCC motif. Here, do novo motif discovery analysis was performed around these editing sites to check if Khd4 can bind to motif other than the AUACCC.

1 Khd4-Ada-Gfp editing sites on transcripts lacking the AUACCC motif

Show the code
# Khd4.rep
Krep_no_auaccc = khd4.rep
# make an auaccc column
Krep_no_auaccc$auaccc = NA
Krep_no_auaccc$auaccc = lookup(Krep_no_auaccc$gene.id, auaccc.gr$gene.id, auaccc.gr$gene.id)

# Khd4-Ada-Gfp editing sites without AUACCC motif in them. n=81
Krep_no_auaccc = Krep_no_auaccc[is.na(Krep_no_auaccc$auaccc)]
# no. of genes with no AUACCC motif
#n_distinct(Krep_no_auaccc$gene.id)
names(Krep_no_auaccc) = Krep_no_auaccc$id

# extend both sides by 250 nt
Krep_no_auaccc_ext = Krep_no_auaccc+250
# get sequence
Krep_no_auaccc_ext = Biostrings::getSeq(Umaydis, Krep_no_auaccc_ext)
# export the fasta files
#output = "C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/TRIBE_data/Editing_events_all/Khd4/bedgraphs/MEME/fasta/"
#writeXStringSet(Krep_no_auaccc_ext, filepath = paste0(output,"Krep_no_auaccc_ext_250.fasta"))

2 XSTREME analysis

Show the code
## make a scatter plot
# unique to Khd4-Ada-Gfp
Motif.frame.khd4.noauaccc = data.frame(Motif = 
c("CWCUUKUGYCUUGY", "UCAUCUCUCGU", "UCAUSWYUCGU", "YMUAUAUCCCAGHC", "UAGGCUUUGG", "GCUUCACUAGCAGSC"),
absolute = c(13.70, 5.48,   9.59,   5.48,   46.58,  19.18),
EnR = c(98.4,   89.5,   11, 44.7,   1.34,   1.65))

pl.khd4.noauaccc = ggplot(Motif.frame.khd4.noauaccc, aes(x=absolute, y=EnR))+
  geom_point(alpha=1, size=2, color="grey20")+
  scale_color_manual(values = khd4.col)+
  coord_cartesian(xlim=c(0,70))+
  labs(subtitle= "Khd4-Ada-Gfp editing events in transcripts lacking AUAUCCC motif",
       x="sequences with motif (%)",
       y = "motif enrichment ratio")+
  theme_paper()+
  theme(legend.position = "none")

3 Most prevalent motif

Show the code
#UAGGCUUUGG (high prevalency)
A= c(0.000836,0.914690,0.000836,0.000836,0.000836,0.107471,0.107471,0.000836,0.000836,0.107471)
C= c(0.001057,0.083589,0.001057,0.107692,0.997443,0.001057,0.001057,0.001057,0.001057,0.107692)
G = c(0.060271,0.000921,0.997308,0.890673,0.000921,0.000921,0.214191,0.000921,0.997308,0.677403)
U = c(0.937836,0.000800,0.000800,0.000800,0.000800,0.890551,0.677281,0.997186,0.000800,0.107435)
 
pwD02U = rbind(A,C,G,U)

# make the motif logo
ggseqlogo(pwD02U, col_scheme="nucleotide", font="helvetica_bold")

4 Enrichment of the UAGGCUUUGG motif

Show the code
# function1
# define a function for identifying the motif occurence
getmotgr=function(seq, pwm, gr_plus, gr_minus){
  
# motif in sense strand
matches_list_strand = lapply(seq, matchPWM, pwm=pwm)
# apply to all 
test = lapply(names(matches_list_strand), function (i){
  uc = matches_list_strand[[i]]
  df= as.data.frame(uc) 
  df$chr = i # make a column specifying chromosome name
  df = cbind.DataFrame(df)
  return(df)
})
# rbind the resulting df
df1 = do.call(rbind, test) %>% as(., "GRanges")
# find overlaps
ov = findOverlaps(df1, gr_plus, ignore.strand=FALSE)
df1 = df1[from(ov)]
df1$gene.id = gr_plus$gene_id[to(ov)]
strand(df1) =strand(gr_plus)[to(ov)]

# motif in antisense strand
matches_list_antistrand = lapply(seq, matchPWM, pwm = reverseComplement(pwm))
# apply to all 
test1 = lapply(names(matches_list_antistrand), function (i){
  uc1 = matches_list_antistrand[[i]]
  df1= as.data.frame(uc1) 
  df1$chr = i
  df1 = cbind.DataFrame(df1)
  return(df1)
})
# rbind the resulting df
df2 = do.call(rbind, test1) %>% as(., "GRanges")
# find overlaps
ov2 = findOverlaps(df2, gr_minus, ignore.strand=FALSE)
df2= df2[from(ov2)]
df2$gene.id = gr_minus$gene_id[to(ov2)]
strand(df2) =strand(gr_minus)[to(ov2)]
df3 = c(df1,df2) %>% as(., "GRanges")
return(df3)
}
Show the code
# as found in meme suite analysis
# remove the um_scaf_contigs because they causes problem downstream
uma.seq = getSeq(Umaydis)

uma.seq = uma.seq[-c(24,25,26,27)]

# plus strands
gns_plus = genes.GR[strand(genes.GR) == "+"]

#minus strand
gns_minus = genes.GR[strand(genes.GR) == "-"]
Show the code
# change U to T
rownames(pwD02U) = c("A", "C", "G", "T")

#UAGGCUUUGG
UAGGCUUUGG.gr = getmotgr(uma.seq, pwm=pwD02U, gns_plus, gns_minus)
Show the code
#change genes.gr column name
khd4vcntrl = list(
khd4 = as.data.frame(unique(hyper.rep$khd4$gene.id)) %>% set_colnames(.,"gene.id"),
ctrl = as.data.frame(unique(hyper.rep$ctrl$gene.id)) %>% set_colnames(., "gene.id"),
all = as.data.frame(genes.GR$gene_id[genes.GR$gene_biotype == "protein_coding"]) %>% set_colnames(., "gene.id"))

# add columns to determine number of genes with CGAGCAAG.GR or GUCUUGCUVY.gr
khd4vcntrl = lapply(khd4vcntrl, function(i){
  # add gucuugcuvy column and look for matching geneID
  i$UAGGCUUUGG = lookup(i$gene.id,UAGGCUUUGG.gr$gene.id,UAGGCUUUGG.gr$gene.id)
  return(i)
})

# counts the total number of sites in each dataset
df2= lapply (names(khd4vcntrl), function(i){
  #i is khd4
  hyp  = khd4vcntrl[[i]]
  df2 = table(!is.na(hyp$UAGGCUUUGG)) %>%
                as.data.frame() %>%
                mutate(set = i, motif = "UAGGCUUUGG") %>%
                set_colnames(.,c("with_motif", "no.of.genes","set","motif")) 
 return(df2)}) 
df1_mot2 = do.call(rbind,df2)


pldf = ggplot(df1_mot2, aes(x=set, y= no.of.genes, fill = with_motif))+
  geom_bar(stat="identity", position="fill")+
  scale_fill_manual(values=c("TRUE" = "grey30", "FALSE" = "white"))+
  labs(x="",
       y = "transcripts (%) ")+
  facet_wrap(~motif)+
  myTheme1

pldf

Show the code
pf = lapply(names(hyper.rep), function(i){
  
  #i= khd4
  hyper = hyper.rep[[i]]
  
  # calculate distance to nearest UAGGCUUUGG motif
  dists.GUCUUGCUVY = distanceToNearest(hyper, UAGGCUUUGG.gr)
  
  # keep only pairs on same genes
  sel = hyper$gene.id[from(dists.GUCUUGCUVY)] == UAGGCUUUGG.gr$gene.id[to(dists.GUCUUGCUVY)]
  table(sel)
  dists.GUCUUGCUVY = subset(dists.GUCUUGCUVY, sel)
  mcols(dists.GUCUUGCUVY)$motif = "UAGGCUUUGG"
  
 
  
  
  # get stats of distances
  summary(mcols(dists.GUCUUGCUVY)$distance)
   
   
  pf = rbind(mcols(dists.GUCUUGCUVY)) %>% as.data.frame()
  pf$set = i
  
  return(pf)
})

pf = do.call(rbind, pf)


pl4 = ggplot(pf, aes(x=distance, fill=set)) +
  geom_histogram(alpha=0.5, binwidth = 100, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2500)) +
  scale_fill_manual(values=c(khd4 = khd4.col, ctrl = ada.col)) +
  labs(x="Distance to the neares motif (nt)",
       y = "motif density")+
  coord_cartesian(ylim=c(0.000,0.004),
                  xlim = c(0,2500))+
  theme_bw() +
  facet_wrap(~motif)+myTheme1


pl4

5 Session Info

Show the code
sessionInfo()
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19045)

Matrix products: default


locale:
[1] LC_COLLATE=English_Germany.utf8  LC_CTYPE=English_Germany.utf8   
[3] LC_MONETARY=English_Germany.utf8 LC_NUMERIC=C                    
[5] LC_TIME=English_Germany.utf8    

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] grid      stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] lookup_1.0                        ggpointdensity_0.1.0             
 [3] colorspace_2.1-0                  ggvenn_0.1.10                    
 [5] paletteer_1.6.0                   tidyHeatmap_1.8.1                
 [7] ggseqlogo_0.2                     clusterProfiler_4.10.1           
 [9] beeswarm_0.4.0                    rstatix_0.7.2                    
[11] cowplot_1.1.3                     kableExtra_1.4.0                 
[13] UpSetR_1.4.0                      EnrichedHeatmap_1.32.0           
[15] circlize_0.4.16                   ggplotify_0.1.2                  
[17] ggh4x_0.2.8                       ggsci_3.2.0                      
[19] gprofiler2_0.2.3                  reshape2_1.4.4                   
[21] viridis_0.6.5                     viridisLite_0.4.2                
[23] ggrastr_1.0.2                     ggforce_0.4.2                    
[25] ggpubr_0.6.0                      ggrepel_0.9.5                    
[27] gginnards_0.2.0                   ggpp_0.5.7                       
[29] gridExtra_2.3                     magrittr_2.0.3                   
[31] magick_2.8.3                      lubridate_1.9.3                  
[33] stringr_1.5.1                     purrr_1.0.2                      
[35] readr_2.1.5                       tibble_3.2.1                     
[37] tidyverse_2.0.0                   dplyr_1.1.4                      
[39] tidyr_1.3.1                       forcats_1.0.0                    
[41] GenomicFeatures_1.54.4            AnnotationDbi_1.64.1             
[43] Biobase_2.62.0                    ComplexHeatmap_2.18.0            
[45] BindingSiteFinder_2.0.0           ggplot2_3.5.1                    
[47] BSgenome.Umaydis.ENSMBL.UM1_2.0.0 BSgenome_1.70.2                  
[49] rtracklayer_1.62.0                BiocIO_1.12.0                    
[51] Biostrings_2.70.1                 XVector_0.42.0                   
[53] GenomicRanges_1.54.1              GenomeInfoDb_1.38.8              
[55] IRanges_2.36.0                    S4Vectors_0.40.2                 
[57] BiocGenerics_0.48.1              

loaded via a namespace (and not attached):
  [1] fs_1.6.3                    matrixStats_1.2.0          
  [3] bitops_1.0-7                enrichplot_1.22.0          
  [5] HDO.db_0.99.1               httr_1.4.7                 
  [7] RColorBrewer_1.1-3          doParallel_1.0.17          
  [9] tools_4.3.1                 backports_1.5.0            
 [11] utf8_1.2.4                  R6_2.5.1                   
 [13] lazyeval_0.2.2              ggdist_3.3.2               
 [15] GetoptLong_1.0.5            withr_3.0.0                
 [17] prettyunits_1.2.0           cli_3.6.1                  
 [19] scatterpie_0.2.3            labeling_0.4.3             
 [21] Rsamtools_2.18.0            systemfonts_1.1.0          
 [23] yulab.utils_0.1.4           gson_0.1.0                 
 [25] DOSE_3.28.2                 svglite_2.1.3              
 [27] rstudioapi_0.16.0           RSQLite_2.3.7              
 [29] generics_0.1.3              gridGraphics_0.5-1         
 [31] shape_1.4.6.1               crosstalk_1.2.1            
 [33] car_3.1-2                   distributional_0.4.0       
 [35] dendextend_1.17.1           GO.db_3.18.0               
 [37] Matrix_1.6-4                ggbeeswarm_0.7.2           
 [39] fansi_1.0.6                 abind_1.4-5                
 [41] lifecycle_1.0.4             yaml_2.3.7                 
 [43] carData_3.0-5               SummarizedExperiment_1.32.0
 [45] qvalue_2.34.0               SparseArray_1.2.3          
 [47] BiocFileCache_2.10.2        blob_1.2.4                 
 [49] promises_1.3.0              crayon_1.5.3               
 [51] lattice_0.22-5              KEGGREST_1.42.0            
 [53] pillar_1.9.0                knitr_1.47                 
 [55] fgsea_1.28.0                rjson_0.2.21               
 [57] codetools_0.2-20            fastmatch_1.1-4            
 [59] glue_1.6.2                  ggfun_0.1.5                
 [61] data.table_1.15.4           vctrs_0.6.5                
 [63] png_0.1-8                   treeio_1.26.0              
 [65] gtable_0.3.5                rematch2_2.1.2             
 [67] cachem_1.0.8                xfun_0.45                  
 [69] mime_0.12                   S4Arrays_1.2.0             
 [71] tidygraph_1.3.1             iterators_1.0.14           
 [73] nlme_3.1-165                ggtree_3.10.1              
 [75] bit64_4.0.5                 progress_1.2.3             
 [77] filelock_1.0.3              vipor_0.4.7                
 [79] DBI_1.2.3                   DESeq2_1.42.1              
 [81] tidyselect_1.2.1            bit_4.0.5                  
 [83] compiler_4.3.1              curl_5.2.1                 
 [85] xml2_1.3.6                  DelayedArray_0.28.0        
 [87] plotly_4.10.4               shadowtext_0.1.3           
 [89] scales_1.3.0                rappdirs_0.3.3             
 [91] digest_0.6.33               rmarkdown_2.27             
 [93] htmltools_0.5.8.1           pkgconfig_2.0.3            
 [95] MatrixGenerics_1.14.0       dbplyr_2.5.0               
 [97] fastmap_1.1.1               rlang_1.1.1                
 [99] GlobalOptions_0.1.2         htmlwidgets_1.6.4          
[101] shiny_1.8.1.1               farver_2.1.2               
[103] jsonlite_1.8.8              BiocParallel_1.36.0        
[105] GOSemSim_2.28.1             RCurl_1.98-1.13            
[107] polynom_1.4-1               GenomeInfoDbData_1.2.11    
[109] patchwork_1.2.0             munsell_0.5.1              
[111] Rcpp_1.0.12                 ape_5.8                    
[113] stringi_1.8.3               ggraph_2.2.1               
[115] zlibbioc_1.48.0             MASS_7.3-60                
[117] plyr_1.8.9                  parallel_4.3.1             
[119] graphlayouts_1.1.1          splines_4.3.1              
[121] hms_1.1.3                   locfit_1.5-9.10            
[123] igraph_2.0.3                ggsignif_0.6.4             
[125] biomaRt_2.58.2              XML_3.99-0.16.1            
[127] evaluate_0.24.0             httpuv_1.6.15              
[129] tzdb_0.4.0                  foreach_1.5.2              
[131] tweenr_2.0.3                polyclip_1.10-6            
[133] clue_0.3-65                 xtable_1.8-4               
[135] broom_1.0.6                 restfulr_0.0.15            
[137] tidytree_0.4.6              later_1.3.2                
[139] aplot_0.2.3                 memoise_2.0.1              
[141] GenomicAlignments_1.38.0    cluster_2.1.6              
[143] timechange_0.3.0