Khd4-Ada-Gfp editing sites are proximal to AUACCC motif

Author

Srimeenakshi Sankaranarayanan

Published

August 4, 2024

Show the code
# library
library(BSgenome.Umaydis.ENSMBL.UM1) # forged ustilago genome
library(ggplot2)
library(BindingSiteFinder)
library(rtracklayer)
library(ComplexHeatmap)
library(GenomicFeatures)
library(forcats)
library(tidyr)
library(dplyr)
library(tidyverse)
library(GenomicRanges)
library(magick)
library(magrittr)
library(gridExtra)
library(IRanges)
library(Biostrings)
library(ggpp)
library(gginnards)
library(ggrepel)
library(ggpubr)
library(ggforce)
library(ggrastr)
library(viridis)
library(reshape2)
library(gprofiler2)
library(ggsci)
library(ggh4x)
library(ggplotify)
library(gridExtra)
library(circlize)
library(EnrichedHeatmap)
library(UpSetR)
library(kableExtra)
library(cowplot)
library(rstatix)
library(beeswarm)
library(clusterProfiler)
library(ggseqlogo)
library(tidyHeatmap)
library(paletteer)
library(ggvenn)
library(colorspace)
library(ggpointdensity)
library(lookup)
library(rstatix)
library(ggrepel)
Show the code
load("C:/Users/Sri/Documents/Khd4/DGE_analysis_SM_PhD/Kathis_lab/hyperTRIBE/hyperTRIBE.rds")

Identifying the positions of RNA sequence to be tested across the genome.

Show the code
################################################################################
#AUACCC
################################################################################

# get gene sequences
gene.seqs = getSeq(Umaydis, genes.GR)

# find AUACCC motifs
motif.auaccc = DNAString("ATACCC")

# find all AUACCC motifs in the genome
auaccc.gr = vmatchPattern(motif.auaccc, Umaydis)

# count motifs in genes
genes.GR$AUACCC = countOverlaps(genes.GR, auaccc.gr)
#table(genes.GR$AUACCC)

# assign motifs to genes
ov = findOverlaps(auaccc.gr, genes.GR)

# check whether motifs overlap with more than one gene
#table(duplicated(from(ov)))

# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
auaccc.gr = auaccc.gr[from(ov)]
auaccc.gr$gene.id = genes.GR$gene_id[to(ov)]

# count motifs in transcript regions
#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
#findOverlaps(auaccc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)


################################################################################
#AGAUCU
################################################################################

# find AGAUCU motifs
motif.ctrl = DNAString("AGATCT")
agaucu.gr = vmatchPattern(motif.ctrl, Umaydis)
genes.GR$AGAUCU = countOverlaps(genes.GR, agaucu.gr)
#table(genes.GR$AGAUCU)

# assign motifs to genes
ov = findOverlaps(agaucu.gr, genes.GR)

# check whether motifs overlap with more than one gene
#table(duplicated(from(ov)))

# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
agaucu.gr = agaucu.gr[from(ov)]
agaucu.gr$gene.id = genes.GR$gene_id[to(ov)]

# count motifs in transcript regions
#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
#findOverlaps(agaucu.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)


################################################################################
#GGGTAT
################################################################################

# find GGGTAT motifs
motif.ctrl = DNAString("GGGTAT")
gggtat.gr = vmatchPattern(motif.ctrl, Umaydis)
genes.GR$GGGTAT = countOverlaps(genes.GR, gggtat.gr)
#table(genes.GR$GGGTAT)

# assign motifs to genes
ov = findOverlaps(gggtat.gr, genes.GR)

# check whether motifs overlap with more than one gene
#table(duplicated(from(ov)))

# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
gggtat.gr = gggtat.gr[from(ov)]
gggtat.gr$gene.id = genes.GR$gene_id[to(ov)]

# count motifs in transcript regions
#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
#findOverlaps(gggtat.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)


################################################################################
#ACACUC
################################################################################

# find AGAUCU motifs
motif.ctrl = DNAString("ACACTC")
acacuc.gr = vmatchPattern(motif.ctrl, Umaydis)
genes.GR$ACACUC = countOverlaps(genes.GR, acacuc.gr)
#table(genes.GR$ACACUC)

# assign motifs to genes
ov = findOverlaps(acacuc.gr, genes.GR)

# check whether motifs overlap with more than one gene
#table(duplicated(from(ov)))

# keep only motifs in genes & duplicate motifs that overlap with 2 genes, so they count for both genes
acacuc.gr = acacuc.gr[from(ov)]
acacuc.gr$gene.id = genes.GR$gene_id[to(ov)]

# count motifs in transcript regions
#findOverlaps(hits, tx.regions) %>% to(.) %>% table(.)
#findOverlaps(acacuc.gr, tx.regions, select="first") %>% names(tx.regions)[.] %>% table(.)

1 Motif enrichment

Show the code
# check that numbers are same as in Venn
#sapply(khd4.vs.ctrl, length)

names(khd4.vs.ctrl) = c("Khd4-Ada-Gfp-unique", "overlap", "control-Ada unique", "all mRNAs")

# table of AUACCC
tab.auaccc = sapply(khd4.vs.ctrl, function(gene.set){
  motifs = genes.GR$AUACCC[genes.GR$gene_id %in% gene.set]
  tab = table(motifs > 0)
  return(tab)
}) %>% melt(.)
names(tab.auaccc) = c("with.motif", "set", "no.genes")

# table of AGAUCU
tab.agaucu = sapply(khd4.vs.ctrl, function(gene.set){
  motifs = genes.GR$AGAUCU[genes.GR$gene_id %in% gene.set]
  tab = table(motifs > 0)
  return(tab)
}) %>% melt(.)
names(tab.agaucu) = c("with.motif", "set", "no.genes")

# table of GGGTAT
tab.gggtat = sapply(khd4.vs.ctrl, function(gene.set){
  motifs = genes.GR$GGGTAT[genes.GR$gene_id %in% gene.set]
  tab = table(motifs > 0)
  return(tab)
}) %>% melt(.)
names(tab.gggtat) = c("with.motif", "set", "no.genes")

# table of CCCAUA
tab.acacuc = sapply(khd4.vs.ctrl, function(gene.set){
  motifs = genes.GR$ACACUC[genes.GR$gene_id %in% gene.set]
  tab = table(motifs > 0)
  return(tab)
}) %>% melt(.)
names(tab.acacuc) = c("with.motif", "set", "no.genes")


# stacked barchart (scaled to 100%)
pl3 = ggplot(tab.auaccc, aes(x=set, y=no.genes, fill=with.motif)) +
  geom_bar(position="fill", stat="identity") +
  scale_fill_manual(values=c('TRUE' = auaccc.col, 'FALSE' = "lightgrey")) +
  ggtitle("AUACCC")+
  labs(x="",
       y="% with motif")+
  myTheme1+
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))


# stacked barchart (scaled to 100%)
pl4 = ggplot(tab.agaucu, aes(x=set, y=no.genes, fill=with.motif)) +
  geom_bar(position="fill", stat="identity") +
  scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) +
  ggtitle("AGAUCU")+
  labs(x="",
       y="% with motif")+
  myTheme1+
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))

# stacked barchart (scaled to 100%)
pl5 = ggplot(tab.gggtat, aes(x=set, y=no.genes, fill=with.motif)) +
  geom_bar(position="fill", stat="identity") +
  scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) +
  ggtitle("GGGTAT")+
  labs(x="",
       y="% with motif")+
  myTheme1+
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))

# stacked barchart (scaled to 100%)
pl6 = ggplot(tab.acacuc, aes(x=set, y=no.genes, fill=with.motif)) +
  geom_bar(position="fill", stat="identity") +
  scale_fill_manual(values=c('TRUE' = agaucu.col, 'FALSE' = "lightgrey")) +
  ggtitle("ACAUCU")+
  labs(x="",
       y="% with motif")+
  myTheme1+
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))

# FigS7A

pl<-ggarrange(pl3, pl4, pl5, pl6, 
          labels = LETTERS[1:4],
         nrow = 1, common.legend = TRUE)

pl

Fischer’s exact test was performed to calculate the enrichment of motif between datasets from the previous section. Note that I made the dataframe by manually entering the number from tab.motif variable from above.

Show the code
#Pvalue calculation
# isolating the motif number in each set that is with or without motif

################################################################################
#AUACCC motif
################################################################################

khd4.uni=c(157,36)
shared=c(60,42)
ctrl.uni=c(226,682)
all=c(1675,5256)

enrmotif= data.frame(khd4.uni,shared,ctrl.uni,all) # make dataframe

row.names(enrmotif)=c("auaccc", "nonauaccc") # assign rownames

# fischers test
chisq.khd4Mot=enrmotif[,c(1,4)]
chisq.sharedMot=enrmotif[,c(2,4)]
chisq.ctrlMot=enrmotif[,c(3,4)]

#fischers test
fisher.test(chisq.khd4Mot) # khd4unique vs all

    Fisher's Exact Test for Count Data

data:  chisq.khd4Mot
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  9.427969 20.322387
sample estimates:
odds ratio 
  13.67828 
Show the code
fisher.test(chisq.sharedMot) # shared vs all

    Fisher's Exact Test for Count Data

data:  chisq.sharedMot
p-value = 1.529e-13
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 2.958915 6.842948
sample estimates:
odds ratio 
  4.481005 
Show the code
fisher.test(chisq.ctrlMot) # conrol vs all

    Fisher's Exact Test for Count Data

data:  chisq.ctrlMot
p-value = 0.6506
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.8818158 1.2228807
sample estimates:
odds ratio 
  1.039832 
Show the code
################################################################################
#AGAUCU
################################################################################

mkhd4.uni=c(102,91)
mshared=c(50,52)
mctrl.uni=c(438,470)
mall=c(2931,4000)

menrmotif= data.frame(mkhd4.uni,mshared,mctrl.uni,mall) # make dataframe

row.names(menrmotif)=c("agaucu", "nonagaucu") # assign rownames

# fischers test
chisq.mkhd4Mot=menrmotif[,c(1,4)]
chisq.msharedMot=menrmotif[,c(2,4)]
chisq.mctrlMot=menrmotif[,c(3,4)]


#fischers test
fisher.test(chisq.mkhd4Mot) # khd4unique vs all

    Fisher's Exact Test for Count Data

data:  chisq.mkhd4Mot
p-value = 0.003924
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.136806 2.060719
sample estimates:
odds ratio 
  1.529565 
Show the code
fisher.test(chisq.msharedMot) # shared vs all

    Fisher's Exact Test for Count Data

data:  chisq.msharedMot
p-value = 0.1896
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.869246 1.979156
sample estimates:
odds ratio 
  1.312167 
Show the code
fisher.test(chisq.mctrlMot) # conrol vs all

    Fisher's Exact Test for Count Data

data:  chisq.mctrlMot
p-value = 0.0007047
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.104319 1.464441
sample estimates:
odds ratio 
  1.271753 
Show the code
################################################################################
#GGGUAU motif
################################################################################

m2khd4.uni=c(47,146)
m2shared=c(31,71)
m2ctrl.uni=c(232,676)
m2all=c(1496,5435)

m2enrmotif= data.frame(m2khd4.uni,m2shared,m2ctrl.uni,m2all) # make dataframe

row.names(m2enrmotif)=c("cccaua", "nonagaucu") # assign rownames

# fischers test
chisq.m2khd4Mot=m2enrmotif[,c(1,4)]
chisq.m2sharedMot=m2enrmotif[,c(2,4)]
chisq.m2ctrlMot=m2enrmotif[,c(3,4)]
chisq.m2ctrlMot_unimot=m2enrmotif[,c(1,3)]


#fischers test
fisher.test(chisq.m2khd4Mot) # khd4unique vs all

    Fisher's Exact Test for Count Data

data:  chisq.m2khd4Mot
p-value = 0.3754
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.8191105 1.6439441
sample estimates:
odds ratio 
  1.169544 
Show the code
fisher.test(chisq.m2sharedMot) # shared vs all

    Fisher's Exact Test for Count Data

data:  chisq.m2sharedMot
p-value = 0.03911
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.001058 2.461645
sample estimates:
odds ratio 
   1.58613 
Show the code
fisher.test(chisq.m2ctrlMot) # conrol vs all

    Fisher's Exact Test for Count Data

data:  chisq.m2ctrlMot
p-value = 0.007319
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.057883 1.465752
sample estimates:
odds ratio 
  1.246811 
Show the code
fisher.test(chisq.m2ctrlMot_unimot) # khd4unique vs ctrl

    Fisher's Exact Test for Count Data

data:  chisq.m2ctrlMot_unimot
p-value = 0.7848
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.6387363 1.3598368
sample estimates:
odds ratio 
 0.9380518 
Show the code
################################################################################
#ACACUC motif
################################################################################

m3khd4.uni=c(115,78)
m3shared=c(49,53)
m3ctrl.uni=c(521,387)
m3all=c(3482,3449)

m3enrmotif= data.frame(m3khd4.uni,m3shared,m3ctrl.uni,m3all) # make dataframe

row.names(m3enrmotif)=c("cccaua", "nonagaucu") # assign rownames

# fischers test
chisq.m3khd4Mot=m3enrmotif[,c(1,4)]
chisq.m3sharedMot=m3enrmotif[,c(2,4)]
chisq.m3ctrlMot=m3enrmotif[,c(3,4)]
chisq.m3ctrlMot_unimot=m3enrmotif[,c(1,3)]


#fischers test
fisher.test(chisq.m3khd4Mot) # khd4unique vs all

    Fisher's Exact Test for Count Data

data:  chisq.m3khd4Mot
p-value = 0.01063
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.081501 1.980467
sample estimates:
odds ratio 
  1.460302 
Show the code
fisher.test(chisq.m3sharedMot) # shared vs all

    Fisher's Exact Test for Count Data

data:  chisq.m3sharedMot
p-value = 0.6906
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.6062636 1.3809297
sample estimates:
odds ratio 
 0.9157924 
Show the code
fisher.test(chisq.m3ctrlMot) # conrol vs all

    Fisher's Exact Test for Count Data

data:  chisq.m3ctrlMot
p-value = 5.612e-05
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.157002 1.537709
sample estimates:
odds ratio 
  1.333439 
Show the code
fisher.test(chisq.m3ctrlMot_unimot) # khd4unique vs ctrl

    Fisher's Exact Test for Count Data

data:  chisq.m3ctrlMot_unimot
p-value = 0.6303
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.7894775 1.5244677
sample estimates:
odds ratio 
  1.095037 

2 Distance from editing sites to motifs

Only editing sites on mRNAs specific to each dataset were considered

Show the code
names(hyper.rep) = c("Khd4_Ada_Gfp", "control_Ada")


pf = lapply(names(hyper.rep), function(i){
  #i = "khd4"
  hyper = hyper.rep[[i]]
  head(hyper)
  
  # calculate distance to nearest AUACCC motif
  dists.auacc = distanceToNearest(hyper, auaccc.gr)
  
  # keep only pairs on same genes
  sel = hyper$gene.id[from(dists.auacc)] == auaccc.gr$gene.id[to(dists.auacc)]
  table(sel)
  dists.auacc= subset(dists.auacc, sel)
  mcols(dists.auacc)$motif = "AUACCC"
  
  # calculate distance to nearest AGAUCU motif
  dists.agaucu = distanceToNearest(hyper, agaucu.gr)
  
  # keep only pairs on same genes
  sel = hyper$gene.id[from(dists.agaucu)] == agaucu.gr$gene.id[to(dists.agaucu)]
  table(sel)
  dists.agaucu= subset(dists.agaucu, sel)
  mcols(dists.agaucu)$motif = "AGAUCU"
  
  # calculate distance to nearest CCCAUA motif
  dists.gggtat = distanceToNearest(hyper, gggtat.gr)
  
  # keep only pairs on same genes
  sel = hyper$gene.id[from(dists.gggtat)] == gggtat.gr$gene.id[to(dists.gggtat)]
  table(sel)
  dists.gggtat = subset(dists.gggtat, sel)
  mcols(dists.gggtat)$motif = "GGGTAT"
  
  # calculate distance to nearest ACACUC motif
  dists.acacuc = distanceToNearest(hyper, acacuc.gr)
  
  # keep only pairs on same genes
  sel = hyper$gene.id[from(dists.acacuc)] == acacuc.gr$gene.id[to(dists.acacuc)]
  table(sel)
  dists.acacuc = subset(dists.acacuc, sel)
  mcols(dists.acacuc)$motif = "ACACUC"
  
  # get stats of distances
  summary(mcols(dists.auacc)$distance)
  summary(mcols(dists.agaucu)$distance)
  summary(mcols(dists.gggtat)$distance)
  summary(mcols(dists.acacuc)$distance)
  
  pf = rbind(mcols(dists.auacc), mcols(dists.agaucu), mcols(dists.gggtat), mcols(dists.acacuc)) %>% as.data.frame()
  pf$set = i
  
  return(pf)
})

pf = do.call(rbind, pf)

# Histogram
pl4 = ggplot(pf, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2500)) +
  scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey")) +
  theme_bw() +
  facet_wrap(~factor(motif, levels=c("AUACCC", "AGAUCU", "GGGTAT", "ACACUC")))+myTheme1
pl4

Show the code
################################################################################
# De novo discoevered motif on Khd4-Ada-Gfp edited transcripts
################################################################################

# motif logo with u in it
# Motif-1 TP% = 60.9   ENR-Ratio = 9.40 HMHHKAUACCC
A= c(0.220238,0.380952,0.369048,0.392857,0.041667,1000000,0.000000,1000000,0.000000,0.000000,0.059524
)
C= c(0.440476,0.321429,0.351190,0.261905,0.107143,0.000000,0.000000,0.000000,0.976190,1000000,0.940476
)
G = c(0.107143,0.113095,0.029762,0.119048,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
)
U = c(0.232143,0.184524,0.250000,0.226190,0.517857,0.000000,1000000,0.000000,0.023810,0.000000,0.000000
)
 
pwDF1bU = rbind(A,C,G,U)


# MoUif-2 UP% = 59.3   ENR-RaUio = 9.60 HMKAUACCCVH

A= c(0.366890,0.432382,0.026357,0.999729,0.000082,0.999729,0.019779,0.000082,0.050456,0.234389,0.213404
)
C= c( 0.304485,0.245517,0.122745,0.000103,0.000103,0.000103,0.927513,0.999751,0.949376,0.374600,0.306852
)
G = c(0.032919,0.116138,0.316355,0.000090,0.006656,0.000090,0.006656,0.000090,0.000090,0.211350,0.159894
)
U = c(0.295706,0.205963,0.534542,0.000078,0.993160,0.000078,0.046052,0.000078,0.000078,0.179661,0.319849
)
 
pwDF2bU = rbind(A,C,G,U)


# MoUif-4 UP% = 65.8  ENR-RaUio = 1.46 UHCCUCUYCHCCMUC 
A= c(0.197917,0.229167,0.000000,0.000000,0.072917,0.000000,0.052083,0.000000,0.020833,0.416667,0.000000,0.145833,0.458333,0.000000,0.000000)
C= c( 0.208333,0.427083,0.770833,0.906250,0.270833,0.739583,0.145833,0.333333,0.979167,0.260417,0.677083,0.697917,0.354167,0.312500,0.979167)
G = c(0.125000,0.000000,0.062500,0.000000,0.093750,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.156250,0.114583,0.000000,0.020833)
U = c(0.468750,0.343750,0.166667,0.093750,0.562500,0.260417,0.739583,0.666667,0.000000,0.322917,0.322917,0.000000,0.072917,0.687500,0.000000)
 
pwDF4bU = rbind(A,C,G,U)





m1 = ggseqlogo(pwDF1bU, col_scheme="nucleotide", font="helvetica_bold")

m2 = ggseqlogo(pwDF2bU, col_scheme="nucleotide", font="helvetica_bold")

m4 = ggseqlogo(pwDF4bU, col_scheme="nucleotide", font="helvetica_bold")

pl = ggarrange(m1,m2,m4,
               labels = LETTERS[1:3],
          nrow = 1)

pl = annotate_figure(pl, top = text_grob("de novo discovered motif on Khd4-Ada-Gfp edited transcripts", color = "red", face = "bold", size = 14))

pl

3 Using motif position weight matrix to look for their occurence in the genome

As a next step, we will check the distribution of the identified motif. For this purpose, i will use matchPWM() function for motif occurence. First I will use AUACCC to see if the function can indeed work!

Show the code
# function to find motif occurence
# define a function for identifying the motif occurence
getmotgr=function(seq, pwm, gr_plus, gr_minus){
  
# motif in sense strand
matches_list_strand = lapply(seq, matchPWM, pwm=pwm)
# apply to all 
test = lapply(names(matches_list_strand), function (i){
  uc = matches_list_strand[[i]]
  df= as.data.frame(uc) 
  df$chr = i # make a column specifying chromosome name
  df = cbind.DataFrame(df)
  return(df)
})
# rbind the resulting df
df1 = do.call(rbind, test) %>% as(., "GRanges")
# find overlaps
ov = findOverlaps(df1, gr_plus, ignore.strand=FALSE)
df1 = df1[from(ov)]
df1$gene.id = gr_plus$gene_id[to(ov)]
strand(df1) =strand(gr_plus)[to(ov)]

# motif in antisense strand
matches_list_antistrand = lapply(seq, matchPWM, pwm = reverseComplement(pwm))
# apply to all 
test1 = lapply(names(matches_list_antistrand), function (i){
  uc1 = matches_list_antistrand[[i]]
  df1= as.data.frame(uc1) 
  df1$chr = i
  df1 = cbind.DataFrame(df1)
  return(df1)
})
# rbind the resulting df
df2 = do.call(rbind, test1) %>% as(., "GRanges")
# find overlaps
ov2 = findOverlaps(df2, gr_minus, ignore.strand=FALSE)
df2= df2[from(ov2)]
df2$gene.id = gr_minus$gene_id[to(ov2)]
strand(df2) =strand(gr_minus)[to(ov2)]
df3 = c(df1,df2) %>% as(., "GRanges")
return(df3)
}
Show the code
# get sequneces of chromosome
uma.seq = getSeq(Umaydis) 

# isolate + and - strand separtely
# + strand
# strand specific data of GTF file
# plus strands
gns_plus = genes.GR[strand(genes.GR) == "+"]

#minus strand
gns_minus = genes.GR[strand(genes.GR) == "-"]
Show the code
# convert RNA string to DNA string
rownames(pwDF1bU) = c("A", "C", "G", "T")
rownames(pwDF4bU) = c("A", "C", "G", "T")
Show the code
# remove the um_scaf_contigs because they causes problem downstream
uma.seq = uma.seq[-c(24,25,26,27)]

#HMHHKAUACCC.gr 
HMHHKAUACCC.gr = getmotgr(uma.seq, pwm=pwDF1bU, gns_plus, gns_minus)

#UHCCUCUYCHCCMUC.gr
UHCCUCUYCHCCMUC.gr = getmotgr(uma.seq, pwm=pwDF4bU, gns_plus, gns_minus)

Here I am going to find the distribution of the most enriched HMHHKAUACCC and frequently occurring UHCCUCUYCHCCMUC motifs from Khd4-Ada-Gfp editing sites.

Show the code
pf = lapply(names(hyper.rep), function(i){
  #i = "khd4"
  hyper = hyper.rep[[i]]
  head(hyper)
  
  # calculate distance to nearest AUACCC motif
  dists.HMHHKAUACCC = distanceToNearest(hyper, HMHHKAUACCC.gr)
  
  # keep only pairs on same genes
  sel = hyper$gene.id[from(dists.HMHHKAUACCC)] == HMHHKAUACCC.gr$gene.id[to(dists.HMHHKAUACCC)]
  table(sel)
  dists.HMHHKAUACCC= subset(dists.HMHHKAUACCC, sel)
  mcols(dists.HMHHKAUACCC)$motif = "HMHHKAUACCC"
  
  # calculate distance to nearest UHCCUCUYCHCCMUC.gr motif
  dists.UHCCUCUYCHCCMUC = distanceToNearest(hyper, UHCCUCUYCHCCMUC.gr)
  
  # keep only pairs on same genes
  sel = hyper$gene.id[from(dists.UHCCUCUYCHCCMUC)] == UHCCUCUYCHCCMUC.gr$gene.id[to(dists.UHCCUCUYCHCCMUC)]
  table(sel)
  dists.agcagcagc = subset(dists.UHCCUCUYCHCCMUC, sel)
  mcols(dists.UHCCUCUYCHCCMUC)$motif = "UHCCUCUYCHCCMUC"
  
  
  # get stats of distances
  summary(mcols(dists.HMHHKAUACCC)$distance)
  summary(mcols(dists.UHCCUCUYCHCCMUC)$distance)
  
   
  pf = rbind(mcols(dists.HMHHKAUACCC),mcols(dists.UHCCUCUYCHCCMUC)) %>% as.data.frame()
  pf$set = i
  
  return(pf)
})

pf = do.call(rbind, pf)


pl4 = ggplot(pf, aes(x=distance, fill=set)) +
  geom_histogram(binwidth = 150, aes(y = ..density..), position="dodge") +
  coord_cartesian(xlim=c(0, 2500), ylim=c(0,0.004)) +
  scale_fill_manual(values=c(Khd4_Ada_Gfp = "dodgerblue4", control_Ada = "grey")) +
  labs(x="Distance to the neares motif (nt)",
       y = "motif density")+
  theme_bw() +
  facet_wrap(~motif)+myTheme1


ggarrange(pl4,  
          labels = LETTERS[1],
          ncol = 1, nrow = 1)

Show the code
sessionInfo()
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19045)

Matrix products: default


locale:
[1] LC_COLLATE=English_Germany.utf8  LC_CTYPE=English_Germany.utf8   
[3] LC_MONETARY=English_Germany.utf8 LC_NUMERIC=C                    
[5] LC_TIME=English_Germany.utf8    

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] grid      stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] lookup_1.0                        ggpointdensity_0.1.0             
 [3] colorspace_2.1-0                  ggvenn_0.1.10                    
 [5] paletteer_1.6.0                   tidyHeatmap_1.8.1                
 [7] ggseqlogo_0.2                     clusterProfiler_4.10.1           
 [9] beeswarm_0.4.0                    rstatix_0.7.2                    
[11] cowplot_1.1.3                     kableExtra_1.4.0                 
[13] UpSetR_1.4.0                      EnrichedHeatmap_1.32.0           
[15] circlize_0.4.16                   ggplotify_0.1.2                  
[17] ggh4x_0.2.8                       ggsci_3.2.0                      
[19] gprofiler2_0.2.3                  reshape2_1.4.4                   
[21] viridis_0.6.5                     viridisLite_0.4.2                
[23] ggrastr_1.0.2                     ggforce_0.4.2                    
[25] ggpubr_0.6.0                      ggrepel_0.9.5                    
[27] gginnards_0.2.0                   ggpp_0.5.7                       
[29] gridExtra_2.3                     magrittr_2.0.3                   
[31] magick_2.8.3                      lubridate_1.9.3                  
[33] stringr_1.5.1                     purrr_1.0.2                      
[35] readr_2.1.5                       tibble_3.2.1                     
[37] tidyverse_2.0.0                   dplyr_1.1.4                      
[39] tidyr_1.3.1                       forcats_1.0.0                    
[41] GenomicFeatures_1.54.4            AnnotationDbi_1.64.1             
[43] Biobase_2.62.0                    ComplexHeatmap_2.18.0            
[45] BindingSiteFinder_2.0.0           ggplot2_3.5.1                    
[47] BSgenome.Umaydis.ENSMBL.UM1_2.0.0 BSgenome_1.70.2                  
[49] rtracklayer_1.62.0                BiocIO_1.12.0                    
[51] Biostrings_2.70.1                 XVector_0.42.0                   
[53] GenomicRanges_1.54.1              GenomeInfoDb_1.38.8              
[55] IRanges_2.36.0                    S4Vectors_0.40.2                 
[57] BiocGenerics_0.48.1              

loaded via a namespace (and not attached):
  [1] fs_1.6.3                    matrixStats_1.2.0          
  [3] bitops_1.0-7                enrichplot_1.22.0          
  [5] HDO.db_0.99.1               httr_1.4.7                 
  [7] RColorBrewer_1.1-3          doParallel_1.0.17          
  [9] tools_4.3.1                 backports_1.5.0            
 [11] utf8_1.2.4                  R6_2.5.1                   
 [13] lazyeval_0.2.2              ggdist_3.3.2               
 [15] GetoptLong_1.0.5            withr_3.0.0                
 [17] prettyunits_1.2.0           cli_3.6.1                  
 [19] scatterpie_0.2.3            labeling_0.4.3             
 [21] Rsamtools_2.18.0            systemfonts_1.1.0          
 [23] yulab.utils_0.1.4           gson_0.1.0                 
 [25] DOSE_3.28.2                 svglite_2.1.3              
 [27] rstudioapi_0.16.0           RSQLite_2.3.7              
 [29] generics_0.1.3              gridGraphics_0.5-1         
 [31] shape_1.4.6.1               crosstalk_1.2.1            
 [33] car_3.1-2                   distributional_0.4.0       
 [35] dendextend_1.17.1           GO.db_3.18.0               
 [37] Matrix_1.6-4                ggbeeswarm_0.7.2           
 [39] fansi_1.0.6                 abind_1.4-5                
 [41] lifecycle_1.0.4             yaml_2.3.7                 
 [43] carData_3.0-5               SummarizedExperiment_1.32.0
 [45] qvalue_2.34.0               SparseArray_1.2.3          
 [47] BiocFileCache_2.10.2        blob_1.2.4                 
 [49] promises_1.3.0              crayon_1.5.3               
 [51] lattice_0.22-5              KEGGREST_1.42.0            
 [53] pillar_1.9.0                knitr_1.47                 
 [55] fgsea_1.28.0                rjson_0.2.21               
 [57] codetools_0.2-20            fastmatch_1.1-4            
 [59] glue_1.6.2                  ggfun_0.1.5                
 [61] data.table_1.15.4           vctrs_0.6.5                
 [63] png_0.1-8                   treeio_1.26.0              
 [65] gtable_0.3.5                rematch2_2.1.2             
 [67] cachem_1.0.8                xfun_0.45                  
 [69] mime_0.12                   S4Arrays_1.2.0             
 [71] tidygraph_1.3.1             iterators_1.0.14           
 [73] nlme_3.1-165                ggtree_3.10.1              
 [75] bit64_4.0.5                 progress_1.2.3             
 [77] filelock_1.0.3              vipor_0.4.7                
 [79] DBI_1.2.3                   DESeq2_1.42.1              
 [81] tidyselect_1.2.1            bit_4.0.5                  
 [83] compiler_4.3.1              curl_5.2.1                 
 [85] xml2_1.3.6                  DelayedArray_0.28.0        
 [87] plotly_4.10.4               shadowtext_0.1.3           
 [89] scales_1.3.0                rappdirs_0.3.3             
 [91] digest_0.6.33               rmarkdown_2.27             
 [93] htmltools_0.5.8.1           pkgconfig_2.0.3            
 [95] MatrixGenerics_1.14.0       dbplyr_2.5.0               
 [97] fastmap_1.1.1               rlang_1.1.1                
 [99] GlobalOptions_0.1.2         htmlwidgets_1.6.4          
[101] shiny_1.8.1.1               farver_2.1.2               
[103] jsonlite_1.8.8              BiocParallel_1.36.0        
[105] GOSemSim_2.28.1             RCurl_1.98-1.13            
[107] polynom_1.4-1               GenomeInfoDbData_1.2.11    
[109] patchwork_1.2.0             munsell_0.5.1              
[111] Rcpp_1.0.12                 ape_5.8                    
[113] stringi_1.8.3               ggraph_2.2.1               
[115] zlibbioc_1.48.0             MASS_7.3-60                
[117] plyr_1.8.9                  parallel_4.3.1             
[119] graphlayouts_1.1.1          splines_4.3.1              
[121] hms_1.1.3                   locfit_1.5-9.10            
[123] igraph_2.0.3                ggsignif_0.6.4             
[125] biomaRt_2.58.2              XML_3.99-0.16.1            
[127] evaluate_0.24.0             httpuv_1.6.15              
[129] tzdb_0.4.0                  foreach_1.5.2              
[131] tweenr_2.0.3                polyclip_1.10-6            
[133] clue_0.3-65                 xtable_1.8-4               
[135] broom_1.0.6                 restfulr_0.0.15            
[137] tidytree_0.4.6              later_1.3.2                
[139] aplot_0.2.3                 memoise_2.0.1              
[141] GenomicAlignments_1.38.0    cluster_2.1.6              
[143] timechange_0.3.0