Single cell age in fly development
----------------------------------
In this tutorial, we will show you a slightly more complex application of EpiTrace on infering single cell age during fly development. This data is comprised of some 200k+ single cells, collected at defined time after egg laying, and sequenced by scATAC. The data is described in :cite:p:`Calderon2020`.
Remember from the bulk ATAC-seq demo that, the input data for EpiTrace are:
- a count matrix where rows are genomic loci, and columns are cells, and elements are ATAC-seq read numbers.
- a data frame or GRanges object corresponds to the genomic loci (we call it "peak sets")
Also, to run EpiTrace, you also need to choose a reference clock-like loci for age estimation. However, there is no active CpG methylation in *Drosophila* genome, prohibiting determination of putative clock-like differential methylated loci as we did in human.
Here we will show you how to perform single-cell age estimation from fly scATAC data, by leveraging high-level orthology relationship to map human clock-like regions to fly genome.
You need to download the fly downsampled scATAC data ("fly.all.downsampled_seurat_filtered_processed.rds") from `the WASHU website `_., and the human-fly orthology table ("dmel_human_orthologs_disease_fb_2022_05.tsv.gz") from `the flybase website `_, in order to run this demo. We assume you have already put these files in a ``fly_scATAC`` dir.
Step 1. Initialize environment
''''''''''''''''''''''''''''''
Load EpiTrace as::
library(dplyr)
library(tidyr)
library(readr)
library(GenomicRanges)
library(reshape2)
library(openxlsx)
library(ggplot2)
library(Matrix)
library(EpiTrace)
library(Seurat)
library(SeuratObject)
library(ggtree)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg19)
library(BSgenome.Dmelanogaster.UCSC.dm6)
library(patchwork)
library(ArchR)
library(parallel)
library(ChIPseeker)
library(TxDb.Dmelanogaster.UCSC.dm6.ensGene)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(org.Dm.eg.db)
Step 2. Load data
'''''''''''''''''
We load the human clock-like DML using::
data("clock_gr_list")
We then load the fly data (assume you put them in a ``fly_scATAC`` dir)::
setwd('fly_scATAC') # change this to your dir
datadir = 'fly_scATAC'
fly_dat <- readRDS('fly.all.downsampled_seurat_filtered_processed.rds')
We now load the fly-to-human orthology mapping data (assume you put them in the same `fly_scATAC` dir)::
dmel_mapped_to_human <- readr::read_tsv('dmel_human_orthologs_disease_fb_2022_05.tsv.gz',comment = "## ")
colnames(dmel_mapped_to_human) <- gsub('#','',colnames(dmel_mapped_to_human))
Step 3. Define clock-like loci in fly genome
''''''''''''''''''''''''''''''''''''''''''''
We first annotate the genomic location for each clockDML using `ChIPseeker`::
# map human clock in the promoter/TSS region to respective fly homolog
human_clock_hg19 <- plyranges::reduce_ranges(c(clock_gr_list[[1]],clock_gr_list[[2]]))
human_clock_hg19_anno <- ChIPseeker::annotatePeak(human_clock_hg19,TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene,assignGenomicAnnotation = T,annoDb = 'org.Hs.eg.db',tssRegion = c(3000,1000),level='gene')
# extract human annotation
human_clock_hg19_anno_df <- human_clock_hg19_anno@anno %>% as.data.frame
human_clock_hg19_anno_tss_df <- human_clock_hg19_anno_df[abs(human_clock_hg19_anno_df$distanceToTSS)< 100,]
human_clock_hg19_anno_tss_df$SYMBOL -> human_clock_hg19_anno_tss_df$Human_gene_symbol
We then annotate the genomic location for each fly scATAC peak, similarly using `ChIPseeker`::
# annotate fly ranges
fly_dat_ranges <- data.frame(peaks=rownames(fly_dat)) %>% separate(col=1,into=c('chr','start','end'),remove=F,convert=T) %>% makeGRangesFromDataFrame(keep.extra.columns = T)
fly_dat_ranges_anno <- ChIPseeker::annotatePeak(fly_dat_ranges,TxDb=TxDb.Dmelanogaster.UCSC.dm6.ensGene,assignGenomicAnnotation = T,annoDb = 'org.Dm.eg.db',tssRegion = c(3000,1000),level='gene')
# extract fly annotation
fly_dat_ranges_anno_df <- fly_dat_ranges_anno@anno %>% as.data.frame()
We now use the ortholog table to find "mappable" human ClockDML which is close to TSS of human-fly ortholog genes::
# find mappable (have ortholog) human annotation
human_clock_hg19_anno_tss_df <- left_join(human_clock_hg19_anno_tss_df,dmel_mapped_to_human)
human_clock_hg19_anno_tss_mappable_df <- human_clock_hg19_anno_tss_df[!is.na(human_clock_hg19_anno_tss_df$Dmel_gene_ID),]
Finally, we can define putative clock-like regions in fly, using such orthology::
# define putative clockAcc regions for fly
fly_dat_ranges_anno_df$clock_hit <- ifelse(abs(fly_dat_ranges_anno_df$distanceToTSS)<100 & fly_dat_ranges_anno_df$geneId %in% human_clock_hg19_anno_tss_mappable_df$Dmel_gene_ID, 'clock','non_clock')
fly_dat_ranges_putative_clock <- fly_dat_ranges_anno@anno[fly_dat_ranges_anno_df$clock_hit %in% 'clock',]
Step 4. Estimate cell age with fly clock-like loci
''''''''''''''''''''''''''''''''''''''''''''''''''
Now we can perform scATAC age estimation on fly genome, as we did for other data, just use a different reference genomic loci (setting the `clock_gr` option)::
## use the fly dataset to perform convergence analysis
EpiTrace::Init_Peakset(fly_dat_ranges) -> init_gr
EpiTrace::Init_Matrix(peakname = fly_dat_ranges$peaks,cellname = colnames(fly_dat),matrix = (GetAssayData(fly_dat))) -> init_mm
# allPeaks clock on fly
EpiTrace::EpiTraceAge_Convergence(peakSet = init_gr,matrix=init_mm,ref_genome='dm6',clock_gr=fly_dat_ranges_putative_clock,iterative_time = 5,min.cutoff = 0,non_standard_clock = T,qualnum = 10,ncore_lim = 40,mean_error_limit = 0.1) -> epitrace_obj_age_estimated
Step 5. Validating the result with the published sample time and cell age derived from neural network
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Note that we have exported the metadata from the fly scATAC dataset as a rds::
Shendure_atac_meta <- readRDS('Shendure_atac_meta.rds')
meta_compare <- left_join(epitrace_obj_age_estimated@meta.data,Shendure_atac_meta)
cor.test(meta_compare$lasso_age,meta_compare$NNv1_age)
cor.test(meta_compare$EpiTraceAge_iterative,meta_compare$NNv1_age)
meta_compare %>% dplyr::group_by(NNv1_time.new) %>% dplyr::summarise(meanAge=mean(EpiTraceAge_iterative,na.rm=T)) -> per_timepoint_mean_age
cor.test(as.numeric(per_timepoint_mean_age$NNv1_time.new),per_timepoint_mean_age$meanAge)
This reports::
Pearson's product-moment correlation
data: as.numeric(per_timepoint_mean_age$NNv1_time.new) and per_timepoint_mean_age$meanAge
t = 11.104, df = 8, p-value = 3.863e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8706589 0.9928825
sample estimates:
cor
0.9690577
We can also do the correlation between single cell ages::
meta_compare <- na.omit(meta_compare)
cor.test(as.numeric(meta_compare$NNv1_age),meta_compare$EpiTraceAge_iterative)
This reports::
Pearson's product-moment correlation
data: as.numeric(meta_compare$NNv1_age) and meta_compare$EpiTraceAge_iterative)
t = 286.89, df = 203240, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5337807 0.5399695
sample estimates:
cor
0.5368824
We can now test the association between mean EpiTrace age of the sample, vs sample development time::
ggplot(per_timepoint_mean_age,aes(x=NNv1_time.new,y=meanAge,group=1)) + geom_point(pch=21,size=5,aes(fill=NNv1_time.new)) + geom_smooth(method='lm',aes(group=1)) + ggpubr::stat_cor(size=6) + scale_fill_viridis_d() + theme_classic() + theme(text=element_text(size=20),axis.text.x = element_text(angle=45,vjust=1,hjust=1))
The result is summarised in this figure:
.. image:: _static/DMscATAC.svg
:width: 600px
:align: center