Prepare fragments file

bulk_peaks = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/sc_H3K27ac_merged_seacr_top0.5.peaks.stringent.bed", as = "GRanges")
peak_files = list.files("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/", pattern = "^Sample-P2-", full.names = TRUE) 

fragments = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/bed/sc_H3K27ac_merged_bowtie2_rmDup.fragments.bed", as = "GRanges")

getFragmentsOverlapsIDs = function(file_path){
  cell = ChIPseeker::readPeakFile(file_path)
  cell_name = basename(file_path)
  overlap = GenomicRanges:::countOverlaps_GenomicRanges(query = fragments, subject = cell)
  fragments$name = cell_name
  fragments$score = overlap
  return(fragments)
}

results = lapply(peak_files, getFragmentsOverlapsIDs)
results_dfs = lapply(results, data.frame)
results_dfs2 = list()

for(i in 1:length(results_dfs)){
  results_dfs2[[i]] = results_dfs[[i]][results_dfs[[i]]$score > 0,]
}

results_bind = do.call("rbind", results_dfs2)
results_final = GenomicRanges::makeGRangesFromDataFrame(results_bind, keep.extra.columns = TRUE)

data.table::fwrite(data.frame(results_final), "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/fragments/fragments_str0.5.tsv", sep = '\t')
#system(paste("bgzip","/Volumes/la420/home/CUTnTAG/sc_H3K27ac/analysis/fragments.tsv"))
#system(paste("tabix", "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/analysis/fragments.tsv.gz"))

Prepare objects

basenameList = list.files("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/", full.names = FALSE, pattern = "\\.stringent.bed$")
basenameList = basenameList[-64]
fpath = file.path("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/fragments/fragments_str0.5_filt_nohead_sort.tsv.gz")

fragments = CreateFragmentObject(
  path = fpath,
  cells = basenameList,
  validate.fragments = FALSE
)


bulk_peaks = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/sc_H3K27ac_merged_seacr_top0.5.peaks.stringent.bed", as = "GRanges")

peak_matrix = FeatureMatrix(
  fragments = fragments,
  features = bulk_peaks
)

chrom_assay = CreateChromatinAssay(
  counts = peak_matrix,
  genome = "hg19",
  fragments = fragments
)

scCT_H3K27ac = Seurat::CreateSeuratObject(
  counts = chrom_assay,
  assay = "peaks"
)
# extract gene annotations from EnsDb
annotations = GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)

# change to UCSC style since the data was mapped to hg19
seqlevelsStyle(annotations) = "UCSC"
genome(annotations) = "hg19"

# add the gene information to the object
Annotation(scCT_H3K27ac) = annotations

QC

total_fragments = CountFragments(fpath)
order = c(colnames(scCT_H3K27ac))
scCT_H3K27ac$fragments = total_fragments[colnames(scCT_H3K27ac), "frequency_count"]

# above may not work so do:
total_fragments = total_fragments[match(order, total_fragments$CB), ]

for(i in 1:length(colnames(scCT_H3K27ac))){
  scCT_H3K27ac$fragments[[i]] = total_fragments$frequency_count[i]
}


peak_matrix = FeatureMatrix(
  fragments = Fragments(scCT_H3K27ac),
  features = bulk_peaks
)

scCT_H3K27ac = FRiP(
  object = scCT_H3K27ac,
  assay = "peaks",
  total.fragments = "fragments"
)

scCT_H3K27ac = NucleosomeSignal(object = scCT_H3K27ac, assay = "peaks")

scCT_H3K27ac = TSSEnrichment(object = scCT_H3K27ac, fast = FALSE)

scCT_H3K27ac$blacklist_fraction = FractionCountsInRegion(
  object = scCT_H3K27ac, 
  assay = 'peaks',
  regions = blacklist_hg19
)

TSSPlot(scCT_H3K27ac) + NoLegend()

FragmentHistogram(object = scCT_H3K27ac)

Seurat::VlnPlot(
  object = scCT_H3K27ac,
  features = c("FRiP", "TSS.enrichment", "blacklist_fraction", "nucleosome_signal"),
  pt.size = 1,
  ncol = 5
)

UMAP

scCT_H3K27ac = RunTFIDF(scCT_H3K27ac)
scCT_H3K27ac = FindTopFeatures(scCT_H3K27ac, min.cutoff = 'q0')
scCT_H3K27ac = RunSVD(scCT_H3K27ac)
DepthCor(scCT_H3K27ac)

scCT_H3K27ac = RunUMAP(object = scCT_H3K27ac, reduction = "lsi", dims = 1:30)
scCT_H3K27ac = FindNeighbors(object = scCT_H3K27ac, reduction = "lsi", dims = 1:30)
scCT_H3K27ac = FindClusters(object = scCT_H3K27ac, verbose = FALSE, algorithm = 3)
DimPlot(object = scCT_H3K27ac, label = TRUE, reduction = 'lsi') + NoLegend()

gene.activities = GeneActivity(scCT_H3K27ac)
scCT_H3K27ac[['RNA']] = CreateAssayObject(counts = gene.activities)

scCT_H3K27ac = NormalizeData(
  object = scCT_H3K27ac,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(scCT_H3K27ac$nCount_RNA)
)

Motifs

pfm = getMatrixSet(
  x = JASPAR2020,
  opts = list(species = 9606, all_versions = FALSE)
)

# add motif information
scCT_H3K27ac = AddMotifs(
  object = scCT_H3K27ac,
  genome = BSgenome.Hsapiens.UCSC.hg19,
  pfm = pfm
)

motifs = Seurat::GetAssayData(scCT_H3K27ac, slot = "motifs")
MotifPlot(
  object = scCT_H3K27ac,
  motifs = head(colnames(motifs))
)

Covplot

peakdir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/"
merged = ChIPseeker::readPeakFile(paste0(peakdir, "sc_H3K27ac_merged_seacr_top0.5.peaks.stringent.bed"), as = "GRanges") %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)

merged_filt = merged[merged$V4 < 3000,]

ChIPseeker::covplot(merged_filt, weightCol = "V4")

Fragments in cells

#fragments_summary = read.csv(file = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/fragments/fragments_str0.5_filt_nohead_sort.tsv.gz", sep = "\t", header = FALSE)

fragdir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/bed/"
fragfiles = list.files(fragdir, pattern = "\\.fragments.bed$", full.names = TRUE)
fragfiles = fragfiles[-64]

total_frags_list = c()

for(file in fragfiles){
  total_frags = read.table(file, sep = "\t", header = FALSE)
  total_frags = nrow(total_frags)
  total_frags_list = c(total_frags_list, total_frags)
}

frag_summary = data.frame(Cell = fragfiles, Total_unique_fragments = total_frags_list)
frag_summary$rank = rank(-frag_summary$Total_unique_fragments)

frag_summary %>% ggplot(aes(x = rank, y = Total_unique_fragments)) +
    theme_light() +
    geom_line(size = 1, linetype = 1) +
    ylab("Unique Fragments") +
    xlab("Cell rank")

Fragment lengths

fragdir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/sam/fragmentLen/"
fraglenList = list.files(fragdir, full.names = FALSE) 

fragLen = c()
for(name in fraglenList){
  fragLen = read.table(paste0(fragdir, name), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = name) %>% rbind(fragLen, .) 
}

fig1 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
    geom_violin(bw = 5) +
    scale_y_continuous(breaks = seq(0, 800, 50)) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 20) +
    ggpubr::rotate_x_text(angle = 90) +
    ylab("Fragment Length") +
    xlab("") +
    theme(plot.margin = unit(c(0.5,0,0,2), "cm")) +
    theme(legend.position = "none")
  
fig2 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
  geom_line(size = 1) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma") +
  theme_bw(base_size = 20) +
  xlab("Fragment Length") +
  ylab("Count") +
  coord_cartesian(xlim = c(0, 500)) +
  theme(legend.position = "none")

fig1

fig2

Alignment summary

datadir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/sam/backup/bowtie2_summary/"
files = list.files(path = datadir, full.names = FALSE)

sampleList = c()
for(file in files){
  name = strsplit(file, split = '_', fixed = TRUE)
  name = paste0(name[[1]][1], "_", name[[1]][2])
  sampleList = c(sampleList, name)
}

alignResult = c()
for(sample in sampleList){
  alignRes = read.table(paste0(datadir, sample, "_trimmed_bowtie2.txt"), header = FALSE, fill = TRUE)
  alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
  alignResult = data.frame(Antibody = sample, SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric, 
                           MappedFragNum_hg19 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
                           AlignmentRate_hg19 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}

alignResult %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))

Duplicates

Bulk duplication rate is 81%

picarddir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/removeDuplicate/picard_summary/"
files = list.files(path = picarddir, full.names = FALSE, pattern = "\\_picard.rmDup.txt$")

dupResult = c()
for(sample in sampleList){
  dupRes = read.table(paste0(picarddir, sample, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
  dupResult = data.frame(Antibody = sample, MappedFragNum_hg19 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric, DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100) %>% mutate(UniqueFragNum = (MappedFragNum_hg19 * (1-DuplicationRate/100)) %>% round()) %>% rbind(dupResult, .)
}

alignDupSummary = left_join(alignResult, dupResult, by = c("Antibody", "MappedFragNum_hg19")) %>% mutate(DuplicationRate = paste0(DuplicationRate, "%")) %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))
alignDupSummary

Bulk ENCODE coverage

hg19_blacklist = ChIPseeker::readPeakFile("/Volumes/la420/home/hg19/other/hg19-blacklist.v2.bed.gz", as = "GRanges")
bulk_peaks_filt = bulk_peaks %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)

bulk_peaks_filt = IRanges::subsetByOverlaps(bulk_peaks_filt, hg19_blacklist, invert = TRUE)

ENCODE_H3K27ac = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27ac_ENCFF044JNJ.bed.narrowPeak", as = "GRanges")
ENCODE_H3K27me3 = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27me3_ENCFF126QYP.bed.narrowPeak", as = "GRanges")

# CT in ENCODE
overlaps_H3K27ac = GenomicRanges::countOverlaps(query = bulk_peaks_filt, subject = ENCODE_H3K27ac)
overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = bulk_peaks_filt, subject = ENCODE_H3K27me3)
CT_in_ENCODE_H3K27ac = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
CT_in_ENCODE_H3K27me3 = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
paste0("Proportion of CT in ENCODE H3K27ac: ", (CT_in_ENCODE_H3K27ac/length(bulk_peaks_filt)*100) %>% round(3), "%")
## [1] "Proportion of CT in ENCODE H3K27ac: 14.16%"
paste0("Proportion of CT in ENCODE H3K27me3: ", (CT_in_ENCODE_H3K27me3/length(bulk_peaks_filt)*100) %>% round(3), "%")
## [1] "Proportion of CT in ENCODE H3K27me3: 9.617%"
# ENCODE in CT
overlaps_H3K27ac = GenomicRanges::countOverlaps(query = ENCODE_H3K27ac, subject = bulk_peaks_filt)
overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = ENCODE_H3K27me3, subject = bulk_peaks_filt)
ENCODE_H3K27ac_captured_peaks = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
ENCODE_H3K27me3_captured_peaks = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
paste0("Proportion of ENCODE H3K27ac captured by CT: ", (CT_in_ENCODE_H3K27ac/length(ENCODE_H3K27ac)*100) %>% round(3), "%")
## [1] "Proportion of ENCODE H3K27ac captured by CT: 3.66%"
paste0("Proportion of ENCODE H3K27me3 captured by CT: ", (CT_in_ENCODE_H3K27me3/length(ENCODE_H3K27me3)*100) %>% round(3), "%")
## [1] "Proportion of ENCODE H3K27me3 captured by CT: 0.773%"

Bulk peak heatmap

promoters = ChIPseeker::getPromoters(TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, upstream = 0, downstream = 1)

matrix = EnrichedHeatmap::normalizeToMatrix(bulk_peaks_filt, promoters, value_column = "V4", extend = 3000, mean_mode = "w0", w = 50, keep = c(0, 0.99999))

SEACR_heatmap = EnrichedHeatmap(matrix, col = colorRamp2(c(0.99999, 0.999999), c("white", "black")), top_annotation = HeatmapAnnotation(enriched = anno_enriched(gp = gpar(col = "black", lty = 2, lwd = 2), col = "black")), axis_name = c("-3kb", "TSS", "TSS","3kb"))

ComplexHeatmap::draw(SEACR_heatmap, show_heatmap_legend = FALSE)

ChromHMM

peak_files = list.files(path = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/", pattern = "^Sample-P2-", full.names = FALSE)

peakList = list()

for(file in peak_files){
  peak = ChIPseeker::readPeakFile(paste0(peakdir, file), as = "GRanges")
  peakList = c(peakList, peak)
}

peakList = setNames(peakList, sampleList)
chrHMM_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmK562HMM.bed.gz"
chrHMM = genomation::readBed(chrHMM_url)
chrHMM_list = GenomicRanges::split(chrHMM, chrHMM$name, drop = TRUE) 

annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(peakList), chrHMM_list)
genomation::heatTargetAnnotation(annotation)

Peak correlation

DBA_sc = NULL

for(i in 1:length(sampleList)){
   DBA_sc = DiffBind::dba.peakset(DBA = DBA_sc, peaks = peakList[[i]], sampID = sampleList[i])
}

DiffBind::dba.plotHeatmap(DBA_sc, margin = 15)