bulk_peaks = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/sc_H3K27ac_merged_seacr_top0.5.peaks.stringent.bed", as = "GRanges")
peak_files = list.files("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/", pattern = "^Sample-P2-", full.names = TRUE)
fragments = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/bed/sc_H3K27ac_merged_bowtie2_rmDup.fragments.bed", as = "GRanges")
getFragmentsOverlapsIDs = function(file_path){
cell = ChIPseeker::readPeakFile(file_path)
cell_name = basename(file_path)
overlap = GenomicRanges:::countOverlaps_GenomicRanges(query = fragments, subject = cell)
fragments$name = cell_name
fragments$score = overlap
return(fragments)
}
results = lapply(peak_files, getFragmentsOverlapsIDs)
results_dfs = lapply(results, data.frame)
results_dfs2 = list()
for(i in 1:length(results_dfs)){
results_dfs2[[i]] = results_dfs[[i]][results_dfs[[i]]$score > 0,]
}
results_bind = do.call("rbind", results_dfs2)
results_final = GenomicRanges::makeGRangesFromDataFrame(results_bind, keep.extra.columns = TRUE)
data.table::fwrite(data.frame(results_final), "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/fragments/fragments_str0.5.tsv", sep = '\t')
#system(paste("bgzip","/Volumes/la420/home/CUTnTAG/sc_H3K27ac/analysis/fragments.tsv"))
#system(paste("tabix", "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/analysis/fragments.tsv.gz"))basenameList = list.files("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/", full.names = FALSE, pattern = "\\.stringent.bed$")
basenameList = basenameList[-64]
fpath = file.path("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/fragments/fragments_str0.5_filt_nohead_sort.tsv.gz")
fragments = CreateFragmentObject(
path = fpath,
cells = basenameList,
validate.fragments = FALSE
)
bulk_peaks = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/sc_H3K27ac_merged_seacr_top0.5.peaks.stringent.bed", as = "GRanges")
peak_matrix = FeatureMatrix(
fragments = fragments,
features = bulk_peaks
)
chrom_assay = CreateChromatinAssay(
counts = peak_matrix,
genome = "hg19",
fragments = fragments
)
scCT_H3K27ac = Seurat::CreateSeuratObject(
counts = chrom_assay,
assay = "peaks"
)# extract gene annotations from EnsDb
annotations = GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)
# change to UCSC style since the data was mapped to hg19
seqlevelsStyle(annotations) = "UCSC"
genome(annotations) = "hg19"
# add the gene information to the object
Annotation(scCT_H3K27ac) = annotationstotal_fragments = CountFragments(fpath)
order = c(colnames(scCT_H3K27ac))
scCT_H3K27ac$fragments = total_fragments[colnames(scCT_H3K27ac), "frequency_count"]
# above may not work so do:
total_fragments = total_fragments[match(order, total_fragments$CB), ]
for(i in 1:length(colnames(scCT_H3K27ac))){
scCT_H3K27ac$fragments[[i]] = total_fragments$frequency_count[i]
}
peak_matrix = FeatureMatrix(
fragments = Fragments(scCT_H3K27ac),
features = bulk_peaks
)
scCT_H3K27ac = FRiP(
object = scCT_H3K27ac,
assay = "peaks",
total.fragments = "fragments"
)
scCT_H3K27ac = NucleosomeSignal(object = scCT_H3K27ac, assay = "peaks")
scCT_H3K27ac = TSSEnrichment(object = scCT_H3K27ac, fast = FALSE)
scCT_H3K27ac$blacklist_fraction = FractionCountsInRegion(
object = scCT_H3K27ac,
assay = 'peaks',
regions = blacklist_hg19
)
TSSPlot(scCT_H3K27ac) + NoLegend()FragmentHistogram(object = scCT_H3K27ac)Seurat::VlnPlot(
object = scCT_H3K27ac,
features = c("FRiP", "TSS.enrichment", "blacklist_fraction", "nucleosome_signal"),
pt.size = 1,
ncol = 5
)scCT_H3K27ac = RunTFIDF(scCT_H3K27ac)
scCT_H3K27ac = FindTopFeatures(scCT_H3K27ac, min.cutoff = 'q0')
scCT_H3K27ac = RunSVD(scCT_H3K27ac)
DepthCor(scCT_H3K27ac)scCT_H3K27ac = RunUMAP(object = scCT_H3K27ac, reduction = "lsi", dims = 1:30)
scCT_H3K27ac = FindNeighbors(object = scCT_H3K27ac, reduction = "lsi", dims = 1:30)
scCT_H3K27ac = FindClusters(object = scCT_H3K27ac, verbose = FALSE, algorithm = 3)
DimPlot(object = scCT_H3K27ac, label = TRUE, reduction = 'lsi') + NoLegend()gene.activities = GeneActivity(scCT_H3K27ac)
scCT_H3K27ac[['RNA']] = CreateAssayObject(counts = gene.activities)
scCT_H3K27ac = NormalizeData(
object = scCT_H3K27ac,
assay = 'RNA',
normalization.method = 'LogNormalize',
scale.factor = median(scCT_H3K27ac$nCount_RNA)
)pfm = getMatrixSet(
x = JASPAR2020,
opts = list(species = 9606, all_versions = FALSE)
)
# add motif information
scCT_H3K27ac = AddMotifs(
object = scCT_H3K27ac,
genome = BSgenome.Hsapiens.UCSC.hg19,
pfm = pfm
)
motifs = Seurat::GetAssayData(scCT_H3K27ac, slot = "motifs")
MotifPlot(
object = scCT_H3K27ac,
motifs = head(colnames(motifs))
)peakdir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/"
merged = ChIPseeker::readPeakFile(paste0(peakdir, "sc_H3K27ac_merged_seacr_top0.5.peaks.stringent.bed"), as = "GRanges") %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
merged_filt = merged[merged$V4 < 3000,]
ChIPseeker::covplot(merged_filt, weightCol = "V4")#fragments_summary = read.csv(file = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/fragments/fragments_str0.5_filt_nohead_sort.tsv.gz", sep = "\t", header = FALSE)
fragdir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/bed/"
fragfiles = list.files(fragdir, pattern = "\\.fragments.bed$", full.names = TRUE)
fragfiles = fragfiles[-64]
total_frags_list = c()
for(file in fragfiles){
total_frags = read.table(file, sep = "\t", header = FALSE)
total_frags = nrow(total_frags)
total_frags_list = c(total_frags_list, total_frags)
}
frag_summary = data.frame(Cell = fragfiles, Total_unique_fragments = total_frags_list)
frag_summary$rank = rank(-frag_summary$Total_unique_fragments)
frag_summary %>% ggplot(aes(x = rank, y = Total_unique_fragments)) +
theme_light() +
geom_line(size = 1, linetype = 1) +
ylab("Unique Fragments") +
xlab("Cell rank")fragdir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/sam/fragmentLen/"
fraglenList = list.files(fragdir, full.names = FALSE)
fragLen = c()
for(name in fraglenList){
fragLen = read.table(paste0(fragdir, name), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = name) %>% rbind(fragLen, .)
}
fig1 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
geom_violin(bw = 5) +
scale_y_continuous(breaks = seq(0, 800, 50)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 20) +
ggpubr::rotate_x_text(angle = 90) +
ylab("Fragment Length") +
xlab("") +
theme(plot.margin = unit(c(0.5,0,0,2), "cm")) +
theme(legend.position = "none")
fig2 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
geom_line(size = 1) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma") +
theme_bw(base_size = 20) +
xlab("Fragment Length") +
ylab("Count") +
coord_cartesian(xlim = c(0, 500)) +
theme(legend.position = "none")
fig1fig2datadir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/alignment/sam/backup/bowtie2_summary/"
files = list.files(path = datadir, full.names = FALSE)
sampleList = c()
for(file in files){
name = strsplit(file, split = '_', fixed = TRUE)
name = paste0(name[[1]][1], "_", name[[1]][2])
sampleList = c(sampleList, name)
}
alignResult = c()
for(sample in sampleList){
alignRes = read.table(paste0(datadir, sample, "_trimmed_bowtie2.txt"), header = FALSE, fill = TRUE)
alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
alignResult = data.frame(Antibody = sample, SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric,
MappedFragNum_hg19 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
AlignmentRate_hg19 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}
alignResult %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))Bulk duplication rate is 81%
picarddir = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/removeDuplicate/picard_summary/"
files = list.files(path = picarddir, full.names = FALSE, pattern = "\\_picard.rmDup.txt$")
dupResult = c()
for(sample in sampleList){
dupRes = read.table(paste0(picarddir, sample, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
dupResult = data.frame(Antibody = sample, MappedFragNum_hg19 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric, DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100) %>% mutate(UniqueFragNum = (MappedFragNum_hg19 * (1-DuplicationRate/100)) %>% round()) %>% rbind(dupResult, .)
}
alignDupSummary = left_join(alignResult, dupResult, by = c("Antibody", "MappedFragNum_hg19")) %>% mutate(DuplicationRate = paste0(DuplicationRate, "%")) %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))
alignDupSummaryhg19_blacklist = ChIPseeker::readPeakFile("/Volumes/la420/home/hg19/other/hg19-blacklist.v2.bed.gz", as = "GRanges")
bulk_peaks_filt = bulk_peaks %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
bulk_peaks_filt = IRanges::subsetByOverlaps(bulk_peaks_filt, hg19_blacklist, invert = TRUE)
ENCODE_H3K27ac = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27ac_ENCFF044JNJ.bed.narrowPeak", as = "GRanges")
ENCODE_H3K27me3 = ChIPseeker::readPeakFile("/Volumes/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27me3_ENCFF126QYP.bed.narrowPeak", as = "GRanges")
# CT in ENCODE
overlaps_H3K27ac = GenomicRanges::countOverlaps(query = bulk_peaks_filt, subject = ENCODE_H3K27ac)
overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = bulk_peaks_filt, subject = ENCODE_H3K27me3)
CT_in_ENCODE_H3K27ac = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
CT_in_ENCODE_H3K27me3 = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
paste0("Proportion of CT in ENCODE H3K27ac: ", (CT_in_ENCODE_H3K27ac/length(bulk_peaks_filt)*100) %>% round(3), "%")## [1] "Proportion of CT in ENCODE H3K27ac: 14.16%"
paste0("Proportion of CT in ENCODE H3K27me3: ", (CT_in_ENCODE_H3K27me3/length(bulk_peaks_filt)*100) %>% round(3), "%")## [1] "Proportion of CT in ENCODE H3K27me3: 9.617%"
# ENCODE in CT
overlaps_H3K27ac = GenomicRanges::countOverlaps(query = ENCODE_H3K27ac, subject = bulk_peaks_filt)
overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = ENCODE_H3K27me3, subject = bulk_peaks_filt)
ENCODE_H3K27ac_captured_peaks = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
ENCODE_H3K27me3_captured_peaks = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
paste0("Proportion of ENCODE H3K27ac captured by CT: ", (CT_in_ENCODE_H3K27ac/length(ENCODE_H3K27ac)*100) %>% round(3), "%")## [1] "Proportion of ENCODE H3K27ac captured by CT: 3.66%"
paste0("Proportion of ENCODE H3K27me3 captured by CT: ", (CT_in_ENCODE_H3K27me3/length(ENCODE_H3K27me3)*100) %>% round(3), "%")## [1] "Proportion of ENCODE H3K27me3 captured by CT: 0.773%"
promoters = ChIPseeker::getPromoters(TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, upstream = 0, downstream = 1)
matrix = EnrichedHeatmap::normalizeToMatrix(bulk_peaks_filt, promoters, value_column = "V4", extend = 3000, mean_mode = "w0", w = 50, keep = c(0, 0.99999))
SEACR_heatmap = EnrichedHeatmap(matrix, col = colorRamp2(c(0.99999, 0.999999), c("white", "black")), top_annotation = HeatmapAnnotation(enriched = anno_enriched(gp = gpar(col = "black", lty = 2, lwd = 2), col = "black")), axis_name = c("-3kb", "TSS", "TSS","3kb"))
ComplexHeatmap::draw(SEACR_heatmap, show_heatmap_legend = FALSE)peak_files = list.files(path = "/Volumes/la420/home/CUTnTAG/sc_H3K27ac/peakCalling/SEACR/", pattern = "^Sample-P2-", full.names = FALSE)
peakList = list()
for(file in peak_files){
peak = ChIPseeker::readPeakFile(paste0(peakdir, file), as = "GRanges")
peakList = c(peakList, peak)
}
peakList = setNames(peakList, sampleList)chrHMM_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmK562HMM.bed.gz"
chrHMM = genomation::readBed(chrHMM_url)
chrHMM_list = GenomicRanges::split(chrHMM, chrHMM$name, drop = TRUE)
annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(peakList), chrHMM_list)
genomation::heatTargetAnnotation(annotation)DBA_sc = NULL
for(i in 1:length(sampleList)){
DBA_sc = DiffBind::dba.peakset(DBA = DBA_sc, peaks = peakList[[i]], sampID = sampleList[i])
}
DiffBind::dba.plotHeatmap(DBA_sc, margin = 15)