root="/Volumes/bms20/projects/neurogenomics-lab/live/Data/tip_seq"
print(root)
## [1] "/Volumes/bms20/projects/neurogenomics-lab/live/Data/tip_seq"
datadir=file.path(
  root,"raw_data/H3K27ac_K562_cells/phase_1_05_jan_2022/X204SC21121860-Z01-F001/raw_data"
)
print(datadir) 
## [1] "/Volumes/bms20/projects/neurogenomics-lab/live/Data/tip_seq/raw_data/H3K27ac_K562_cells/phase_1_05_jan_2022/X204SC21121860-Z01-F001/raw_data"
fastq_files <- list.files(datadir, pattern = ".fastq.gz|.fq.gz",
                          full.names = TRUE, recursive = TRUE)

echoconda::yaml_to_env("epiprocess")
## echoconda:: Conda already installed.
## Identified yaml file stored in echoconda.
## Yaml contents:
## name: epiprocess
## channels:
##   - conda-forge
##   - bioconda
##   - nodefaults
## dependencies:
##   # Python
##   - python>=3.6.1
##   - pandas>=0.25.0
##   # Command line
##   - fastqc
##   - trim-galore
##   - bowtie2
##   - picard
##   - samtools
##   - bedtools
##   - seacr
##   - deeptools
##   - macs2
##   # R
##   - r>=4.0.0
##   - r-devtools
##   - r-patchwork
##   - r-gert
##   - r-r.oo
##   - r-data.table
##   - r-ggplot2
##   - r-tidyverse
##   # R: Bioconductor
##   - r-biocmanager
##   - radian
##   - pip
## 
## echoconda:: Conda environment already exists: epiprocess
## Time difference of 0.3 secs
## [1] "epiprocess"

Pre-processing

FastQC

 

Adapter trimming

Get bad gzip error if use zipped files, so gunzip first.

### Linux command

sampledir="rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/raw_data/HKTCKBBXY/IGFQ001138_hu_2-3-2021_CutandTag/IGF117517"
outdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/data_trimmed"

#mkdir -p $outdir

sample_name="..."

## need cutadapt
module load anaconda3/personal
source activate cutadaptenv

/rds/general/user/la420/home/TrimGalore-0.6.6/trim_galore -o $outdir --gzip --basename $sample_name --paired $sampledir/${sample_name}_R1_001.fastq $sampledir/${sample_name}_R2_001.fastq

Alignment

Index and other genome files obtained at https://emea.support.illumina.com/sequencing/sequencing_software/igenome.html

  • If adapter trimming is not required (e.g. in case of shorter-read sequencing, such as 25x25 paired-end sequencing), use the –end-to-end instead of –local bowtie2 setting. –local is used to avoid remaining adapter sequence at the 3’ ends of reads during mapping.
### Linux command

datadir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
ref="/rds/general/user/la420/home/hg19/index/genome"
cores=8

sample_name="..."

#mkdir -p ${datadir}/alignment/sam/bowtie2_summary
#mkdir -p ${datadir}/alignment/bam
#mkdir -p ${datadir}/alignment/bed
#mkdir -p ${datadir}/alignment/bedgraph

outdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment/sam"

module load bowtie2/2.2.9

bowtie2 \
--local \ 
--very-sensitive \
--no-mixed \
--no-discordant \
--phred33 \
-I 10 \
-X 700 \
-p ${cores} \
-x ${ref} \
-1 ${datadir}/data_trimmed/${sample_name}_val_1.fq.gz \
-2 ${datadir}/data_trimmed/${sample_name}_val_2.fq.gz \
-S ${outdir}/${sample_name}_trimmed_bowtie2.sam &> ${outdir}/bowtie2_summary/${sample_name}_trimmed_bowtie2.txt

Remove duplicates

### Linux command

module load picard/2.6.0

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
picard_command="java -Xmx12G -XX:ParallelGCThreads=5 -XX:GCTimeLimit=50 -jar /apps/picard/2.6.0/picard.jar"
sample_name="..."

#mkdir -p $workdir/removeDuplicate
#mkdir -p $workdir/removeDuplicate/picard_summary

## sort by coordinate
$picard_command SortSam I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sam O=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam SORT_ORDER=coordinate

## mark duplicates
$picard_command MarkDuplicates I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam O=$workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.dupMarked.sam METRICS_FILE=$workdir/removeDuplicate/picard_summary/${sample_name}_picard.dupMark.txt

## remove duplicates
$picard_command MarkDuplicates I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam O=$workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam REMOVE_DUPLICATES=true METRICS_FILE=$workdir/removeDuplicate/picard_summary/${sample_name}_picard.rmDup.txt

Obtain fragment size distribution

### Linux command
module load samtools/1.3.1

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
sample_name="..."

#mkdir -p $workdir/alignment/sam/fragmentLen

## extract the 9th column from the alignment sam file which is the fragment length
samtools view -F 0x04 $workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam | awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' >$workdir/alignment/sam/fragmentLen/${sample_name}_rmDup_fragmentLen.txt

Convert files

### Linux command

module load samtools/1.3.1
module load bedtools/2.25.0

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
sample_name="..."

## sort sam file by read name (rather than coordinate, as it is now)
samtools sort -n $workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam -o $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.n_sorted.rmDup.sam

## filter and keep the mapped read pairs
samtools view -bS -F 0x04 $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.n_sorted.rmDup.sam >$workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam

## convert into bed file format
bedtools bamtobed -i $workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam -bedpe >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.bed

## keep the read pairs that are on the same chromosome and fragment length less than 1000bp.
awk '$1==$4 && $6-$2 < 1000 {print $0}' $workdir/alignment/bed/${sample_name}_bowtie2_rmDup.bed >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.clean.bed

## only extract the fragment related columns
cut -f 1,2,6 $workdir/alignment/bed/${sample_name}_bowtie2_rmDup.clean.bed | sort -k1,1 -k2,2n -k3,3n  >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.fragments.bed

Downsampling

Can be done from fastqs or bam files

### Linux command

module load samtools/1.3.1

bamdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment/bam"
sample_name="..."

read_proportion=".05" #this would be 5%
seed="..."

samtools view -bs ${seed}${read_proportion} $bamdir/{sample_name}_bowtie2.mapped.bam > $bamdir/{sample_name}_subsample.mapped.bam

Make (normalized) bedgraph

Optional as this does not affect peak calling, only bedgraph values.

*chrom sizes from http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/hg19.chrom.sizes

### Linux command

module load bedtools/2.25.0

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment"
chromSize="/rds/general/user/la420/home/hg19/other/hg19.chrom.sizes"

sample_name="..."

#total number of (paired-end) reads, or 2x the number of fragments
seqDepthDouble=`samtools view -F 0x04 $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sam | wc -l`
seqDepth=$((seqDepthDouble/2))

if [[ "$seqDepth" -gt "1" ]]; then

    scale_factor=`echo "1000000 / $seqDepth" | bc -l`
    echo "Scaling factor for $sample_name is: $scale_factor"
    bedtools genomecov -bg -scale $scale_factor -i $workdir/bed/${sample_name}_bowtie2_rmDup.fragments.bed -g $chromSize > $workdir/bedgraph/${sample_name}_norm_bowtie2_rmDup.fragments.bedgraph
    
fi

Peak calling (SEACR)

*No IgG control, so using “non”

### Linux command

module load anaconda3/personal
source activate seacr_env

seacr="bash $HOME/anaconda3/envs/seacr_env/bin/SEACR_1.3.sh"
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
sample_name="..."

$seacr $workdir/alignment/bedgraph/${sample_name}_bowtie2_rmDup.fragments.bedgraph 0.01 non stringent $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks

Visualization

Prepare bigwig file:

### Linux command

module load anaconda3/personal
source activate deepenv
module load samtools/1.3.1

bamCoverage="$HOME/anaconda3/envs/deepenv/bin/bamCoverage"
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"

sample_name="Diagenode_S1"

#mkdir $workdir/alignment/bigwig

samtools sort -o $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam $workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam
samtools index $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam
$bamCoverage -b $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam -o $workdir/alignment/bigwig/${sample_name}_rmDup_raw.bw       

Heatmap over transcription units

hg19 genes from: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/genes/hg19.ncbiRefSeq.gtf.gz

### Linux command

module load anaconda3/personal
source activate deepenv

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
plotHeatmap="$HOME/anaconda3/envs/deepenv/bin/plotHeatmap"
computeMatrix="$HOME/anaconda3/envs/deepenv/bin/computeMatrix"

sample_1="..."
sample_2="..."

cores=8

#mkdir -p $workdir/heatmap

computeMatrix scale-regions -S $workdir/alignment/bigwig/${sample_1}_rmDup_raw.bw \
                               $workdir/alignment/bigwig/${sample_2}_rmDup_raw.bw \
                              -R $HOME/hg19/other/hg19.ncbiRefSeq.gtf.gz \
                              --beforeRegionStartLength 3000 \
                              --regionBodyLength 5000 \
                              --afterRegionStartLength 3000 \
                              --skipZeros \
                              -o $workdir/heatmap/matrix_gene.mat.gz \
                              -p $cores

plotHeatmap -m $workdir/heatmap/hg19_gene/matrix_gene.mat.gz -out $workdir/heatmap/heatmap_transcription_units.png --sortUsing sum

Heatmap over CUT&Tag peaks

### Linux command

module load anaconda3/personal
source activate deepenv

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
plotHeatmap="$HOME/anaconda3/envs/deepenv/bin/plotHeatmap"
computeMatrix="$HOME/anaconda3/envs/deepenv/bin/computeMatrix"

sample_name="..."

cores=8

awk '{split($6, summit, ":"); split(summit[2], region, "-"); print summit[1]"\t"region[1]"\t"region[2]}' $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.stringent.bed >$workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.summitRegion.bed

computeMatrix reference-point -S $workdir/alignment/bigwig/${sample_name}_rmDup_raw.bw -R $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.summitRegion.bed --skipZeros -o $workdir/peakCalling/SEACR/${sample_name}_SEACR.mat.gz -p $cores -a 3000 -b 3000 --referencePoint center

plotHeatmap -m $workdir/peakCalling/SEACR/${sample_name}_SEACR.mat.gz -out $workdir/peakCalling/SEACR/${sample_name}_SEACR_heatmap.png --sortUsing sum --startLabel "Peak Start" --endLabel "Peak End" --xAxisLabel "" --regionsLabel "Peaks" --samplesLabel "${sample_name}"

Peak calling (MACS2)

### Linux command

module load anaconda3/personal
source activate macs2_env

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
out_dir="${workdir}/peakCalling"

#mkdir -p $outdir
#mkdir -p $outdir/MACS2

sample_name="..."

macs2 callpeak -t $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam -q 0.05 -n ${sample_name}_q0.05 -f BAMPE -g hs --keep-dup all --nomodel --outdir $out_dir/MACS2

Find motifs


module load anaconda3/personal
source activate homerenv

workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
peakdir="${workdir}/peakCalling"
outdir="${workdir}/motifs"
#mkdir -p $outdir

caller="..."
sample_name="..."
genome="..."

# need separate folder per sample
#mkdir -p $outdir/${sample_name}

findMotifsGenome.pl $peakdir/${caller}/${sample_name}_${caller}.bed $genome $out_dir -size given

Genome-wide correlation

### Linux command

#PBS -l walltime=72:00:00
#PBS -l select=1:ncpus=32:mem=124gb

module load anaconda3/personal
source activate bedtools_env

abdir="/rds/general/user/la420/home/CUTnTAG/antibodies"
corrdir="${abdir}/correlation"
bamdir="${abdir}/bam/sorted_bam"
beddir="${corrdir}/ENCODE_peak_files"
ENCODE_bamdir="${corrdir}/ENCODE_peak_files/bam/sorted_bam/fixed"
old_ENCODE_bamdir="${corrdir}/ENCODE_peak_files/bam/sorted_bam"
outdir="${corrdir}/all_marks"
mergedbeddir="${abdir}/all_peaks/merged"
hg19_dir="/rds/general/user/la420/home/hg19/other"

#mark="H3K27ac_ENCFF044JNJ"
#mark="H3K27me3_ENCFF126QYP"

bedtools multicov -bams \
$bamdir/Abcam-ab177178_rmDup.sorted.fixed2.bam \
$bamdir/Abcam-ab4729_rmDup.sorted.fixed2.bam \
$bamdir/ActiveMotif_rmDup.sorted.fixed2.bam \
$bamdir/Diagenode_100x_rmDup.sorted.fixed2.bam \
$bamdir/Diagenode_50x_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_rmDup.sorted.fixed2.bam \
$bamdir/H3K27ac_SRR8383507_rmDup.sorted.fixed2.bam \
$bamdir/H3K27ac_SRR8383508_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_SRR11074238_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_SRR11074239_rmDup.sorted.fixed2.bam \
$bamdir/ActiveMotif_H3K27ac_rmDup.sorted.fixed2.bam \
$bamdir/H3K27ac_SRR8581604_CnR_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_SRR907370_CnR_rmDup.sorted.fixed2.bam \
$old_ENCODE_bamdir/H3K27ac_ENCFF384ZZM_sorted.bam \
$old_ENCODE_bamdir/H3K27me3_ENCFF676ORH_sorted.bam \
$ENCODE_bamdir/H3K4me1_ENCFF196QGZ_sorted.fixed.bam \
$ENCODE_bamdir/H3K4me3_ENCFF915MJO_sorted.fixed.bam \
$ENCODE_bamdir/H3K9ac_ENCFF204DNG_sorted.fixed.bam \
$ENCODE_bamdir/H3K9me3_ENCFF805FLY_sorted.fixed.bam \
$ENCODE_bamdir/H3K36me3_ENCFF190AQC_sorted.fixed.bam \
$ENCODE_bamdir/DNase_ENCFF826DJP_sorted.fixed.bam \
-bed $hg19_dir/hg19_500bp_windows.bed > $outdir/hg19_500bp_overlaps.txt

Session Info

utils::sessionInfo()
## R Under development (unstable) (2022-03-03 r81847)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] EpiProcess_0.99.0 BiocStyle_2.23.1 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.8          knitr_1.37          magrittr_2.0.2     
##  [4] rappdirs_0.3.3      lattice_0.20-45     R6_2.5.1           
##  [7] ragg_1.2.2          rlang_1.0.2         fastmap_1.1.0      
## [10] stringr_1.4.0       tools_4.2.0         grid_4.2.0         
## [13] data.table_1.14.2   xfun_0.30           png_0.1-7          
## [16] cli_3.2.0           jquerylib_0.1.4     systemfonts_1.0.4  
## [19] htmltools_0.5.2     yaml_2.3.5          digest_0.6.29      
## [22] rprojroot_2.0.2     pkgdown_2.0.2.9000  bookdown_0.24      
## [25] textshaping_0.3.6   Matrix_1.4-0        BiocManager_1.30.16
## [28] purrr_0.3.4         sass_0.4.0          fs_1.5.2           
## [31] memoise_2.0.1       cachem_1.0.6        evaluate_0.15      
## [34] rmarkdown_2.12      echoconda_0.99.5    stringi_1.7.6      
## [37] compiler_4.2.0      bslib_0.3.1         desc_1.4.1         
## [40] reticulate_1.24     jsonlite_1.8.0