vignettes/EpiProcess.Rmd
EpiProcess.Rmd
root="/Volumes/bms20/projects/neurogenomics-lab/live/Data/tip_seq"
print(root)
## [1] "/Volumes/bms20/projects/neurogenomics-lab/live/Data/tip_seq"
datadir=file.path(
root,"raw_data/H3K27ac_K562_cells/phase_1_05_jan_2022/X204SC21121860-Z01-F001/raw_data"
)
print(datadir)
## [1] "/Volumes/bms20/projects/neurogenomics-lab/live/Data/tip_seq/raw_data/H3K27ac_K562_cells/phase_1_05_jan_2022/X204SC21121860-Z01-F001/raw_data"
fastq_files <- list.files(datadir, pattern = ".fastq.gz|.fq.gz",
full.names = TRUE, recursive = TRUE)
echoconda::yaml_to_env("epiprocess")
## echoconda:: Conda already installed.
## Identified yaml file stored in echoconda.
## Yaml contents:
## name: epiprocess
## channels:
## - conda-forge
## - bioconda
## - nodefaults
## dependencies:
## # Python
## - python>=3.6.1
## - pandas>=0.25.0
## # Command line
## - fastqc
## - trim-galore
## - bowtie2
## - picard
## - samtools
## - bedtools
## - seacr
## - deeptools
## - macs2
## # R
## - r>=4.0.0
## - r-devtools
## - r-patchwork
## - r-gert
## - r-r.oo
## - r-data.table
## - r-ggplot2
## - r-tidyverse
## # R: Bioconductor
## - r-biocmanager
## - radian
## - pip
##
## echoconda:: Conda environment already exists: epiprocess
## Time difference of 0.3 secs
## [1] "epiprocess"
Get bad gzip error if use zipped files, so gunzip first.
### Linux command
sampledir="rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/raw_data/HKTCKBBXY/IGFQ001138_hu_2-3-2021_CutandTag/IGF117517"
outdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/data_trimmed"
#mkdir -p $outdir
sample_name="..."
## need cutadapt
module load anaconda3/personal
source activate cutadaptenv
/rds/general/user/la420/home/TrimGalore-0.6.6/trim_galore -o $outdir --gzip --basename $sample_name --paired $sampledir/${sample_name}_R1_001.fastq $sampledir/${sample_name}_R2_001.fastq
Index and other genome files obtained at https://emea.support.illumina.com/sequencing/sequencing_software/igenome.html
### Linux command
datadir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
ref="/rds/general/user/la420/home/hg19/index/genome"
cores=8
sample_name="..."
#mkdir -p ${datadir}/alignment/sam/bowtie2_summary
#mkdir -p ${datadir}/alignment/bam
#mkdir -p ${datadir}/alignment/bed
#mkdir -p ${datadir}/alignment/bedgraph
outdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment/sam"
module load bowtie2/2.2.9
bowtie2 \
\
--local --very-sensitive \
\
--no-mixed \
--no-discordant \
--phred33 \
-I 10 \
-X 700 ${cores} \
-p ${ref} \
-x ${datadir}/data_trimmed/${sample_name}_val_1.fq.gz \
-1 ${datadir}/data_trimmed/${sample_name}_val_2.fq.gz \
-2 ${outdir}/${sample_name}_trimmed_bowtie2.sam &> ${outdir}/bowtie2_summary/${sample_name}_trimmed_bowtie2.txt -S
### Linux command
module load picard/2.6.0
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
picard_command="java -Xmx12G -XX:ParallelGCThreads=5 -XX:GCTimeLimit=50 -jar /apps/picard/2.6.0/picard.jar"
sample_name="..."
#mkdir -p $workdir/removeDuplicate
#mkdir -p $workdir/removeDuplicate/picard_summary
## sort by coordinate
$picard_command SortSam I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sam O=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam SORT_ORDER=coordinate
## mark duplicates
$picard_command MarkDuplicates I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam O=$workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.dupMarked.sam METRICS_FILE=$workdir/removeDuplicate/picard_summary/${sample_name}_picard.dupMark.txt
## remove duplicates
$picard_command MarkDuplicates I=$workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sorted.sam O=$workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam REMOVE_DUPLICATES=true METRICS_FILE=$workdir/removeDuplicate/picard_summary/${sample_name}_picard.rmDup.txt
### Linux command
module load samtools/1.3.1
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
sample_name="..."
#mkdir -p $workdir/alignment/sam/fragmentLen
## extract the 9th column from the alignment sam file which is the fragment length
samtools view -F 0x04 $workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam | awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' >$workdir/alignment/sam/fragmentLen/${sample_name}_rmDup_fragmentLen.txt
### Linux command
module load samtools/1.3.1
module load bedtools/2.25.0
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
sample_name="..."
## sort sam file by read name (rather than coordinate, as it is now)
samtools sort -n $workdir/removeDuplicate/${sample_name}_trimmed_bowtie2.sorted.rmDup.sam -o $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.n_sorted.rmDup.sam
## filter and keep the mapped read pairs
samtools view -bS -F 0x04 $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.n_sorted.rmDup.sam >$workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam
## convert into bed file format
bedtools bamtobed -i $workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam -bedpe >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.bed
## keep the read pairs that are on the same chromosome and fragment length less than 1000bp.
awk '$1==$4 && $6-$2 < 1000 {print $0}' $workdir/alignment/bed/${sample_name}_bowtie2_rmDup.bed >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.clean.bed
## only extract the fragment related columns
cut -f 1,2,6 $workdir/alignment/bed/${sample_name}_bowtie2_rmDup.clean.bed | sort -k1,1 -k2,2n -k3,3n >$workdir/alignment/bed/${sample_name}_bowtie2_rmDup.fragments.bed
Can be done from fastqs or bam files
### Linux command
module load samtools/1.3.1
bamdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment/bam"
sample_name="..."
read_proportion=".05" #this would be 5%
seed="..."
samtools view -bs ${seed}${read_proportion} $bamdir/{sample_name}_bowtie2.mapped.bam > $bamdir/{sample_name}_subsample.mapped.bam
Optional as this does not affect peak calling, only bedgraph values.
*chrom sizes from http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/hg19.chrom.sizes
### Linux command
module load bedtools/2.25.0
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY/alignment"
chromSize="/rds/general/user/la420/home/hg19/other/hg19.chrom.sizes"
sample_name="..."
#total number of (paired-end) reads, or 2x the number of fragments
seqDepthDouble=`samtools view -F 0x04 $workdir/alignment/sam/${sample_name}_trimmed_bowtie2.sam | wc -l`
seqDepth=$((seqDepthDouble/2))
if [[ "$seqDepth" -gt "1" ]]; then
scale_factor=`echo "1000000 / $seqDepth" | bc -l`
echo "Scaling factor for $sample_name is: $scale_factor"
bedtools genomecov -bg -scale $scale_factor -i $workdir/bed/${sample_name}_bowtie2_rmDup.fragments.bed -g $chromSize > $workdir/bedgraph/${sample_name}_norm_bowtie2_rmDup.fragments.bedgraph
fi
*No IgG control, so using “non”
### Linux command
module load anaconda3/personal
source activate seacr_env
seacr="bash $HOME/anaconda3/envs/seacr_env/bin/SEACR_1.3.sh"
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
sample_name="..."
$seacr $workdir/alignment/bedgraph/${sample_name}_bowtie2_rmDup.fragments.bedgraph 0.01 non stringent $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks
Prepare bigwig file:
### Linux command
module load anaconda3/personal
source activate deepenv
module load samtools/1.3.1
bamCoverage="$HOME/anaconda3/envs/deepenv/bin/bamCoverage"
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
sample_name="Diagenode_S1"
#mkdir $workdir/alignment/bigwig
samtools sort -o $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam $workdir/alignment/bam/${sample_name}_bowtie2_rmDup.mapped.bam
samtools index $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam
$bamCoverage -b $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam -o $workdir/alignment/bigwig/${sample_name}_rmDup_raw.bw
hg19 genes from: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/genes/hg19.ncbiRefSeq.gtf.gz
### Linux command
module load anaconda3/personal
source activate deepenv
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
plotHeatmap="$HOME/anaconda3/envs/deepenv/bin/plotHeatmap"
computeMatrix="$HOME/anaconda3/envs/deepenv/bin/computeMatrix"
sample_1="..."
sample_2="..."
cores=8
#mkdir -p $workdir/heatmap
computeMatrix scale-regions -S $workdir/alignment/bigwig/${sample_1}_rmDup_raw.bw \
$workdir/alignment/bigwig/${sample_2}_rmDup_raw.bw \
-R $HOME/hg19/other/hg19.ncbiRefSeq.gtf.gz \
--beforeRegionStartLength 3000 \
--regionBodyLength 5000 \
--afterRegionStartLength 3000 \
--skipZeros \
-o $workdir/heatmap/matrix_gene.mat.gz \
-p $cores
plotHeatmap -m $workdir/heatmap/hg19_gene/matrix_gene.mat.gz -out $workdir/heatmap/heatmap_transcription_units.png --sortUsing sum
### Linux command
module load anaconda3/personal
source activate deepenv
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
plotHeatmap="$HOME/anaconda3/envs/deepenv/bin/plotHeatmap"
computeMatrix="$HOME/anaconda3/envs/deepenv/bin/computeMatrix"
sample_name="..."
cores=8
awk '{split($6, summit, ":"); split(summit[2], region, "-"); print summit[1]"\t"region[1]"\t"region[2]}' $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.stringent.bed >$workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.summitRegion.bed
computeMatrix reference-point -S $workdir/alignment/bigwig/${sample_name}_rmDup_raw.bw -R $workdir/peakCalling/SEACR/${sample_name}_seacr_top0.01.peaks.summitRegion.bed --skipZeros -o $workdir/peakCalling/SEACR/${sample_name}_SEACR.mat.gz -p $cores -a 3000 -b 3000 --referencePoint center
plotHeatmap -m $workdir/peakCalling/SEACR/${sample_name}_SEACR.mat.gz -out $workdir/peakCalling/SEACR/${sample_name}_SEACR_heatmap.png --sortUsing sum --startLabel "Peak Start" --endLabel "Peak End" --xAxisLabel "" --regionsLabel "Peaks" --samplesLabel "${sample_name}"
### Linux command
module load anaconda3/personal
source activate macs2_env
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
out_dir="${workdir}/peakCalling"
#mkdir -p $outdir
#mkdir -p $outdir/MACS2
sample_name="..."
macs2 callpeak -t $workdir/alignment/bam/${sample_name}_rmDup.sorted.bam -q 0.05 -n ${sample_name}_q0.05 -f BAMPE -g hs --keep-dup all --nomodel --outdir $out_dir/MACS2
module load anaconda3/personal
source activate homerenv
workdir="/rds/general/project/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/HKTCKBBXY"
peakdir="${workdir}/peakCalling"
outdir="${workdir}/motifs"
#mkdir -p $outdir
caller="..."
sample_name="..."
genome="..."
# need separate folder per sample
#mkdir -p $outdir/${sample_name}
findMotifsGenome.pl $peakdir/${caller}/${sample_name}_${caller}.bed $genome $out_dir -size given
### Linux command
#PBS -l walltime=72:00:00
#PBS -l select=1:ncpus=32:mem=124gb
module load anaconda3/personal
source activate bedtools_env
abdir="/rds/general/user/la420/home/CUTnTAG/antibodies"
corrdir="${abdir}/correlation"
bamdir="${abdir}/bam/sorted_bam"
beddir="${corrdir}/ENCODE_peak_files"
ENCODE_bamdir="${corrdir}/ENCODE_peak_files/bam/sorted_bam/fixed"
old_ENCODE_bamdir="${corrdir}/ENCODE_peak_files/bam/sorted_bam"
outdir="${corrdir}/all_marks"
mergedbeddir="${abdir}/all_peaks/merged"
hg19_dir="/rds/general/user/la420/home/hg19/other"
#mark="H3K27ac_ENCFF044JNJ"
#mark="H3K27me3_ENCFF126QYP"
bedtools multicov -bams \
$bamdir/Abcam-ab177178_rmDup.sorted.fixed2.bam \
$bamdir/Abcam-ab4729_rmDup.sorted.fixed2.bam \
$bamdir/ActiveMotif_rmDup.sorted.fixed2.bam \
$bamdir/Diagenode_100x_rmDup.sorted.fixed2.bam \
$bamdir/Diagenode_50x_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_rmDup.sorted.fixed2.bam \
$bamdir/H3K27ac_SRR8383507_rmDup.sorted.fixed2.bam \
$bamdir/H3K27ac_SRR8383508_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_SRR11074238_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_SRR11074239_rmDup.sorted.fixed2.bam \
$bamdir/ActiveMotif_H3K27ac_rmDup.sorted.fixed2.bam \
$bamdir/H3K27ac_SRR8581604_CnR_rmDup.sorted.fixed2.bam \
$bamdir/H3K27me3_SRR907370_CnR_rmDup.sorted.fixed2.bam \
$old_ENCODE_bamdir/H3K27ac_ENCFF384ZZM_sorted.bam \
$old_ENCODE_bamdir/H3K27me3_ENCFF676ORH_sorted.bam \
$ENCODE_bamdir/H3K4me1_ENCFF196QGZ_sorted.fixed.bam \
$ENCODE_bamdir/H3K4me3_ENCFF915MJO_sorted.fixed.bam \
$ENCODE_bamdir/H3K9ac_ENCFF204DNG_sorted.fixed.bam \
$ENCODE_bamdir/H3K9me3_ENCFF805FLY_sorted.fixed.bam \
$ENCODE_bamdir/H3K36me3_ENCFF190AQC_sorted.fixed.bam \
$ENCODE_bamdir/DNase_ENCFF826DJP_sorted.fixed.bam \
$hg19_dir/hg19_500bp_windows.bed > $outdir/hg19_500bp_overlaps.txt -bed
utils::sessionInfo()
## R Under development (unstable) (2022-03-03 r81847)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] EpiProcess_0.99.0 BiocStyle_2.23.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.8 knitr_1.37 magrittr_2.0.2
## [4] rappdirs_0.3.3 lattice_0.20-45 R6_2.5.1
## [7] ragg_1.2.2 rlang_1.0.2 fastmap_1.1.0
## [10] stringr_1.4.0 tools_4.2.0 grid_4.2.0
## [13] data.table_1.14.2 xfun_0.30 png_0.1-7
## [16] cli_3.2.0 jquerylib_0.1.4 systemfonts_1.0.4
## [19] htmltools_0.5.2 yaml_2.3.5 digest_0.6.29
## [22] rprojroot_2.0.2 pkgdown_2.0.2.9000 bookdown_0.24
## [25] textshaping_0.3.6 Matrix_1.4-0 BiocManager_1.30.16
## [28] purrr_0.3.4 sass_0.4.0 fs_1.5.2
## [31] memoise_2.0.1 cachem_1.0.6 evaluate_0.15
## [34] rmarkdown_2.12 echoconda_0.99.5 stringi_1.7.6
## [37] compiler_4.2.0 bslib_0.3.1 desc_1.4.1
## [40] reticulate_1.24 jsonlite_1.8.0