#!/usr/bin/env nextflow /* * Nextflow script for comparing species reference annotations * to generate report use: -with-report file-name * export _JAVA_OPTIONS=-Djava.io.tmpdir=/blue/mcintyre/share/transcript_distance/dros_analysis/reference_comparison/ROZ_NF */ println """\ DIRECTORIES AND INPUT FILES project directory: ${PROJ} scripts directory: ${SCRIPTS} input trand conda env path: ${TD_CONDA} input number of CPU to use: ${N_CPU} input reference genome FASTA 1: ${REF_FA1} input reference transcript FASTA 1: ${XCRPT_FA1} input reference annotation GTF 1: ${REF_GTF1} input reference genome FASTA 2: ${REF_FA2} input reference transcript FASTA 2: ${XCRPT_FA2} input reference annotation GTF 2: ${REF_GTF2} input reference name 1: ${NAME1} input reference name 2: ${NAME2} output directory: ${OUTD} """ .stripIndent() // These are the comparisons of transcripts mapped to reference (same and opposite - 4 total comparisons) comparison = Channel.of(["${NAME1}", "${NAME1}"], ["${NAME2}", "${NAME1}"], ["${NAME1}", "${NAME2}"], ["${NAME2}", "${NAME2}"]) // Map transcript sequences to the same and opposite reference genome process runMinimap2 { input: tuple env(XCRPT_NAME), env(REF_NAME) from comparison output: tuple env(OUT_NAME), env(MAPPED) into mapped_ch shell: ''' module purge module load minimap/2.17 if [[ ${XCRPT_NAME} == !{NAME1} ]]; then XCRPT_FA=!{XCRPT_FA1} else XCRPT_FA=!{XCRPT_FA2} fi if [[ ${REF_NAME} == !{NAME1} ]]; then REF_FA=!{REF_FA1} else REF_FA=!{REF_FA2} fi mkdir -p !{OUTD} OUT_NAME="${XCRPT_NAME}2${REF_NAME}" MAPPED=!{OUTD}/${OUT_NAME}_minimap2.sam minimap2 --secondary=no -u f -ax splice -C 5 ${REF_FA} ${XCRPT_FA} > ${MAPPED} ''' } // Convert sam alignment file to bam (-F 2048 selected only primary alignments) process sam2bam { input: tuple env(OUT_NAME), env(MAPPED) from mapped_ch output: tuple env(OUT_NAME), env(BAM) into bam_ch shell: ''' module purge module load samtools/1.10 BAM=!{OUTD}/${OUT_NAME}_minimap2.bam samtools view -b -F 2048 ${MAPPED} > ${BAM} ''' } // Convert bam to bed file of aligned transcripts // -split will splice the gapped alignment into exonic blocks with 1 exon per line process bam2bed { input: tuple env(OUT_NAME), env(BAM) from bam_ch output: tuple env(OUT_NAME), env(BED) into bed_ch shell: ''' module purge module load bedtools/2.29.2 BED=!{OUTD}/${OUT_NAME}_minimap2.bed bedtools bamtobed -split -i ${BAM} > ${BED} rm ${BAM} ''' } // Convert splice bed file into gtf file process bed2gtf { input: tuple env(OUT_NAME), env(BED) from bed_ch output: tuple env(OUT_NAME), env(GTF) into gtf_ch shell: ''' GTF=!{OUTD}/${OUT_NAME}_minimap2.gtf awk -v name=${OUT_NAME} -F "\t" '{{print $1"\t"name"\t""exon""\t"$2+1"\t"$3"\t"".""\t"$6"\t"".""\t""gene_id " "\\"" $4 "\\"" "; " "transcript_id " "\\"" $4 "\\"" ";"}}' $BED > $GTF rm ${BED} ''' } // Use GFFcompare to get associated reference gene_id values process runGFFcompare { input: tuple env(OUT_NAME), env(GTF) from gtf_ch output: tuple env(OUT_NAME), env(ANNOT_PREFIX) into gtfAnnot_ch shell: ''' module purge module load gffcompare/0.12.2 ANNOT_PREFIX=!{OUTD}/${OUT_NAME} if [[ ${OUT_NAME} =~ 2!{NAME1} ]]; then REF_GTF=!{REF_GTF1} else REF_GTF=!{REF_GTF2} fi gffcompare -r ${REF_GTF} -o ${ANNOT_PREFIX} ${GTF} ''' } // Correct mapped gtf to have associated reference gene_id from gffcompare process correctGeneID { input: tuple env(OUT_NAME), env(ANNOT_PREFIX) from gtfAnnot_ch output: tuple env(OUT_NAME), env(CORRECTED_GTF) into gtfCorrected_ch shell: ''' module purge module load python/3.8 CORRECTED_GTF=!{OUTD}/${OUT_NAME}_corrected_associated_gene.gtf ORIG_GTF=!{OUTD}/${OUT_NAME}_minimap2.gtf python !{SCRIPTS}/correct_gffcompare_GTF_gene_id_06avn.py \ -a ${ANNOT_PREFIX}.annotated.gtf \ -g ${ORIG_GTF} \ -o ${CORRECTED_GTF} \ --key !{OUTD}/${OUT_NAME}_gene_key.csv ''' }