#!/usr/bin/env nextflow

/*
*    Nextflow script for comparing species reference annotations
*    to generate report use:  -with-report file-name
*    export _JAVA_OPTIONS=-Djava.io.tmpdir=/blue/mcintyre/share/transcript_distance/dros_analysis/reference_comparison/ROZ_NF
*/

println """\
    DIRECTORIES AND INPUT FILES
    project directory:			${PROJ}
    scripts directory:			${SCRIPTS}
    input trand conda env path:		${TD_CONDA}
    input number of CPU to use:		${N_CPU}
    input reference genome FASTA 1:	${REF_FA1}
    input reference transcript FASTA 1: ${XCRPT_FA1}
    input reference annotation GTF 1:	${REF_GTF1}
    input reference genome FASTA 2:     ${REF_FA2}
    input reference transcript FASTA 2: ${XCRPT_FA2}
    input reference annotation GTF 2:	${REF_GTF2}
    input reference name 1:		${NAME1}
    input reference name 2:		${NAME2}
    output directory:			${OUTD}
    """
    .stripIndent()

// These are the comparisons of transcripts mapped to reference (same and opposite - 4 total comparisons)
comparison = Channel.of(["${NAME1}", "${NAME1}"], ["${NAME2}", "${NAME1}"], ["${NAME1}", "${NAME2}"], ["${NAME2}", "${NAME2}"])


// Map transcript sequences to the same and opposite reference genome
process runMinimap2 {
    input:
    tuple env(XCRPT_NAME), env(REF_NAME) from comparison
    
    output:
    tuple env(OUT_NAME), env(MAPPED) into mapped_ch

    shell:
    '''

    module purge
    module load minimap/2.17

    if [[ ${XCRPT_NAME} == !{NAME1} ]]; then
        XCRPT_FA=!{XCRPT_FA1}
    else
        XCRPT_FA=!{XCRPT_FA2}
    fi

    if [[ ${REF_NAME} == !{NAME1} ]]; then
        REF_FA=!{REF_FA1}
    else
        REF_FA=!{REF_FA2}
    fi

    mkdir -p !{OUTD}

    OUT_NAME="${XCRPT_NAME}2${REF_NAME}"
    MAPPED=!{OUTD}/${OUT_NAME}_minimap2.sam

    minimap2 --secondary=no -u f -ax splice -C 5 ${REF_FA} ${XCRPT_FA} > ${MAPPED}

    '''
}


// Convert sam alignment file to bam (-F 2048 selected only primary alignments)
process sam2bam {
    input:
    tuple env(OUT_NAME), env(MAPPED) from mapped_ch

    output:
    tuple env(OUT_NAME), env(BAM) into bam_ch

    shell:
    '''

    module purge
    module load samtools/1.10

    BAM=!{OUTD}/${OUT_NAME}_minimap2.bam

    samtools view -b -F 2048 ${MAPPED} > ${BAM}

    '''
}


// Convert bam to bed file of aligned transcripts
//     -split will splice the gapped alignment into exonic blocks with 1 exon per line
process bam2bed {
    input:
    tuple env(OUT_NAME), env(BAM) from bam_ch

    output:
    tuple env(OUT_NAME), env(BED) into bed_ch

    shell:
    '''

    module purge
    module load bedtools/2.29.2

    BED=!{OUTD}/${OUT_NAME}_minimap2.bed

    bedtools bamtobed -split -i ${BAM} > ${BED}

    rm ${BAM}

    '''
}


// Convert splice bed file into gtf file
process bed2gtf {
    input:
    tuple env(OUT_NAME), env(BED) from bed_ch

    output:
    tuple env(OUT_NAME), env(GTF) into gtf_ch

    shell:
    '''

    GTF=!{OUTD}/${OUT_NAME}_minimap2.gtf

    awk -v name=${OUT_NAME} -F "\t" '{{print $1"\t"name"\t""exon""\t"$2+1"\t"$3"\t"".""\t"$6"\t"".""\t""gene_id " "\\"" $4 "\\"" "; " "transcript_id " "\\"" $4 "\\"" ";"}}' $BED > $GTF

    rm ${BED}

    '''
}


// Use GFFcompare to get associated reference gene_id values
process runGFFcompare {
    input:
    tuple env(OUT_NAME), env(GTF) from gtf_ch

    output:
    tuple env(OUT_NAME), env(ANNOT_PREFIX) into gtfAnnot_ch

    shell:
    '''

    module purge
    module load gffcompare/0.12.2

    ANNOT_PREFIX=!{OUTD}/${OUT_NAME}

    if [[ ${OUT_NAME} =~ 2!{NAME1} ]]; then
        REF_GTF=!{REF_GTF1}
    else
        REF_GTF=!{REF_GTF2}
    fi

    gffcompare -r ${REF_GTF} -o ${ANNOT_PREFIX} ${GTF}

    '''
}


// Correct mapped gtf to have associated reference gene_id from gffcompare
process correctGeneID {
    input:
    tuple env(OUT_NAME), env(ANNOT_PREFIX) from gtfAnnot_ch

    output:
    tuple env(OUT_NAME), env(CORRECTED_GTF) into gtfCorrected_ch

    shell:
    '''

    module purge
    module load python/3.8

    CORRECTED_GTF=!{OUTD}/${OUT_NAME}_corrected_associated_gene.gtf
    ORIG_GTF=!{OUTD}/${OUT_NAME}_minimap2.gtf

    python !{SCRIPTS}/correct_gffcompare_GTF_gene_id_06avn.py \
        -a ${ANNOT_PREFIX}.annotated.gtf \
        -g ${ORIG_GTF} \
        -o ${CORRECTED_GTF} \
        --key !{OUTD}/${OUT_NAME}_gene_key.csv

    '''
}