#!/usr/bin/env nextflow

/*
*    Nextflow script for getting reference transcript FASTA sequences
*    to generate report use:  -with-report file-name
*    export _JAVA_OPTIONS=-Djava.io.tmpdir=/blue/mcintyre/share/transcript_distance/dros_analysis/ROZ_NF
*/

println """\
    DIRECTORIES AND INPUT FILES
    input reference genome FASTA 1:	${REF_FA1}
    input reference annotation GTF 1:	${REF_GTF1}
    input reference genome FASTA 2:     ${REF_FA2}
    input reference annotation GTF 2:	${REF_GTF2}
    input gene list to exclude 1:	${EXCLUDE_GENE1}
    input gene list to exclude 2:	${EXCLUDE_GENE2}
    output transcript FASTA 1:		${XCRPT_FA1}
    output transcript FASTA 2:		${XCRPT_FA2}
    """
    .stripIndent()


// These are the possible reference genome sequences and annotations
ref_FA_GTF_excl_out = Channel.of( [ "${REF_FA1}", "${REF_GTF1}", "${EXCLUDE_GENE1}", "${XCRPT_FA1}" ], [ "${REF_FA2}", "${REF_GTF2}", "${EXCLUDE_GENE2}", "${XCRPT_FA2}" ] )

// Get transcript FASTA sequences given reference FASTA and GTF
process getTranscriptFA {

    scratch true

    input:
    tuple env(FA), env(GTF), env(EXCLUDE_GENE), env(XCRPT_FA) from ref_FA_GTF_excl_out

    shell:
    '''

    module purge
    module load gffread/0.12.7
    module load python/3.8

    if [[ ${EXCLUDE_GENE} != "" ]]; then
        if [[ -e ${EXCLUDE_GENE} && $(wc -l ${EXCLUDE_GENE} | awk '{print $1}') > 0 ]]; then
            echo "Excluding genes in list provided from sequences extraction"
            python !{SCRIPTS}/subset_gtf.py \
                -g ${GTF} \
                -t gene_id \
                -e ${EXCLUDE_GENE} \
                -o ${TMPDIR}/subset_gene.gtf
            GTF=${TMPDIR}/subset_gene.gtf
        fi
    fi

    gffread --w-nocds -w ${XCRPT_FA} -g ${FA} ${GTF}

    '''
}