#!/usr/bin/env nextflow /* * Nextflow script for getting reference transcript FASTA sequences * to generate report use: -with-report file-name * export _JAVA_OPTIONS=-Djava.io.tmpdir=/blue/mcintyre/share/transcript_distance/dros_analysis/ROZ_NF */ println """\ DIRECTORIES AND INPUT FILES input reference genome FASTA 1: ${REF_FA1} input reference annotation GTF 1: ${REF_GTF1} input reference genome FASTA 2: ${REF_FA2} input reference annotation GTF 2: ${REF_GTF2} input gene list to exclude 1: ${EXCLUDE_GENE1} input gene list to exclude 2: ${EXCLUDE_GENE2} output transcript FASTA 1: ${XCRPT_FA1} output transcript FASTA 2: ${XCRPT_FA2} """ .stripIndent() // These are the possible reference genome sequences and annotations ref_FA_GTF_excl_out = Channel.of( [ "${REF_FA1}", "${REF_GTF1}", "${EXCLUDE_GENE1}", "${XCRPT_FA1}" ], [ "${REF_FA2}", "${REF_GTF2}", "${EXCLUDE_GENE2}", "${XCRPT_FA2}" ] ) // Get transcript FASTA sequences given reference FASTA and GTF process getTranscriptFA { scratch true input: tuple env(FA), env(GTF), env(EXCLUDE_GENE), env(XCRPT_FA) from ref_FA_GTF_excl_out shell: ''' module purge module load gffread/0.12.7 module load python/3.8 if [[ ${EXCLUDE_GENE} != "" ]]; then if [[ -e ${EXCLUDE_GENE} && $(wc -l ${EXCLUDE_GENE} | awk '{print $1}') > 0 ]]; then echo "Excluding genes in list provided from sequences extraction" python !{SCRIPTS}/subset_gtf.py \ -g ${GTF} \ -t gene_id \ -e ${EXCLUDE_GENE} \ -o ${TMPDIR}/subset_gene.gtf GTF=${TMPDIR}/subset_gene.gtf fi fi gffread --w-nocds -w ${XCRPT_FA} -g ${FA} ${GTF} ''' }