#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec  5 14:39:24 2022

@author: adalena
"""

import pandas as pd
import numpy as np
import argparse

def getOptions():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description=(
            "Create a file that classifies transcript pairs across transcripts between two species."
        )
    )

    # Input data
    parser.add_argument(
        "-i",
        "--input-directory",
        dest="inDir",
        required=True,
        help=(
            "Input directory that contains the following: "
            "TranD_consol_[species1]_vs_[species2]_2_[species1] and TranD_consol_[species1]_vs_[species2]_2_[species2] "
            "directories, as well as union_[genome1]_2_union_[genome2]_map.csv."
        )
    )
    parser.add_argument(
        "-k",
        "--key-file",
        dest="inKey",
        required=True,
        help=(
            "Input file for UJC key/map file between the two sets of coordinates: union_[genome1]_2_union_[genome2]_map.csv."
        )
    )
    parser.add_argument(
        "-s1",
        "--species1",
        dest="inName1",
        required=True,
        help=(
            "Input species 1 name."
        )
    )
    parser.add_argument(
        "-s2",
        "--species2",
        dest="inName2",
        required=True,
        help=(
            "Input species 2 name."
        )
    )
    parser.add_argument(
        "-g1",
        "--genome1",
        dest="inG1",
        required=True,
        help=(
            "Input genome 1 name."
        )
    )
    parser.add_argument(
        "-g2",
        "--genome2",
        dest="inG2",
        required=True,
        help=(
            "Input genome 2 name."
        )
    )

    # Output data
    parser.add_argument(
        "-o",
        "--output-directory",
        dest="outDir",
        required=True,
        help="Output directory."
    )

    args = parser.parse_args()
    return args


#### Functions ####

def split_column_by_sep(df,col_name=None,sep=None,sort_list=None):
    # Split variable by some character like '|' or ',' and keep all other values the same
    if col_name == None:
        col_name = 'transcript_id'
    if sep == None:
        sep = "|"
    splitList = df[col_name].str.split(sep).apply(pd.Series, 1).stack()
    splitList.index = splitList.index.droplevel(-1)
    tempDF = df.copy()
    del(tempDF[col_name])
    splitDF = tempDF.join(splitList.rename(col_name))
    if sort_list != None:
        splitDF = splitDF.sort_values(by=sort_list)
    del(tempDF, splitList)
    return splitDF


def get_internal_nt_diff(td_df):
    # Count number of internal (altnerative donor/acceptor or IR) nt differences in ERM
    #   Get the number of nt unique to T1/T2 in shared exon regions
    #   Subtract from this number the number of nt difference from a 5'/3' end length difference
    #   Only counting internal nt difference

    # Get difference of TSS
    #   Conditions:
    #       1. ERM, positive strand, 5' end difference, T1 has the longer 5' end
    #       2. ERM, positive strand, 5' end difference, T2 has the longer 5' end
    #       3. ERM, negative strand, 5' end difference, T1 has the longer 5' end
    #       4. ERM, negative strand, 5' end difference, T2 has the longer 5' end
    tssConditions = [
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_5_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+")
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2]),
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_5_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+")
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2]),
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_5_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-")
           & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1]),
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_5_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-")
            & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1]),

    ]
    tssChoices = [
        td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[1].astype(float),
        td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[1].astype(float),
        td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1].astype(float),
        td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1].astype(float),
    ]
    td_df["num_ERM_nt_diff_TSS"] = np.select(tssConditions, tssChoices, np.nan)

    # Get difference of TTS
    #   Conditions:
    #       1. ERM, positive strand, 3' end difference, T1 has the longer 3' end
    #       2. ERM, positive strand, 3' end difference, T2 has the longer 3' end
    #       3. ERM, negative strand, 3' end difference, T1 has the longer 3' end
    #       4. ERM, negative strand, 3' end difference, T2 has the longer 3' end
    ttsConditions = [
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_3_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+")
            & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1]),
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_3_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+")
            & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1]),
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_3_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-")
           & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2]),
        (td_df["prop_ER_diff"] == 0)
            & (td_df["flag_3_variation"]==1)
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-")
            & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2]),

    ]
    ttsChoices = [
        td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1].astype(float),
        td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1].astype(float),
        td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[1].astype(float),
        td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[1].astype(float),
    ]
    td_df["num_ERM_nt_diff_TTS"] = np.select(ttsConditions, ttsChoices, np.nan)

    # Get donor/acceptor/IR length difference
    td_df["num_ERM_nt_diff_internal"] = np.where(
        td_df["prop_ER_diff"] == 0,
        td_df["num_nt_diff"].astype(int) - td_df["num_ERM_nt_diff_TSS"].fillna(0) - td_df["num_ERM_nt_diff_TTS"].fillna(0),
        np.nan
    )
    return td_df

def prep_coord_TO(ref, species1, species2, g1, g2, indir, keyFile):
    # Get TranD species1 vs species2 distance output
    # Get internal (alt. donor/acceptor and IR) nt difference
    species1v2Df = get_internal_nt_diff(pd.read_csv("{}/TranD_consol_{}_vs_{}_2_{}/{}_vs_{}_pairwise_transcript_distance.csv".format(indir, species1, species2, ref, species1, species2), low_memory=False))

    if ref == species1:
        name = g1
    else:
        name = g2

    # Get reciprocal minimum matches
    species1v2RecipDf = species1v2Df[
        species1v2Df["flag_recip_min_match"]==1][
            ["gene_id", "transcript_1", "transcript_2", "flag_FSM",
             "flag_ERM_noIR_recip_min_match", "num_transcript_in_gene_{}".format(species1),
             "num_transcript_in_gene_{}".format(species2), "num_nt_shared", "num_nt_diff", "total_nt", 
             "prop_nt_diff", "prop_nt_similar", "num_ERM_nt_diff_internal", "num_ER_T1_only", "num_ER_T2_only", "num_ER_shared"]
        ].copy().rename(columns={
            "gene_id": "gene_id_"+name,
            "flag_FSM": "flag_FSM_"+name,
            "flag_ERM_noIR_recip_min_match": "flag_ERM_noIR_"+name,
            "num_transcript_in_gene_"+species1: "num_transcript_in_gene_{}_{}".format(species1,name),
            "num_transcript_in_gene_"+species2: "num_transcript_in_gene_{}_{}".format(species2,name),
            "num_nt_shared": "num_nt_shared_"+name,
            "num_nt_diff": "num_nt_diff_"+name,
            "total_nt": "total_nt_"+name,
            "prop_nt_diff": "prop_nt_diff_"+name,
            "prop_nt_similar": "prop_nt_similar_"+name,
            "num_ERM_nt_diff_internal": "num_ERM_nt_diff_internal_"+name
        })
    species1v2RecipDf["flag_RMP_"+name] = 1
    species1v2RecipDf[species1+"_UJC_id_"+name] = species1v2RecipDf["transcript_1"].str[:-4]
    species1v2RecipDf[species2+"_UJC_id_"+name] = species1v2RecipDf["transcript_2"].str[:-4]
    
    # species1v2RecipDf["num_ER_T1"] = species1v2RecipDf["num_ER_T1_only"] + species1v2RecipDf["num_ER_shared"]
    # species1v2RecipDf["num_ER_T2"] = species1v2RecipDf["num_ER_T2_only"] + species1v2RecipDf["num_ER_shared"]
    # xcrptNumCount = pd.crosstab(species1v2RecipDf["num_ER_T1"], species1v2RecipDf["num_ER_T2"])
    # species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617"]].drop_duplicates()["num_transcript_in_gene_mel_mFB617"].value_counts()
    # species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_sim_mFB617"].value_counts()
    # xcrptNumCountGene = pd.crosstab(species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_mel_mFB617"], species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_sim_mFB617"])
    # monoexonDf = species1v2RecipDf[(species1v2RecipDf["num_ER_T1"]==1)&(species1v2RecipDf["num_ER_T2"]==1)]
    # monoXcrptNumCountGene = pd.crosstab(monoexonDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_mel_mFB617"], monoexonDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_sim_mFB617"])

    # Get union reference key file variables for the given reference
    unionDf = keyFile[~keyFile["gene_id_"+name].isna()][[c for c in keyFile.columns if name in c or "_transcript_id" in c]].copy()
    unionDf["flag_in_"+species1+"_only_annot"] = np.where(
        (~unionDf[species1+"_transcript_id"].isna()) & (unionDf[species2+"_transcript_id"].isna()),
        1,
        0
    )
    unionDf["flag_in_"+species2+"_only_annot"] = np.where(
        (~unionDf[species2+"_transcript_id"].isna()) & (unionDf[species1+"_transcript_id"].isna()),
        1,
        0
    )
    unionDf["flag_in_both_annot"] = np.where(
        (~unionDf[species2+"_transcript_id"].isna()) & (~unionDf[species1+"_transcript_id"].isna()),
        1,
        0
    )
    # Check that the flags all add up
    if len(unionDf) != unionDf["flag_in_"+species1+"_only_annot"].sum() + unionDf["flag_in_"+species2+"_only_annot"].sum() + unionDf["flag_in_both_annot"].sum():
        print("WARNING: UJC found that does not fit into logical group.")
    # Make unique on union_UJC_id and pair of species1_UJC_id and species2_UJC_id
    # Make piped lists of species1 and species2 transcript_id values
    unionDf = unionDf.sort_values([species1+"_transcript_id", species2+"_transcript_id"])
    unionListDf = unionDf.fillna("").groupby(["gene_id_"+name, "union_UJC_id_"+name]).agg({
        species1+"_transcript_id": lambda x: "|".join([element for element in x if element != ""]),
        species2+"_transcript_id": lambda x: "|".join([element for element in x if element != ""]),
        species1+"_UJC_id_"+name: lambda x: "|".join([element for element in x.unique() if element != ""]),
        species2+"_UJC_id_"+name:lambda x: "|".join([element for element in x.unique() if element != ""]),
        "flag_in_"+species1+"_only_annot": max,
        "flag_in_"+species2+"_only_annot": max,
        "flag_in_both_annot": max
        }).reset_index()
    unionListDf[species1+"_UJC_id_"+name] = np.where(
        unionListDf[species1+"_UJC_id_"+name]=="",
        np.nan,
        unionListDf[species1+"_UJC_id_"+name]
    )
    unionListDf[species2+"_UJC_id_"+name] = np.where(
        unionListDf[species2+"_UJC_id_"+name]=="",
        np.nan,
        unionListDf[species2+"_UJC_id_"+name]
    )
    if len(unionListDf.fillna("")[unionListDf.fillna("")[species1+"_UJC_id_"+name].str.contains("\|")]) > 0:
        print("WARNING: There is a union UJC that covers more than one {} UJC -\n{}".format(species1, unionListDf.fillna("")[unionListDf.fillna("")[species1+"_UJC_id_"+name].str.contains("\|")].to_string(index=False)))
    if len(unionListDf.fillna("")[unionListDf.fillna("")[species2+"_UJC_id_"+name].str.contains("\|")]) > 0:
        print("WARNING: There is a union UJC that covers more than one {} UJC -\n{}".format(species2, unionListDf.fillna("")[unionListDf.fillna("")[species2+"_UJC_id_"+name].str.contains("\|")].to_string(index=False)))

    # Merge RMP with union_UJC_id found in both species
    unionRMP = pd.merge(
        unionListDf[unionListDf["flag_in_both_annot"]==1],
        species1v2RecipDf.drop(columns=["transcript_1", "transcript_2"]),
        how="outer",
        on=["gene_id_"+name,species1+"_UJC_id_"+name,species2+"_UJC_id_"+name],
        indicator="merge_check",
        validate="1:1"
    )
    if unionRMP["merge_check"].value_counts()["left_only"] > 0:
        print("WARNING: There are pairs of UJC_id that are not found in the RMP file:\n{}".format(unionRMP[unionRMP["merge_check"]=="left_only"].to_csv(index=False)))

    pairUnionRMP = unionRMP[unionRMP["merge_check"]=="both"]
    noPairUnionRMP = unionRMP[unionRMP["merge_check"]=="right_only"][[c for c in species1v2RecipDf.columns if c not in ["transcript_1", "transcript_2"]]]

    # Merge RMP without a union UJC pair with the RMP that are only in species1 and then those that are only in species2
    unionRMPspecies1 = pd.merge(
        unionListDf[unionListDf["flag_in_"+species1+"_only_annot"]==1].drop(columns=[species2+"_UJC_id_"+name, species2+"_transcript_id", "flag_in_"+species2+"_only_annot", "flag_in_both_annot"]),
        noPairUnionRMP,
        how="outer",
        on=["gene_id_"+name,species1+"_UJC_id_"+name],
        indicator="merge_check",
        validate="1:1"
    )
    if unionRMPspecies1["merge_check"].value_counts()["right_only"] > 0:
        print("WARNING: There are UJC_id for species1 in the RMP file that are not found in the UJC file:\n{}".format(unionRMPspecies1[unionRMPspecies1["merge_check"]=="right_only"].to_csv(index=False)))

    unionRMPspecies1Present = unionRMPspecies1[unionRMPspecies1["merge_check"]=="both"].drop(columns=["merge_check"])
    unionRMPspecies1No = unionRMPspecies1[unionRMPspecies1["merge_check"]=="left_only"].drop(columns=["merge_check"])
    unionRMPspecies2 = pd.merge(
        unionListDf[unionListDf["flag_in_"+species2+"_only_annot"]==1].drop(columns=[species1+"_UJC_id_"+name, species1+"_transcript_id", "flag_in_"+species1+"_only_annot"]).rename(columns={"union_UJC_id_"+name: species2+"_union_UJC_id_"+name}),
        unionRMPspecies1Present.rename(columns={"union_UJC_id_"+name: species1+"_union_UJC_id_"+name}),
        how="outer",
        on=["gene_id_"+name,species2+"_UJC_id_"+name],
        indicator="merge_check",
        validate="1:1"
    )
    if unionRMPspecies2["merge_check"].value_counts()["right_only"] > 0:
        print("WARNING: There are UJC_id for species2 in the RMP file that are not found in the UJC file:\n{}".format(unionRMPspecies2[unionRMPspecies2["merge_check"]=="right_only"].to_csv(index=False)))

    # Combine merged RMP union UJC pairs, merged individual species RMP UJC pairs, and inidvidual species unmerged UJC
    pairUnionRMP = pairUnionRMP.rename(columns={"union_UJC_id_"+name: species1+"_union_UJC_id_"+name})
    pairUnionRMP[species2+"_union_UJC_id_"+name] = pairUnionRMP[species1+"_union_UJC_id_"+name]
    unionRMPspecies1No = unionRMPspecies1No.rename(columns={"union_UJC_id_"+name: species1+"_union_UJC_id_"+name})
    unionRMP = pd.concat([pairUnionRMP, unionRMPspecies1No, unionRMPspecies2], ignore_index=True).drop(columns=["merge_check"])

    # Fill in na values where necessary
    unionRMP[[c for c in unionRMP.columns if "flag_" in c]] = unionRMP[[c for c in unionRMP.columns if "flag_" in c]].fillna(0)
    unionRMP["check_num_UJC_in_gene_{}_{}".format(species1,name)] = unionRMP.groupby("gene_id_"+name)[species1+"_union_UJC_id_"+name].transform('count')
    unionRMP["check_num_UJC_in_gene_{}_{}".format(species2,name)] = unionRMP.groupby("gene_id_"+name)[species2+"_union_UJC_id_"+name].transform('count')
    unionRMP["num_UJC_in_gene_{}_{}".format(species1,name)] = np.where(
        unionRMP["num_transcript_in_gene_{}_{}".format(species1,name)].isna(),
        unionRMP["check_num_UJC_in_gene_{}_{}".format(species1,name)],
        unionRMP["num_transcript_in_gene_{}_{}".format(species1,name)]
    )
    unionRMP["num_UJC_in_gene_{}_{}".format(species2,name)] = np.where(
        unionRMP["num_transcript_in_gene_{}_{}".format(species2,name)].isna(),
        unionRMP["check_num_UJC_in_gene_{}_{}".format(species2,name)],
        unionRMP["num_transcript_in_gene_{}_{}".format(species2,name)]
    )
    unionRMP = unionRMP.drop(columns=["check_num_UJC_in_gene_{}_{}".format(species1,name), "num_transcript_in_gene_{}_{}".format(species1,name), "check_num_UJC_in_gene_{}_{}".format(species2,name), "num_transcript_in_gene_{}_{}".format(species2,name)])

    # Add variable that is FSM, ERS, RMP (recip min that is not FSM/ERS), NRM (no reciprocal minimum match)
    compConditions = [
        unionRMP["flag_FSM_"+name] == 1,
        unionRMP["flag_ERM_noIR_"+name] == 1,
        unionRMP["flag_RMP_"+name] == 1,
        unionRMP["flag_RMP_"+name] == 0,
    ]
    compChoices = [
        "FSM",
        "ERS",
        "RMP",
        "NRM"
    ]
    unionRMP["comparison_type"] = np.select(compConditions, compChoices, "oops")
    return unionRMP

def main():
    # Get species names
    species1 = args.inName1
    species2 = args.inName2
    g1 = args.inG1
    g2 = args.inG2
    #species1 = "mel"
    #species2 = "sim"
    #g1 = "mFB617"
    #g2 = "sFB202"

    # Set directories
    outdir = args.outDir
    #outdir = "/Users/adalena/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/supp_files"
    #outdir = "/Volumes/blue/mcintyre/share/transcript_distance/dros_analysis/reference_comparison"
    indir = args.inDir
    #indir = "/Users/adalena/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/analysis_output/dros_mel_vs_sim_ref"
    #indir = "/Volumes/blue/mcintyre/share/transcript_distance/dros_analysis/reference_comparison"

    # Get UJC key file map for UJC on both sets of coordinates
    keyFile = pd.read_csv(args.inKey)
    #keyFile = pd.read_csv("/Users/adalena/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/supp_files/union_mFB617_2_union_sFB202_map.csv")
    
    ### Make putative transcript ortholog file
    
    # Get FSM/ERM look up tables for each coordinates
    species1UnionRecipDf = prep_coord_TO(species1, species1, species2, g1, g2, indir, keyFile)
    species2UnionRecipDf = prep_coord_TO(species2, species1, species2, g1, g2, indir, keyFile)
    
    # Merge flags and variables from each set of coordinates
    bothCoordDf = pd.merge(
        species1UnionRecipDf,
        species2UnionRecipDf,
        how="outer",
        on=[species1+"_transcript_id", species2+"_transcript_id"],
        suffixes=["_"+g1, "_"+g2],
        indicator="merge_check",
        validate="1:1"
    )
    # Flag putative transcript orthologs that are at least a RMP and ERM with no IR
    bothCoordDf["flag_putative_TO"] = np.where(
        (bothCoordDf['flag_RMP_'+g1] + bothCoordDf['flag_RMP_'+g2] == 2)
        & (bothCoordDf['flag_ERM_noIR_'+g1] + bothCoordDf['flag_ERM_noIR_'+g2] == 2)
        & (bothCoordDf['num_ERM_nt_diff_internal_'+g1] < 15)
        & (bothCoordDf['num_ERM_nt_diff_internal_'+g2] < 15),
        1,
        0
    )
    
    # Get combined pair classification
    pairClassDict = {"TO": 4, "FSM": 3, "ERS": 2, "RMP": 1, "NRM": 0}
    pairClassConditions = [
        bothCoordDf["flag_putative_TO"]==1,
        bothCoordDf["comparison_type_"+g1] == bothCoordDf["comparison_type_"+g2],
        bothCoordDf["comparison_type_"+g1].map(pairClassDict) < bothCoordDf["comparison_type_"+g2].map(pairClassDict),    
        bothCoordDf["comparison_type_"+g1].map(pairClassDict) > bothCoordDf["comparison_type_"+g2].map(pairClassDict),
        (bothCoordDf["comparison_type_"+g1].isna()) | (bothCoordDf["comparison_type_"+g2].isna())
    ]
    pairClassChoices = [
        "TO",
        bothCoordDf["comparison_type_"+g1],
        bothCoordDf["comparison_type_"+g1],
        bothCoordDf["comparison_type_"+g2],
        "NRM"
    ]
    bothCoordDf["combine_pair_classification"] = np.select(pairClassConditions, pairClassChoices, "oops")
    # bothCoordDf["combine_pair_classification"].value_counts()
        # NRM    21604
        # TO     14761
        # RMP     2635
        # ERS      461
    # Set subclassification
    pairSubClassConditions = [
        (bothCoordDf["combine_pair_classification"]=="TO") & (bothCoordDf["comparison_type_"+g1]=="FSM") & (bothCoordDf["comparison_type_"+g2]=="FSM"),
        (bothCoordDf["combine_pair_classification"]=="TO") & (bothCoordDf["comparison_type_"+g1]=="FSM") & (bothCoordDf["comparison_type_"+g2]=="ERS"),
        (bothCoordDf["combine_pair_classification"]=="TO") & (bothCoordDf["comparison_type_"+g1]=="ERS") & (bothCoordDf["comparison_type_"+g2]=="FSM"),
        (bothCoordDf["combine_pair_classification"]=="ERS"),
        (bothCoordDf["combine_pair_classification"]=="RMP"),
        (bothCoordDf["combine_pair_classification"]=="NRM"),
    ]
    pairSubClassChoices = [
        "TO FSM on both coordinates",
        "TO FSM on "+g1+", ERS on "+g2,
        "TO FSM on "+g2+", ERS on "+g1,
        "",
        "",
        ""
    ]
    bothCoordDf["combine_pair_classification"] = np.select(pairSubClassConditions, pairSubClassChoices, "oops")
    
    # Output TO file
    bothCoordDf.drop(columns=["merge_check"]).to_csv("{}/{}_{}_{}_{}_transcript_map.csv".format(outdir, species1, g1, species2, g2), index=False)

if __name__ == '__main__':
    # Parse command line arguments
    global args
    args = getOptions()
    main()