#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Dec 5 14:39:24 2022 @author: adalena """ import pandas as pd import numpy as np import argparse def getOptions(): # Parse command line arguments parser = argparse.ArgumentParser( description=( "Create a file that classifies transcript pairs across transcripts between two species." ) ) # Input data parser.add_argument( "-i", "--input-directory", dest="inDir", required=True, help=( "Input directory that contains the following: " "TranD_consol_[species1]_vs_[species2]_2_[species1] and TranD_consol_[species1]_vs_[species2]_2_[species2] " "directories, as well as union_[genome1]_2_union_[genome2]_map.csv." ) ) parser.add_argument( "-k", "--key-file", dest="inKey", required=True, help=( "Input file for UJC key/map file between the two sets of coordinates: union_[genome1]_2_union_[genome2]_map.csv." ) ) parser.add_argument( "-s1", "--species1", dest="inName1", required=True, help=( "Input species 1 name." ) ) parser.add_argument( "-s2", "--species2", dest="inName2", required=True, help=( "Input species 2 name." ) ) parser.add_argument( "-g1", "--genome1", dest="inG1", required=True, help=( "Input genome 1 name." ) ) parser.add_argument( "-g2", "--genome2", dest="inG2", required=True, help=( "Input genome 2 name." ) ) # Output data parser.add_argument( "-o", "--output-directory", dest="outDir", required=True, help="Output directory." ) args = parser.parse_args() return args #### Functions #### def split_column_by_sep(df,col_name=None,sep=None,sort_list=None): # Split variable by some character like '|' or ',' and keep all other values the same if col_name == None: col_name = 'transcript_id' if sep == None: sep = "|" splitList = df[col_name].str.split(sep).apply(pd.Series, 1).stack() splitList.index = splitList.index.droplevel(-1) tempDF = df.copy() del(tempDF[col_name]) splitDF = tempDF.join(splitList.rename(col_name)) if sort_list != None: splitDF = splitDF.sort_values(by=sort_list) del(tempDF, splitList) return splitDF def get_internal_nt_diff(td_df): # Count number of internal (altnerative donor/acceptor or IR) nt differences in ERM # Get the number of nt unique to T1/T2 in shared exon regions # Subtract from this number the number of nt difference from a 5'/3' end length difference # Only counting internal nt difference # Get difference of TSS # Conditions: # 1. ERM, positive strand, 5' end difference, T1 has the longer 5' end # 2. ERM, positive strand, 5' end difference, T2 has the longer 5' end # 3. ERM, negative strand, 5' end difference, T1 has the longer 5' end # 4. ERM, negative strand, 5' end difference, T2 has the longer 5' end tssConditions = [ (td_df["prop_ER_diff"] == 0) & (td_df["flag_5_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+") & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2]), (td_df["prop_ER_diff"] == 0) & (td_df["flag_5_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+") & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2]), (td_df["prop_ER_diff"] == 0) & (td_df["flag_5_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-") & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1]), (td_df["prop_ER_diff"] == 0) & (td_df["flag_5_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-") & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1]), ] tssChoices = [ td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[1].astype(float), td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[1].astype(float), td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1].astype(float), td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1].astype(float), ] td_df["num_ERM_nt_diff_TSS"] = np.select(tssConditions, tssChoices, np.nan) # Get difference of TTS # Conditions: # 1. ERM, positive strand, 3' end difference, T1 has the longer 3' end # 2. ERM, positive strand, 3' end difference, T2 has the longer 3' end # 3. ERM, negative strand, 3' end difference, T1 has the longer 3' end # 4. ERM, negative strand, 3' end difference, T2 has the longer 3' end ttsConditions = [ (td_df["prop_ER_diff"] == 0) & (td_df["flag_3_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+") & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1]), (td_df["prop_ER_diff"] == 0) & (td_df["flag_3_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="+") & (td_df["fragment_shared"].str.split("|").str[-1].str.split(":").str[2] == td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1]), (td_df["prop_ER_diff"] == 0) & (td_df["flag_3_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-") & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2]), (td_df["prop_ER_diff"] == 0) & (td_df["flag_3_variation"]==1) & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[3]=="-") & (td_df["fragment_shared"].str.split("|").str[0].str.split(":").str[1] == td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2]), ] ttsChoices = [ td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[-1].str.split(":").str[1].astype(float), td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[-1].str.split(":").str[1].astype(float), td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T1_only"].str.split("|").str[0].str.split(":").str[1].astype(float), td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[2].astype(float) - td_df["fragment_T2_only"].str.split("|").str[0].str.split(":").str[1].astype(float), ] td_df["num_ERM_nt_diff_TTS"] = np.select(ttsConditions, ttsChoices, np.nan) # Get donor/acceptor/IR length difference td_df["num_ERM_nt_diff_internal"] = np.where( td_df["prop_ER_diff"] == 0, td_df["num_nt_diff"].astype(int) - td_df["num_ERM_nt_diff_TSS"].fillna(0) - td_df["num_ERM_nt_diff_TTS"].fillna(0), np.nan ) return td_df def prep_coord_TO(ref, species1, species2, g1, g2, indir, keyFile): # Get TranD species1 vs species2 distance output # Get internal (alt. donor/acceptor and IR) nt difference species1v2Df = get_internal_nt_diff(pd.read_csv("{}/TranD_consol_{}_vs_{}_2_{}/{}_vs_{}_pairwise_transcript_distance.csv".format(indir, species1, species2, ref, species1, species2), low_memory=False)) if ref == species1: name = g1 else: name = g2 # Get reciprocal minimum matches species1v2RecipDf = species1v2Df[ species1v2Df["flag_recip_min_match"]==1][ ["gene_id", "transcript_1", "transcript_2", "flag_FSM", "flag_ERM_noIR_recip_min_match", "num_transcript_in_gene_{}".format(species1), "num_transcript_in_gene_{}".format(species2), "num_nt_shared", "num_nt_diff", "total_nt", "prop_nt_diff", "prop_nt_similar", "num_ERM_nt_diff_internal", "num_ER_T1_only", "num_ER_T2_only", "num_ER_shared"] ].copy().rename(columns={ "gene_id": "gene_id_"+name, "flag_FSM": "flag_FSM_"+name, "flag_ERM_noIR_recip_min_match": "flag_ERM_noIR_"+name, "num_transcript_in_gene_"+species1: "num_transcript_in_gene_{}_{}".format(species1,name), "num_transcript_in_gene_"+species2: "num_transcript_in_gene_{}_{}".format(species2,name), "num_nt_shared": "num_nt_shared_"+name, "num_nt_diff": "num_nt_diff_"+name, "total_nt": "total_nt_"+name, "prop_nt_diff": "prop_nt_diff_"+name, "prop_nt_similar": "prop_nt_similar_"+name, "num_ERM_nt_diff_internal": "num_ERM_nt_diff_internal_"+name }) species1v2RecipDf["flag_RMP_"+name] = 1 species1v2RecipDf[species1+"_UJC_id_"+name] = species1v2RecipDf["transcript_1"].str[:-4] species1v2RecipDf[species2+"_UJC_id_"+name] = species1v2RecipDf["transcript_2"].str[:-4] # species1v2RecipDf["num_ER_T1"] = species1v2RecipDf["num_ER_T1_only"] + species1v2RecipDf["num_ER_shared"] # species1v2RecipDf["num_ER_T2"] = species1v2RecipDf["num_ER_T2_only"] + species1v2RecipDf["num_ER_shared"] # xcrptNumCount = pd.crosstab(species1v2RecipDf["num_ER_T1"], species1v2RecipDf["num_ER_T2"]) # species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617"]].drop_duplicates()["num_transcript_in_gene_mel_mFB617"].value_counts() # species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_sim_mFB617"].value_counts() # xcrptNumCountGene = pd.crosstab(species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_mel_mFB617"], species1v2RecipDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_sim_mFB617"]) # monoexonDf = species1v2RecipDf[(species1v2RecipDf["num_ER_T1"]==1)&(species1v2RecipDf["num_ER_T2"]==1)] # monoXcrptNumCountGene = pd.crosstab(monoexonDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_mel_mFB617"], monoexonDf[["gene_id_mFB617", "num_transcript_in_gene_mel_mFB617", "num_transcript_in_gene_sim_mFB617"]].drop_duplicates()["num_transcript_in_gene_sim_mFB617"]) # Get union reference key file variables for the given reference unionDf = keyFile[~keyFile["gene_id_"+name].isna()][[c for c in keyFile.columns if name in c or "_transcript_id" in c]].copy() unionDf["flag_in_"+species1+"_only_annot"] = np.where( (~unionDf[species1+"_transcript_id"].isna()) & (unionDf[species2+"_transcript_id"].isna()), 1, 0 ) unionDf["flag_in_"+species2+"_only_annot"] = np.where( (~unionDf[species2+"_transcript_id"].isna()) & (unionDf[species1+"_transcript_id"].isna()), 1, 0 ) unionDf["flag_in_both_annot"] = np.where( (~unionDf[species2+"_transcript_id"].isna()) & (~unionDf[species1+"_transcript_id"].isna()), 1, 0 ) # Check that the flags all add up if len(unionDf) != unionDf["flag_in_"+species1+"_only_annot"].sum() + unionDf["flag_in_"+species2+"_only_annot"].sum() + unionDf["flag_in_both_annot"].sum(): print("WARNING: UJC found that does not fit into logical group.") # Make unique on union_UJC_id and pair of species1_UJC_id and species2_UJC_id # Make piped lists of species1 and species2 transcript_id values unionDf = unionDf.sort_values([species1+"_transcript_id", species2+"_transcript_id"]) unionListDf = unionDf.fillna("").groupby(["gene_id_"+name, "union_UJC_id_"+name]).agg({ species1+"_transcript_id": lambda x: "|".join([element for element in x if element != ""]), species2+"_transcript_id": lambda x: "|".join([element for element in x if element != ""]), species1+"_UJC_id_"+name: lambda x: "|".join([element for element in x.unique() if element != ""]), species2+"_UJC_id_"+name:lambda x: "|".join([element for element in x.unique() if element != ""]), "flag_in_"+species1+"_only_annot": max, "flag_in_"+species2+"_only_annot": max, "flag_in_both_annot": max }).reset_index() unionListDf[species1+"_UJC_id_"+name] = np.where( unionListDf[species1+"_UJC_id_"+name]=="", np.nan, unionListDf[species1+"_UJC_id_"+name] ) unionListDf[species2+"_UJC_id_"+name] = np.where( unionListDf[species2+"_UJC_id_"+name]=="", np.nan, unionListDf[species2+"_UJC_id_"+name] ) if len(unionListDf.fillna("")[unionListDf.fillna("")[species1+"_UJC_id_"+name].str.contains("\|")]) > 0: print("WARNING: There is a union UJC that covers more than one {} UJC -\n{}".format(species1, unionListDf.fillna("")[unionListDf.fillna("")[species1+"_UJC_id_"+name].str.contains("\|")].to_string(index=False))) if len(unionListDf.fillna("")[unionListDf.fillna("")[species2+"_UJC_id_"+name].str.contains("\|")]) > 0: print("WARNING: There is a union UJC that covers more than one {} UJC -\n{}".format(species2, unionListDf.fillna("")[unionListDf.fillna("")[species2+"_UJC_id_"+name].str.contains("\|")].to_string(index=False))) # Merge RMP with union_UJC_id found in both species unionRMP = pd.merge( unionListDf[unionListDf["flag_in_both_annot"]==1], species1v2RecipDf.drop(columns=["transcript_1", "transcript_2"]), how="outer", on=["gene_id_"+name,species1+"_UJC_id_"+name,species2+"_UJC_id_"+name], indicator="merge_check", validate="1:1" ) if unionRMP["merge_check"].value_counts()["left_only"] > 0: print("WARNING: There are pairs of UJC_id that are not found in the RMP file:\n{}".format(unionRMP[unionRMP["merge_check"]=="left_only"].to_csv(index=False))) pairUnionRMP = unionRMP[unionRMP["merge_check"]=="both"] noPairUnionRMP = unionRMP[unionRMP["merge_check"]=="right_only"][[c for c in species1v2RecipDf.columns if c not in ["transcript_1", "transcript_2"]]] # Merge RMP without a union UJC pair with the RMP that are only in species1 and then those that are only in species2 unionRMPspecies1 = pd.merge( unionListDf[unionListDf["flag_in_"+species1+"_only_annot"]==1].drop(columns=[species2+"_UJC_id_"+name, species2+"_transcript_id", "flag_in_"+species2+"_only_annot", "flag_in_both_annot"]), noPairUnionRMP, how="outer", on=["gene_id_"+name,species1+"_UJC_id_"+name], indicator="merge_check", validate="1:1" ) if unionRMPspecies1["merge_check"].value_counts()["right_only"] > 0: print("WARNING: There are UJC_id for species1 in the RMP file that are not found in the UJC file:\n{}".format(unionRMPspecies1[unionRMPspecies1["merge_check"]=="right_only"].to_csv(index=False))) unionRMPspecies1Present = unionRMPspecies1[unionRMPspecies1["merge_check"]=="both"].drop(columns=["merge_check"]) unionRMPspecies1No = unionRMPspecies1[unionRMPspecies1["merge_check"]=="left_only"].drop(columns=["merge_check"]) unionRMPspecies2 = pd.merge( unionListDf[unionListDf["flag_in_"+species2+"_only_annot"]==1].drop(columns=[species1+"_UJC_id_"+name, species1+"_transcript_id", "flag_in_"+species1+"_only_annot"]).rename(columns={"union_UJC_id_"+name: species2+"_union_UJC_id_"+name}), unionRMPspecies1Present.rename(columns={"union_UJC_id_"+name: species1+"_union_UJC_id_"+name}), how="outer", on=["gene_id_"+name,species2+"_UJC_id_"+name], indicator="merge_check", validate="1:1" ) if unionRMPspecies2["merge_check"].value_counts()["right_only"] > 0: print("WARNING: There are UJC_id for species2 in the RMP file that are not found in the UJC file:\n{}".format(unionRMPspecies2[unionRMPspecies2["merge_check"]=="right_only"].to_csv(index=False))) # Combine merged RMP union UJC pairs, merged individual species RMP UJC pairs, and inidvidual species unmerged UJC pairUnionRMP = pairUnionRMP.rename(columns={"union_UJC_id_"+name: species1+"_union_UJC_id_"+name}) pairUnionRMP[species2+"_union_UJC_id_"+name] = pairUnionRMP[species1+"_union_UJC_id_"+name] unionRMPspecies1No = unionRMPspecies1No.rename(columns={"union_UJC_id_"+name: species1+"_union_UJC_id_"+name}) unionRMP = pd.concat([pairUnionRMP, unionRMPspecies1No, unionRMPspecies2], ignore_index=True).drop(columns=["merge_check"]) # Fill in na values where necessary unionRMP[[c for c in unionRMP.columns if "flag_" in c]] = unionRMP[[c for c in unionRMP.columns if "flag_" in c]].fillna(0) unionRMP["check_num_UJC_in_gene_{}_{}".format(species1,name)] = unionRMP.groupby("gene_id_"+name)[species1+"_union_UJC_id_"+name].transform('count') unionRMP["check_num_UJC_in_gene_{}_{}".format(species2,name)] = unionRMP.groupby("gene_id_"+name)[species2+"_union_UJC_id_"+name].transform('count') unionRMP["num_UJC_in_gene_{}_{}".format(species1,name)] = np.where( unionRMP["num_transcript_in_gene_{}_{}".format(species1,name)].isna(), unionRMP["check_num_UJC_in_gene_{}_{}".format(species1,name)], unionRMP["num_transcript_in_gene_{}_{}".format(species1,name)] ) unionRMP["num_UJC_in_gene_{}_{}".format(species2,name)] = np.where( unionRMP["num_transcript_in_gene_{}_{}".format(species2,name)].isna(), unionRMP["check_num_UJC_in_gene_{}_{}".format(species2,name)], unionRMP["num_transcript_in_gene_{}_{}".format(species2,name)] ) unionRMP = unionRMP.drop(columns=["check_num_UJC_in_gene_{}_{}".format(species1,name), "num_transcript_in_gene_{}_{}".format(species1,name), "check_num_UJC_in_gene_{}_{}".format(species2,name), "num_transcript_in_gene_{}_{}".format(species2,name)]) # Add variable that is FSM, ERS, RMP (recip min that is not FSM/ERS), NRM (no reciprocal minimum match) compConditions = [ unionRMP["flag_FSM_"+name] == 1, unionRMP["flag_ERM_noIR_"+name] == 1, unionRMP["flag_RMP_"+name] == 1, unionRMP["flag_RMP_"+name] == 0, ] compChoices = [ "FSM", "ERS", "RMP", "NRM" ] unionRMP["comparison_type"] = np.select(compConditions, compChoices, "oops") return unionRMP def main(): # Get species names species1 = args.inName1 species2 = args.inName2 g1 = args.inG1 g2 = args.inG2 #species1 = "mel" #species2 = "sim" #g1 = "mFB617" #g2 = "sFB202" # Set directories outdir = args.outDir #outdir = "/Users/adalena/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/supp_files" #outdir = "/Volumes/blue/mcintyre/share/transcript_distance/dros_analysis/reference_comparison" indir = args.inDir #indir = "/Users/adalena/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/analysis_output/dros_mel_vs_sim_ref" #indir = "/Volumes/blue/mcintyre/share/transcript_distance/dros_analysis/reference_comparison" # Get UJC key file map for UJC on both sets of coordinates keyFile = pd.read_csv(args.inKey) #keyFile = pd.read_csv("/Users/adalena/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/supp_files/union_mFB617_2_union_sFB202_map.csv") ### Make putative transcript ortholog file # Get FSM/ERM look up tables for each coordinates species1UnionRecipDf = prep_coord_TO(species1, species1, species2, g1, g2, indir, keyFile) species2UnionRecipDf = prep_coord_TO(species2, species1, species2, g1, g2, indir, keyFile) # Merge flags and variables from each set of coordinates bothCoordDf = pd.merge( species1UnionRecipDf, species2UnionRecipDf, how="outer", on=[species1+"_transcript_id", species2+"_transcript_id"], suffixes=["_"+g1, "_"+g2], indicator="merge_check", validate="1:1" ) # Flag putative transcript orthologs that are at least a RMP and ERM with no IR bothCoordDf["flag_putative_TO"] = np.where( (bothCoordDf['flag_RMP_'+g1] + bothCoordDf['flag_RMP_'+g2] == 2) & (bothCoordDf['flag_ERM_noIR_'+g1] + bothCoordDf['flag_ERM_noIR_'+g2] == 2) & (bothCoordDf['num_ERM_nt_diff_internal_'+g1] < 15) & (bothCoordDf['num_ERM_nt_diff_internal_'+g2] < 15), 1, 0 ) # Get combined pair classification pairClassDict = {"TO": 4, "FSM": 3, "ERS": 2, "RMP": 1, "NRM": 0} pairClassConditions = [ bothCoordDf["flag_putative_TO"]==1, bothCoordDf["comparison_type_"+g1] == bothCoordDf["comparison_type_"+g2], bothCoordDf["comparison_type_"+g1].map(pairClassDict) < bothCoordDf["comparison_type_"+g2].map(pairClassDict), bothCoordDf["comparison_type_"+g1].map(pairClassDict) > bothCoordDf["comparison_type_"+g2].map(pairClassDict), (bothCoordDf["comparison_type_"+g1].isna()) | (bothCoordDf["comparison_type_"+g2].isna()) ] pairClassChoices = [ "TO", bothCoordDf["comparison_type_"+g1], bothCoordDf["comparison_type_"+g1], bothCoordDf["comparison_type_"+g2], "NRM" ] bothCoordDf["combine_pair_classification"] = np.select(pairClassConditions, pairClassChoices, "oops") # bothCoordDf["combine_pair_classification"].value_counts() # NRM 21604 # TO 14761 # RMP 2635 # ERS 461 # Set subclassification pairSubClassConditions = [ (bothCoordDf["combine_pair_classification"]=="TO") & (bothCoordDf["comparison_type_"+g1]=="FSM") & (bothCoordDf["comparison_type_"+g2]=="FSM"), (bothCoordDf["combine_pair_classification"]=="TO") & (bothCoordDf["comparison_type_"+g1]=="FSM") & (bothCoordDf["comparison_type_"+g2]=="ERS"), (bothCoordDf["combine_pair_classification"]=="TO") & (bothCoordDf["comparison_type_"+g1]=="ERS") & (bothCoordDf["comparison_type_"+g2]=="FSM"), (bothCoordDf["combine_pair_classification"]=="ERS"), (bothCoordDf["combine_pair_classification"]=="RMP"), (bothCoordDf["combine_pair_classification"]=="NRM"), ] pairSubClassChoices = [ "TO FSM on both coordinates", "TO FSM on "+g1+", ERS on "+g2, "TO FSM on "+g2+", ERS on "+g1, "", "", "" ] bothCoordDf["combine_pair_classification"] = np.select(pairSubClassConditions, pairSubClassChoices, "oops") # Output TO file bothCoordDf.drop(columns=["merge_check"]).to_csv("{}/{}_{}_{}_{}_transcript_map.csv".format(outdir, species1, g1, species2, g2), index=False) if __name__ == '__main__': # Parse command line arguments global args args = getOptions() main()