libname sim "!MCLAB/useful_dsim_data/flybase202/sas_data"; libname mel "!MCLAB/useful_dmel_data/flybase617/sas_data"; /* cc for mel on erg cc for sim on erg for melRef: merge pair class file with ERG xscript output on transcipt_1 to xscript_model_id -- check geneID should be same in both files merge in geneNames for simRef: same */ %macro both_refs (ref, Xnum) ; proc import datafile = "!MCLAB/Transcript_orthologs/reference_trand_output/dros_union_ref/mel2&ref._sim2&ref._pair_classification.csv" out = &ref._pair dbms = csv replace ; guessingrows = MAX ; run ; /* has transcript_1 and transcript_2 */ data &ref._pair_keep ; set &ref._pair ; where flag_RMP = 1 ; run ; proc import datafile = "!MCLAB/Transcript_orthologs/reference_trand_output/dros_union_ref/mel2&ref._vs_sim2&ref._union_xscript_output.csv" out = &ref._erg dbms = csv replace ; guessingrows = MAX ; run ; /* xscript_model_id = transcript_# */ data &ref.2_erg ; set &ref._erg ; rename transcript_id = transcript_&Xnum. ; rename gene_id = erg_gene_id ; keep transcript_id gene_id erg_id ; run; proc sort data = &ref._pair_keep ; by transcript_&Xnum. ; proc sort data = &ref.2_erg ; by transcript_&Xnum. ; run ; data &ref._pair_wERG ; merge &ref._pair_keep (in=in1) &ref.2_erg (in=in2) ; by transcript_&Xnum ; if in1 ; run ; data &ref.Ref_pair_wERG ; set &ref._pair_wERG ; drop erg_gene_id; newERG_id = compress(erg_id) ; drop erg_id ; rename newERG_id = ERG_id; run ; %mend ; %both_refs (mel, 1) ; %both_refs (sim, 2) ; /* gene names and symbol */ proc import datafile = "!MCLAB/useful_dmel_data/flybase617/flybase_files/gene_map_table_fb_2017_04.tsv" out = mel_anno dbms = tab replace ; getnames = no; datarow = 7 ; guessingrows = Max ; run; data mel_anno2 ; set mel_anno ; keep var2 var3 ; rename var2 = symbol ; rename var3 = gene_id ; run; proc import datafile = "!MCLAB/useful_dsim_data/flybase202/flybase_files/dsim-all-r2.02.gtf" out = sim_anno dbms = tab replace ; getnames = no; guessingrows = Max ; run; data sim_anno2 ; set sim_anno ; g = scan(var9, 1, ";") ; g2 = scan(g, 2, " ") ; gene_id = compress(g2, '"') ; s = scan(var9, 2, ";") ; s2 = scan(s, 2, " ") ; sym = compress(s2, '"') ; symbol = compress(sym, 'Dsim\') ; keep gene_id symbol; run ; proc sort data = sim_anno2 nodups ; by _all_ ; run; %macro symbols (ref) ; proc sort data = &ref.Ref_pair_wERG ; by gene_id ; proc sort data = &ref._anno2 ; by gene_id ; run ; data &ref._pair_ERG_symbol ; merge &ref._anno2 (in=in1) &ref.Ref_pair_wERG (in=in2) ; by gene_id ; if in2 ; run; data &ref._pair_ERG_symbol2 ; set &ref._pair_ERG_symbol ; where flag_RMP = 1 ; run ; proc sql noprint; select cats(name,'=',"&ref._",name) into :list separated by ' ' from dictionary.columns where libname = 'WORK' and memname = "&ref._PAIR_ERG_SYMBOL2" and upcase(name) not like "TRANSCRIPT_%" ; quit; proc datasets library = work nolist; modify &ref._pair_ERG_symbol2; rename &list; quit; proc sort data = &ref._pair_ERG_symbol2; by transcript_1 transcript_2 ; run ; %mend ; %symbols (MEL) ; /* 19065 obs */ %symbols (SIM) ; /* 20263 obs */ proc export data = mel_pair_ERG_symbol2 outfile = "/nfshome/ammorse/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/analysis_output/dros_melRef_pair_ERG_symbol.csv" dbms = csv replace ; run ; proc export data = sim_pair_ERG_symbol2 outfile = "/nfshome/ammorse/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/analysis_output/dros_simRef_pair_ERG_symbol.csv" dbms = csv replace ; run ; /* merge mel_pair_ERG_symbol and sim_pair_ERG_symbol by transcript_1 and transcript_2 */ data melRef_simRef other ; merge mel_pair_ERG_symbol2 (in=in1) sim_pair_ERG_symbol2 (in=in2) ; by transcript_1 transcript_2 ; if in1 and in2 then output melRef_simRef ; else output other ; run; /* output 3242 obs */ data melRef_simRef_map ; set melRef_simRef ; if mel_pair_classification = "FSM" or mel_pair_classification = "ERS_noIR_small" then flag_mel_t_hat = 1 ; else flag_mel_t_hat = 0; if sim_pair_classification = "FSM" or sim_pair_classification = "ERS_noIR_small" then flag_sim_t_hat = 1 ; else flag_sim_t_hat = 0 ; if flag_mel_t_hat = 1 and flag_sim_t_hat = 1 then flag_t_hat = 1 ; else flag_t_hat = 0; run; proc freq data = melRef_simRef_map ; tables flag_t_hat * flag_mel_t_hat * flag_sim_t_hat / out = aaa ; run; proc print data = aaa ; run; /* flag_ flag_ flag_ mel_t_ sim_t_ t_hat hat hat COUNT 0 0 0 1355 0 0 1 484 0 1 0 1012 1 1 1 391 */ proc export data = melRef_simRef_map outfile = "/nfshome/ammorse/mclab/SHARE/McIntyre_Lab/Transcript_orthologs/analysis_output/dros_melRef_simRef_union_map.csv" dbms = csv replace ; run ;