Background Workflow |
Py################ # IMPORT LINES # ################ from math import sqrt import sys sys.path.append('/mindstore/home33ext/kscopino/BIN/') from statistics import mean,stdev import os ######################## # cpptraj FILES NEEDED # ######################## # rmsd_x_NOTrestr_bkbone.dat # nhb_AVE_169_all_175_all.dat # nhb_AVE_168_all_175_all.dat # nhb_AVE_169_all_176_all.dat # nhb_AVE_168_all_176_all.dat # nhb_AVE_167_all_176_all.dat # nhb_AVE_168_all_177_all.dat # nhb_AVE_167_all_177_all.dat # nhb_AVE_105_all_177_all.dat # nhb_AVE_167_all_178_all.dat # nhb_AVE_105_all_178_all.dat # nhb_AVE_114_all_178_all.dat # nhb_AVE_105_all_179_all.dat # nhb_AVE_114_all_179_all.dat # nhb_AVE_368_all_179_all.dat # nhb_AVE_114_all_180_all.dat # nhb_AVE_368_all_180_all.dat ###################### # MAY NEED TO CHANGE # ###################### mut = 'GCC' structure = '5JUP' # ns_path # datapath # mutations1 # mutations2 # mutations3 ### INPUT VARIABLES, below ### # mut # structure # dir # frames_per_ns # frames_equil # min_frames # expt_nums # freq_threshold ######################## # FUNCTION DEFINITIONS # ######################## def countlines(handle): '''Count lines in a file''' n = 0 for line in handle: n += 1 return n def ns_check(expt, mut, min_frames): '''Return the number of lines in a file if the trajectory is long enough''' import os # test to see if cpptraj has been run ns_path = '/mindstore/home33ext/kscopino/COD1_GCN_SUBS/5JUP/GCC/NEUTRAL/NEUTRAL_' + str(expt) + '/DATA/rmsd_' + str(expt) + '_NOTrestr_bkbone.dat' if os.path.exists(ns_path) == True: handle = open(ns_path, 'r') # number of lines = number of frames in trajectory num_lines = countlines(handle) if os.path.exists(ns_path) == True and num_lines > min_frames: return num_lines def same_heavy_atoms(test_hb, hb_list): '''Check if alternative H for acceptor-donor pair''' counter = 0 for hb in hb_list: # if the same H-bond acceptor/donor appear multiple times in the list of all H-bonds (differ by H atom identity) if hb[0] == test_hb[0] and hb[2] == test_hb[2]: counter += 1 # multiple H atoms involved in same donor/acceptor pair if counter > 1: return "** shared acceptor donor" # single H atom involved in donor/acceptor pair else: return "single acceptor donor" def specialprint_sum(label, list_sums): '''Print labeled line with list of sums''' if list_sums == []: return # add label to printline printstring = str(label) # add each sum to the printline for sum in list_sums: printstring += '\t'+str(round(sum,3)) # calculate the mean and serror and print in the printline fraction_mean = round(mean(list_sums),3) fraction_serror = round(stdev(list_sums)/sqrt(len(list_sums)),3) print print printstring+'\t\tmean\t'+str(fraction_mean)+'\tse\t'+str(fraction_serror) def specialprint_sumofsums(label, list_list_sums): '''Print labeled line with list of sums of sums''' # add label to printline printstring = str(label) # initialize the list of sums of sums as the first list of sums overall_sum = list_list_sums[0] if len(list_list_sums) > 1: # add each list of sums to the list of sums of sums for sum_list in list_list_sums[1:]: if len(sum_list) > 0: for i in range(len(overall_sum)): overall_sum[i] += sum_list[i] # add each sum of sums to the printline if overall_sum != []: for sum in overall_sum: printstring += '\t'+str(round(sum,3)) # calculate the mean and serror and print in the printline fraction_mean = round(mean(overall_sum),3) fraction_serror = round(stdev(overall_sum)/sqrt(len(overall_sum)),3) print print printstring+'\t\tmean\t'+str(fraction_mean)+'\tse\t'+str(fraction_serror) def collate_avgdata(avgdata_file): '''Summarize data from multiple experiments into a list of sums''' ################### # INPUT VARIABLES # ################### mut = 'GCC' structure = '5JUP' dir = '/mindstore/home33ext/kscopino/COD1_GCN_SUBS/5JUP/GCC/' frames_per_ns = 100 # data analyzed after x ns frames_equil = frames_per_ns * 25.0 # minimum of 5ns after equilibration min_frames = frames_equil + (frames_per_ns * 5) #expt_nums = change according to range of experiments expt_nums = range(1,31) # set to 0 for no threshold freq_threshold = 0.00 ################ # COLLECT DATA # ################ os.chdir(dir) # initialize dictionary of dictionaries for file data filedata = {} # initialize dictionary for used experiments and their duration dict_used_expts = {} # initialize list of used experiments list_used_expts = [] # add frame counter which can be converted to ns for output total_analysed_frames = 0 # iterate through list of experiment numbers for experiment in expt_nums: # count frames in cpptraj output to make sure trajectory is long enough num_lines = ns_check(experiment, mut, min_frames) # if long enough if num_lines > min_frames: # add expt number to dictionary of used experiments as key, length in ns as value dict_used_expts[experiment] = round(float(num_lines - frames_equil)/frames_per_ns,0) # add expt number to dictionary of used experiments list_used_expts += [experiment] # update analyzed frames, subtracting equilibration total_analysed_frames += num_lines-frames_equil datapath = dir + 'NEUTRAL/NEUTRAL_' + str(experiment) + '/DATA/' + avgdata_file handle = open(datapath, 'r') # ignore header line handle.readline() # set up keys for filedata dictionary as experiment number filedata[experiment] = {} # for each line in the datafile split by whitespace for line in handle: # each value in a line is added as a separate value in a list linelist = line.strip().split() # for the filedata dictionary, the experiment number is the key and the dictionary of H-bond donors and acceptors are the values # for dictionary where a triplet of H-bond donors and acceptors are the keys, the frequency data are the values filedata[experiment][(linelist[0],linelist[1],linelist[2])] = linelist[3:7] # initialize dictionary for H-bond triplet keys with frequencies for each experiment as values alldata = {} # iterate through each used experiment, add each unique H-bond donor/acceptor triplet as a key to alldata for expt in filedata: for hbkey in filedata[expt]: if hbkey not in alldata: alldata[hbkey] = [] # begin header printline header1, header2 = '\t','\t' # renumber experiments for table new_expt_num = 1 # initialize list of H-bonds observed across all experiments hb_obs_all_expts = [] for expt in list_used_expts: # add expts (original numbering) to header 1 header1 += str(expt)+'\t' # add expts (new numbering) with ns to header 2 header2 += str(new_expt_num)+' ('+str(dict_used_expts[expt])[:-2]+')'+'\t' new_expt_num += 1 # initialize list of observed H-bonds for a given experiment hb_observed = [] # for each H-bond donor/acceptor triplet, add triplet to list of observed H-bonds for hb in filedata[expt]: hb_observed += [hb] # if a unique triplet, add to the list of H-bonds observed across all experiments if hb not in hb_obs_all_expts: hb_obs_all_expts += [hb] # for each H-bond triplet key for key in alldata: # if observed in the experiment, add frequency information if key in hb_observed: alldata[key] += [[int(filedata[expt][key][0]),float(filedata[expt][key][1]),float(filedata[expt][key][2]),float(filedata[expt][key][3])]] else: alldata[key] += [[0, 0.0, 'na', 'na']] # print substitution, translocation structure, file name, nanoseconds used in analysis print mut, structure, avgdata_file[:-4], 'total ns '+str((int(1.0*total_analysed_frames/frames_per_ns))), '\texpt (ns)' # print new experiment numbering with analyzed run length, plus column labels print header2+'\tfreq_ave (se)\tdist_avg (se)\tangle_ave (se)' # initialize list listof_fraction_lst = [] # for each H-bond triplet in alldata for h_bond in alldata: # initialize lists for H-bond frequency, distance, and angle fraction_lst, distance_lst, angle_lst = [], [], [] # add each H-bond donor/acceptor of the triplet to the printline printline = h_bond[0]+', '+h_bond[1]+', '+h_bond[2]+'\t' # for frequency info by experiment for a given H-bond triplet for expt_data in alldata[h_bond]: # make sure the H-bond isn't between atoms of the same residue if (h_bond[0][:5] != h_bond[2][:5]): # add frequency by experiment to printline printline += str(round(expt_data[1],3))+'\t' # make list of frequencies by experiment fraction_lst += [expt_data[1]] if expt_data[2] != 'na': # make a list of distances by experiment distance_lst += [expt_data[2]] # make a list of angle measures by experiment angle_lst += [expt_data[3]] # calculate mean and standard error for H-bond triplet frequency across experiments fraction_mean = round(mean(fraction_lst),3) fraction_serror = round(stdev(fraction_lst)/sqrt(len(fraction_lst)),3) # calculate mean and standard error for H-bond triplet distance across experiments distance_mean = round(mean(distance_lst),2) if len(distance_lst) > 1: distance_serror = round(stdev(distance_lst)/sqrt(len(distance_lst)),2) else: distance_serror = 'na' #calculate mean and standard error for H-bond triplet angle across experiments angle_mean = round(mean(angle_lst),1) if len(angle_lst) > 1: angle_serror = round(stdev(angle_lst)/sqrt(len(angle_lst)),2) else: angle_serror = 'na' # combine means and standard errors into print strings fraction_ave_se = str(fraction_mean)+' ('+str(fraction_serror)+')\t' distance_ave_se = str(distance_mean)+' ('+str(distance_serror)+')\t' angle_ave_se = str(angle_mean)+' ('+str(angle_serror)+')' # if H-bonding is above a threshold frequency and H-bonding is not between atoms of the same residue, print to terminal if (fraction_mean >= freq_threshold) and (h_bond[0][:5] != h_bond[2][:5]): # print each H-bond triplet line print printline+'\t'+ fraction_ave_se + distance_ave_se + angle_ave_se + '\t'+same_heavy_atoms(h_bond,hb_obs_all_expts) # update list of H-bond lists, each containing frequency by experiment listof_fraction_lst += [fraction_lst] # initialize list of H-bond sums sum_fractions = [] # initialize 1 sum per experiment for f in listof_fraction_lst[0]: sum_fractions += [0.0] # add up all the frequencies for different H-bonds by experiment for fract_ls in listof_fraction_lst: for i in range(len(sum_fractions)): sum_fractions[i] += fract_ls[i] if len(listof_fraction_lst) > 0: return sum_fractions else: return [] ################## # PROGRAM OUTPUT # ################## print 'interaction surface \tANTICODON \t\t\tCAR' print 'interaction residue \tnt1 \tnt2 \twobble \tC1054 \tA1196 \tR146' print 'subsystem residue \t169 \t168 \t167 \t105 \t114 \t368\n' print 'mRNA codon \tA-SITE \t\t\t+1 CODON' print 'nucleotide position \t1 \t2 \t3 \t1 \t2 \t3' print 'subsystem residue \t175 \t176 \t177 \t178 \t179 \t180' # ribosomal residues of interest rRes = {'Anticod1':169, 'AntiCod2':168,'AntiCod3':167,'C1054':105,'A1196':114,'R146':368} # mRNA residues of interest mRNA = {'Cod1':175,'Cod2':176,'Cod3':177,'G1':178,'C1':178,'A1':178,'U1':178,'C2':179,'G2':179,'A2':179,'U2':179,'U3':180,'A3':180,'C3':180,'G3':180} # ribosomal residues for indexing rRes_to_analyze = ['Anticod1', 'AntiCod2', 'AntiCod3', 'C1054', 'A1196', 'R146'] #assigns N1 and N2 variables according to the mutation mutations1 = {'AAU':'A1','ACU':'A1','AGU':'A1','AUU':'A1','CAU':'C1','CCU':'C1','CGU':'C1','CUU':'C1','GAU':'G1','GCU':'G1','GGU':'G1','GUU':'G1','UAU':'U1','UCU':'U1','UGU':'U1','UUU':'U1','GCA':'G1','GCC':'G1','GCG':'G1','CGA':'C1','CGC':'C1','CGC':'C1'} N1 = mutations1[mut] mutations2 = {'AAU':'A2','ACU':'C2','AGU':'G2','AUU':'U2','CAU':'A2','CCU':'C2','CGU':'G2','CUU':'U2','GAU':'A2','GCU':'C2','GGU':'G2','GUU':'U2','UAU':'A2','UCU':'C2','UGU':'G2','UUU':'U2','GCA':'C2','GCC':'C2','GCG':'C2','CGA':'G2','CGC':'G2','CGC':'G2'} N2 = mutations2[mut] mutations3 = {'AAU':'U3','ACU':'U3','AGU':'U3','AUU':'U3','CAU':'U3','CCU':'U3','CGU':'U3','CUU':'U3','GAU':'U3','GCU':'U3','GGU':'U3','GUU':'U3','UAU':'U3','UCU':'U3','UGU':'U3','UUU':'U3','GCA':'A3','GCC':'C3','GCG':'G3','CGA':'A3','CGC':'C3','CGC':'G3'} N3 = mutations3[mut] # mRNA residues for indexing mRNA_to_analyze = ['Cod1', 'Cod2','Cod3',N1,N2,N3,'+2nt1','+2nt2','+2nt3'] # residue pairs to analyze, ribosomal residue then mRNA residue #rRes_mRNA_to_analyze = [['AntiCod2','Cod2'],['AntiCod3','Cod3'],['C1054','G1'],['A1196',N2],['R146',N2]] # Position 1 of the A site codon print '\n\n*******************\n' print '\n\nStandard Registration' file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[0]]) + '_all_' + str(mRNA[mRNA_to_analyze[0]]) + '_all.dat' label = 'nhb_' + str(rRes_to_analyze[0]) + '_' + str(mRNA_to_analyze[0]) + '\n' print label collate_0 = collate_avgdata(file) specialprint_sum('fraction sum registration 0',collate_0) print '\n\nCross Registration +1' file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[1]]) + '_all_' + str(mRNA[mRNA_to_analyze[0]]) + '_all.dat' label = 'nhb_' + str(rRes_to_analyze[1]) + '_' + str(mRNA_to_analyze[0]) + '\n' print label collate_plus1 = collate_avgdata(file) specialprint_sum('fraction sum registration +1',collate_plus1) # -1, Standard, and +1 Registation for position 2 of A-site codon to position 2 of +1 codon for i in range(1,5): print '\n\n*******************\n' #print '\n' + mRNA_to_analyze[i] print '\n\nCross Registration -1' file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i-1]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat' label = 'nhb_' + str(rRes_to_analyze[i-1]) + '_' + str(mRNA_to_analyze[i]) + '\n' print label collate_min1 = collate_avgdata(file) specialprint_sum('fraction sum registration -1',collate_min1) print '\n\nStandard Registration' file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat' label = 'nhb_' + str(rRes_to_analyze[i]) + '_' + str(mRNA_to_analyze[i]) + '\n' print label collate_0 = collate_avgdata(file) specialprint_sum('fraction sum registration 0',collate_0) print '\n\nCross Registration +1' file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i+1]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat' label = 'nhb_' + str(rRes_to_analyze[i+1]) + '_' + str(mRNA_to_analyze[i]) + '\n' print label collate_plus1 = collate_avgdata(file) specialprint_sum('fraction sum registration +1',collate_plus1) specialprint_sumofsums('fraction sum registrations -1,0,+1',[collate_min1,collate_0,collate_plus1]) # position 3 of +1 codon print '\n\n*******************\n' #print '\n' + mRNA_to_analyze[i] print '\n\nCross Registration -1' file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[4]]) + '_all_' + str(mRNA[mRNA_to_analyze[5]]) + '_all.dat' label = 'nhb_' + str(rRes_to_analyze[4]) + '_' + str(mRNA_to_analyze[5]) + '\n' print label collate_min1 = collate_avgdata(file) specialprint_sum('fraction sum registration -1',collate_min1) print '\n\nStandard Registration' file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[5]]) + '_all_' + str(mRNA[mRNA_to_analyze[5]]) + '_all.dat' label = 'nhb_' + str(rRes_to_analyze[5]) + '_' + str(mRNA_to_analyze[5]) + '\n' print label collate_0 = collate_avgdata(file) specialprint_sum('fraction sum registration 0',collate_0) print '\n\n*******************\n' # output to terminal can be written into a file: python make_fig6.py > output.txt # output.txt can then be imported into Excel as a tab-delimited file |