|
Background Workflow |
Py
################
# IMPORT LINES #
################
from math import sqrt
import sys
sys.path.append('/mindstore/home33ext/kscopino/BIN/')
from statistics import mean,stdev
import os
########################
# cpptraj FILES NEEDED #
########################
# rmsd_x_NOTrestr_bkbone.dat
# nhb_AVE_169_all_175_all.dat
# nhb_AVE_168_all_175_all.dat
# nhb_AVE_169_all_176_all.dat
# nhb_AVE_168_all_176_all.dat
# nhb_AVE_167_all_176_all.dat
# nhb_AVE_168_all_177_all.dat
# nhb_AVE_167_all_177_all.dat
# nhb_AVE_105_all_177_all.dat
# nhb_AVE_167_all_178_all.dat
# nhb_AVE_105_all_178_all.dat
# nhb_AVE_114_all_178_all.dat
# nhb_AVE_105_all_179_all.dat
# nhb_AVE_114_all_179_all.dat
# nhb_AVE_368_all_179_all.dat
# nhb_AVE_114_all_180_all.dat
# nhb_AVE_368_all_180_all.dat
######################
# MAY NEED TO CHANGE #
######################
mut = 'GCC'
structure = '5JUP'
# ns_path
# datapath
# mutations1
# mutations2
# mutations3
### INPUT VARIABLES, below ###
# mut
# structure
# dir
# frames_per_ns
# frames_equil
# min_frames
# expt_nums
# freq_threshold
########################
# FUNCTION DEFINITIONS #
########################
def countlines(handle):
'''Count lines in a file'''
n = 0
for line in handle:
n += 1
return n
def ns_check(expt, mut, min_frames):
'''Return the number of lines in a file if the trajectory is long enough'''
import os
# test to see if cpptraj has been run
ns_path = '/mindstore/home33ext/kscopino/COD1_GCN_SUBS/5JUP/GCC/NEUTRAL/NEUTRAL_' + str(expt) + '/DATA/rmsd_' + str(expt) + '_NOTrestr_bkbone.dat'
if os.path.exists(ns_path) == True:
handle = open(ns_path, 'r')
# number of lines = number of frames in trajectory
num_lines = countlines(handle)
if os.path.exists(ns_path) == True and num_lines > min_frames:
return num_lines
def same_heavy_atoms(test_hb, hb_list):
'''Check if alternative H for acceptor-donor pair'''
counter = 0
for hb in hb_list:
# if the same H-bond acceptor/donor appear multiple times in the list of all H-bonds (differ by H atom identity)
if hb[0] == test_hb[0] and hb[2] == test_hb[2]:
counter += 1
# multiple H atoms involved in same donor/acceptor pair
if counter > 1: return "** shared acceptor donor"
# single H atom involved in donor/acceptor pair
else: return "single acceptor donor"
def specialprint_sum(label, list_sums):
'''Print labeled line with list of sums'''
if list_sums == []: return
# add label to printline
printstring = str(label)
# add each sum to the printline
for sum in list_sums:
printstring += '\t'+str(round(sum,3))
# calculate the mean and serror and print in the printline
fraction_mean = round(mean(list_sums),3)
fraction_serror = round(stdev(list_sums)/sqrt(len(list_sums)),3)
print
print printstring+'\t\tmean\t'+str(fraction_mean)+'\tse\t'+str(fraction_serror)
def specialprint_sumofsums(label, list_list_sums):
'''Print labeled line with list of sums of sums'''
# add label to printline
printstring = str(label)
# initialize the list of sums of sums as the first list of sums
overall_sum = list_list_sums[0]
if len(list_list_sums) > 1:
# add each list of sums to the list of sums of sums
for sum_list in list_list_sums[1:]:
if len(sum_list) > 0:
for i in range(len(overall_sum)):
overall_sum[i] += sum_list[i]
# add each sum of sums to the printline
if overall_sum != []:
for sum in overall_sum:
printstring += '\t'+str(round(sum,3))
# calculate the mean and serror and print in the printline
fraction_mean = round(mean(overall_sum),3)
fraction_serror = round(stdev(overall_sum)/sqrt(len(overall_sum)),3)
print
print printstring+'\t\tmean\t'+str(fraction_mean)+'\tse\t'+str(fraction_serror)
def collate_avgdata(avgdata_file):
'''Summarize data from multiple experiments into a list of sums'''
###################
# INPUT VARIABLES #
###################
mut = 'GCC'
structure = '5JUP'
dir = '/mindstore/home33ext/kscopino/COD1_GCN_SUBS/5JUP/GCC/'
frames_per_ns = 100
# data analyzed after x ns
frames_equil = frames_per_ns * 25.0
# minimum of 5ns after equilibration
min_frames = frames_equil + (frames_per_ns * 5)
#expt_nums = change according to range of experiments
expt_nums = range(1,31)
# set to 0 for no threshold
freq_threshold = 0.00
################
# COLLECT DATA #
################
os.chdir(dir)
# initialize dictionary of dictionaries for file data
filedata = {}
# initialize dictionary for used experiments and their duration
dict_used_expts = {}
# initialize list of used experiments
list_used_expts = []
# add frame counter which can be converted to ns for output
total_analysed_frames = 0
# iterate through list of experiment numbers
for experiment in expt_nums:
# count frames in cpptraj output to make sure trajectory is long enough
num_lines = ns_check(experiment, mut, min_frames)
# if long enough
if num_lines > min_frames:
# add expt number to dictionary of used experiments as key, length in ns as value
dict_used_expts[experiment] = round(float(num_lines - frames_equil)/frames_per_ns,0)
# add expt number to dictionary of used experiments
list_used_expts += [experiment]
# update analyzed frames, subtracting equilibration
total_analysed_frames += num_lines-frames_equil
datapath = dir + 'NEUTRAL/NEUTRAL_' + str(experiment) + '/DATA/' + avgdata_file
handle = open(datapath, 'r')
# ignore header line
handle.readline()
# set up keys for filedata dictionary as experiment number
filedata[experiment] = {}
# for each line in the datafile split by whitespace
for line in handle:
# each value in a line is added as a separate value in a list
linelist = line.strip().split()
# for the filedata dictionary, the experiment number is the key and the dictionary of H-bond donors and acceptors are the values
# for dictionary where a triplet of H-bond donors and acceptors are the keys, the frequency data are the values
filedata[experiment][(linelist[0],linelist[1],linelist[2])] = linelist[3:7]
# initialize dictionary for H-bond triplet keys with frequencies for each experiment as values
alldata = {}
# iterate through each used experiment, add each unique H-bond donor/acceptor triplet as a key to alldata
for expt in filedata:
for hbkey in filedata[expt]:
if hbkey not in alldata:
alldata[hbkey] = []
# begin header printline
header1, header2 = '\t','\t'
# renumber experiments for table
new_expt_num = 1
# initialize list of H-bonds observed across all experiments
hb_obs_all_expts = []
for expt in list_used_expts:
# add expts (original numbering) to header 1
header1 += str(expt)+'\t'
# add expts (new numbering) with ns to header 2
header2 += str(new_expt_num)+' ('+str(dict_used_expts[expt])[:-2]+')'+'\t'
new_expt_num += 1
# initialize list of observed H-bonds for a given experiment
hb_observed = []
# for each H-bond donor/acceptor triplet, add triplet to list of observed H-bonds
for hb in filedata[expt]:
hb_observed += [hb]
# if a unique triplet, add to the list of H-bonds observed across all experiments
if hb not in hb_obs_all_expts: hb_obs_all_expts += [hb]
# for each H-bond triplet key
for key in alldata:
# if observed in the experiment, add frequency information
if key in hb_observed:
alldata[key] += [[int(filedata[expt][key][0]),float(filedata[expt][key][1]),float(filedata[expt][key][2]),float(filedata[expt][key][3])]]
else:
alldata[key] += [[0, 0.0, 'na', 'na']]
# print substitution, translocation structure, file name, nanoseconds used in analysis
print mut, structure, avgdata_file[:-4], 'total ns '+str((int(1.0*total_analysed_frames/frames_per_ns))), '\texpt (ns)'
# print new experiment numbering with analyzed run length, plus column labels
print header2+'\tfreq_ave (se)\tdist_avg (se)\tangle_ave (se)'
# initialize list
listof_fraction_lst = []
# for each H-bond triplet in alldata
for h_bond in alldata:
# initialize lists for H-bond frequency, distance, and angle
fraction_lst, distance_lst, angle_lst = [], [], []
# add each H-bond donor/acceptor of the triplet to the printline
printline = h_bond[0]+', '+h_bond[1]+', '+h_bond[2]+'\t'
# for frequency info by experiment for a given H-bond triplet
for expt_data in alldata[h_bond]:
# make sure the H-bond isn't between atoms of the same residue
if (h_bond[0][:5] != h_bond[2][:5]):
# add frequency by experiment to printline
printline += str(round(expt_data[1],3))+'\t'
# make list of frequencies by experiment
fraction_lst += [expt_data[1]]
if expt_data[2] != 'na':
# make a list of distances by experiment
distance_lst += [expt_data[2]]
# make a list of angle measures by experiment
angle_lst += [expt_data[3]]
# calculate mean and standard error for H-bond triplet frequency across experiments
fraction_mean = round(mean(fraction_lst),3)
fraction_serror = round(stdev(fraction_lst)/sqrt(len(fraction_lst)),3)
# calculate mean and standard error for H-bond triplet distance across experiments
distance_mean = round(mean(distance_lst),2)
if len(distance_lst) > 1:
distance_serror = round(stdev(distance_lst)/sqrt(len(distance_lst)),2)
else: distance_serror = 'na'
#calculate mean and standard error for H-bond triplet angle across experiments
angle_mean = round(mean(angle_lst),1)
if len(angle_lst) > 1:
angle_serror = round(stdev(angle_lst)/sqrt(len(angle_lst)),2)
else: angle_serror = 'na'
# combine means and standard errors into print strings
fraction_ave_se = str(fraction_mean)+' ('+str(fraction_serror)+')\t'
distance_ave_se = str(distance_mean)+' ('+str(distance_serror)+')\t'
angle_ave_se = str(angle_mean)+' ('+str(angle_serror)+')'
# if H-bonding is above a threshold frequency and H-bonding is not between atoms of the same residue, print to terminal
if (fraction_mean >= freq_threshold) and (h_bond[0][:5] != h_bond[2][:5]):
# print each H-bond triplet line
print printline+'\t'+ fraction_ave_se + distance_ave_se + angle_ave_se + '\t'+same_heavy_atoms(h_bond,hb_obs_all_expts)
# update list of H-bond lists, each containing frequency by experiment
listof_fraction_lst += [fraction_lst]
# initialize list of H-bond sums
sum_fractions = []
# initialize 1 sum per experiment
for f in listof_fraction_lst[0]:
sum_fractions += [0.0]
# add up all the frequencies for different H-bonds by experiment
for fract_ls in listof_fraction_lst:
for i in range(len(sum_fractions)):
sum_fractions[i] += fract_ls[i]
if len(listof_fraction_lst) > 0:
return sum_fractions
else: return []
##################
# PROGRAM OUTPUT #
##################
print 'interaction surface \tANTICODON \t\t\tCAR'
print 'interaction residue \tnt1 \tnt2 \twobble \tC1054 \tA1196 \tR146'
print 'subsystem residue \t169 \t168 \t167 \t105 \t114 \t368\n'
print 'mRNA codon \tA-SITE \t\t\t+1 CODON'
print 'nucleotide position \t1 \t2 \t3 \t1 \t2 \t3'
print 'subsystem residue \t175 \t176 \t177 \t178 \t179 \t180'
# ribosomal residues of interest
rRes = {'Anticod1':169, 'AntiCod2':168,'AntiCod3':167,'C1054':105,'A1196':114,'R146':368}
# mRNA residues of interest
mRNA = {'Cod1':175,'Cod2':176,'Cod3':177,'G1':178,'C1':178,'A1':178,'U1':178,'C2':179,'G2':179,'A2':179,'U2':179,'U3':180,'A3':180,'C3':180,'G3':180}
# ribosomal residues for indexing
rRes_to_analyze = ['Anticod1', 'AntiCod2', 'AntiCod3', 'C1054', 'A1196', 'R146']
#assigns N1 and N2 variables according to the mutation
mutations1 = {'AAU':'A1','ACU':'A1','AGU':'A1','AUU':'A1','CAU':'C1','CCU':'C1','CGU':'C1','CUU':'C1','GAU':'G1','GCU':'G1','GGU':'G1','GUU':'G1','UAU':'U1','UCU':'U1','UGU':'U1','UUU':'U1','GCA':'G1','GCC':'G1','GCG':'G1','CGA':'C1','CGC':'C1','CGC':'C1'}
N1 = mutations1[mut]
mutations2 = {'AAU':'A2','ACU':'C2','AGU':'G2','AUU':'U2','CAU':'A2','CCU':'C2','CGU':'G2','CUU':'U2','GAU':'A2','GCU':'C2','GGU':'G2','GUU':'U2','UAU':'A2','UCU':'C2','UGU':'G2','UUU':'U2','GCA':'C2','GCC':'C2','GCG':'C2','CGA':'G2','CGC':'G2','CGC':'G2'}
N2 = mutations2[mut]
mutations3 = {'AAU':'U3','ACU':'U3','AGU':'U3','AUU':'U3','CAU':'U3','CCU':'U3','CGU':'U3','CUU':'U3','GAU':'U3','GCU':'U3','GGU':'U3','GUU':'U3','UAU':'U3','UCU':'U3','UGU':'U3','UUU':'U3','GCA':'A3','GCC':'C3','GCG':'G3','CGA':'A3','CGC':'C3','CGC':'G3'}
N3 = mutations3[mut]
# mRNA residues for indexing
mRNA_to_analyze = ['Cod1', 'Cod2','Cod3',N1,N2,N3,'+2nt1','+2nt2','+2nt3']
# residue pairs to analyze, ribosomal residue then mRNA residue
#rRes_mRNA_to_analyze = [['AntiCod2','Cod2'],['AntiCod3','Cod3'],['C1054','G1'],['A1196',N2],['R146',N2]]
# Position 1 of the A site codon
print '\n\n*******************\n'
print '\n\nStandard Registration'
file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[0]]) + '_all_' + str(mRNA[mRNA_to_analyze[0]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[0]) + '_' + str(mRNA_to_analyze[0]) + '\n'
print label
collate_0 = collate_avgdata(file)
specialprint_sum('fraction sum registration 0',collate_0)
print '\n\nCross Registration +1'
file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[1]]) + '_all_' + str(mRNA[mRNA_to_analyze[0]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[1]) + '_' + str(mRNA_to_analyze[0]) + '\n'
print label
collate_plus1 = collate_avgdata(file)
specialprint_sum('fraction sum registration +1',collate_plus1)
# -1, Standard, and +1 Registation for position 2 of A-site codon to position 2 of +1 codon
for i in range(1,5):
print '\n\n*******************\n'
#print '\n' + mRNA_to_analyze[i]
print '\n\nCross Registration -1'
file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i-1]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[i-1]) + '_' + str(mRNA_to_analyze[i]) + '\n'
print label
collate_min1 = collate_avgdata(file)
specialprint_sum('fraction sum registration -1',collate_min1)
print '\n\nStandard Registration'
file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[i]) + '_' + str(mRNA_to_analyze[i]) + '\n'
print label
collate_0 = collate_avgdata(file)
specialprint_sum('fraction sum registration 0',collate_0)
print '\n\nCross Registration +1'
file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i+1]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[i+1]) + '_' + str(mRNA_to_analyze[i]) + '\n'
print label
collate_plus1 = collate_avgdata(file)
specialprint_sum('fraction sum registration +1',collate_plus1)
specialprint_sumofsums('fraction sum registrations -1,0,+1',[collate_min1,collate_0,collate_plus1])
# position 3 of +1 codon
print '\n\n*******************\n'
#print '\n' + mRNA_to_analyze[i]
print '\n\nCross Registration -1'
file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[4]]) + '_all_' + str(mRNA[mRNA_to_analyze[5]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[4]) + '_' + str(mRNA_to_analyze[5]) + '\n'
print label
collate_min1 = collate_avgdata(file)
specialprint_sum('fraction sum registration -1',collate_min1)
print '\n\nStandard Registration'
file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[5]]) + '_all_' + str(mRNA[mRNA_to_analyze[5]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[5]) + '_' + str(mRNA_to_analyze[5]) + '\n'
print label
collate_0 = collate_avgdata(file)
specialprint_sum('fraction sum registration 0',collate_0)
print '\n\n*******************\n'
# output to terminal can be written into a file: python make_fig6.py > output.txt
# output.txt can then be imported into Excel as a tab-delimited file
|