My wiki | MakeCpptraj5bTables / Py


################
# IMPORT LINES #
################

from math import sqrt
import sys
sys.path.append('/mindstore/home33ext/kscopino/BIN/')
from statistics import mean,stdev 
import os

########################
# cpptraj FILES NEEDED #
########################

# rmsd_x_NOTrestr_bkbone.dat
# nhb_AVE_169_all_175_all.dat
# nhb_AVE_168_all_175_all.dat
# nhb_AVE_169_all_176_all.dat
# nhb_AVE_168_all_176_all.dat
# nhb_AVE_167_all_176_all.dat
# nhb_AVE_168_all_177_all.dat
# nhb_AVE_167_all_177_all.dat
# nhb_AVE_105_all_177_all.dat
# nhb_AVE_167_all_178_all.dat
# nhb_AVE_105_all_178_all.dat
# nhb_AVE_114_all_178_all.dat
# nhb_AVE_105_all_179_all.dat
# nhb_AVE_114_all_179_all.dat
# nhb_AVE_368_all_179_all.dat
# nhb_AVE_114_all_180_all.dat
# nhb_AVE_368_all_180_all.dat

######################
# MAY NEED TO CHANGE #
######################

mut = 'GCC'
structure = '5JUP'
# ns_path
# datapath
# mutations1
# mutations2
# mutations3

### INPUT VARIABLES, below ###
# mut
# structure
# dir
# frames_per_ns
# frames_equil
# min_frames
# expt_nums
# freq_threshold

########################
# FUNCTION DEFINITIONS #
########################

def countlines(handle):
     '''Count lines in a file'''
     n = 0
     for line in handle:
     	n += 1
     return n

def ns_check(expt, mut, min_frames):
	'''Return the number of lines in a file if the trajectory is long enough'''
	import os

	# test to see if cpptraj has been run
	ns_path = '/mindstore/home33ext/kscopino/COD1_GCN_SUBS/5JUP/GCC/NEUTRAL/NEUTRAL_' + str(expt) + '/DATA/rmsd_' + str(expt) + '_NOTrestr_bkbone.dat'
	if os.path.exists(ns_path) == True:
		handle = open(ns_path, 'r')
		# number of lines = number of frames in trajectory
		num_lines = countlines(handle)		

	if os.path.exists(ns_path) == True and num_lines > min_frames:
		return num_lines

def same_heavy_atoms(test_hb, hb_list):
	'''Check if alternative H for acceptor-donor pair'''
	counter = 0
	for hb in hb_list:
		# if the same H-bond acceptor/donor appear multiple times in the list of all H-bonds (differ by H atom identity)
		if hb[0] == test_hb[0] and hb[2] == test_hb[2]:
			counter += 1
	# multiple H atoms involved in same donor/acceptor pair
	if counter > 1: return "** shared acceptor donor"
	# single H atom involved in donor/acceptor pair
	else: return "single acceptor donor"

def specialprint_sum(label, list_sums):
	'''Print labeled line with list of sums'''
	if list_sums == []: return

	# add label to printline
	printstring = str(label)

	# add each sum to the printline
	for sum in list_sums:
		printstring += '\t'+str(round(sum,3))

	# calculate the mean and serror and print in the printline
	fraction_mean = round(mean(list_sums),3)
	fraction_serror = round(stdev(list_sums)/sqrt(len(list_sums)),3)
	print
	print printstring+'\t\tmean\t'+str(fraction_mean)+'\tse\t'+str(fraction_serror)

def specialprint_sumofsums(label, list_list_sums):
	'''Print labeled line with list of sums of sums'''
	# add label to printline
	printstring = str(label)

	# initialize the list of sums of sums as the first list of sums
	overall_sum = list_list_sums[0]
	if len(list_list_sums) > 1:
		# add each list of sums to the list of sums of sums
		for sum_list in list_list_sums[1:]:
			if len(sum_list) > 0:
				for i in range(len(overall_sum)):
					overall_sum[i] += sum_list[i]
	# add each sum of sums to the printline
	if overall_sum != []:
		for sum in overall_sum:
			printstring += '\t'+str(round(sum,3))

		# calculate the mean and serror and print in the printline
		fraction_mean = round(mean(overall_sum),3)
		fraction_serror = round(stdev(overall_sum)/sqrt(len(overall_sum)),3)
		print
		print printstring+'\t\tmean\t'+str(fraction_mean)+'\tse\t'+str(fraction_serror)

def collate_avgdata(avgdata_file):
	'''Summarize data from multiple experiments into a list of sums'''

	###################
	# INPUT VARIABLES #
	###################

	mut = 'GCC'
	structure = '5JUP'
	dir = '/mindstore/home33ext/kscopino/COD1_GCN_SUBS/5JUP/GCC/'

	frames_per_ns = 100
	# data analyzed after x ns
	frames_equil = frames_per_ns * 25.0
	# minimum of 5ns after equilibration
	min_frames = frames_equil + (frames_per_ns * 5)
	#expt_nums = change according to range of experiments
	expt_nums = range(1,31)
	# set to 0 for no threshold
	freq_threshold = 0.00

	################
	# COLLECT DATA #
	################

	os.chdir(dir)

	# initialize dictionary of dictionaries for file data
	filedata = {}
	# initialize dictionary for used experiments and their duration
	dict_used_expts = {}
	# initialize list of used experiments
	list_used_expts = []
	# add frame counter which can be converted to ns for output
	total_analysed_frames = 0

	# iterate through list of experiment numbers
	for experiment in expt_nums:
		# count frames in cpptraj output to make sure trajectory is long enough
		num_lines = ns_check(experiment, mut, min_frames)

		# if long enough
		if num_lines > min_frames:
			# add expt number to dictionary of used experiments as key, length in ns as value
			dict_used_expts[experiment] = round(float(num_lines - frames_equil)/frames_per_ns,0)
			# add expt number to dictionary of used experiments
			list_used_expts += [experiment]
			# update analyzed frames, subtracting equilibration
			total_analysed_frames += num_lines-frames_equil

			datapath = dir + 'NEUTRAL/NEUTRAL_' + str(experiment) + '/DATA/' + avgdata_file
			handle = open(datapath, 'r')
			# ignore header line
			handle.readline()

			# set up keys for filedata dictionary as experiment number
			filedata[experiment] = {}
			# for each line in the datafile split by whitespace
			for line in handle:
				# each value in a line is added as a separate value in a list
				linelist = line.strip().split()
				# for the filedata dictionary, the experiment number is the key and the dictionary of H-bond donors and acceptors are the values
				# for dictionary where a triplet of H-bond donors and acceptors are the keys, the frequency data are the values
				filedata[experiment][(linelist[0],linelist[1],linelist[2])] = linelist[3:7]

	# initialize dictionary for H-bond triplet keys with frequencies for each experiment as values
	alldata = {}

	# iterate through each used experiment, add each unique H-bond donor/acceptor triplet as a key to alldata
	for expt in filedata:
		for hbkey in filedata[expt]:
			if hbkey not in alldata:
				alldata[hbkey] = []

	# begin header printline
	header1, header2 = '\t','\t'

	# renumber experiments for table
	new_expt_num = 1

	# initialize list of H-bonds observed across all experiments
	hb_obs_all_expts = []

	for expt in list_used_expts:
		# add expts (original numbering) to header 1
		header1 += str(expt)+'\t'
		# add expts (new numbering) with ns to header 2
		header2 += str(new_expt_num)+' ('+str(dict_used_expts[expt])[:-2]+')'+'\t'
		new_expt_num += 1

		# initialize list of observed H-bonds for a given experiment
		hb_observed = []
		# for each H-bond donor/acceptor triplet, add triplet to list of observed H-bonds
		for hb in filedata[expt]:
			hb_observed += [hb]
			# if a unique triplet, add to the list of H-bonds observed across all experiments
			if hb not in hb_obs_all_expts: hb_obs_all_expts += [hb]

		# for each H-bond triplet key 
		for key in alldata:
			# if observed in the experiment, add frequency information
			if key in hb_observed:
				alldata[key] += [[int(filedata[expt][key][0]),float(filedata[expt][key][1]),float(filedata[expt][key][2]),float(filedata[expt][key][3])]]
			else:
				alldata[key] += [[0, 0.0, 'na', 'na']]                                                            

	# print substitution, translocation structure, file name, nanoseconds used in analysis
	print mut, structure, avgdata_file[:-4], 'total ns '+str((int(1.0*total_analysed_frames/frames_per_ns))), '\texpt (ns)'
	# print new experiment numbering with analyzed run length, plus column labels
	print header2+'\tfreq_ave (se)\tdist_avg (se)\tangle_ave (se)'

	# initialize list
	listof_fraction_lst = []

	# for each H-bond triplet in alldata
	for h_bond in alldata:
		# initialize lists for H-bond frequency, distance, and angle
		fraction_lst, distance_lst, angle_lst = [], [], []

		# add each H-bond donor/acceptor of the triplet to the printline
		printline = h_bond[0]+', '+h_bond[1]+', '+h_bond[2]+'\t'

		# for frequency info by experiment for a given H-bond triplet
		for expt_data in alldata[h_bond]:
			# make sure the H-bond isn't between atoms of the same residue
			if (h_bond[0][:5] != h_bond[2][:5]):
				# add frequency by experiment to printline
				printline += str(round(expt_data[1],3))+'\t'
			# make list of frequencies by experiment
			fraction_lst += [expt_data[1]]
			if expt_data[2] != 'na':
				# make a list of distances by experiment
				distance_lst += [expt_data[2]]
				# make a list of angle measures by experiment
				angle_lst += [expt_data[3]]

		# calculate mean and standard error for H-bond triplet frequency across experiments
		fraction_mean = round(mean(fraction_lst),3)
		fraction_serror = round(stdev(fraction_lst)/sqrt(len(fraction_lst)),3)

		# calculate mean and standard error for H-bond triplet distance across experiments
		distance_mean = round(mean(distance_lst),2)
		if len(distance_lst) > 1:
			distance_serror = round(stdev(distance_lst)/sqrt(len(distance_lst)),2)
		else: distance_serror = 'na'

		#calculate mean and standard error for H-bond triplet angle across experiments
		angle_mean = round(mean(angle_lst),1)
		if len(angle_lst) > 1:
			angle_serror = round(stdev(angle_lst)/sqrt(len(angle_lst)),2)
		else: angle_serror = 'na'

		# combine means and standard errors into print strings
		fraction_ave_se = str(fraction_mean)+' ('+str(fraction_serror)+')\t'
		distance_ave_se = str(distance_mean)+' ('+str(distance_serror)+')\t'
		angle_ave_se = str(angle_mean)+' ('+str(angle_serror)+')'

		# if H-bonding is above a threshold frequency and H-bonding is not between atoms of the same residue, print to terminal
		if (fraction_mean >= freq_threshold) and (h_bond[0][:5] != h_bond[2][:5]):
			# print each H-bond triplet line
			print printline+'\t'+ fraction_ave_se + distance_ave_se + angle_ave_se + '\t'+same_heavy_atoms(h_bond,hb_obs_all_expts)

			# update list of H-bond lists, each containing frequency by experiment
			listof_fraction_lst += [fraction_lst]

			# initialize list of H-bond sums
			sum_fractions = []
			# initialize 1 sum per experiment
			for f in listof_fraction_lst[0]:
				sum_fractions += [0.0]
			# add up all the frequencies for different H-bonds by experiment
			for fract_ls in listof_fraction_lst:
				for i in range(len(sum_fractions)):
					sum_fractions[i] += fract_ls[i]

	if len(listof_fraction_lst) > 0:
		return sum_fractions
	else: return []

##################
# PROGRAM OUTPUT #
##################

print 'interaction surface \tANTICODON \t\t\tCAR'
print 'interaction residue \tnt1 \tnt2 \twobble \tC1054 \tA1196 \tR146'
print 'subsystem residue \t169 \t168 \t167 \t105 \t114 \t368\n'

print 'mRNA codon \tA-SITE \t\t\t+1 CODON'
print 'nucleotide position \t1 \t2 \t3 \t1 \t2 \t3'
print 'subsystem residue \t175 \t176 \t177 \t178 \t179 \t180'

# ribosomal residues of interest
rRes = {'Anticod1':169, 'AntiCod2':168,'AntiCod3':167,'C1054':105,'A1196':114,'R146':368} 
# mRNA residues of interest
mRNA = {'Cod1':175,'Cod2':176,'Cod3':177,'G1':178,'C1':178,'A1':178,'U1':178,'C2':179,'G2':179,'A2':179,'U2':179,'U3':180,'A3':180,'C3':180,'G3':180}
# ribosomal residues for indexing
rRes_to_analyze = ['Anticod1', 'AntiCod2', 'AntiCod3', 'C1054', 'A1196', 'R146']

#assigns N1 and N2 variables according to the mutation
mutations1 = {'AAU':'A1','ACU':'A1','AGU':'A1','AUU':'A1','CAU':'C1','CCU':'C1','CGU':'C1','CUU':'C1','GAU':'G1','GCU':'G1','GGU':'G1','GUU':'G1','UAU':'U1','UCU':'U1','UGU':'U1','UUU':'U1','GCA':'G1','GCC':'G1','GCG':'G1','CGA':'C1','CGC':'C1','CGC':'C1'}
N1 = mutations1[mut]
mutations2 = {'AAU':'A2','ACU':'C2','AGU':'G2','AUU':'U2','CAU':'A2','CCU':'C2','CGU':'G2','CUU':'U2','GAU':'A2','GCU':'C2','GGU':'G2','GUU':'U2','UAU':'A2','UCU':'C2','UGU':'G2','UUU':'U2','GCA':'C2','GCC':'C2','GCG':'C2','CGA':'G2','CGC':'G2','CGC':'G2'}
N2 = mutations2[mut]
mutations3 = {'AAU':'U3','ACU':'U3','AGU':'U3','AUU':'U3','CAU':'U3','CCU':'U3','CGU':'U3','CUU':'U3','GAU':'U3','GCU':'U3','GGU':'U3','GUU':'U3','UAU':'U3','UCU':'U3','UGU':'U3','UUU':'U3','GCA':'A3','GCC':'C3','GCG':'G3','CGA':'A3','CGC':'C3','CGC':'G3'}
N3 = mutations3[mut]

# mRNA residues for indexing
mRNA_to_analyze = ['Cod1', 'Cod2','Cod3',N1,N2,N3,'+2nt1','+2nt2','+2nt3']

# residue pairs to analyze, ribosomal residue then mRNA residue
#rRes_mRNA_to_analyze = [['AntiCod2','Cod2'],['AntiCod3','Cod3'],['C1054','G1'],['A1196',N2],['R146',N2]]

# Position 1 of the A site codon

print '\n\n*******************\n'

print '\n\nStandard Registration'

file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[0]]) + '_all_' + str(mRNA[mRNA_to_analyze[0]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[0]) + '_' + str(mRNA_to_analyze[0]) + '\n'
print label
collate_0 = collate_avgdata(file)
specialprint_sum('fraction sum registration 0',collate_0)

print '\n\nCross Registration +1'

file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[1]]) + '_all_' + str(mRNA[mRNA_to_analyze[0]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[1]) + '_' + str(mRNA_to_analyze[0]) + '\n'
print label
collate_plus1 = collate_avgdata(file)
specialprint_sum('fraction sum registration +1',collate_plus1)

# -1, Standard, and +1 Registation for position 2 of A-site codon to position 2 of +1 codon

for i in range(1,5):
	print '\n\n*******************\n'

	#print '\n' + mRNA_to_analyze[i]

	print '\n\nCross Registration -1'

	file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i-1]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat'
	label = 'nhb_' + str(rRes_to_analyze[i-1]) + '_' + str(mRNA_to_analyze[i]) + '\n'
	print label
	collate_min1 = collate_avgdata(file)
	specialprint_sum('fraction sum registration -1',collate_min1)

	print '\n\nStandard Registration'

	file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat'
	label = 'nhb_' + str(rRes_to_analyze[i]) + '_' + str(mRNA_to_analyze[i]) + '\n'
	print label
	collate_0 = collate_avgdata(file)
	specialprint_sum('fraction sum registration 0',collate_0)		

	print '\n\nCross Registration +1'

	file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[i+1]]) + '_all_' + str(mRNA[mRNA_to_analyze[i]]) + '_all.dat'
	label = 'nhb_' + str(rRes_to_analyze[i+1]) + '_' + str(mRNA_to_analyze[i]) + '\n'
	print label
	collate_plus1 = collate_avgdata(file)
	specialprint_sum('fraction sum registration +1',collate_plus1)
	specialprint_sumofsums('fraction sum registrations -1,0,+1',[collate_min1,collate_0,collate_plus1])


# position 3 of +1 codon

print '\n\n*******************\n'

#print '\n' + mRNA_to_analyze[i]

print '\n\nCross Registration -1'

file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[4]]) + '_all_' + str(mRNA[mRNA_to_analyze[5]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[4]) + '_' + str(mRNA_to_analyze[5]) + '\n'
print label
collate_min1 = collate_avgdata(file)
specialprint_sum('fraction sum registration -1',collate_min1)

print '\n\nStandard Registration'

file = 'nhb_AVE_' + str(rRes[rRes_to_analyze[5]]) + '_all_' + str(mRNA[mRNA_to_analyze[5]]) + '_all.dat'
label = 'nhb_' + str(rRes_to_analyze[5]) + '_' + str(mRNA_to_analyze[5]) + '\n'
print label
collate_0 = collate_avgdata(file)
specialprint_sum('fraction sum registration 0',collate_0)	



print '\n\n*******************\n'


# output to terminal can be written into a file: python make_fig6.py > output.txt
# output.txt can then be imported into Excel as a tab-delimited file