"""
biometall.py
Software to predict feasible metal binding areas into proteins
Handles the primary functions
"""
import numpy as np
import sys
import time
import math
import os
import re
import multiprocessing
import psutil
import itertools
import copy
import urllib.request
from functools import partial
from ._version import get_versions
# BioMetAll modules
from .modules.data import DIST_PROBE_ALPHA, DIST_PROBE_BETA, ANGLE_PAB
from .modules.data import DIST_PROBE_OXYGEN, ANGLE_POC
from .modules.data import ALLOWED_FILE_TYPES
from .modules.grid import _chunk_size, _calculate_center_and_radius, _grid
from .modules.pdb import _parse_molecule, _print_pdb
from .modules.motif import _check_actual_motif, _check_possible_mutations
[docs]def run(inputfile, min_coordinators=3, min_sidechain=2,
residues='[ASP,HIS,GLU,CYS]', motif='', grid_step=1.0,
consider_backbone_residues='[]', cluster_cutoff=0.0, pdb=False,
propose_mutations_to='', custom_radius=None, custom_center=None,
cores_number=None, backbone_clashes_threshold=1.0,
sidechain_clashes_threshold=0.0, cmd_str="", **kwargs):
# Print header
versions = get_versions()
__version__ = versions['version']
str_header = "You are running BioMetAll {}".format(__version__)
print(str_header)
print("*" * len(str_header))
print("")
filename, file_extension = os.path.splitext(inputfile)
if motif:
motifs = list(map(str, motif.strip('[]').split(',')))
motifs_list = []
for mot in motifs:
motifs_list.append(list(map(str, mot.split('/'))))
motifs = motifs_list
motifs.sort(key=len)
else:
motifs = None
if propose_mutations_to:
mutated_motif = list(map(str,
propose_mutations_to.strip('[]').split(',')))
mutated_motif_list = []
for mot in mutated_motif:
mutated_motif_list.append(list(map(str, mot.split('/'))))
mutated_motif = mutated_motif_list
mutated_motif.sort(key=len)
else:
mutated_motif = None
#Set residues to consider as coordinating
if propose_mutations_to:
tot = mutated_motif_list + motifs_list
residues = list(set(x for l in tot for x in l))
elif motif:
tot = motifs_list
residues = list(set(x for l in tot for x in l))
else:
residues = list(map(str, residues.strip('[]').split(',')))
# Control of a correct usage of parameters
if propose_mutations_to and motif:
#min_coordinators should be at least len(motif) + len(mutated_motif)
if min_coordinators < (len(motifs) + len(mutated_motif)):
min_coordinators = len(motifs) + len(mutated_motif)
elif propose_mutations_to and not motif:
print("To propose mutations is necessary to set a base motif with --motif parameter.")
sys.exit()
elif motif:
#min_coordinators should be at least len(motif)
if min_coordinators < len(motifs):
min_coordinators = len(motifs)
print("min_coordinators has been set to {} due to the motif length".format(len(motifs)))
#Set residues to consider as coordinating in backbone oxygens
if consider_backbone_residues == '[]':
consider_backbone_residues = None
min_sidechain = min_coordinators
elif consider_backbone_residues == 'ALL':
consider_backbone_residues = 'ALL'
else:
consider_backbone_residues = list(map(str, consider_backbone_residues.strip('[]').split(',')))
t0 = time.time()
if not cores_number:
cores_number = psutil.cpu_count(logical=False)
pool = multiprocessing.Pool(cores_number)
#Parse inputfile to obtain atom coords
if file_extension in ALLOWED_FILE_TYPES:
with open(inputfile, "r") as f:
lines = f.read().splitlines()
elif file_extension == "":
try:
response = urllib.request.urlretrieve('http://files.rcsb.org/download/%s.pdb' % (filename), '%s.pdb' % (filename))
file_extension = ".pdb"
with open(filename + file_extension, "r") as f:
lines = f.read().splitlines()
except:
raise Exception("The input should be a valid PDB code or a file of a valid type: " + str(ALLOWED_FILE_TYPES))
else:
raise Exception("The input should be a valid PDB code or a file of a valid type: " + str(ALLOWED_FILE_TYPES))
centroid, radius, alphas, betas, carbons, nitrogens, oxygens, column_for_res, res_for_column, name_for_res, atoms_in_res, side_chains = _parse_molecule(lines, file_extension)
#Alpha-Beta and Oxygen-Carbon distances for further use
alpha_beta_distances = np.sqrt((np.square(betas-alphas).sum(axis=1)))
oxygen_carbon_distances = np.sqrt((np.square(carbons-oxygens).sum(axis=1)))
#Construct grid
if custom_center and custom_radius:
centroid = np.array(list(map(float, custom_center.strip('[]').split(','))))
radius = custom_radius
grid = _grid(centroid, radius, grid_step)
#Split grid in chunks (to ensure a good use of the memory/processors)
n = _chunk_size(len(grid), len(alphas), cores_number)
grids = [grid[i * n:(i + 1) * n] for i in range((len(grid) + n - 1) // n )]
coordination_chunks = pool.map(partial(_test_chunk, alphas=alphas,
betas=betas, carbons=carbons, nitrogens=nitrogens,
oxygens=oxygens, side_chains=side_chains,
alpha_beta_distances=alpha_beta_distances,
oxygen_carbon_distances=oxygen_carbon_distances,
name_for_res=name_for_res,
column_for_res=column_for_res,
res_for_column=res_for_column,
atoms_in_res=atoms_in_res,
consider_backbone_residues=consider_backbone_residues,
DIST_PROBE_ALPHA=DIST_PROBE_ALPHA,
DIST_PROBE_BETA=DIST_PROBE_BETA,
ANGLE_PAB=ANGLE_PAB,
bck_clashes=backbone_clashes_threshold,
sc_clashes=sidechain_clashes_threshold), grids)
centers, mutations = clustering(coordination_chunks, residues, motifs, min_coordinators, min_sidechain,
consider_backbone_residues, mutated_motif, cluster_cutoff, filename, name_for_res, res_for_column, atoms_in_res)
#Print results
sorted_data = sorted(centers, key=lambda x: x[2], reverse=True)
if motif:
file_name_addendum = "_" + motif.replace("[", "").replace("]", "").replace(",", "_").replace("/", "-")
else:
file_name_addendum = ""
if pdb:
pdb_filename = "probes_%s%s.pdb" %(os.path.basename(filename), file_name_addendum)
pdb_filename = os.path.join(os.path.dirname(inputfile), pdb_filename)
_print_pdb(sorted_data, pdb_filename)
mutations_width, radius_width, coord_width, probes_width, residues_width, pos_width = len('Proposed mutations'), len('Radius search'), len('Coordinates of center'), len('Num. probes'), len('Coordinating residues'), 1
lines=[]
for pos, one_center in enumerate(sorted_data,1):
residues = []
for res in one_center[0]:
if not "_BCK" in res:
residues.append(str(name_for_res[res]) + ":" + res)
else:
res_bck = res.split("_")[0]
residues.append(str(name_for_res[res_bck]) + ":" + res_bck + "_BCK")
continue
residues_str = ' '.join(res for res in residues)
coord_str = ' '.join([str(format(r, '.3f')) for r in one_center[1]])
pos_str, radius_str, probes_str = str(pos), str(format(one_center[3], '.3f')), str(one_center[2])
if propose_mutations_to:
mutations_str = ' '.join([str(res) + ":" + str(mut) for (res,mut) in mutations[one_center[0]]])
if len(mutations_str) > mutations_width:
mutations_width = len(mutations_str)
else:
mutations_str = ''
if len(pos_str) > pos_width:
pos_width = len(pos_str)
if len(coord_str) > coord_width:
coord_width = len(coord_str)
if len(probes_str) > probes_width:
probes_width = len(probes_str)
if len(residues_str) > residues_width:
residues_width = len(residues_str)
if len(radius_str) > radius_width:
radius_width = len(radius_str)
lines.append((pos_str, residues_str, coord_str, probes_str, radius_str, mutations_str))
print(' {:>{pos_width}} | {:^{residues_width}} | {:{coord_width}} | {:{probes_width}} | {:{radius_width}} | {:{mutations_width}}'.format(
'#', 'Coordinating residues', 'Coordinates of center', 'Num. probes', 'Radius search' , 'Proposed mutations', pos_width=pos_width, residues_width=residues_width, coord_width=coord_width,
probes_width=probes_width, radius_width=radius_width, mutations_width=mutations_width))
print('-{}-+-{}-+-{}-+-{}-+-{}-+-{}-'.format('-'*pos_width, '-'*residues_width, '-'*coord_width, '-'*probes_width, '-'*radius_width, '-'*mutations_width))
for line in lines:
print(' {:>{pos_width}} | {:<{residues_width}} | {:^{coord_width}} | {:>{probes_width}} | {:<{radius_width}} | {:<{mutations_width}} '.format(
line[0], line[1], line[2], line[3], line[4], line[5], pos_width=pos_width, residues_width=residues_width,
coord_width=coord_width, probes_width=probes_width, radius_width=radius_width, mutations_width=mutations_width))
text_filename = "results_biometall_%s%s.txt" %(os.path.basename(filename), file_name_addendum)
text_filename = os.path.join(os.path.dirname(inputfile), text_filename)
f = open(text_filename, "w")
str_header = "*****BioMetAll {}".format(__version__)
f.write(str_header)
f.write('\n')
f.write(cmd_str)
f.write('\n')
f.write(' {:>{pos_width}} | {:^{residues_width}} | {:{coord_width}} | {:{probes_width}} | {:{radius_width}} | {:{mutations_width}} \n'.format(
'#', 'Coordinating residues', 'Coordinates of center', 'Num. probes', 'Radius search' , 'Proposed mutations', pos_width=pos_width, residues_width=residues_width, coord_width=coord_width,
probes_width=probes_width, radius_width=radius_width, mutations_width=mutations_width))
f.write('-{}-+-{}-+-{}-+-{}-+-{}-+-{}-\n'.format('-'*pos_width, '-'*residues_width, '-'*coord_width, '-'*probes_width, '-'*radius_width, '-'*mutations_width ))
for line in lines:
f.write(' {:>{pos_width}} | {:<{residues_width}} | {:^{coord_width}} | {:>{probes_width}} | {:<{radius_width}} | {:<{mutations_width}} \n'.format(
line[0], line[1], line[2], line[3], line[4], line[5], pos_width=pos_width, residues_width=residues_width,
coord_width=coord_width, probes_width=probes_width, radius_width=radius_width, mutations_width=mutations_width))
f.write("*****Calculation took {0:.2f} seconds".format(time.time() - t0))
f.close()
print("{0:.2f} seconds".format(time.time() - t0))
def clustering(coordination_chunks, residues, motifs, min_coordinators, min_sidechain,
consider_backbone_residues, mutated_motif, cluster_cutoff, filename,
name_for_res, res_for_column, atoms_in_res):
dict_cluster = {}
dict_mutations = {}
centers = []
if motifs:
motif_possibilities = list(itertools.product(*motifs))
if mutated_motif:
mutated_motif_possibilities = list(itertools.product(*mutated_motif))
residues_of_mutated_motif = set(x for l in mutated_motif for x in l)
for probes, coordinations, discarded in coordination_chunks:
coordinators = {}
for possible_coord_name in residues: #Sidechain coordinations
for probe_idx, res_idx in coordinations[possible_coord_name][0,:,:]:
if not mutated_motif and (possible_coord_name != name_for_res[res_for_column[res_idx]]): #discarded due to different residue name
continue
if probe_idx in discarded: #residue is discarded for that probe
continue
elif ('CA' not in atoms_in_res[res_for_column[res_idx]]) or ('CB' not in atoms_in_res[res_for_column[res_idx]]):
continue
else: #coordination is valid for that probe and residue
if probe_idx not in list(coordinators):
coordinators[probe_idx] = {res_idx: [possible_coord_name]}
else:
if res_idx not in list(coordinators[probe_idx]):
coordinators[probe_idx][res_idx] = [possible_coord_name]
else:
coordinators[probe_idx][res_idx].append(possible_coord_name)
if consider_backbone_residues:
for probe_idx, res_idx in coordinations["BCK"][0,:,:]:
if probe_idx in discarded:
continue
if (name_for_res[res_for_column[res_idx]] not in consider_backbone_residues) and consider_backbone_residues != "ALL": #discarded due to not searched residue
continue
else: #coordination is valid for that probe and residue
if probe_idx not in list(coordinators):
coordinators[probe_idx] = {res_idx: [name_for_res[res_for_column[res_idx]] + "_BCK"]}
else:
if res_idx not in list(coordinators[probe_idx]):
coordinators[probe_idx][res_idx] = [name_for_res[res_for_column[res_idx]] + "_BCK"]
else:
coordinators[probe_idx][res_idx].append(name_for_res[res_for_column[res_idx]] + "_BCK")
for probe_idx in list(coordinators):
if motifs:
#A minimum motif should be accomplished without mutations (contained in motifs/motif_possibilities)
actual_motif_solutions = _check_actual_motif(motif_possibilities, coordinators[probe_idx], name_for_res, res_for_column)
if mutated_motif:
#The resting coordinators of each solution should be mutable to complete the propose_mutations_to motif
mutation_motif_solutions = {}
for actual_motif_solution in actual_motif_solutions:
resting_coordinators = copy.deepcopy(coordinators[probe_idx])
for r in actual_motif_solution:
del resting_coordinators[r]
mutation_motif_solutions = _check_possible_mutations(mutated_motif_possibilities, residues_of_mutated_motif, resting_coordinators)
if motifs and mutated_motif and actual_motif_solutions and mutation_motif_solutions:
#Searching for already present motifs plus proposing mutations
#already present motifs
for actual_motif_solution in actual_motif_solutions:
item = [probes[probe_idx], tuple(sorted([res_for_column[c] for c in actual_motif_solution]))]
if item[1] not in list(dict_cluster):
dict_cluster[item[1]] = [item[0]]
else:
dict_cluster[item[1]] += [item[0]]
#Mutation information
for m in list(mutation_motif_solutions):
if item[1] not in dict_mutations:
dict_mutations[item[1]] = {res_for_column[m]: [[el, 1] for el in mutation_motif_solutions[m]]}
else:
if res_for_column[m] not in dict_mutations[item[1]]:
dict_mutations[item[1]][res_for_column[m]] = [[el, 1] for el in mutation_motif_solutions[m]]
else:
for el1 in mutation_motif_solutions[m]:
if el1 not in [el2[0] for el2 in dict_mutations[item[1]][res_for_column[m]]]:
dict_mutations[item[1]][res_for_column[m]].append([el1, 1])
else:
for el2 in dict_mutations[item[1]][res_for_column[m]]:
if el1 == el2[0]:
el2[1] += 1
break
elif motifs and not mutated_motif:
#Searching for already present motifs
for actual_motif_solution in actual_motif_solutions:
item = [probes[probe_idx], tuple(sorted([res_for_column[c] for c in actual_motif_solution]))]
if item[1] not in list(dict_cluster):
dict_cluster[item[1]] = [item[0]]
else:
dict_cluster[item[1]] += [item[0]]
elif not motifs and not mutated_motif:
#Searching number of coordinators
sidechain_coord = []
bck_coord = []
for item in list(coordinators[probe_idx]):
actual_residue_name = name_for_res[res_for_column[item]]
if actual_residue_name in coordinators[probe_idx][item]:
sidechain_coord.append(res_for_column[item])
if actual_residue_name + "_BCK" in coordinators[probe_idx][item]:
bck_coord.append(str(res_for_column[item]) + "_BCK")
if ((len(sidechain_coord) + len(bck_coord)) >= min_coordinators) and (len(sidechain_coord) >= min_sidechain):
c = tuple(sorted(sidechain_coord + bck_coord))
for coord_environment in list(dict_cluster):
if set(c).issuperset(coord_environment):
dict_cluster[coord_environment] += [probes[probe_idx]]
coordination_possibilities = []
for i in range(min_coordinators, len(c)+1):
coordination_possibilities += list(itertools.combinations(c,i))
coordination_possibilities = [tuple(sorted(el)) for el in coordination_possibilities]
for el in coordination_possibilities:
sc_num = sum("BCK" not in L for L in el)
if (el not in dict_cluster) and sc_num >= min_sidechain:
dict_cluster[el] = [probes[probe_idx]]
#Order Mutation information
for coord_environment in list(dict_mutations):
for residue in list(dict_mutations[coord_environment]):
dict_mutations[coord_environment][residue].sort(key = lambda x: x[1], reverse=True) #order every residue by number of probes
for coord_environment in list(dict_mutations):
dict_mutations[coord_environment] = sorted(dict_mutations[coord_environment].items(), key=lambda item: item[1][0][1], reverse=True)
try:
max_probes = max([len(v) for v in dict_cluster.values()])
except:
print("None possible coordinating sites have been found. Try again with other parameters or check/change the input file.")
sys.exit()
for coord_residues,probes in dict_cluster.items():
if len(probes) >= max_probes*cluster_cutoff:
center, radius_search = _calculate_center_and_radius(probes)
centers.append((coord_residues, center, len(probes), radius_search, probes))
return centers, dict_mutations
def _test_chunk(grid, alphas, betas, carbons, nitrogens, oxygens, side_chains,
alpha_beta_distances, oxygen_carbon_distances,
name_for_res, column_for_res, res_for_column,
atoms_in_res, consider_backbone_residues,
DIST_PROBE_ALPHA, DIST_PROBE_BETA, ANGLE_PAB,
bck_clashes, sc_clashes):
alpha_distances = np.sqrt((np.square(grid[:,np.newaxis]-alphas).sum(axis=2)))
beta_distances = np.sqrt((np.square(grid[:,np.newaxis]-betas).sum(axis=2)))
carbon_distances = np.sqrt((np.square(grid[:,np.newaxis]-carbons).sum(axis=2)))
nitrogen_distances = np.sqrt((np.square(grid[:,np.newaxis]-nitrogens).sum(axis=2)))
oxygen_distances = np.sqrt((np.square(grid[:,np.newaxis]-oxygens).sum(axis=2)))
PAB_angles = np.arccos((np.square(alpha_distances) + np.square(alpha_beta_distances) - np.square(beta_distances)) / (2*alpha_distances*alpha_beta_distances))
POC_angles = np.arccos((np.square(oxygen_distances) + np.square(oxygen_carbon_distances) - np.square(carbon_distances)) / (2*oxygen_distances*oxygen_carbon_distances))
coords = {}
# include backbone coordinations
if consider_backbone_residues:
coords["BCK"] = np.dstack(np.where((DIST_PROBE_OXYGEN[0]<=oxygen_distances) & (oxygen_distances<=DIST_PROBE_OXYGEN[1]) &
(ANGLE_POC[0]<=POC_angles) & (POC_angles<=ANGLE_POC[1])))
# include sidechain coordinations (Alpha+Beta)
for res_name in list(DIST_PROBE_ALPHA):
coords[res_name] = np.dstack(np.where((DIST_PROBE_ALPHA[res_name][0]<=alpha_distances) & (alpha_distances<=DIST_PROBE_ALPHA[res_name][1]) &
(DIST_PROBE_BETA[res_name][0]<=beta_distances) & (beta_distances<=DIST_PROBE_BETA[res_name][1]) &
(ANGLE_PAB[res_name][0]<=PAB_angles) & (PAB_angles<=ANGLE_PAB[res_name][1])))
# If there is a clash (distance < bck_clashes) with a backbone atom,
# no coordination is possible for that probe
discarded = set(np.where((oxygen_distances<bck_clashes) |
(carbon_distances<bck_clashes) |
(nitrogen_distances<bck_clashes) |
(alpha_distances<bck_clashes))[0])
if sc_clashes > 0:
for sc_atom in side_chains:
sidechain_distances = np.sqrt((np.square(grid[:,np.newaxis]-sc_atom).sum(axis=2)))
discarded = discarded.union(set(np.where(sidechain_distances<sc_clashes)[0]))
return [grid, coords, discarded]