Commit d8d41ee0 authored by Martino Bertoni's avatar Martino Bertoni 🌋
Browse files

fixed module export, unittest ok, added multipred

parent 17e91ef8
Showing with 363 additions and 211 deletions
+363 -211
[pytest]
norecursedirs = venv* .*
addopts =
-r fEsxXw
-vvv
--doctest-modules
--ignore setup.py
--cov-report=term-missing
--cov-report=xml
--cov-report=html
--cov=packagename
import os
import shutil
import tempfile
import tensorflow as tf
import tensorflow_hub as hub
def export_smilespred(smilespred_path, destination,
tmp_path=None, clear_tmp=True):
"""Export our Keras Smiles predictor to the TF-hub module format."""
from keras import backend as K
from chemicalchecker.tool.smilespred import Smilespred
if tmp_path is None:
tmp_path = tempfile.mkdtemp()
# save to savedmodel format
with tf.Graph().as_default():
smilespred = Smilespred(smilespred_path)
smilespred.build_model(load=True)
model = smilespred.model
'''
with tf.Session() as sess:
sess.run(tf.tables_initializer())
sess.run(tf.global_variables_initializer())
tf.saved_model.simple_save(
sess,
tmp_path,
inputs={'default': model.input},
outputs={'default': model.output}
)
'''
signature = tf.saved_model.signature_def_utils.predict_signature_def(
inputs={'default': model.input}, outputs={'default': model.output})
if tmp_path is None:
tmp_path = tempfile.mkdtemp()
builder = tf.saved_model.builder.SavedModelBuilder(tmp_path)
builder.add_meta_graph_and_variables(
sess=K.get_session(),
tags=['serve'],
signature_def_map={'serving_default': signature})
builder.save()
# now export savedmodel to module
export_savedmodel(tmp_path, destination)
# clean temporary folder
if clear_tmp:
shutil.rmtree(tmp_path)
def export_savedmodel(savedmodel_path, destination,
tmp_path=None, clear_tmp=True):
"""Export Tensorflow SavedModel to the TF-hub module format."""
if tmp_path is None:
tmp_path = tempfile.mkdtemp()
# save to hub module format
print('SAVE export_savedmodel')
with tf.Graph().as_default():
spec = hub.create_module_spec_from_saved_model(savedmodel_path)
module = hub.Module(spec, tags=['serve'])
with tf.Session() as sess:
sess.run(tf.tables_initializer())
sess.run(tf.global_variables_initializer())
module.export(tmp_path, sess)
print('DONE export_savedmodel')
# compress the exported files to destination
os.system("tar -cz -f %s --owner=0 --group=0 -C %s ." %
(destination, tmp_path))
# clean temporary folder
if clear_tmp:
shutil.rmtree(tmp_path)
def export_batch(cc, destination_dir, datasets=None):
"""Export all CC Smiles predictor to the TF-hub module format."""
if datasets is None:
datasets = cc.datasets_exemplary()
for ds in datasets:
s3 = cc.signature(ds, 'sign3')
pred_path = os.path.join(s3.model_path, 'smiles_final')
mdl_dest = os.path.join(destination_dir, ds[:2] + '.tar.gz')
export_smilespred(pred_path, mdl_dest)
# using the module
import os
import h5py
import shutil
import tempfile
import itertools
import numpy as np
from tqdm import tqdm
import tensorflow as tf
......@@ -18,125 +17,92 @@ except ImportError:
class Signaturizer():
"""Class loading TF-hub module and performing predictions."""
def __init__(self, model_name, verbose=True, compressed=True, local=False,
cc_url="https://dynbench3d.irbbarcelona.org/.well-known/acme-challenge/"):
def __init__(self, model_name, verbose=True,
base_url="https://dynbench3d.irbbarcelona.org/.well-known/acme-challenge/"):
"""Initialize the Signaturizer.
Args:
model_name(str): The model name, i.e. the bioactivity space of
interest (e.g. "A1")
cc_url(str): The ChemicalChecker getModel API URL.
model(str): The model to load. Possible values:
- the model name (the bioactivity space (e.g. "B1") )
- the model path (the directory containing 'saved_model.pb')
- a list of models names or paths (e.g. ["B1", "B2", "E5"])
- 'ALL' to get the stacked bioactivity signatures
base_url(str): The ChemicalChecker getModel API URL.
"""
# Model url
if local is False:
model_url = cc_url + model_name
else:
model_url = model_name
if compressed:
model_url += '.tar.gz'
self.verbose = verbose
# load Module
print('model_url', model_url)
spec = hub.create_module_spec_from_saved_model(model_url)
self.module = hub.Module(spec, tags=['serve'])
@staticmethod
def _export_smilespred_as_module(smilespred_path, module_destination, tmp_path=None):
from keras import backend as K
from chemicalchecker.tool.smilespred import Smilespred
smilespred = Smilespred(smilespred_path)
smilespred.build_model(load=True)
model = smilespred.model
signature = tf.saved_model.signature_def_utils.predict_signature_def(
inputs={'default': model.input}, outputs={'default': model.output})
if tmp_path is None:
tmp_path = tempfile.mkdtemp()
print("_export_smilespred_as_module", tmp_path)
builder = tf.saved_model.builder.SavedModelBuilder(tmp_path)
builder.add_meta_graph_and_variables(
sess=K.get_session(),
tags=['serve'],
signature_def_map={'serving_default': signature
})
builder.save()
Signaturizer._export_savedmodel_as_module(tmp_path, module_destination)
# clean temporary folder
# shutil.rmtree(tmp_path)
@staticmethod
def _export_savedmodel_as_module(savedmodel_path, module_destination, tmp_path=None):
"""Export tensorflow SavedModel to the TF-hub module format."""
# Create ModuleSpec
spec = hub.create_module_spec_from_saved_model(
savedmodel_path, drop_collections=['saved_model_train_op'])
# Initialize Graph and export session to temporary folder
if tmp_path is None:
tmp_path = tempfile.mkdtemp()
print("_export_savedmodel_as_module", tmp_path)
with tf.Graph().as_default():
module = hub.Module(spec, tags=['serve'])
with tf.Session() as session:
session.run(tf.tables_initializer())
session.run(tf.global_variables_initializer())
module.export(tmp_path, session)
# compress the exported files
os.system("tar -cz -f %s --owner=0 --group=0 -C %s ." %
(module_destination, tmp_path))
# clean temporary folder
# shutil.rmtree(tmp_path)
if model_name == 'ALL':
models = list(itertools.product("ABCDE", "12345"))
elif not isinstance(model_name, list):
models = [model_name]
else:
models = model_name
# load modules
self.modules = list()
self.graph = tf.Graph()
with self.graph.as_default():
for model in models:
if os.path.isdir(model):
spec = hub.create_module_spec_from_saved_model(model)
module = hub.Module(spec, tags=['serve'])
else:
module = hub.Module(base_url + model, tags=['serve'])
self.modules.append(module)
def predict(self, smiles, destination=None, chunk_size=1000):
"""Predict signatures for given SMILES.
Args:
smiles(list): A list of SMILES strings.
chunk_size(int): Perform prediction on chuncks of this size.
destination(str): Path to H5 file where prediction results will
be saved.
smiles(list): List of SMILES strings.
chunk_size(int): Perform prediction on chunks of this size.
destination(str): File path where to save predictions.
Returns:
results: `SignaturizerResult` class.
"""
# Init TF session
with tf.Session() as session:
# Init Graph ariables
session.run(tf.tables_initializer())
session.run(tf.global_variables_initializer())
# Prepare result object
results = SignaturizerResult(len(smiles), destination)
# predict by chunk
all_chunks = range(0, len(smiles), chunk_size)
for i in tqdm(all_chunks, disable=self.verbose):
chunk = slice(i, i + chunk_size)
sign0s = list()
failed = list()
for idx, mol_smiles in enumerate(smiles[chunk]):
try:
# read SMILES as molecules
mol = Chem.MolFromSmiles(mol_smiles)
if mol is None:
raise Exception("Cannot get molecule from smiles.")
info = {}
fp = AllChem.GetMorganFingerprintAsBitVect(
mol, 2, nBits=2048, bitInfo=info)
bin_s0 = [fp.GetBit(i) for i in range(fp.GetNumBits())]
calc_s0 = np.array(bin_s0).astype(np.float32)
except Exception as err:
# in case of failure save idx to later append NaNs
print("SKIPPING %s: %s", mol_smiles, str(err))
failed.append(idx)
calc_s0 = np.full((2048, ), np.nan)
finally:
sign0s.append(calc_s0)
# stack input fingerprints and run predictor
sign0s = np.vstack(sign0s)
pred = self.module(sign0s, signature='serving_default')
preds = session.run(pred)
print('preds', preds)
# add NaN where SMILES conversion failed
if failed:
preds[np.array(failed)] = np.full((131, ), np.nan)
# save chunk to results dictionary
results.signature[chunk] = preds[:, :128]
with self.graph.as_default():
with tf.Session() as sess:
sess.run(tf.tables_initializer())
sess.run(tf.global_variables_initializer())
# Prepare result object
features = len(self.modules) * 128
results = SignaturizerResult(len(smiles), destination,
features)
# predict by chunk
all_chunks = range(0, len(smiles), chunk_size)
for i in tqdm(all_chunks, disable=self.verbose):
chunk = slice(i, i + chunk_size)
sign0s = list()
failed = list()
for idx, mol_smiles in enumerate(smiles[chunk]):
try:
# read SMILES as molecules
mol = Chem.MolFromSmiles(mol_smiles)
if mol is None:
raise Exception(
"Cannot get molecule from smiles.")
info = {}
fp = AllChem.GetMorganFingerprintAsBitVect(
mol, 2, nBits=2048, bitInfo=info)
bin_s0 = [fp.GetBit(i) for i in range(
fp.GetNumBits())]
calc_s0 = np.array(bin_s0).astype(np.float32)
except Exception as err:
# in case of failure save idx to fill NaNs
print("SKIPPING %s: %s", mol_smiles, str(err))
failed.append(idx)
calc_s0 = np.full((2048, ), np.nan)
finally:
sign0s.append(calc_s0)
# stack input fingerprints and run predictor
sign0s = np.vstack(sign0s)
for idx, module in enumerate(self.modules):
pred = module(sign0s, signature='serving_default')
preds = sess.run(pred)
# add NaN where SMILES conversion failed
if failed:
preds[np.array(failed)] = np.full((128, ), np.nan)
# save chunk to results dictionary
mdl_cols = slice(idx * 128, (idx + 1) * 128)
results.signature[chunk, mdl_cols] = preds
results.close()
return results
......@@ -144,40 +110,37 @@ class Signaturizer():
class SignaturizerResult():
"""Class storing result of the prediction.
Results are stored in the following numpy vectors:
signatures: 128 float32 defining the moleule signature.
stddev_norm: standard deviation of the signature.
intensity_norm: intensity of the consensus.
confidence: signature confidence.
Results are stored in the following numpy vector:
signatures: 128 float32 defining the molecule signature.
If a destination is specified the result are saved in an H5 file with
the same vector available as H5 datasets.
If a destination is specified the result are saved in an HDF5 file with
the same vector available as HDF5 datasets.
"""
def __init__(self, size, destination):
def __init__(self, size, destination, features=128):
"""Initialize the result containers.
Args:
size(int): The number of molecules being signaturized.
destination(str): Path to H5 file where prediction results will
destination(str): Path to HDF5 file where prediction results will
be saved.
"""
self.dst = destination
if self.dst is None:
# simply numpy arrays
# simple numpy arrays
self.h5 = None
self.signature = np.zeros((size, 128), dtype=np.float32)
self.signature = np.zeros((size, features), dtype=np.float32)
else:
# check if the file exists already
if os.path.isfile(self.dst):
print('H5 file %s exists, opening in read-only.' % self.dst)
print('HDF5 file %s exists, opening in read-only.' % self.dst)
# this avoid overwriting by mistake
self.h5 = h5py.File(self.dst, 'r')
else:
# create the datasets
self.h5 = h5py.File(self.dst, 'w')
self.h5.create_dataset(
'signature', (size, 128), dtype=np.float32)
'signature', (size, features), dtype=np.float32)
# expose the datasets
self.signature = self.h5['signature']
......@@ -189,29 +152,3 @@ class SignaturizerResult():
self.h5 = h5py.File(self.dst, 'r')
# expose the datasets
self.signature = self.h5['signature']
# UNIT TEST
from chemicalchecker import ChemicalChecker
from chemicalchecker.core.signature_data import DataSignature
test_smiles = ['CCC', 'C']
cc = ChemicalChecker()
s3 = cc.signature('B1.001', 'sign3')
s3.predict_from_smiles(test_smiles, './tmp.h5')
pred1 = DataSignature('./tmp_pred1.h5')
a = Signaturizer('/tmp/moduledir/', compressed=False, local=True)
module_destination = './tmp_dest'
Signaturizer._export_smilespred_as_module(
os.path.join(s3.module_path, 'smiles_final'),
module_destination, tmp_path='./conv_k2tf')
module2 = Signaturizer('./conv_k2tf', compressed=False, local=True)
pred2 = module2.predict(test_smiles)
Signaturizer._export_savedmodel_as_module(
os.path.join(s3.module_path, 'smiles_final'),
module_destination, tmp_path='./conv_tf2hub')
module3 = Signaturizer('./conv_tf2hub', compressed=False, local=True)
pred3 = module3.predict(test_smiles)
assert(pred1 == pred2)
assert(pred1 == pred3)
class Version(object):
"""Version of the package"""
def __setattr__(self, *args):
raise TypeError("can't modify immutable instance")
__delattr__ = __setattr__
def __init__(self, num):
super(Version, self).__setattr__('number', num)
File added

\ No newline at end of file
File added
File added
File added

\ No newline at end of file
File added
File added
File added
......@@ -11,3 +11,56 @@ def skip_if_import_exception(function):
except ImportError as err:
pytest.skip(str(err))
return wrapper
def start_http_server(redirect=None):
"""Returns the port of the newly started HTTP server."""
def _do_redirect(handler, location):
handler.send_response(301)
handler.send_header("Location", location)
handler.end_headers()
import socket
import sys
import threading
# Start HTTP server to serve TAR files.
# pylint:disable=g-import-not-at-top
if sys.version_info[0] == 2:
import BaseHTTPServer
import SimpleHTTPServer
class HTTPServerV6(BaseHTTPServer.HTTPServer):
address_family = socket.AF_INET6
class RedirectHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
def do_GET(self):
_do_redirect(self, redirect)
server = HTTPServerV6(("", 0), RedirectHandler if redirect else
SimpleHTTPServer.SimpleHTTPRequestHandler)
server_port = server.server_port
else:
import http.server
import socketserver
class TCPServerV6(socketserver.TCPServer):
address_family = socket.AF_INET6
class RedirectHandler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
_do_redirect(self, redirect)
server = TCPServerV6(("", 0), RedirectHandler if redirect else
http.server.SimpleHTTPRequestHandler)
_, server_port, _, _ = server.server_address
# pylint:disable=g-import-not-at-top
thread = threading.Thread(target=server.serve_forever)
thread.daemon = True
thread.start()
return server_port
import os
import time
import shutil
import unittest
from .helper import skip_if_import_exception, start_http_server
from signaturizer.exporter import export_smilespred, export_savedmodel
from signaturizer import Signaturizer
class TestSignaturizer(unittest.TestCase):
def setUp(self):
# path for test data
test_dir = os.path.dirname(os.path.realpath(__file__))
self.data_dir = os.path.join(test_dir, 'data')
self.tmp_dir = os.path.join(test_dir, 'tmp')
if os.path.exists(self.tmp_dir):
shutil.rmtree(self.tmp_dir)
os.mkdir(self.tmp_dir)
self.cwd = os.getcwd()
os.chdir(self.tmp_dir)
self.server_port = start_http_server()
self.test_smiles = [
# Erlotinib
'COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C)OCCOC',
# Diphenhydramine
'CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2'
]
def tearDown(self):
os.chdir(self.cwd)
if os.path.exists(self.tmp_dir):
shutil.rmtree(self.tmp_dir)
pass
@skip_if_import_exception
def test_export_consistency(self):
"""Compare the exported module to the original SMILES predictor.
N.B. This test is working only with a valid CC instance available.
"""
from chemicalchecker import ChemicalChecker
from chemicalchecker.core.signature_data import DataSignature
# load CC instance and smiles prediction model
cc = ChemicalChecker()
#cc.set_verbosity('warning')
s3 = cc.signature('B1.001', 'sign3')
tmp_pred_ref = os.path.join(self.tmp_dir, 'tmp.h5')
s3.predict_from_smiles(self.test_smiles, tmp_pred_ref)
pred_ref = DataSignature(tmp_pred_ref)[:]
# export smilespred
module_file = 'dest_smilespred.tar.gz'
module_destination = os.path.join(
self.tmp_dir, module_file)
tmp_path_smilespred = os.path.join(self.tmp_dir, 'export_smilespred')
export_smilespred(
os.path.join(s3.model_path, 'smiles_final'),
module_destination, tmp_path=tmp_path_smilespred, clear_tmp=False)
# test intermediate step
module = Signaturizer(tmp_path_smilespred)
res = module.predict(self.test_smiles)
pred = res.signature[:]
self.assertEqual(pred_ref.tolist(), pred.tolist())
# test final step
base_url = "http://localhost:%d/" % (self.server_port)
module = Signaturizer(module_file, base_url=base_url)
res = module.predict(self.test_smiles)
pred = res.signature[:]
self.assertEqual(pred_ref.tolist(), pred.tolist())
# export savedmodel
module_destination = os.path.join(
self.tmp_dir, 'dest_savedmodel.tar.gz')
tmp_path_savedmodel = os.path.join(self.tmp_dir, 'export_savedmodel')
export_savedmodel(
tmp_path_smilespred, module_destination,
tmp_path=tmp_path_savedmodel, clear_tmp=False)
# test intermediate step
module = Signaturizer(tmp_path_savedmodel)
res = module.predict(self.test_smiles)
pred = res.signature[:]
self.assertEqual(pred_ref.tolist(), pred.tolist())
# test final step
module = Signaturizer(module_file, base_url=base_url)
res = module.predict(self.test_smiles)
pred = res.signature[:]
self.assertEqual(pred_ref.tolist(), pred.tolist())
import os
import math
import pickle
import shutil
import unittest
from helper import skip_if_import_exception
from signaturizer import Signaturizer
......@@ -13,51 +14,60 @@ class TestSignaturizer(unittest.TestCase):
test_dir = os.path.dirname(os.path.realpath(__file__))
self.data_dir = os.path.join(test_dir, 'data')
self.tmp_dir = os.path.join(test_dir, 'tmp')
os.environ["CC_CONFIG"] = os.path.join(self.data_dir, 'config.json')
if os.path.exists(self.tmp_dir):
shutil.rmtree(self.tmp_dir)
os.mkdir(self.tmp_dir)
self.test_smiles = [
# Erlotinib
'COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C)OCCOC',
# Diphenhydramine
'CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2'
]
def tearDown(self):
if os.path.exists(self.tmp_dir):
shutil.rmtree(self.tmp_dir)
pass
def test_predict(self):
# load reference predictions
ref_file = os.path.join(self.data_dir, 'pred.pkl')
pred_ref = pickle.load(open(ref_file, 'rb'))
# load module and predict
module_dir = os.path.join(self.data_dir, 'B1')
module = Signaturizer(module_dir)
res = module.predict(self.test_smiles)
self.assertEqual(pred_ref.tolist(), res.signature.tolist())
# test saving to file
destination = os.path.join(self.tmp_dir, 'pred.h5')
res = module.predict(self.test_smiles, destination)
self.assertTrue(os.path.isfile(destination))
self.assertEqual(pred_ref.tolist(), res.signature[:].tolist())
# test prediction of invalid SMILES
res = module.predict(['C', 'C&', 'C'])
for comp in res.signature[0]:
self.assertFalse(math.isnan(comp))
for comp in res.signature[1]:
self.assertTrue(math.isnan(comp))
for comp in res.signature[2]:
self.assertFalse(math.isnan(comp))
def test_predict_multi(self):
module_dirs = list()
A1_path = os.path.join(self.data_dir, 'A1')
B1_path = os.path.join(self.data_dir, 'B1')
module_dirs.append(A1_path)
module_dirs.append(B1_path)
module_A1B1 = Signaturizer(module_dirs)
res_A1B1 = module_A1B1.predict(self.test_smiles)
self.assertEqual(res_A1B1.signature.shape[0], 2)
self.assertEqual(res_A1B1.signature.shape[1], 128 * 2)
@skip_if_import_exception
def test_export_consistency(self):
from chemicalchecker import ChemicalChecker
from chemicalchecker.core.signature_data import DataSignature
test_smiles = ['CCC', 'C']
cc = ChemicalChecker()
s3 = cc.signature('B1.001', 'sign3')
tmp_pred_ref = os.path.join(self.tmp_dir, 'tmp.h5')
s3.predict_from_smiles(test_smiles, tmp_pred_ref)
pred_ref = DataSignature(tmp_pred_ref)
# export smilespred
module_destination = os.path.join(self.tmp_dir, 'dest_smilespred')
tmp_path = os.path.join(self.tmp_dir, 'export_smilespred')
Signaturizer._export_smilespred_as_module(
os.path.join(s3.module_path, 'smiles_final'),
module_destination, tmp_path=tmp_path)
# test intermediate step
module = Signaturizer(tmp_path, compressed=False, local=True)
pred = module.predict(test_smiles)
self.assertEqual(pred_ref, pred)
# test final step
module = Signaturizer(module_destination, compressed=True, local=True)
pred = module.predict(test_smiles)
self.assertEqual(pred_ref, pred)
# export savedmodel
module_destination = os.path.join(self.tmp_dir, 'dest_savedmodel')
tmp_path = os.path.join(self.tmp_dir, 'export_savedmodel')
Signaturizer._export_savedmodel_as_module(
os.path.join(s3.module_path, 'smiles_final'),
module_destination, tmp_path=tmp_path)
# test intermediate step
module = Signaturizer(tmp_path, compressed=False, local=True)
pred = module.predict(test_smiles)
self.assertEqual(pred_ref, pred)
# test final step
module = Signaturizer(module_destination, compressed=True, local=True)
pred = module.predict(test_smiles)
self.assertEqual(pred_ref, pred)
module_A1 = Signaturizer(A1_path)
res_A1 = module_A1.predict(self.test_smiles)
self.assertEqual(res_A1B1.signature[:, :128].tolist(),
res_A1.signature.tolist())
module_B1 = Signaturizer(B1_path)
res_B1 = module_B1.predict(self.test_smiles)
self.assertEqual(res_A1B1.signature[:, 128:].tolist(),
res_B1.signature.tolist())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment