Commit 67b3b92c authored by Martino Bertoni's avatar Martino Bertoni 🌋
Browse files

initial commit

parent 3fc3a406
# 2019-08-29
* Added local module support
# 2019-08-29
* Added H5 support
# 2019-08-23
* Initial commit
# Contributing
## Adding features or fixing bugs
* Fork the repo
* Check out a feature or bug branch
* Add your changes
* Update README when needed
* Submit a pull request to upstream repo
* Add description of your changes
* Ensure tests are passing
* Ensure branch is mergeable
## Testing
* Please make sure tests pass with `./script/test`
# Signaturizer
Generate Chemical Checker signatures from molecules SMILES.
# Install
pip install signaturizer
# Example
from signaturizer import Signaturizer
# load the bioactivity space predictor
sign = Signaturizer('A1')
# prepare a list of SMILES strings
smiles = ['C', 'CCC']
# run prediction
results = sign.predict(smiles)
# or save results as H5 file
results = sign.predict(smiles, 'destination.h5')
\ No newline at end of file
norecursedirs = venv* .*
addopts =
-r fEsxXw
#!/usr/bin/env bash
# Test app
pip install -r requirements.txt
# vim: ft=sh:
import setuptools
from setuptools import find_packages
from signaturizer.version import Version
description='Generate Chemical Checker signatures from molecules SMILES.',
author='Martino Bertoni',
license='MIT License',
keywords='signaturizer package',
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
from .signaturizer import Signaturizer, SignaturizerResult
\ No newline at end of file
# using the module
import os
import h5py
import shutil
import tempfile
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
from rdkit import Chem
from rdkit.Chem import AllChem
except ImportError:
raise ImportError("requires RDKit " +
class Signaturizer():
"""Class loading TF-hub module and performing predictions."""
def __init__(self, model_name, verbose=True, compressed=True,
"""Initialize the Signaturizer.
model_name(str): The model name, i.e. the bioactivity space of
interest (e.g. "A1")
cc_url(str): The ChemicalChecker getModel API URL.
# Model url
model_url = cc_url + model_name
if compressed:
model_url += '.tar.gz'
self.verbose = verbose
# load Module locally
self.module = hub.Module(model_url, tags=['serve'])
def _export_model_as_module(saved_model_path, model_destination):
"""Export tensorflow SavedModel to the TF-hub module format."""
# Create ModuleSpec
spec = hub.create_module_spec_from_saved_model(
saved_model_path, drop_collections=['saved_model_train_op'])
# Initialize Graph and export session to temporary folder
tmp_path = tempfile.mkdtemp()
with tf.Graph().as_default():
module = hub.Module(spec, tags=['serve'])
with tf.Session() as session:
module.export(tmp_path, session)
# compress the exported files
os.system("tar -cz -f %s --owner=0 --group=0 -C %s ." %
(model_destination, tmp_path))
# clean temporary folder
def predict(self, smiles, destination=None, chunk_size=1000):
"""Predict signatures for given SMILES.
smiles(list): A list of SMILES strings.
chunk_size(int): Perform prediction on chuncks of this size.
destination(str): Path to H5 file where prediction results will
be saved.
results: `SignaturizerResult` class.
# Init TF session
with tf.Session() as session:
# Init Graph ariables
# Prepare result object
results = SignaturizerResult(len(smiles), destination)
# predict by chunk
all_chunks = range(0, len(smiles), chunk_size)
for i in tqdm(all_chunks, disable=self.verbose):
chunk = slice(i, i + chunk_size)
sign0s = list()
failed = list()
for idx, mol_smiles in enumerate(smiles[chunk]):
# read SMILES as molecules
mol = Chem.MolFromSmiles(mol_smiles)
if mol is None:
raise Exception("Cannot get molecule from smiles.")
info = {}
fp = AllChem.GetMorganFingerprintAsBitVect(
mol, 2, nBits=2048, bitInfo=info)
bin_s0 = [fp.GetBit(i) for i in range(fp.GetNumBits())]
calc_s0 = np.array(bin_s0).astype(np.float32)
except Exception as err:
# in case of failure save idx to later append NaNs
print("SKIPPING %s: %s", mol_smiles, str(err))
calc_s0 = np.full((2048, ), np.nan)
# stack input fingerprints and run predictor
sign0s = np.vstack(sign0s)
pred = self.module(sign0s, signature='predict', as_dict=True)
preds =['predictions']
# add NaN where SMILES conversion failed
if failed:
preds[np.array(failed)] = np.full((131, ), np.nan)
# save chunk to results dictionary
results.signature[chunk] = preds[:, :128]
results.stddev_norm[chunk] = preds[:, 128]
results.intensity_norm[chunk] = preds[:, 129]
results.confidence[chunk] = preds[:, 130]
return results
class SignaturizerResult():
"""Class storing result of the prediction.
Results are stored in the following numpy vectors:
signatures: 128 float32 defining the moleule signature.
stddev_norm: standard deviation of the signature.
intensity_norm: intensity of the consensus.
confidence: signature confidence.
If a destination is specified the result are saved in an H5 file with
the same vector available as H5 datasets.
def __init__(self, size, destination):
"""Initialize the result containers.
size(int): The number of molecules being signaturized.
destination(str): Path to H5 file where prediction results will
be saved.
self.dst = destination
if self.dst is None:
# simply numpy arrays
self.h5 = None
self.signature = np.zeros((size, 128), dtype=np.float32)
self.stddev_norm = np.zeros((size, ), dtype=np.float32)
self.intensity_norm = np.zeros((size, ), dtype=np.float32)
self.confidence = np.zeros((size, ), dtype=np.float32)
# check if the file exists already
if os.path.isfile(self.dst):
print('H5 file %s exists, opening in read-only.' % self.dst)
# this avoid overwriting by mistake
self.h5 = h5py.File(self.dst, 'r')
# create the datasets
self.h5 = h5py.File(self.dst, 'w')
'signature', (size, 128), dtype=np.float32)
'stddev_norm', (size, ), dtype=np.float32)
'intensity_norm', (size, ), dtype=np.float32)
'confidence', (size, ), dtype=np.float32)
# expose the datasets
self.signature = self.h5['signature']
self.stddev_norm = self.h5['stddev_norm']
self.intensity_norm = self.h5['intensity_norm']
self.confidence = self.h5['confidence']
def close(self):
if self.h5 is None:
# leave it open for reading
self.h5 = h5py.File(self.dst, 'r')
# expose the datasets
self.signature = self.h5['signature']
self.stddev_norm = self.h5['stddev_norm']
self.intensity_norm = self.h5['intensity_norm']
self.confidence = self.h5['confidence']
class Version(object):
"""Version of the package"""
def __setattr__(self, *args):
raise TypeError("can't modify immutable instance")
__delattr__ = __setattr__
def __init__(self, num):
super(Version, self).__setattr__('number', num)
import pytest
from tests.tests_helper import *
import unittest
from tests.helpers import *
__all__ = [
from .my_helper import MyHelper
__all__ = [
from datetime import datetime
class MyHelper():
def days_ago(cls, d):
return ( - d).days
import tests.helpers
def list_has(value, lst):
found = False
for val in lst:
if val == value:
found = True
return found
from datetime import date
from tests import *
from tests.helpers import *
class TestExample(unittest.TestCase):
def test_helper(self):
self.assertEqual(MyHelper.days_ago(, 0)
def test_other_helper(self):
assert tests_helper.list_has(3, [1, 2, 3])
def test_pass(self):
self.assertEqual(True, True)
def test_should_fail(self):
self.assertEqual(False, True)
from packagename.version import Version
from tests import *
from tests.helpers import *
class TestVersion(unittest.TestCase):
def test_set_version(self):
ver = Version("1.0.0")
self.assertEqual(ver.number, "1.0.0")
def test_version_immutable(self):
ver = Version("1.0.0")
with pytest.raises(TypeError) as e:
ver.number = "1.1.0"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment