Commit 47fced57 authored by nsoler's avatar nsoler
Browse files

progress on predicting sign1 for A spaces

parent 54b94867
Pipeline #2380 failed with stages
in 2 minutes and 3 seconds
......@@ -59,9 +59,11 @@ from chemicalchecker.database import Dataset, Molecule
from chemicalchecker.database.database import test_connection
from chemicalchecker.util import logged, Config
from chemicalchecker.util.decorator import cached_property
from chemicalchecker.util.models import import_models # import models for predicting sign1, sign2
from chemicalchecker.util.models import import_models, import_sign0_features # import models for predicting sign1, sign2
CURRENT_VERSION= '2020_01'
@logged
class ChemicalChecker():
"""ChemicalChecker class."""
......@@ -468,7 +470,7 @@ class ChemicalChecker():
from chemicalchecker.core.diagnostics import Diagnosis
return Diagnosis(self, sign, **kwargs)
def import_models_for_prediction(self,sign,version='2020_01'):
def import_models_for_prediction(self,sign,version= CURRENT_VERSION):
"""
Nico: copy the models files we store in chemicalchecker.utils.models into sign.model_path
in order to use the predict functions
......@@ -479,6 +481,17 @@ class ChemicalChecker():
"""
import_models(sign, version=version)
def import_features_sign0(self,sign,version= CURRENT_VERSION):
"""
Import the sign0 features (available for spaces A1 to A5 under util/models in the code).
returns a list of features
returns: a list of sign0 features
"""
self.__log.info("--> Importing saved features from "+sign.cctype+", "+sign.dataset+".")
return import_sign0_features(sign, version= version)
def import_h5(self):
"""Recovers h5 files from a given custom directory.
......
......@@ -192,10 +192,10 @@ class sign0(BaseSignature, DataSignature):
self.__log.debug("Processing features")
features = self.process_features(features, X.shape[1])
self.__log.debug("Only keeping idxs of relevance")
self.__log.debug("keys is {}".format(keys))
self.__log.debug("keys_raw is {}".format(keys_raw))
self.__log.debug("idxs is {}".format(idxs))
print("idxs is {}".format(idxs))
#self.__log.debug("keys is {}".format(keys))
#self.__log.debug("keys_raw is {}".format(keys_raw))
#self.__log.debug("idxs is {}".format(idxs))
X = X[idxs]
self.__log.debug("Setting input type")
input_type = "matrix"
......
......@@ -252,7 +252,8 @@ class sign1(BaseSignature, DataSignature):
tmp_path = os.path.join(self.model_path, tag)
#try:
cc = ChemicalChecker(tmp_path)
s1 = cc.signature(self.dataset, "sign1")
#s1 = cc.signature(self.dataset, "sign1")
s1 = cc.get_signature(self.cctype, self.molset, self.cctype) # Nico, experiment
self.copy_sign0_to_sign1(sign0, s1, just_data=True)
self.__log.debug("Reading pipeline")
fn = self.pipeline_file()
......@@ -262,7 +263,9 @@ class sign1(BaseSignature, DataSignature):
self.__log.debug("Scaling if necessary")
if not pipeline["sparse"] and pipeline["scale"]:
mod = self.load_model("scale")
mod.model_path = self.model_path
mod.predict(s1)
self.__log.debug("Transformation")
if pipeline["metric_learning"]:
if pipeline["semisupervised"]:
......@@ -279,8 +282,6 @@ class sign1(BaseSignature, DataSignature):
mod = None
if mod is not None:
mod.model_path = self.model_path # avoid taking the info from pickle in case it is copied
# print("\n SHERLOCK mod.features",len(mod.features), mod.features)
# print("\n SHERLOCK s1.features", s1.dataset,len(s1.features), s1.features)
mod.predict(s1)
self.__log.debug("Prediction done!")
......
from .import_models import import_models
\ No newline at end of file
from .import_models import import_models, import_sign0_features
\ No newline at end of file
......@@ -52,6 +52,13 @@ def import_models(sign_object ,version='2020_01'):
os.symlink(target, symlink)
def import_sign0_features(sign_object, version='2020_01'):
"""
Opens the stored sign0 h5 feature file and returns a set of features
Arguments:
sign_object : signature 0 object
version (str): version of the cc_repo stored in package/chemicalchecker/util/models
"""
cctype= sign_object.cctype
dataset= sign_object.dataset
......
......@@ -2,9 +2,12 @@
# Uses the data calculator class to get all A spaces preprocessed data
# Inspired from the cc pipeline
import os, json
import numpy as np
import h5py
import collections
from chemicalchecker.util.parser import DataCalculator
from chemicalchecker.core.signature_data import DataSignature
from chemicalchecker.core.preprocess import Preprocess
from chemicalchecker.util.parser import fetch_features_A
from chemicalchecker.util.parser import Converter
......@@ -175,7 +178,6 @@ class Aspaces_prop_calculator(object):
def create_h5(self):
print("Retrieving InChI strings from the list of input InChIkeys")
for i, (inchikey, inchi) in enumerate(self.dict_inchikey_inchi.items()):
......@@ -256,20 +258,76 @@ class Aspaces_prop_calculator(object):
# dict space: path to raw file
return outputfiles
# The next three functions are stolen from the Sanitizer
def chunker(self, n):
size = 2000
for i in range(0, n, size):
yield slice(i, i + size)
def rewrite_matrix_h5(self, data, mask, axis=1, name='V'):
name_tmp = "%s_tmp" % name
with h5py.File(data, "a") as hf:
n = hf[name].shape[0]
create = True
for chunk in self.chunker(n):
if axis == 1:
M_tmp = hf[name][chunk][:, mask]
else:
mask_ = mask[chunk]
M_tmp = hf[name][chunk][mask_]
if create:
hf.create_dataset(name_tmp, data=M_tmp,
maxshape=(None, M_tmp.shape[1]))
create = False
else:
hf[name_tmp].resize(
(hf[name_tmp].shape[0] + M_tmp.shape[0]), axis=0)
hf[name_tmp][-M_tmp.shape[0]:] = M_tmp
del hf[name]
hf[name] = hf[name_tmp]
del hf[name_tmp]
def rewrite_str_array_h5(self, data, mask, name="features"):
name_tmp = "%s_tmp" % name
with h5py.File(data, "a") as hf:
array_tmp = hf[name][:][mask]
hf.create_dataset(name_tmp, data=np.array(
array_tmp, DataSignature.string_dtype()))
del hf[name]
hf[name] = hf[name_tmp]
del hf[name_tmp]
def createSign0(self, dict_of_Aspaces_h5, sanitize=False):
"""
Create sign0 from all raw A spaces h5 files created with create_h5_from_inchikeys_inchi
Here we take in a list of 5 paths to raw data (A1 o A5) and return a cc instance that contains sign0 for these 5 spaces
Here we take in a list of 5 paths to raw data (A1 o A5) and return a cc instance that contains
sign0 for these 5 spaces.
Column filtering: we 'Sanitize' (remove features) according to what was done in the CC_repo for spaces A1 to A5.
"""
# Now creating sign0 for each of the input raw files
for space, fp in dict_of_Aspaces_h5.items():
print("\nCalculating sign0 for space", space)
sign0 = self.cc.get_signature('sign0', 'full',space+'.001')
if not sign0.is_fit():
sign0.fit(data_file=fp,do_triplets=False, overwrite=True,sanitize=sanitize)
else:
print("Sign0 for space", space+'.001', "already fit, nothing to do")
for molset in ('full','reference'):
print("\nCalculating sign0",molset,"for space", space)
sign0 = self.cc.get_signature('sign0', molset, space+'.001')
features_from_fit= self.cc.import_features_sign0(sign0)
if not sign0.available():
sign0.fit(data_file=fp,do_triplets=False, overwrite=True,sanitize=sanitize)
else:
print("Sign0", molset, "for space", space+'.001', "already fit, nothing to do")
# Now remove the required features (columns) from the sign0 h5 file
mask = np.isin(sign0.features, features_from_fit)
self.rewrite_matrix_h5(sign0.data_path, mask)
self.rewrite_str_array_h5(sign0.data_path, mask)
# Then we can use this cc instance to predict sign1
return self.cc
......@@ -290,14 +348,22 @@ class Aspaces_prop_calculator(object):
for space in self.Aspaces:
assert space+'.001' in dictSpaces.keys(), print("Sign0 for space",space, "not fit!!")
sign0= self.cc.get_signature('sign0', 'full',space+'.001') # already fitted
sign1 = self.cc.get_signature('sign1', 'full',space+'.001') # will get converted to reference by the next fct
sign1.clear()
self.cc.import_models_for_prediction(sign1) # Import model for this space
models_imported =False
for molset in ('full'):
sign0= self.cc.get_signature('sign0', molset , space+'.001') # already fitted
sign1 = self.cc.get_signature('sign1', molset, space+'.001') # will get converted to reference by the next fct
sign1.clear()
# import models from the fit that took place in CC_repo 2020_XX
if not models_imported:
self.cc.import_models_for_prediction(sign1) # Import model for this space
models_imported =True
destination = sign1.data_path
if not os.path.exists(destination):
print("\nPredicting sign1", molset, "for space",space, 'to' ,destination)
sign1.predict(sign0,destination=destination)
print("\nPredicting sign1 for space",space)
sign1.predict(sign0)
return self.cc
def predictSign2(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment