Commit 470638c0 authored by Martino Bertoni's avatar Martino Bertoni 🌋
Browse files

preprocess fuction call was missing in case of no input in sign0.fit, added...

preprocess fuction call was missing in case of no input in sign0.fit, added call, relocated preprocess function to preprocess class as classmethods
parent 0a1a16a2
......@@ -52,7 +52,6 @@ from glob import glob
from .molkit import Mol
from .data import DataFactory
from .preprocess import Preprocess
from .signature_data import DataSignature
from chemicalchecker.database import Dataset, Molecule
......@@ -406,60 +405,6 @@ class ChemicalChecker():
return None
return DataSignature(data.data_path)
def preprocess(self, sign):
"""Return the file with the raw data preprocessed.
Args:
sign: signature object obtained from cc.get_signature)
Returns:
datafile(str): The name of the file where the data in pairs is
saved.
"""
prepro = Preprocess(sign.signature_path, sign.dataset)
if not prepro.is_fit():
self.__log.info(
"No preprocessed file found, calling the preprocessing script")
prepro.fit()
else:
self.__log.info("Found {}".format(prepro.data_path))
return prepro.data_path
# ex:os.path.join(self.raw_path, "preprocess.h5")
def preprocess_predict(self, sign, input_file, destination):
"""Runs the preprocessing script 'predict'.
Run on an input file of raw data formatted correctly for the space of
interest
Args:
sign: signature object obtained from cc.get_signature)
input_file(str): path to the h5 file containning the data on which
to apply 'predict'
destination(str): Path to a .h5 file where the predicted signature
will be saved.
Returns:
datafile(str): The h5 file containing the predicted data after
preprocess
"""
input_file = os.path.abspath(input_file)
destination = os.path.abspath(destination)
# Checking the provided paths
if not os.path.exists(input_file):
raise Exception("Error, {} does not exist!".format(input_file))
ext = destination[-2:].lower()
if not ext == 'h5':
destination += '.h5'
prepro = Preprocess(sign.signature_path, sign.dataset)
prepro.predict(input_file, destination)
return destination
def signature(self, dataset, cctype):
return self.get_signature(cctype=cctype, molset="full",
......
......@@ -161,6 +161,64 @@ class Preprocess():
result.append(dict(zip(keys, values)))
return result
@classmethod
def preprocess(cls, sign):
"""Return the file with the raw data preprocessed.
Args:
sign: signature object (e.g. obtained from cc.get_signature)
Returns:
datafile(str): The name of the file where the data is saved.
ex:
os.path.join(self.raw_path, "preprocess.h5")
"""
prepro = cls(sign.signature_path, sign.dataset)
if not prepro.is_fit():
cls.__log.info(
"No preprocessed file found, calling the preprocessing script")
prepro.fit()
else:
cls.__log.info("Found {}".format(prepro.data_path))
return prepro.data_path
@classmethod
def preprocess_predict(cls, sign, input_file, destination):
"""Runs the preprocessing script 'predict'.
Run on an input file of raw data formatted correctly for the space of
interest
Args:
sign: signature object ( e.g. obtained from cc.get_signature)
input_file(str): path to the h5 file containning the data on which
to apply 'predict'
destination(str): Path to a .h5 file where the predicted signature
will be saved.
Returns:
datafile(str): The h5 file containing the predicted data after
preprocess
"""
input_file = os.path.abspath(input_file)
destination = os.path.abspath(destination)
# Checking the provided paths
if not os.path.exists(input_file):
raise Exception("Error, {} does not exist!".format(input_file))
ext = destination[-2:].lower()
if not ext == 'h5':
destination += '.h5'
prepro = cls(sign.signature_path, sign.dataset)
prepro.predict(input_file, destination)
return destination
@staticmethod
def get_parser():
description = 'Run preprocess script.'
......
......@@ -13,6 +13,7 @@ import numpy as np
from .signature_data import DataSignature
from .signature_base import BaseSignature
from .preprocess import Preprocess
from chemicalchecker.util import logged
from chemicalchecker.util.sanitize import Sanitizer
......@@ -305,6 +306,10 @@ class sign0(BaseSignature, DataSignature):
BaseSignature.fit(self, **params)
self.clear()
self.update_status("Getting data")
if pairs is None and X is None and data_file is None:
self.__log.debug("Runnning preprocess")
data_file = Preprocess.preprocess(self)
self.__log.debug("data_file is {}".format(data_file))
res = self.get_data(pairs=pairs, X=X, keys=keys, features=features,
data_file=data_file, key_type=key_type,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment