Commit 5ea49042 authored by nsoler's avatar nsoler
Browse files

added restrict_to_universe fct to sign0

parent bac25bda
Pipeline #2137 passed with stages
in 4 minutes and 1 second
......@@ -184,6 +184,7 @@ class ChemicalChecker():
'essential'.
"""
universe = set()
dataset_accepted=[]
for ds in Dataset.get():
if not ds.derived:
print(ds, "not derived")
......@@ -192,10 +193,12 @@ class ChemicalChecker():
print(ds, "not essential")
continue
s0 = self.get_signature('sign0', 'full', ds.code)
dataset_accepted.append(ds.code)
try:
universe.update(s0.unique_keys)
except Exception as ex:
self.__log.warning(str(ex))
print("datasets accepted:", dataset_accepted)
return sorted(list(universe))
@staticmethod
......
......@@ -452,3 +452,30 @@ class sign0(BaseSignature, DataSignature):
features, DataSignature.string_dtype()))
hf.create_dataset("keys_raw", data=np.array(
keys_raw, DataSignature.string_dtype()))
def restrict_to_universe(self):
"""
Nico : 17/09/2020
- Restricts the keys in the corresponding h5 files to the ones contained in the universe,
defined as the union of all molecules from bioactivity spaces (B and after).
- Applicable when the signature belongs to one of the A spaces
"""
cc= self.get_cc()
universe = cc.universe # list of inchikeys belonging to the universe
# get the vectors from s0 corresponding to our (restricted) universe
inchk_univ, _ = self.get_vectors(keys=universe)
# obtain a mask for sign0 in order to obtain a filtered h5 file
mask= np.isin(self.keys, list(inchk_univ))
filtered_h5=os.path.join(os.path.dirname(self.data_path), 'sign0_univ.h5')
print("Creating",filtered_h5)
s0.make_filtered_copy(filtered_h5, mask)
# After that check that your file is ok and move it to sign0.h5
def restrict_to_universe_hpc(self, *args, **kwargs):
return self.func_hpc("restrict_to_universe", *args, **kwargs)
......@@ -339,7 +339,7 @@ class DataSignature(object):
else:
return hf[h5_dataset_name][mask, :]
def get_vectors(self, keys, include_nan=False, dataset_name='V'):
def get_vectors(self, keys, include_nan=False, dataset_name='V', output_missing=False):
"""Get vectors for a list of keys, sorted by default.
Args:
......@@ -350,12 +350,11 @@ class DataSignature(object):
dataset_name(str): return any dataset in the h5 which is organized
by sorted keys.
"""
self.__log.debug("Fetching %s rows from dataset %s" %
(len(keys), dataset_name))
self.__log.debug("Fetching %s rows from dataset %s" %(len(keys), dataset_name))
valid_keys = list(self.unique_keys & set(keys))
idxs = np.argwhere(
np.isin(list(self.keys), list(valid_keys), assume_unique=True))
idxs = np.argwhere(np.isin(list(self.keys), list(valid_keys), assume_unique=True))
inks, signs = list(), list()
with h5py.File(self.data_path, 'r') as hf:
dset = hf[dataset_name]
dset_shape = dset.shape
......@@ -380,6 +379,8 @@ class DataSignature(object):
return None, None
inks, signs = np.stack(inks), np.vstack(signs)
sort_idx = np.argsort(inks)
if output_missing:
return inks[sort_idx], signs[sort_idx], missed_inks
return inks[sort_idx], signs[sort_idx]
def get_vectors_lite(self, keys, chunk_size=2000, chunk_above=10000):
......
......@@ -8,7 +8,7 @@ cc= ChemicalChecker(cc_repo)
# Get the union of molecules for exemplary B spaces and above
universe = cc.universe
spaces_to_filter = ['A1.001', 'A2.001', 'A3.001', 'A4.001', 'A5.001', 'B4.002', 'D2.002']
spaces_to_filter = ['A1.001', 'A2.001', 'A3.001', 'A4.001', 'A5.001']
for space in spaces_to_filter:
......
......@@ -2,7 +2,7 @@
"PATH": {
"CC_ROOT": "'/aloy/web_checker/package_cc/2020_01'",
"CC_DATA": "'/aloy/scratch/sbnb-adm/CC/download/'",
"CC_TMP": "'/aloy/scratch/sbnb-adm/CC/tmp_jobs/'",
"CC_TMP": "'/aloy/scratch/nsoler/CC/tmp_jobs/'",
"CC_REPO": "'/opt/chemical_checker'",
"SINGULARITY_IMAGE": "'/aloy/home/nsoler/CC_related/IMAGE/cc.simg'",
"validation_path": "'/aloy/web_checker/package_cc/validation_sets/'"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment