Commit a77b6e64 authored by nsoler's avatar nsoler
Browse files

export_feature function for sign0

parent dd506a3c
Pipeline #2377 failed with stages
in 1 minute and 59 seconds
......@@ -510,3 +510,8 @@ class sign0(BaseSignature, DataSignature):
# After that check that your file is ok and move it to sign0.h5
self.__log.debug("Done")
def export_features(self,destination=self.model_path):
features = self.features
with h5py.File(os.path.join(destination, "features_sign0_"+self.dataset+".h5"), 'w') as hf_out:
hf_out.create_dataset("features", data=np.array(features, DataSignature.string_dtype()))
......@@ -118,11 +118,14 @@ class sign1(BaseSignature, DataSignature):
return fn
def load_model(self, name):
fn = os.path.join(self.get_molset(
"reference").model_path, "%s.pkl" % name)
with open(fn, "rb") as f:
mod = pickle.load(f)
self.__log.debug("\n----> Loading model:"+fn)
return mod
def delete_tmp(self, s1):
......@@ -275,10 +278,11 @@ class sign1(BaseSignature, DataSignature):
else:
mod = None
if mod is not None:
mod.model_path = self.model_path # avoid taking the info from pickle
print(" \n SHERLOCK s1.model_path",s1.model_path)
print(" \n SHERLOCK mod.model_path",mod.model_path)
mod.model_path = self.model_path # avoid taking the info from pickle in case it is copied
# print("\n SHERLOCK mod.features",len(mod.features), mod.features)
# print("\n SHERLOCK s1.features", s1.dataset,len(s1.features), s1.features)
mod.predict(s1)
self.__log.debug("Prediction done!")
if destination is None:
self.__log.debug("Returning a V, keys dictionary")
......
......@@ -710,3 +710,8 @@ class DataSignature(object):
beg_idx, end_idx = beg_idx + batch_size, end_idx + batch_size
return _generator_fn
def export_features(self,destination='.'):
features = self.features
with h5py.File(os.path.join(destination, "features_sign.h5"), 'w') as hf_out:
hf_out.create_dataset("features", data=np.array(features, DataSignature.string_dtype()))
......@@ -187,17 +187,21 @@ class Lsi(BaseTransform):
plain_corpus = os.path.join(tmp_dir, self.name + ".plain.txt")
tfidf_corpus = os.path.join(tmp_dir, self.name + ".tfidf.mm")
with open(plain_corpus, "w") as f:
# Read the provided sign1 by chunks of n signautres
for chunk in sign1.chunker():
vs = sign1[chunk].astype(np.int)
ks = sign1.keys[chunk]
vs = sign1[chunk].astype(np.int) # take a chunk of n signatures
ks = sign1.keys[chunk] # together with their keys
for i in range(0, len(ks)):
row = vs[i]
mask = np.where(row > 0)
val = ",".join([",".join([self.features[x]] * row[x])
for x in mask[0]])
row = vs[i] # signature i
mask = np.where(row > 0) # indices of where positive values are (the array is in fact mask[0])
# print("\n\n SHERLOCK size mask0",mask[0].shape)
# print("SHERLOCK size features",len(self.features))
# print("SHERLOCK size row",len(row))
val = ",".join([",".join([self.features[x]] * row[x]) for x in mask[0]])
f.write("%s %s\n" % (ks[i], val))
# load dictionary
print("\n SHERLOCK self.model_path",self.model_path)
dictionary = corpora.Dictionary.load(
os.path.join(self.model_path, self.name + ".dict.pkl"))
# corpus
......
## NS (08 Feb 2021), export the features from sign0
## So as 'sanitize' (i.e remove columns) identically as in the cc-rep when a custom dataset is made
## in A spaces
from chemicalchecker.core.chemcheck import ChemicalChecker
repo= "/aloy/web_checker/package_cc/2020_01/"
outDir= "/aloy/scratch/nsoler/CC_related/EXPORT_SIGN/sign0"
cc = ChemicalChecker(repo)
for space in "ABCDE":
for num in (1, 2, 3, 4, 5):
ds= space+str(num)+'.001'
sign0tmp = cc.get_signature('sign0', 'full', ds)
sign0tmp.export_features(outDir)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment