Commit ad239590 authored by mlocatelli's avatar mlocatelli

Modifing dataset creation for large datasets

parent 00d69c50
Pipeline #2516 passed with stages
in 12 minutes and 12 seconds
......@@ -284,7 +284,7 @@ class sign0(BaseSignature, DataSignature):
def fit(self, cc_root=None, pairs=None, X=None, keys=None, features=None,
data_file=None, key_type="inchikey", agg_method="average",
do_triplets=False, max_features=50000, chunk_size=10000,
do_triplets=False, max_features=20000, chunk_size=10000,
sanitize=True, **params):
"""Process the input data.
......
......@@ -71,7 +71,9 @@ class sign1(BaseSignature, DataSignature):
"date",
data=np.array([datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S")], DataSignature.string_dtype()))
hf.create_dataset("V", data=s0[:])
hf.create_dataset("V", s0.shape, dtype=s0.data_type)
for i in range(0, s0.shape[0]):
hf["V"][i] = s0[i][:]
hf.create_dataset("keys", data=np.array(
s0.keys, DataSignature.string_dtype()))
if is_basesig:
......@@ -100,7 +102,10 @@ class sign1(BaseSignature, DataSignature):
if "V_tmp" in hf.keys():
self.__log.debug("Deleting V_tmp")
del hf["V_tmp"]
hf.create_dataset("V_tmp", data=hf["V"][:])
hf.create_dataset("V_tmp", hf["V"].shape, dtype=hf["V"].dtype)
for i in range(0, hf["V"].shape[0]):
hf["V_tmp"][i] = hf["V"][i][:]
def was_sparse(self, max_keys=1000, zero_prop=0.5):
"""Guess if the matrix was sparse"""
......
......@@ -203,8 +203,6 @@ class RNDuplicates():
hf.create_dataset("V", (len(self.final_ids), dh5["V"].shape[1]), dtype=self.data_type)
for count, i in enumerate(self.final_ids):
hf["V"][count] = dh5["V"][i]
# V = np.array(
# [dh5["V"][i] for i in self.final_ids], dtype=self.data_type)
else:
V = np.array(
self.data[np.array(self.final_ids)], dtype=self.data_type)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment