Commit 9b876ecc authored by Martino Bertoni's avatar Martino Bertoni 🌋
Browse files

accessing signature data via the getitem function (square brackets e.g....

accessing signature data via the getitem function (square brackets e.g. sign[:]) leaves now the file open (for performance reason), so it is important to close the file esplicitly with sign.close_hdf5() or to use the sign.get_h5_dataset() method instead. If not closed correctly, sucessive writing of the file will give errors.
parent 16c97a8d
Pipeline #2687 failed with stages
in 77 minutes and 53 seconds
......@@ -586,8 +586,9 @@ class DataSignature(object):
idx = bisect_left(self.keys, key)
return idx
def open_hdf5(self):
self.hdf5 = h5py.File(self.data_path, 'r', rdcc_nbytes=100*1024**2)
def open_hdf5(self, mode='r'):
self.close_hdf5()
self.hdf5 = h5py.File(self.data_path, mode, rdcc_nbytes=100*1024**2)
def close_hdf5(self):
if hasattr(self, 'hdf5'):
......@@ -746,7 +747,7 @@ class DataSignature(object):
np.random.seed(seed)
if n >= len(self.keys):
self.__log.debug("Full dataset sampled (n=%d)" % len(self.keys))
V = self[:]
V = self.get_h5_dataset('V')
keys = self.keys
else:
self.__log.debug("Subsampling dataset (n=%d)" % n)
......@@ -802,8 +803,7 @@ class DataSignature(object):
"""Map signature throught mappings."""
if "mappings" not in self.info_h5:
raise Exception("Data file has no mappings.")
with h5py.File(self.data_path, 'r') as hf:
mappings_raw = hf['mappings'][:]
mappings_raw = self.get_h5_dataset('mappings')
mappings = dict(mappings_raw)
# avoid trivial mappings (where key==value)
to_map = list(set(mappings.keys()) - set(mappings.values()))
......@@ -825,6 +825,7 @@ class DataSignature(object):
for dst_key in sorted(to_map):
dst_keys.append(dst_key)
dst_vectors.append(self[mappings[dst_key]])
self.close_hdf5()
# to numpy arrays
dst_keys = np.array(dst_keys)
matrix = np.vstack(dst_vectors)
......
......@@ -51,7 +51,7 @@ class LinkPrediction():
if limit_nodes:
all_nodes_set = all_nodes_set & limit_nodes
all_nodes = list(all_nodes_set)
matrix = self.sign2[:]
matrix = self.sign2.get_h5_dataset('V')
if len(all_nodes) < 100:
raise Exception(
"Insufficient nodes for validation: %s" % len(all_nodes))
......
......@@ -968,7 +968,7 @@ class Plot():
keys = np.random.choice(sign.keys, max_samples, replace=False)
matrix = sign.get_vectors(keys)[1]
else:
matrix = sign[:]
matrix = sign.get_h5_dataset('V')
df = pd.DataFrame(matrix).melt()
coord = self.dataset_code
......
......@@ -36,6 +36,7 @@ class BaseTransform(object):
self.categorical = self.is_categorical()
self.tmp = tmp
def reindex_triplets(self, sign1, keys):
fn = os.path.join(sign1.model_path, "triplets.h5")
if not os.path.exists(fn):
......@@ -139,9 +140,9 @@ class BaseTransform(object):
self.__log.debug("Considering all reference data")
keys = self.sign_ref.keys
if self.tmp:
V = self.sign_ref.get_h5_dataset("V_tmp")[:]
V = self.sign_ref.get_h5_dataset("V_tmp")
else:
V = self.sign_ref[:]
V = self.sign_ref.get_h5_dataset("V")
else:
self.__log.debug(
"Subsampling data (ensuring coverage of at least one feature)")
......@@ -191,10 +192,12 @@ class BaseTransform(object):
def is_categorical(self, n=1000):
self.__log.debug("Checking continuous or categorical")
V = self.sign_ref[:n]
self.sign_ref.close_hdf5()
is_cat = np.all(V == V.astype(np.int))
if not is_cat:
return False
V = self.sign[:n]
self.sign.close_hdf5()
is_cat = np.all(V == V.astype(np.int))
if is_cat:
return True
......
......@@ -179,7 +179,6 @@ class Lsi(BaseTransform):
self.__log.warning(
'Repeating LSI with: variance_explained: %.2f num_topics: %s',
self.variance_explained, str(self.num_topics))
self.predict(self.sign_ref)
self.predict(self.sign)
self.save()
......@@ -203,6 +202,7 @@ class Lsi(BaseTransform):
mask = np.argwhere(row > 0).ravel()
val = ",".join(self.features[mask])
f.write("%s %s\n" % (ks[i], val))
sign1.close_hdf5()
# load dictionary
dictionary = corpora.Dictionary.load(
os.path.join(self.model_path, self.name + ".dict.pkl"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment