Commit 4a12bde6 authored by Martino Bertoni's avatar Martino Bertoni 🌋
Browse files

bugfix, after fetching refeerence nearest neighbors we must map to the full...

bugfix, after fetching refeerence nearest neighbors we must map to the full redundant space, otherwise we miss obvious relationship in the explore page
parent e7c60a9f
Pipeline #2623 passed with stages
in 74 minutes and 35 seconds
...@@ -71,6 +71,7 @@ metric_prd = None ...@@ -71,6 +71,7 @@ metric_prd = None
map_coords_obs = collections.defaultdict(list) map_coords_obs = collections.defaultdict(list)
dataset_pairs = {} dataset_pairs = {}
for ds in Dataset.get(exemplary=True): for ds in Dataset.get(exemplary=True):
print(ds)
dataset_pairs[ds.coordinate] = ds.dataset_code dataset_pairs[ds.coordinate] = ds.dataset_code
if metric_obs is None: if metric_obs is None:
neig1 = cc.get_signature("neig1", "reference", ds.dataset_code) neig1 = cc.get_signature("neig1", "reference", ds.dataset_code)
...@@ -84,17 +85,23 @@ for ds in Dataset.get(exemplary=True): ...@@ -84,17 +85,23 @@ for ds in Dataset.get(exemplary=True):
map_coords_obs[ik] += [ds.coordinate] map_coords_obs[ik] += [ds.coordinate]
print('took', time.time() - t0) print('took', time.time() - t0)
# get relevant background distances # get relevant background distances and mappings
print('get relevant background distances') print('get relevant background distances')
t0 = time.time() t0 = time.time()
bg_vals = dict() bg_vals = dict()
bg_vals['obs'] = dict() bg_vals['obs'] = dict()
bg_vals['prd'] = dict() bg_vals['prd'] = dict()
signatures = dict()
signatures['obs'] = dict()
signatures['prd'] = dict()
for coord in dataset_pairs.keys(): for coord in dataset_pairs.keys():
print(coord)
sign1 = cc.get_signature("sign1", "reference", dataset_pairs[coord]) sign1 = cc.get_signature("sign1", "reference", dataset_pairs[coord])
bg_vals['obs'][coord] = sign1.background_distances(metric_obs)["distance"] bg_vals['obs'][coord] = sign1.background_distances(metric_obs)["distance"]
signatures['obs'][coord] = sign1
sign3 = cc.get_signature("sign3", "reference", dataset_pairs[coord]) sign3 = cc.get_signature("sign3", "reference", dataset_pairs[coord])
bg_vals['prd'][coord] = sign3.background_distances(metric_prd)["distance"] bg_vals['prd'][coord] = sign3.background_distances(metric_prd)["distance"]
signatures['prd'][coord] = sign3
print('took', time.time() - t0) print('took', time.time() - t0)
# for both observed (sign1) and predicted (sign3) get significant neighbors # for both observed (sign1) and predicted (sign3) get significant neighbors
...@@ -108,6 +115,7 @@ neig_cctype = { ...@@ -108,6 +115,7 @@ neig_cctype = {
'prd': 'neig3', 'prd': 'neig3',
} }
for dataset in keys: for dataset in keys:
print(dataset)
coord, type_data = dataset.split("_") coord, type_data = dataset.split("_")
dist_cutoffs = bg_vals[type_data][coord] dist_cutoffs = bg_vals[type_data][coord]
neig = cc.get_signature( neig = cc.get_signature(
...@@ -117,13 +125,36 @@ for dataset in keys: ...@@ -117,13 +125,36 @@ for dataset in keys:
_, nn_inks = neig.get_vectors( _, nn_inks = neig.get_vectors(
inchikeys, include_nan=True, dataset_name='indices') inchikeys, include_nan=True, dataset_name='indices')
# mask to keep only neighbors below cutoff # mask to keep only neighbors below cutoff
mask = nn_dist <= dist_cutoffs[cutoff_idx] masks = nn_dist <= dist_cutoffs[cutoff_idx]
# get binned data according to distance cutoffs # get binned data according to distance cutoffs
dist_bin = np.digitize(nn_dist, dist_cutoffs) dist_bin = np.digitize(nn_dist, dist_cutoffs)
# get actual neighbors inchikeys and distance bins # get close neighbors inchikeys and distance bins and apply mapping
inks = [v[m].tolist() for v, m in zip(nn_inks, mask)] mappings = signatures[type_data][coord].get_h5_dataset('mappings')
dbins = [v[m].tolist() for v, m in zip(dist_bin, mask)] all_inks = list()
ds_inks_bin[dataset] = (inks, dbins) all_dbins = list()
# couldn't find a way to avoid iterating on molecules
for ref_nn_ink, ref_dbin, mask in zip(nn_inks, dist_bin, masks):
# apply distance cutoff
ref_nn_ink = ref_nn_ink[mask]
ref_dbin = ref_dbin[mask]
# iterate on bins to aggregate mappings
full_inks = list()
full_dbins = list()
for dbin in np.unique(ref_dbin):
# get inks in the bin
ink_dbin = ref_nn_ink[ref_dbin == dbin]
# get idx bassed on redundant 'reference' column
full_idxs = np.isin(mappings[:,1], ink_dbin)
# get non redundnt 'full' inks
full_nn_ink = mappings[:,0][full_idxs]
# append to molecule lists
full_inks.extend(full_nn_ink)
full_dbins.extend([dbin] * len(full_nn_ink))
all_inks.append(full_inks)
all_dbins.append(full_dbins)
# keep neighbors and bins for later
ds_inks_bin[dataset] = (all_inks, all_dbins)
print('took', time.time() - t0) print('took', time.time() - t0)
# read inchikey to pubmed names mapping # read inchikey to pubmed names mapping
...@@ -154,7 +185,8 @@ for index, inchikey in enumerate(inchikeys): ...@@ -154,7 +185,8 @@ for index, inchikey in enumerate(inchikeys):
if len(inks) == 0: if len(inks) == 0:
empty_spaces.append(dataset) empty_spaces.append(dataset)
continue continue
all_neig.update(inks) # iterate on each neighbor and expand to full set
all_neig.update(set(inks))
dbins = ds_inks_bin[dataset][1][index] dbins = ds_inks_bin[dataset][1][index]
neig_ds[dataset] = dict(zip(inks, dbins)) neig_ds[dataset] = dict(zip(inks, dbins))
for ds in empty_spaces: for ds in empty_spaces:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment