Commit 5b4813b3 authored by Martina Locatelli's avatar Martina Locatelli 🍁
Browse files

D1 space pipeline integration

parent 33c07f22
Pipeline #2486 failed with stages
in 46 minutes and 6 seconds
# Imports
import h5py
import numpy as np
import sys
import os
import collections
import pickle
# Functions
core_cells = set(['A375', 'A549', 'HA1E', 'HCC515',
'HEPG2', 'MCF7', 'PC3', 'VCAP', 'HT29'])
only_touchstone = True # Doing the connectivity only with touchstone signatures
def es_score(idxs, ref_expr, min_idxs, p=1):
"""
Enrichment score
idx: list of gene indices
ref_expr: expression levels of all genes in the signature which we want to do connectivity
"""
if len(idxs) < min_idxs: # if not enough genes matching in the up/down regulated list (10 minimum)
return 0.
N = len(ref_expr) # number of genes in the expression profile
Nh = len(idxs) # number of matches found
norm = 1. / (N - Nh) # normalise by 1/the gene number difference between expr profile and query list of up/down regulated genes
miss = np.empty(N) # Return a new array of given shape and type, without initializing entries.
miss[:] = norm # Fill it with the normalization factor
miss[idxs] = 0. # initialize the matching gene positions of the gene expression profile to zero
hit_nums = np.zeros(N) # array of size (number of genes in expression profile) initialized with zeros
hit_nums[idxs] = np.abs(ref_expr[idxs])**p # Where a match occurs, replace zero by the expression level of that gene
hit = hit_nums / np.sum(hit_nums) # Normalize this array dividing by the total number of matches
#P_hit = hit
P_miss = miss
P_hit = np.cumsum(hit) # now transform hit into its cumulative sum array.
P_miss = np.cumsum(miss)
ES = P_hit - P_miss # element-wise difference between the two cumulative sum arrays
return ES[np.argmax(np.abs(ES))] # Return the max value of this difference array (in absolute value)
def connectivity_score(up, dw, signature_file, signatures_dir, min_idxs):
with h5py.File("%s/%s" % (signatures_dir, signature_file), "r") as hf:
expr = hf["expr"][:] # i.e [ 5.02756786, 4.73850965, 4.49766302 ..]
gene = hf["gene"][:] #i.e ['AGR2', 'RBKS', 'HERC6', ..., 'SLC25A46', 'ATP6V0B', 'SRGN']
up_idxs = np.where(np.in1d(gene,up))[0]
dw_idxs = np.where(np.in1d(gene,dw))[0]
es_up = es_score(up_idxs, expr, min_idxs)
es_dw = es_score(dw_idxs, expr, min_idxs)
if np.sign(es_up) == np.sign(es_dw):
return 0
else:
return (es_up - es_dw) / 2
def signature_info(mini_sig_info_file):
d = {}
with open(mini_sig_info_file, "r") as f:
for l in f:
l = l.rstrip("\n").split("\t")
d[l[0]] = (l[1], l[2], l[3])
return d
# Main
def main(SIG, up, dw, mini_sig_info_file, signatures_dir, connectivity_dir, touch, min_idxs, output_h5):
""" NS, main:
SIG: diff gene expression signature id (string)
up: set of up-regulated genes in this signature (set of strings b'')
dw: set of down-regulated genes in this signature (set of strings)
mini_sig_info_file: Path to mini_sig_info_file.tsv, which contains info about each signature (string)
signatures_dir: Path to the directory that contains the gene diff expression signature h5 files (string)
touch: Set of perturbagen ids belonging to the touchstone dataset (set of strings)
min_idxs: integer (10 in the update)
output_h5: str, name of the output h5 connectivity file
"""
sig_info = signature_info(mini_sig_info_file) # dict version of mini_sig_info_file.tsv
# sign_id: (pert_id, treatment, cell_line, is_touchstone)
CTp = collections.defaultdict(list) # These dicts will never throw a KeyError, if the key doesn't exist
CTm = collections.defaultdict(list) # it creates it and puts an empty list as the default value
R = []
for f in tqdm(os.listdir(signatures_dir)): # Going through all h5 files of gene expression data to match our list of up/down-regulated genes
if ".h5" not in f:
continue
#print("match against-->", f)
sig = f.split("/")[-1].split(".h5")[0] # file name without extension, ex: REP.A001_A375_24H:A19.h5
sinfo = sig_info[sig] # (pert_id, treatment, cell_line, is_touchstone)
if sigs: # True
if sinfo[0] not in touch or sinfo[2] not in core_cells:
#print(f,"not in touchstone or in core_cells, skipping")
continue
# Each signature will be compared with all the others, and connectivity scores are calculated
cs = connectivity_score(up, dw, f, signatures_dir, min_idxs)
R += [(sig, cs)] # signature:id, connectivity score
if cs > 0:
CTp[(sinfo[1], sinfo[2])] += [cs] # treatment, cell_line
elif cs < 0:
CTm[(sinfo[1], sinfo[2])] += [cs]
else:
continue
CTp = dict((k, np.median(v)) for k, v in CTp.items()) # median of positive connec. scores found for each treatment and cell_line
CTm = dict((k, np.median(v)) for k, v in CTm.items()) # median of positive connec. scores found for each treatment and cell_line
# S Will contain all connectivity score for this query signature to all others (sign_id, connect.score, normalized connect. score)
S = []
for r in R: #for each signature:id, connectivity score (preselected for belonging to the Touchstone dataset)
cs = r[1]
sig = r[0]
sinfo = sig_info[sig] # # (pert_id, treatment, cell_line, is_touchstone)
if cs > 0:
mu = CTp[(sinfo[1], sinfo[2])]
ncs = cs / mu # divide the connectivity score by the median of connec. scores found for this treatment and cell_line
elif cs < 0:
mu = -CTm[(sinfo[1], sinfo[2])]
ncs = cs / mu
else:
ncs = 0.
S += [(r[0], cs, ncs)] # add (sign_id, connect.score, normalized connect. score)
S = sorted(S, key=lambda tup: tup[0]) # sort by sign_id (preselected for belonging to the Touchstone dataset)
if not os.path.exists("%s/signatures.tsv" % connectivity_dir): #Write signatures id only to refer to the h5 file
with open("%s/signatures.tsv" % connectivity_dir, "w") as f:
for s in S:
f.write("%s\n" % s[0])
# Each signature will show its connectivity to all other signatures in this h5 file
with h5py.File(output_h5, "w") as hf:
es = np.array([s[1] * 1000 for s in S]).astype(np.int16) # connectivity score
nes = np.array([s[2] * 1000 for s in S]).astype(np.int16) # normalized connectivity score
hf.create_dataset("es", data=es)
hf.create_dataset("nes", data=nes)
if __name__ == '__main__':
task_id = sys.argv[1]
filename = sys.argv[2]
mini_sig_info_file = sys.argv[3]
signatures_dir = sys.argv[4]
connectivity_dir = sys.argv[5]
pert_info = sys.argv[6] # NSex: GSE92742_Broad_LINCS_pert_info.txt
sig_info = sys.argv[7] # GSE92742_Broad_LINCS_sig_info.txt
min_idxs = int(sys.argv[8]) # was 10 in our case
inputs = pickle.load(open(filename, 'rb')) # contains signid: path_to_the sign h5 file
sigs = inputs[task_id] # value for a particular task id, is a dict which values are themselves dict
### EP: Obtain touchstone ### - Name of the trt_oe has change so we need to mapp
touch = set()
tou_oe = set()
with open(pert_info, "r") as f:
f.readline() # skip header
for l in f:
l = l.rstrip("\n").split("\t")
trt = l[2]
if (trt == "trt_oe") and (l[3] == '1'):
tou_oe.add(l[0])
# checks the treatment record (cp, sh)
if trt not in ["trt_cp", "trt_sh.cgs"]:
continue
if l[3] == '0':
continue
touch.add(l[0]) # Keep perturbagen ids belonging to the touchstone dataset
### EP: Obtain new id of trt_oe touchstone ###
with open(sig_info, "r") as f:
f.readline() # skip header
for l in f:
l = l.rstrip("\n").split("\t")
if l[1] in tou_oe:
touch.add(l[0].split(':')[1])
for k, v in sigs.items(): # k=signid1, v={'file': pathtosignature1.h5}
output_h5 = os.path.join(connectivity_dir,k+'.h5')
if not os.path.exists(output_h5):
#print("Recovering up/down-regulated genes in signature {}".format(k))
if "up" in v: # If up/downregulated genes have already been selected (not our case)
main(k, v["up"], v["down"], mini_sig_info_file, signatures_dir, connectivity_dir, touch, min_idxs, output_h5)
else:
# NS: select up / down regulated genes from the gene expression profile
with h5py.File(v["file"], "r") as hf: # pathtosignature1.h5
expr = hf["expr"][:] # i.e [ 5.02756786, 4.73850965, 4.49766302 ..]
gene = hf["gene"][:] # i.e ['AGR2', 'RBKS', 'HERC6', ..., 'SLC25A46', 'ATP6V0B', 'SRGN']
# Make a np array of (gene, diff expression), sorted by epr level
R = np.array(sorted(zip(gene, expr), key=lambda tup: -tup[1]), dtype=np.dtype([('gene', '|S300'), ('expr', np.float)]))
# R contains 12328 genes
# array([(b'AGR2', 5.02756786), (b'RBKS', 4.73850965),
# (b'HERC6', 4.49766302), ..., (b'SLC25A46', -6.47712374),
# (b'ATP6V0B', -6.93565464), (b'SRGN', -8.43125248)],
# dtype=[('gene', 'S300'), ('expr', '<f8')])--> these are bytes, need to decode the output!!!
up = R[:250] # the first 250 genes are considered up-regulated
dw = R[-250:] # the last 250 genes are considered down-regulated
up = set(up['gene'][up['expr'] > 2]) # then keep up/down-regulated genes whose expression is a least 2 units absolute value
dw = set(dw['gene'][dw['expr'] < -2]) # Then it is only an array of gene names
# decode the bytes into Py3 strings
up = {s.decode() for s in up}
dw = {s.decode() for s in dw}
# main will take the list of up/down regulated-genes for this sign_id(k) and compare match it to all others
main(k, up, dw, mini_sig_info_file, signatures_dir, connectivity_dir, touch, min_idxs, output_h5)
else:
print("{} already exists, skipping".format(output_h5))
# Imports
import h5py
import numpy as np
import sys
import os
import collections
import pickle
from chemicalchecker.database import Molrepo
# Functions
def read_l1000(connectivitydir, mini_sig_info_file):
######## use when new mapping on molrepo ########
# molrepos = Molrepo.get_by_molrepo_name("lincs")
# pertid_inchikey = {}
# inchikey_inchi = {}
# for molrepo in molrepos:
# if not molrepo.inchikey:
# continue
# pertid_inchikey[molrepo.src_id] = molrepo.inchikey
# inchikey_inchi[molrepo.inchikey] = molrepo.inchi
######## In the meantime, we can map passing as argument the mapping file ########
pertid_inchikey = {}
inchikey_inchi = {}
with open(LINCS_2020_cp_info, "r") as f:
f.readline()
for l in f:
l = l.rstrip("\n").split("\t")
if l[-1] == '':
continue
pertid_inchikey[l[1]] =l[-1] # pert_id ->inchikey
inchikey_inchi[l[-1]] = l[5] # inchikey --> smile
# Read signature data
touchstones = set()
siginfo = {}
with open(mini_sig_info_file, "r") as f: # Ns REMINDER mini_sig_info_file contains:
# sig_id, pert_id, pert_type, cell_id, istouchstone)
for l in f:
l = l.rstrip("\n").split("\t")
if int(l[4]) == 1: # select pert_id from Touchstone dataset
touchstones.update([l[1]])
siginfo[l[0]] = l[1] # siginfo is {sig_id: pert_id, ..}
inchikey_sigid = collections.defaultdict(list) # creates an empty list if the key doesn't exists
PATH = connectivitydir # List all individual h5 connectivity matrices
for r in os.listdir(PATH):
if ".h5" not in r:
continue
sig_id = r.split(".h5")[0] # sign id = h5 file name
pert_id = siginfo[sig_id] # recovers pert_id of this signature
if pert_id in pertid_inchikey: # if this pert_id happens to have an inchikey
ik = pertid_inchikey[pert_id]
inchikey_sigid[ik] += [sig_id] # add it to the defaultdict of pert inchikeys {inchickey : [sigid]}
return inchikey_sigid, siginfo # {inchickey : [sigid]}, {sig_id: pert_id}
def read_l1000_predict(connectivitydir, mini_sig_info_file):
# Read signature data
touchstones = set()
siginfo = {}
with open(mini_sig_info_file, "r") as f:
for l in f:
l = l.rstrip("\n").split("\t")
if int(l[4]) == 1:
touchstones.update([l[1]])
siginfo[l[0]] = l[1]
pertid_sigid = collections.defaultdict(list)
PATH = connectivitydir
for r in os.listdir(PATH):
if ".h5" not in r:
continue
sig_id = r.split(".h5")[0]
sig, pert_id = sig_id.split("---")
pertid_sigid[pert_id] += [sig]
return pertid_sigid, siginfo
def get_summary(v):
Qhi = np.percentile(v, 66) # value before which 2/3 of the data are contained
Qlo = np.percentile(v, 33) # value before which 1/3 of the data are contained
if np.abs(Qhi) > np.abs(Qlo):
return Qhi
else:
# If both scores are negative then the 33-percentile is returned which is in fact the 66-percentile
return Qlo
# Main
if __name__ == '__main__':
task_id = sys.argv[1]
filename = sys.argv[2]
mini_sig_info_file = sys.argv[3]
LINCS_2020_cp_info = sys.argv[4]
connectivitydir = sys.argv[5]
agg_matrices = sys.argv[6]
method = sys.argv[7]
inputs = pickle.load(open(filename, 'rb'))
iks = inputs[task_id] # array of U27 str inchickeys
# NS: example of input (task id is 998 here)
# '998': array(['RWDQACDHOIJVRL-OXXUSNJHSA-N', 'WZQJIBXYFKEECF-WCZJQEMASA-N',
# 'WOLKNTYXBIJULP-PJIZGREPSA-N', 'JEEWDARUSPNTGW-ZGQJUUMOSA-N',
# 'BFDAKIJRIQBPOO-BPHNFWMXSA-N', 'LMYACBAJSFEPOG-SQQWPVGBSA-N',
# 'RWDQACDHOIJVRL-WCZJQEMASA-N', 'KPHBLEYIQPKXAE-NTSGDEEZSA-N',
# 'GBDHSNMZLIRBLK-INVAMZEASA-N', 'YGSIRKPQRSDRAD-UVMVBKAHSA-N'],
# dtype='<U27'),
if method == "fit":
inchikey_sigid, siginfo = read_l1000(connectivitydir, mini_sig_info_file) # {inchickey : [sigid]}, {sig_id: pert_id}
else:
inchikey_sigid, siginfo = read_l1000_predict(connectivitydir, mini_sig_info_file)
with open("%s/signatures.tsv" % connectivitydir, "r") as f: # contains a list of sign ids from Touchstone
signatures = [l.rstrip("\n") for l in f] # list of sign_id
cols = sorted(set(siginfo[s] for s in signatures)) # list of sorted unique pert_id corresponding to entries in signatures.tsv
cols_d = dict((cols[i], i) for i in range(len(cols))) # {pert_id:index}
for ik in iks: # going through the chunk of 10 iks(perturbagens) from input file (see above)
v = inchikey_sigid[ik] # [several sigid]
neses = collections.defaultdict(list) # creates an empty list if the key doesn't exists
for sigid in v: # For every sig_id corresponding TO THIS PERTURBAGEN (ik)
filename = sigid
if method == "predict":
filename = sigid + "---" + ik
with h5py.File("%s/%s.h5" % (connectivitydir, filename), "r") as hf: # recover the corresponding connectivity h5 file
nes = hf["nes"][:] # get the normalized connectivity scores for this sign
for i in range(len(signatures)): # For all Touchstone signatures (many sign can refer to the same perturbagen)
neses[(sigid, siginfo[signatures[i]])] += [nes[i]] # create neses dict as {(current sigid, pert_id):[norm_conn_scores of all signatures refering to pert_id]}
# Here neses correspond to all the signatures refering to our perturbagen's ik
neses = dict((x, get_summary(y)) for x, y in neses.items())
# convert it into {(sigid, pert_id): 66-percentile-nes}
# (current sigid, pert_id)--> 'Median' connectivity score of all signatures referring to pert_id with our current signature
rows = sorted(set([k[0] for k in neses.keys()])) # neses keys are (sigid, pert_id), take signid and sort-> sorted list of sigids
rows_d = dict((rows[i], i) for i in range(len(rows))) # {sigid:i}
X = np.zeros((len(rows), len(cols))).astype(np.int16) # all sigid for this pertb x all sorted pert_id in signatures.tsv
for x, y in neses.items(): # for all (sigid, pert_id): 66-percentile-nes
i = rows_d[x[0]] # rows_d[sigid] is the index of sorted sigid
j = cols_d[x[1]] # cols_d[pert_id] is the index of sorted pertid
X[i, j] = y # X[signid_idx, pertid_idx] = 66-percentile-nes (normalized conn score)
if len(rows) == 0: # If there are no signatures for this perturbagen
continue
# Create a matrix containing the 'median' connectivity score between
# rows: a couple of signatures (usually one) corresponding to the current inchikey
# Each perturbagen from Touchstone (7841 for the 2020 update)
with h5py.File("%s/%s.h5" % (agg_matrices, ik), "w") as hf:
hf.create_dataset("X", data=X)
hf.create_dataset("rows", data=np.array(rows, dtype=h5py.special_dtype(vlen=str)))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment