Commit d7f06c7c authored by mlocatelli's avatar mlocatelli 🍁
Browse files

Fixes to D1 preprocess; small adaptation of create_database.py script

parent 3e6029f5
Pipeline #2510 passed with stages
in 62 minutes and 26 seconds
......@@ -200,16 +200,21 @@ class RNDuplicates():
hf.create_dataset("keys", data=np.array(keys, DataSignature.string_dtype()))
if self.data is None:
dh5 = h5py.File(self.data_file, 'r')
V = np.array(
[dh5["V"][i] for i in self.final_ids], dtype=self.data_type)
hf.create_dataset("V", (len(self.final_ids), dh5["V"].shape[1]), dtype=self.data_type)
for count, i in enumerate(self.final_ids):
hf["V"][count] = dh5["V"][i]
# V = np.array(
# [dh5["V"][i] for i in self.final_ids], dtype=self.data_type)
else:
V = np.array(
self.data[np.array(self.final_ids)], dtype=self.data_type)
hf.create_dataset("V", data=V)
hf.create_dataset("shape", data=V.shape)
hf.create_dataset("V", data=V)
hf.create_dataset("shape", hf["V"].shape)
hf.create_dataset("mappings",
data=np.array(list_maps,
DataSignature.string_dtype()))
self.__log.info("Writing mappings to " + dirpath)
with open(os.path.join(dirpath, "mappings"), 'wb') as fh:
pickle.dump(self.mappings, fh)
......@@ -348,8 +348,11 @@ def main(args):
inchikey_raw.append((k[0], k[1] + "(" + str(v) + ")"))
with h5py.File(args.output_file, "w") as hf:
# getting strings instead of bytes from the h5 file
hf.create_dataset("pairs", data=np.array(inchikey_raw, DataSignature.string_dtype()))
# getting strings instead of bytes from the h5 file
hf.create_dataset("pairs", data=DataSignature.h5_str(inchikey_raw))
# they keep being bytes...
# with h5py.File(args.output_file, "r") as hf:
# main._log.info(hf['pairs'][:10])
if __name__ == '__main__':
......
......@@ -90,7 +90,7 @@ def main(SIG, up, dw, mini_sig_info_file, signatures_dir, connectivity_dir, touc
CTm = collections.defaultdict(list) # it creates it and puts an empty list as the default value
R = []
for f in tqdm(os.listdir(signatures_dir)): # Going through all h5 files of gene expression data to match our list of up/down-regulated genes
for f in os.listdir(signatures_dir): # Going through all h5 files of gene expression data to match our list of up/down-regulated genes
if ".h5" not in f:
continue
#print("match against-->", f)
......
......@@ -16,7 +16,7 @@ from chemicalchecker.core.preprocess import Preprocess
from chemicalchecker.core.signature_data import DataSignature
# Variables
TEST = True # once the 'signatures' dir exists with h5 inside, you can copy a few of them to 'signatures_test' and check if it's working
TEST = False # once the 'signatures' dir exists with h5 inside, you can copy a few of them to 'signatures_test' and check if it's working
CHUNK_SIZE=10 # number of tasks per single job sent to sge
dataset_code = os.path.dirname(os.path.abspath(__file__))[-6:] #NS D1.001
features_file = "features.h5"
......@@ -145,7 +145,7 @@ def parse_level(mini_sig_info_file, map_files, signaturesdir):
cids = []
mini_sig_info_file = 'mini_sig_info_file.tsv'
# mini_sig_info_file = 'mini_sig_info_file.tsv' --> we already have this variable as argument of the method
with open(mini_sig_info_file, "w") as f:
for k, v in sigs.items(): #(pert_id, cell_id) --> (sig_id, phase) fo the tuple with biggest tas
x = sig_info[v] # (pert_id, pert_type, cell_id, istouchstone, 1)
......@@ -160,12 +160,13 @@ def parse_level(mini_sig_info_file, map_files, signaturesdir):
gtcx_sh = os.path.join(map_files["level5_beta_trt_sh_n238351x12328"], "level5_beta_trt_sh_n238351x12328.gctx")
gtcx_oe = os.path.join(map_files["level5_beta_trt_oe_n34171x12328"], "level5_beta_trt_oe_n34171x12328.gctx")
genes_i = [genes[r[0]] for r in parse.parse(gtcx_cp, cid=[[x[0] for x in cids if x[1] == 'trt_cp'][0]]).data_df.iterrows()]
genes_ii = [genes[r[0]] for r in parse.parse(gtcx_sh, cid=[[x[0] for x in cids if x[1] == 'trt_sh.cgs'][0]]).data_df.iterrows()] # Just to make sure.
genes_iii = [genes[r[0]] for r in parse.parse(gtcx_oe, cid=[[x[0] for x in cids if x[1] == 'trt_oe'][0]]).data_df.iterrows()] # Just to make sure.
for cid in tqdm(cids): # for each sign id
main._log.info("Parsing GCTX files for each sign id")
for cid in cids: # for each sign id
main._log.info("Sign id: {}".format(cid[0]))
if cid[1] == 'trt_cp':
expr = np.array(parse.parse(gtcx_cp, cid=[cid[0]]).data_df).ravel()
genes = genes_i
......@@ -181,7 +182,7 @@ def parse_level(mini_sig_info_file, map_files, signaturesdir):
R = zip(genes, expr)
R = sorted(R, key=lambda tup: -tup[1])
with h5py.File(signaturesdir + "%s.h5" % cid[0], "w") as hf:
with h5py.File(os.path.join(signaturesdir, "%s.h5" % cid[0]), "w") as hf:
hf.create_dataset("expr", data=[float(r[1]) for r in R])
hf.create_dataset("gene", data=DataSignature.h5_str([r[0] for r in R]))
......@@ -481,9 +482,9 @@ def main(args):
WD = os.path.dirname(os.path.realpath(__file__)) # directory from which run.py is launched
connectivity_script = WD + "/connectivity_2020.py" # scripts called by run.py in the same directory
connectivity_script = WD + "/connectivity.py" # scripts called by run.py in the same directory
ikmatrices_script = WD + "/do_agg_matrices_2020.py"
ikmatrices_script = WD + "/do_agg_matrices.py"
readyfile = "conn.ready"
......
......@@ -11,93 +11,97 @@ MolrepoDatasourceFile="molrepo_has_datasource.csv"
current_dir = os.path.dirname(os.path.abspath(__file__))
# Create DB
command1 ="PGPASSWORD={} psql -h {} -U {} -tc \"SELECT 1 FROM pg_database WHERE datname = \'{}\';\" | grep -q 1" \
.format(Config().DB.password, Config().DB.host, Config().DB.user, Config().DB.database)
res = os.system(command1)
if res == 0:
raise Exception("Database '{}' already exist".format(Config().DB.database))
else:
command1 = "PGPASSWORD={} psql -h {} -U {} -c \"CREATE DATABASE {}\"".format(Config().DB.password, Config().DB.host, Config().DB.user, Config().DB.database)
os.system(command1)
print("DB CREATED---->{}".format(Config().DB.database))
def create_db_dataset():
# Create DB
command1 ="PGPASSWORD={} psql -h {} -U {} -tc \"SELECT 1 FROM pg_database WHERE datname = \'{}\';\" | grep -q 1" \
.format(Config().DB.password, Config().DB.host, Config().DB.user, Config().DB.database)
res = os.system(command1)
if res == 0:
raise Exception("Database '{}' already exists".format(Config().DB.database))
else:
command1 = "PGPASSWORD={} psql -h {} -U {} -c \"CREATE DATABASE {}\"".format(Config().DB.password, Config().DB.host, Config().DB.user, Config().DB.database)
os.system(command1)
print("DB CREATED---->{}".format(Config().DB.database))
# check if datasource table is already present
if Datasource._table_exists():
Datasource._drop_table()
print("'datasource' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
print("Creating table 'datasource' in database '{}'".format(Config().DB.database))
Datasource._create_table()
print("Populating 'datasource' table with data")
Datasource.from_csv(os.path.join(current_dir, DatasourceFile))
print("TABLE CREATED---->datasource")
# check if datasource table is already present
if Datasource._table_exists():
Datasource._drop_table()
print("'datasource' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
print("Creating table 'datasource' in database '{}'".format(Config().DB.database))
Datasource._create_table()
print("Populating 'datasource' table with data")
Datasource.from_csv(os.path.join(current_dir, DatasourceFile))
print("TABLE CREATED---->datasource")
# check if dataset table is already present
if Dataset._table_exists():
print("'dataset' table already exists in database '{}' \n Dropped it ".format(Config().DB.database))
print("Creating table 'dataset' in database '{}'".format(Config().DB.database))
Dataset._create_table()
print("Populating 'dataset' table with data")
Dataset.from_csv(os.path.join(current_dir,DatasetFile))
print("TABLE CREATED---->dataset")
# check if dataset table is already present
if Dataset._table_exists():
print("'dataset' table already exists in database '{}' \n Dropped it ".format(Config().DB.database))
print("Creating table 'dataset' in database '{}'".format(Config().DB.database))
Dataset._create_table()
print("Populating 'dataset' table with data")
Dataset.from_csv(os.path.join(current_dir,DatasetFile))
print("TABLE CREATED---->dataset")
# check if dataset_has_datasource table is already present
if DatasetHasDatasource._table_exists():
DatasetHasDatasource._drop_table()
print("'dataset_has_datasource' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
dataset_exists = Dataset._table_exists()
datasource_exists = Datasource._table_exists()
if dataset_exists and datasource_exists:
print("Creating table 'dataset_has_datasource' in database '{}'".format(Config().DB.database))
DatasetHasDatasource._create_table()
print("Populating 'dataset_has_datasource' table with data")
DatasetHasDatasource.from_csv(os.path.join(current_dir,DatasetDatasourceFile))
print("TABLE CREATED---->dataset_has_datasource")
else:
raise Exception("It is not possble to create 'dataset_has_datasource' because either 'dataset' or 'datasource' table doesn't exist: \
dataset {} - datasource {}".format(dataset_exists, datasource_exists))
# check if dataset_has_datasource table is already present
if DatasetHasDatasource._table_exists():
DatasetHasDatasource._drop_table()
print("'dataset_has_datasource' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
dataset_exists = Dataset._table_exists()
datasource_exists = Datasource._table_exists()
if dataset_exists and datasource_exists:
print("Creating table 'dataset_has_datasource' in database '{}'".format(Config().DB.database))
DatasetHasDatasource._create_table()
print("Populating 'dataset_has_datasource' table with data")
DatasetHasDatasource.from_csv(os.path.join(current_dir,DatasetDatasourceFile))
print("TABLE CREATED---->dataset_has_datasource")
else:
raise Exception("It is not possble to create 'dataset_has_datasource' because either 'dataset' or 'datasource' table doesn't exist: \
dataset {} - datasource {}".format(dataset_exists, datasource_exists))
# check if molrepo table is already present
if Molrepo._table_exists():
Molrepo._drop_table()
print("'molrepo' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
print("Creating table 'molrepo' in database '{}'".format(Config().DB.database))
Molrepo._create_table()
print("Populating 'molrepo' table with data")
Molrepo.from_csv(os.path.join(current_dir,MolrepoFile))
print("TABLE CREATED---->molrepo")
# check if molrepo table is already present
if Molrepo._table_exists():
Molrepo._drop_table()
print("'molrepo' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
print("Creating table 'molrepo' in database '{}'".format(Config().DB.database))
Molrepo._create_table()
print("Populating 'molrepo' table with data")
Molrepo.from_csv(os.path.join(current_dir,MolrepoFile))
print("TABLE CREATED---->molrepo")
# check if molrepo_has_datasource table is already present
if MolrepoHasDatasource._table_exists():
MolrepoHasDatasource._drop_table()
print("'molrepo_has_datasource' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
molrepo_exists = Molrepo._table_exists()
datasource_exists = Datasource._table_exists()
if molrepo_exists and datasource_exists:
print("Creating table 'molrepo_has_datasource' in database '{}'".format(Config().DB.database))
MolrepoHasDatasource._create_table()
print("Populating 'molrepo_has_datasource' table with data")
MolrepoHasDatasource.from_csv(os.path.join(current_dir,MolrepoDatasourceFile))
print("TABLE CREATED---->molrepo_has_datasource")
else:
raise Exception("Itis not possble to create 'molrepo_has_datasource' because either 'molrepo' or 'datasource' table doesn't exist: \
molrepo {} - datasource {}".format(molrepo_exists, datasource_exists))
# check if molrepo_has_datasource table is already present
if MolrepoHasDatasource._table_exists():
MolrepoHasDatasource._drop_table()
print("'molrepo_has_datasource' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
molrepo_exists = Molrepo._table_exists()
datasource_exists = Datasource._table_exists()
if molrepo_exists and datasource_exists:
print("Creating table 'molrepo_has_datasource' in database '{}'".format(Config().DB.database))
MolrepoHasDatasource._create_table()
print("Populating 'molrepo_has_datasource' table with data")
MolrepoHasDatasource.from_csv(os.path.join(current_dir,MolrepoDatasourceFile))
print("TABLE CREATED---->molrepo_has_datasource")
else:
raise Exception("Itis not possble to create 'molrepo_has_datasource' because either 'molrepo' or 'datasource' table doesn't exist: \
molrepo {} - datasource {}".format(molrepo_exists, datasource_exists))
# molecule and molrepo_has_molecule tables are created and left empty until pipeline step 'molrepo'
if Molecule._table_exists():
Molecule._drop_table()
print("'molecule' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
print("Creating table 'molecule' in database '{}'".format(Config().DB.database))
Molecule._create_table()
# molecule and molrepo_has_molecule tables are created and left empty until pipeline step 'molrepo'
if Molecule._table_exists():
Molecule._drop_table()
print("'molecule' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
print("Creating table 'molecule' in database '{}'".format(Config().DB.database))
Molecule._create_table()
if MolrepoHasMolecule._table_exists():
MolrepoHasMolecule._drop_table()
print("'molrepo_has_molecule' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
molrepo_exists = Molrepo._table_exists()
molecule_exists = Molecule._table_exists()
if molrepo_exists and molecule_exists:
print("Creating table 'molrepo_has_molecule' in database '{}'".format(Config().DB.database))
MolrepoHasMolecule._create_table()
else:
raise Exception("Itis not possble to create 'molrepo_has_molecule' because either 'molrepo' or 'molecule' table doesn't exist: \
molrepo {} - molecule {}".format(molrepo_exists, molecule_exists))
if MolrepoHasMolecule._table_exists():
MolrepoHasMolecule._drop_table()
print("'molrepo_has_molecule' table already exists in database '{}' \n Dropped it".format(Config().DB.database))
molrepo_exists = Molrepo._table_exists()
molecule_exists = Molecule._table_exists()
if molrepo_exists and molecule_exists:
print("Creating table 'molrepo_has_molecule' in database '{}'".format(Config().DB.database))
MolrepoHasMolecule._create_table()
else:
raise Exception("Itis not possble to create 'molrepo_has_molecule' because either 'molrepo' or 'molecule' table doesn't exist: \
molrepo {} - molecule {}".format(molrepo_exists, molecule_exists))
if __name__ == '__main__':
create_db_dataset()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment