Commit 1319e67d authored by Oriol Guitart's avatar Oriol Guitart
Browse files

First commit to create checker pipeline

parent 64b6104f
.project
.pydevproject
#!/usr/bin/python
import os,sys,string,commands
sys.path.append(os.path.join(sys.path[0],"../../src/utils"))
from checkerUtils import logSystem, execAndCheck
sys.path.append(os.path.join(sys.path[0],"../"))
import checkerconfig
checkercfg = checkerconfig.checkerConf( )
print "Entering download files"
logsFiledir = checkercfg.getDirectory( "logs" )
log = logSystem(sys.stdout)
downloadsdir = checkercfg.getDirectory( "downloads" )
check_dir = os.path.exists(logsFiledir)
# Changed servers to mirror ftp.ebi.ac.uk (Roger)
if check_dir == False:
c = os.makedirs(logsFiledir)
check_dir = os.path.exists(downloadsdir)
# Changed servers to mirror ftp.ebi.ac.uk (Roger)
if check_dir == False:
c = os.makedirs(downloadsdir)
os.chdir(downloadsdir)
for file in checkerconfig.downloads:
if file[3] != '':
if os.path.exists(downloadsdir + '/' + file[3]):
continue
download = 'wget '
if file[1] != '':
download += ( '--user ' + file[1] + ' --password ' + file[2])
if file[3] != '':
download += ( ' -O ' + file[3])
log.info( " %s %s" % (download,file[0]))
out = commands.getstatusoutput(download + ' "' + file[0] + '"')
if out[0] != 0:
log.error( "Step get " + file[3] + " in downloadFiles.py failed with message: " + out[1])
sys.exit(1)
if file[1].endswith(".zip") or file[3].endswith(".zip"):
out = commands.getstatusoutput('unzip -o ' + file[3])
if out[0] != 0:
log.error( "Step zip in " + file[3] + " failed with message: " + out[1])
sys.exit(1)
continue
if file[1].endswith(".tgz") or file[3].endswith(".tgz"):
out = commands.getstatusoutput('tar -xzf ' + file[3])
if out[0] != 0:
log.error( "Step tgz in " + file[3] + " failed with message: " + out[1])
sys.exit(1)
continue
if file[1].endswith("tar.gz") or file[3].endswith("tar.gz"):
out = commands.getstatusoutput('tar -xzf ' + file[3])
if out[0] != 0:
log.error( "Step tar.gz in " + file[3] + " failed with message: " + out[1])
sys.exit(1)
continue
if file[1].endswith(".gz") or file[3].endswith(".gz"):
out = commands.getstatusoutput('gunzip -kf ' + file[3])
if out[0] != 0:
log.error( "Step gunzip in " + file[3] + " failed with message: " + out[1])
sys.exit(1)
continue
with open(os.path.join(downloadsdir,checkerconfig.kegg_atcs_download), "r") as f:
drugs = set()
for l in f:
if l[0] == "F":
drugs.update([l.split()[1]])
drugs = sorted(drugs)
if not os.path.exists(os.path.join(downloadsdir,checkerconfig.kegg_mol_folder_download)):
os.makedirs(os.path.join(downloadsdir,checkerconfig.kegg_mol_folder_download))
for drug in drugs:
out = commands.getstatusoutput("wget -O " + os.path.join(downloadsdir,checkerconfig.kegg_mol_folder_download,drug + ".mol") + " http://rest.kegg.jp/get/" + drug + "/mol" )
if out[0] != 0:
log.error( "Step wget for mol " + drug + " failed with message: " + out[1])
sys.exit(1)
logFilename = os.path.join(logsFiledir,"loadChemblinDB.log")
job2run = "dropdb --if-exists -h aloy-dbsrv chembl\n"
job2run += "createdb -h aloy-dbsrv chembl\n"
job2run += "psql -h aloy-dbsrv -d chembl -f " + downloadsdir + "/chembl_*/chembl_*_postgresql/*.dmp"
# And we start it
cmdStr = os.path.join(sys.path[0],"../../src/utils/")+ "setupSingleJob.py -x -N db-chembl " + job2run
# Then I move to the directory where I want the output generated
wrapperCmd = "cd "+downloadsdir+"; "+cmdStr + "; " +checkerconfig.SUBMITJOB + " job-db-chembl.sh > " + logFilename
if MASTERNODE != "":
wrapperCmd = "ssh "+checkerconfig.MASTERNODE+" '%s'" % wrapperCmd
ret = execAndCheck(wrapperCmd,log)
# Then we check the log file
logFile = open( logFilename )
taskOK = False
for line in logFile:
if line.find( "exited with exit code 0") != -1:
taskOK = True
break
if taskOK == False :
sys.exit(1)
\ No newline at end of file
# Do the essential molrepos (DrugBank, ChEMBL, etc.)
# Imports
import sys, os
sys.path.append(os.path.join(sys.path[0],"../../src/utils"))
import htstandardizer as hts
import Psql
import csv
import pybel
import urllib2
import checkerconfig
downloadsdir = ""
moldir = ""
# Filenames are the same as the function names (e.g. drugbank() -> drugbank.tsv)
# Molrepo functions (ordered alphabetically)
def bindingdb():
f = open(os.path.join(downloadsdir,checkerconfig.bindingdb_download), "r")
g = open(os.path.join(moldir,"bindingdb.tsv"), "w")
header = f.next()
header = header.rstrip("\n").split("\t")
bdlig_idx = header.index("Ligand InChI Key")
smiles_idx = header.index("Ligand SMILES")
dones = set()
for l in f:
l = l.rstrip("\n").split("\t")
Id = l[bdlig_idx]
smi = l[smiles_idx]
if Id in dones or not smi: continue
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
g.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
dones.update([Id])
def chebi():
import rdkit.Chem as Chem
print os.path.join(downloadsdir,checkerconfig.chebi_lite_download)
with open(os.path.join(moldir,"chebi.tsv"), "w") as f:
suppl = Chem.SDMolSupplier(os.path.join(downloadsdir,checkerconfig.chebi_lite_download))
for m in suppl:
if not m: continue
Id = m.GetPropsAsDict()['ChEBI ID']
smi = Chem.MolToSmiles(m)
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
def chembl():
with open(moldir+"/chembl.tsv", "w") as f:
query = "SELECT md.chembl_id, cs.canonical_smiles FROM molecule_dictionary md, compound_structures cs WHERE md.molregno = cs.molregno AND cs.canonical_smiles IS NOT NULL"
con = Psql.connect(checkerconfig.chembl)
con.set_isolation_level(0)
cur = con.cursor()
cur.execute(query)
for r in cur:
Id = r[0]
smi = r[1]
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
def ctd():
f = open(os.path.join(downloadsdir,checkerconfig.ctd_molecules_download), "r")
g = open(moldir + "/ctd.tsv", "w")
for l in csv.reader(f, delimiter = "\t"):
if len(l) < 2: continue
Id = l[0]
smi = l[1]
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
g.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
g.close()
f.close()
def drugbank():
# Parse Drugbank and convert to inchikeys.
import xml.etree.ElementTree as ET
xmlfile = os.path.join(downloadsdir,checkerconfig.drugbank_download)
prefix = "{http://www.drugbank.ca}"
tree = ET.parse(xmlfile)
root = tree.getroot()
with open(moldir+"/drugbank.tsv", "w") as f:
for drug in root:
# Drugbank ID
db_id = None
for child in drug.findall(prefix + "drugbank-id"):
if "primary" in child.attrib:
if child.attrib["primary"] == "true":
db_id = child.text
if not db_id: continue
# Smiles
smiles = None
for props in drug.findall(prefix + "calculated-properties"):
for prop in props:
if prop.find(prefix + "kind").text == "SMILES":
smiles = prop.find(prefix + "value").text
if not smiles: continue
smi = smiles
Id = db_id
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
def kegg():
with open(moldir + "/kegg.tsv", "w") as f:
L = os.listdir(os.path.join(downloadsdir,checkerconfig.kegg_mol_folder_download))
for l in L:
mol = pybel.readfile("mol", os.path.join(downloadsdir,checkerconfig.kegg_mol_folder_download) + "/" + l)
for m in mol:
smi = m.write("smi").rstrip("\n")
if ".mol" not in l: continue
Id = l.split(".")[0]
if not smi: continue
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
def lincs():
S = set()
with open(os.path.join(downloadsdir,checkerconfig.lincs_GSE92742_pert_info_download), "r") as f:
f.next()
for r in csv.reader(f, delimiter = "\t"):
if not r[1] or r[1] == "-666": continue
S.update([(r[0], r[1])])
with open(os.path.join(downloadsdir,checkerconfig.lincs_GSE70138_pert_info_download), "r") as f:
f.next()
for r in csv.reader(f, delimiter = "\t"):
if not r[6] or r[6] == "-666": continue
S.update([(r[0], r[6])])
with open(moldir+"/lincs.tsv", "w") as f:
for s in sorted(S):
Id = s[0]
smi = s[1]
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
def mosaic():
with open(moldir+"/mosaic.tsv", "w") as f:
for mol in pybel.readfile("sdf", os.path.join(downloadsdir,checkerconfig.mosaic_all_collections_download)):
if not mol: continue
smi, Id = mol.write("can").rstrip("\n").split("\t")
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
def morphlincs():
f = open(moldir+"/morphlincs.tsv", "w")
g = open(os.path.join(downloadsdir,checkerconfig.morphlincs_molecules_download), "r")
g.next()
for l in csv.reader(g, delimiter = "\t"):
if not l[7]: continue
Id = l[0]
smi = l[7]
if not smi: continue
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
g.close()
f.close()
def nci60():
f = open(os.path.join(downloadsdir,checkerconfig.nci60_download), "r")
g = open(moldir+"/nci60.tsv", "w")
f.next()
for l in csv.reader(f):
Id, smi = l[0], l[5]
if not smi: continue
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
g.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
g.close()
f.close()
def pdb():
ligand_inchikey = {}
inchikey_inchi = {}
f = open(os.path.join(downloadsdir,checkerconfig.pdb_components_smiles_download), "r")
g = open(moldir+"/pdb.tsv", "w")
for l in f:
l = l.rstrip("\n").split("\t")
if len(l) < 2: continue
lig_id = l[1]
mol = hts.apply(l[0])
if not mol:
g.write("%s\t%s\t\t\n" % (lig_id, l[0]))
continue
ligand_inchikey[lig_id] = mol[0]
inchikey_inchi[mol[0]] = mol[1]
g.write("%s\t%s\t%s\t%s\n" % (lig_id, l[0], mol[0], mol[1]))
f.close()
g.close()
def sider():
with open(os.path.join(downloadsdir,checkerconfig.sider_download), "r") as f:
S = set()
for l in f:
l = l.split("\t")
S.update([l[1]])
with open(os.path.join(downloadsdir,checkerconfig.stitch_molecules_download), "r") as f:
stitch = {}
f.next()
for r in csv.reader(f, delimiter = "\t"):
if r[0] not in S: continue
stitch[r[0]] = r[-1]
with open(moldir + "/sider.tsv", "w") as f:
for s in list(S):
Id = s
smi = stitch[s]
if not smi: continue
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
f.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
def smpdb():
f = open(os.path.join(downloadsdir,checkerconfig.smpdb_metabolites_download), "r")
f.next()
g = open(moldir+"/smpdb.tsv", "w")
S = set()
for r in csv.reader(f):
if not r[12]: continue
S.update([(r[5], r[12])])
for s in sorted(S):
Id = s[0]
smi = s[1]
mol = hts.apply(smi)
if not mol:
inchikey = ""
inchi = ""
else:
inchikey = mol[0]
inchi = mol[1]
g.write("%s\t%s\t%s\t%s\n" % (Id, smi, inchikey, inchi))
f.close()
g.close()
if __name__ == '__main__':
import argparse
if len(sys.argv) != 3:
sys.exit(1)
configFilename = sys.argv[2]
checkercfg = checkerconfig.checkerConf( configFilename)
downloadsdir = checkercfg.getDirectory( "downloads" )
moldir = checkercfg.getDirectory( "molRepo" )
args = dict()
args["do"] = sys.argv[1]
if args["do"] == "bindingdb" or args["do"] == "all":
bindingdb()
if args["do"] == "chebi" or args["do"] == "all":
chebi()
if args["do"] == "chembl" or args["do"] == "all":
chembl()
if args["do"] == "ctd" or args["do"] == "all":
ctd()
if args["do"] == "drugbank" or args["do"] == "all":
drugbank()
if args["do"] == "kegg" or args["do"] == "all":
kegg()
if args["do"] == "lincs" or args["do"] == "all":
lincs()
if args["do"] == "morphlincs" or args["do"] == "all":
morphlincs()
if args["do"] == "mosaic" or args["do"] == "all":
mosaic()
if args["do"] == "nci60" or args["do"] == "all":
nci60()
if args["do"] == "pdb" or args["do"] == "all":
pdb()
if args["do"] == "sider" or args["do"] == "all":
sider()
if args["do"] == "smpdb" or args["do"] == "all":
smpdb()
#!/usr/bin/env python
#
# Runs all the tasks of this step
#
# Imports
import os
import sys
import subprocess
sys.path.append(os.path.join(sys.path[0],"../../src/utils"))
from checkerUtils import logSystem, execAndCheck
import checkerconfig
# Constants
tasks = [
( 'Dowload all files', 'downloadFiles.py', 'download_files' ),
( 'Mol Repos BindingDB', 'essential_molrepos.py bindingdb', 'mol_repos_bindingdb' ),
( 'Mol Repos Chebi', 'essential_molrepos.py chebi', 'mol_repos_chebi' ),
( 'Mol Repos Chembl', 'essential_molrepos.py chembl', 'mol_repos_chembl' ),
( 'Mol Repos CTD', 'essential_molrepos.py ctd', 'mol_repos_ctd' ),
( 'Mol Repos Drugbank', 'essential_molrepos.py drugbank', 'mol_repos_drugbank' ),
( 'Mol Repos Kegg', 'essential_molrepos.py kegg', 'mol_repos_kegg' ),
( 'Mol Repos Lincs', 'essential_molrepos.py lincs', 'mol_repos_lincs' ),
( 'Mol Repos Morphlincs', 'essential_molrepos.py morphlincs', 'mol_repos_morphlincs' ),
( 'Mol Repos Mosaic', 'essential_molrepos.py mosaic', 'mol_repos_mosaic' ),
( 'Mol Repos NCI60', 'essential_molrepos.py nci60', 'mol_repos_nci60' ),
( 'Mol Repos PDB', 'essential_molrepos.py pdb', 'mol_repos_pdb' ),
( 'Mol Repos Sider', 'essential_molrepos.py sider', 'mol_repos_sider' ),
( 'Mol Repos Smpdb', 'essential_molrepos.py smpdb', 'mol_repos_smpdb' )
]
# Functions
def usage(progName):
print "Usage: "+progName+" <config_ini>"
def main():
# Check arguments
checkercfg = checkerconfig.checkerConf( )
readyFiledir = checkercfg.getDirectory( "ready", configUpdate.UNIPROTKB_SUBDIR )
log = logSystem(sys.stdout)
log.debug(os.getcwd())
dirName = os.path.abspath(sys.argv[0]).split("/")[-2]
bOk = True
for i in range(0,len(tasks)):
log.info("====>>>> "+tasks[i][0]+" <<<<====")
readyFilename = os.path.join(readyFiledir,dirName+"_"+tasks[i][2]+".ready")
if os.path.exists(readyFilename):
log.info( "Ready file for task %s does exist. Skipping this task..." % tasks[i][2] )
continue
# Then I execute the current task
try:
scriptName = os.path.join(sys.path[0],tasks[i][1])
p = subprocess.Popen( [scriptName,configFilename], stderr=subprocess.STDOUT )
(pid,retcode) = os.waitpid(p.pid, 0)
if retcode != 0: