parser.py 33.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
"""Container for static parsing methods.

Each parsing function here is iterating on a raw input file.
Each input line is a molecule which is loaded, standardised and converted
to InChI and InChIKeys.
The raw features are yielded in chunks as dictionaries.
These methods are used to populate the :mod:`~chemicalchecker.database.molrepo`
database table.
"""
10
import os
11
12
import csv
import pandas as pd
Martino Bertoni's avatar
Martino Bertoni committed
13
import xml.etree.ElementTree as ET
14

Martino Bertoni's avatar
Martino Bertoni committed
15
from .converter import Converter
16
from chemicalchecker.util import logged
Martino Bertoni's avatar
Martino Bertoni committed
17
from chemicalchecker.util import psql
18
19
20
21


@logged
class Parser():
22
    """Parser class."""
23
24
25

    @staticmethod
    def parse_fn(function):
26
        """Serve a parse function."""
27
28
29
30
31
32
33
        try:
            return eval('Parser.' + function)
        except Exception as ex:
            Parser.__log.error("Cannot find parsing function %s", function)
            raise ex

    @staticmethod
34
    def bindingdb(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
35
        converter = Converter()
36
37

        file_path = map_paths[molrepo_name]
38
39
        fh = open(os.path.join(file_path), "r")
        # skip header
40
        header = fh.readline()
41
42
43
44
45
46
47
48
49
50
51
        header_rows = 1
        header = header.rstrip("\n").split("\t")
        # get indexes
        bdlig_idx = header.index("Ligand InChI Key")
        smiles_idx = header.index("Ligand SMILES")
        done = set()
        chunk = list()
        for idx, line in enumerate(fh):
            idx = idx + header_rows
            line = line.rstrip("\n").split("\t")
            src_id = line[bdlig_idx]
52
            smiles = line[smiles_idx]
53
54
55
56
57
            # skip repeated entries
            if src_id in done:
                # Parser.__log.debug("skipping line %s: repeated.", idx)
                continue
            done.add(src_id)
58
59
            if not smiles:
                # Parser.__log.debug("skipping line %s: missing smiles.", idx)
60
61
62
                continue
            # the following is always the same
            try:
Martino Bertoni's avatar
Martino Bertoni committed
63
                inchikey, inchi = converter.smiles_to_inchi(smiles)
64
65
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
66
67
68
69
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
70
            result = {
71
                "id": id_text,
Martino Bertoni's avatar
Martino Bertoni committed
72
                "molrepo_name": molrepo_name,
73
                "src_id": src_id,
74
                "smiles": smiles,
75
76
77
78
79
80
81
82
83
84
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk

    @staticmethod
85
    def chebi(map_paths, molrepo_name, chunks=1000):
86
87
88
89
90
        try:
            import rdkit.Chem as Chem
        except ImportError:
            raise ImportError("requires rdkit " +
                              "https://www.rdkit.org/")
Martino Bertoni's avatar
Martino Bertoni committed
91
        converter = Converter()
92
93

        file_path = map_paths["chebi_lite"]
94
95
96
97
98
99
        suppl = Chem.SDMolSupplier(file_path)
        chunk = list()
        for idx, line in enumerate(suppl):
            if not line:
                continue
            src_id = line.GetPropsAsDict()['ChEBI ID']
100
            smiles = Chem.MolToSmiles(line, isomericSmiles=True)
101
102
            # the following is always the same
            try:
Martino Bertoni's avatar
Martino Bertoni committed
103
                inchikey, inchi = converter.smiles_to_inchi(smiles)
104
105
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
106
107
108
109
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
110
            result = {
111
                "id": id_text,
Martino Bertoni's avatar
Martino Bertoni committed
112
                "molrepo_name": molrepo_name,
113
                "src_id": src_id,
114
115
116
117
118
119
120
121
122
123
124
                "smiles": smiles,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk

    @staticmethod
125
    def ctd(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
126
        converter = Converter()
127
128

        file_path = map_paths["CTD_chemicals_diseases"]
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
        fh = open(os.path.join(file_path), "r")
        done = set()
        chunk = list()
        for idx, line in enumerate(fh):
            # skip header
            if line.startswith("#"):
                continue
            line = line.rstrip("\n").split("\t")
            # skip those without DirectEvidence
            if not line[5]:
                continue
            chemicalname = line[0]
            chemicalid = line[1]
            src_id = chemicalid
            # skip repeated entries
            if src_id in done:
                # Parser.__log.debug("skipping line %s: repeated.", idx)
                continue
            done.add(src_id)
Martino Bertoni's avatar
Martino Bertoni committed
148
            # try to conert CTD id to SMILES
149
150
            smiles = None
            try:
Martino Bertoni's avatar
Martino Bertoni committed
151
                smiles = converter.ctd_to_smiles(chemicalid)
152
153
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
Martino Bertoni's avatar
Martino Bertoni committed
154
            # if that did't work we can still try with the chamical name
155
            if not smiles:
Martino Bertoni's avatar
Martino Bertoni committed
156
                try:
Martino Bertoni's avatar
Martino Bertoni committed
157
                    smiles = converter.chemical_name_to_smiles(chemicalname)
Martino Bertoni's avatar
Martino Bertoni committed
158
159
160
                except Exception as ex:
                    Parser.__log.warning("line %s: %s", idx, str(ex))
                    continue
161
162
            # the following is always the same
            try:
Martino Bertoni's avatar
Martino Bertoni committed
163
                inchikey, inchi = converter.smiles_to_inchi(smiles)
164
165
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
166
167
168
169
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
170
            result = {
171
                "id": id_text,
172
173
174
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smiles,
175
176
177
178
179
180
181
182
183
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk

Martino Bertoni's avatar
Martino Bertoni committed
184
    @staticmethod
185
    def chembl(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
186
        converter = Converter()
Martino Bertoni's avatar
Martino Bertoni committed
187
188
189
190
191
192
193
194
195
196
197
198
        # no file to parse here, but querying the chembl database
        query = "SELECT md.chembl_id, cs.canonical_smiles " +\
            "FROM molecule_dictionary md, compound_structures cs " +\
            "WHERE md.molregno = cs.molregno " +\
            "AND cs.canonical_smiles IS NOT NULL"
        cur = psql.qstring_cur(query, molrepo_name)
        chunk = list()
        for idx, row in enumerate(cur):
            src_id = row[0]
            smiles = row[1]
            # the following is always the same
            try:
Martino Bertoni's avatar
Martino Bertoni committed
199
                inchikey, inchi = converter.smiles_to_inchi(smiles)
Martino Bertoni's avatar
Martino Bertoni committed
200
201
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
202
203
204
205
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
Martino Bertoni's avatar
Martino Bertoni committed
206
            result = {
207
                "id": id_text,
Martino Bertoni's avatar
Martino Bertoni committed
208
209
210
211
212
213
214
215
216
217
218
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smiles,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
219

220
    @staticmethod
221
    def drugbank(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
222
        converter = Converter()
223
224

        file_path = map_paths[molrepo_name]
225
226
227
228
229
230
        # parse XML
        prefix = "{http://www.drugbank.ca}"
        tree = ET.parse(file_path)
        root = tree.getroot()
        chunk = list()
        for idx, drug in enumerate(root):
231
            # Drugbank ID
232
            src_id = None
233
234
235
            for child in drug.findall(prefix + "drugbank-id"):
                if "primary" in child.attrib:
                    if child.attrib["primary"] == "true":
236
237
238
                        src_id = child.text
            if not src_id:
                Parser.__log.warning("line %s: %s", idx, "no drugbank-id")
239
                continue
240
            # SMILES
241
242
243
244
245
246
            smiles = None
            for props in drug.findall(prefix + "calculated-properties"):
                for prop in props:
                    if prop.find(prefix + "kind").text == "SMILES":
                        smiles = prop.find(prefix + "value").text
            if not smiles:
247
                Parser.__log.warning("line %s: %s", idx, "no SMILES")
248
                continue
249
250
            # the following is always the same
            try:
Martino Bertoni's avatar
Martino Bertoni committed
251
                inchikey, inchi = converter.smiles_to_inchi(smiles)
252
253
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
254
255
256
257
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
258
            result = {
259
                "id": id_text,
260
261
262
263
264
265
266
267
268
269
270
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smiles,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
271

272
    @staticmethod
273
    def kegg(map_paths, molrepo_name, chunks=1000):
274
275
276
277
278
        try:
            import pybel
        except ImportError:
            raise ImportError("requires pybel " +
                              "http://openbabel.org")
Martino Bertoni's avatar
Martino Bertoni committed
279
280
281
282
283
        try:
            import wget
        except ImportError:
            raise ImportError("requires wget " +
                              "http://bitbucket.org/techtonik/python-wget/src")
Martino Bertoni's avatar
Martino Bertoni committed
284
        converter = Converter()
285
286

        file_path = map_paths["kegg_br"]
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
        fh = open(os.path.join(file_path), "r")
        # kegg molecules will be downloaded to following dir
        kegg_download = os.path.join(os.path.dirname(file_path), 'mols')
        if not os.path.isdir(kegg_download):
            os.mkdir(kegg_download)
        done = set()
        chunk = list()
        for idx, line in enumerate(fh):
            if not line.startswith("F"):
                continue
            src_id = line.split()[1]
            # skip repeated entries
            if src_id in done:
                # Parser.__log.debug("skipping line %s: repeated.", idx)
                continue
            done.add(src_id)
            # download mol if not available
            mol_path = os.path.join(kegg_download, '%s.mol' % src_id)
            if not os.path.isfile(mol_path):
                url = "http://rest.kegg.jp/get/" + src_id + "/mol"
                try:
                    wget.download(url, mol_path)
                except Exception:
                    Parser.__log.error('Cannot download: %s', url)
Martino Bertoni's avatar
Martino Bertoni committed
311
                    continue
312
            mol = pybel.readfile("mol", mol_path)
313
            for m in mol:
314
315
316
317
318
                smiles = m.write("smi").rstrip("\n").rstrip("\t")
                if not smiles:
                    Parser.__log.warning("line %s: %s", idx, "no SMILES")
                # the following is always the same
                try:
Martino Bertoni's avatar
Martino Bertoni committed
319
                    inchikey, inchi = converter.smiles_to_inchi(smiles)
320
321
                except Exception as ex:
                    Parser.__log.warning("line %s: %s", idx, str(ex))
322
323
324
325
                    inchikey, inchi = None, None
                id_text = molrepo_name + "_" + src_id
                if inchikey is not None:
                    id_text += ("_" + inchikey)
326
                result = {
327
                    "id": id_text,
328
329
330
331
332
333
334
335
336
337
338
                    "molrepo_name": molrepo_name,
                    "src_id": src_id,
                    "smiles": smiles,
                    "inchikey": inchikey,
                    "inchi": inchi
                }
                chunk.append(result)
            if len(chunk) >= chunks:
                yield chunk
                chunk = list()
        yield chunk
339

340
    @staticmethod
341
    def lincs(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
342
        converter = Converter()
343

344
345
346
347
348
349
350
351
        file_path = map_paths["compoundinfo_beta"]
        df = pd.read_csv(file_path, sep='\t')
        df = df[['pert_id', 'canonical_smiles', 'inchi_key']]
        df = df[df['canonical_smiles'] != 'restricted']
        df = df.dropna(subset=['canonical_smiles'])
        df = df.sort_values('pert_id')
        df = df.drop_duplicates(subset=['canonical_smiles'])
        df = df.reset_index(drop=True)
352

353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
        chunk = list()
        for idx, line in df.iterrows():
            src_id = line['pert_id']
            smiles = line['canonical_smiles']
            # the following is always the same
            try:
                inchikey, inchi = converter.smiles_to_inchi(smiles)
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
            result = {
                "id": id_text,
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smiles,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
378
        yield chunk
379

380
    @staticmethod
381
    def mosaic(map_paths, molrepo_name, chunks=1000):
382
383
384
385
386
        try:
            import pybel
        except ImportError:
            raise ImportError("requires pybel " +
                              "http://openbabel.org")
Martino Bertoni's avatar
Martino Bertoni committed
387
        converter = Converter()
388
389
390
        # FIXME find source (hint:/aloy/home/mduran/myscripts/mosaic/D/D3/data)
        # eventually add All_collection to local
        # check input size
391
392

        file_path = map_paths["mosaic_all_collections"]
393
394
        chunk = list()
        for mol in pybel.readfile("sdf", file_path):
395
            if not mol:
396
                continue
397
            smi, src_id = mol.write("can").rstrip("\n").split("\t")
398
            try:
Martino Bertoni's avatar
Martino Bertoni committed
399
                inchikey, inchi = converter.smiles_to_inchi(smi)
400
            except Exception as ex:
401
                Parser.__log.warning("Mosaic ID %s: %s", src_id, str(ex))
402
403
404
405
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
406
            result = {
407
                "id": id_text,
408
                "molrepo_name": molrepo_name,
409
                "src_id": src_id,
410
411
412
413
414
415
416
417
418
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
419

420
    @staticmethod
421
    def morphlincs(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
422
        converter = Converter()
423
424
425

        file_path = os.path.join(
            map_paths["morphlincs_LDS-1195"], "LDS-1195/Metadata/Small_Molecule_Metadata.txt")
426
        g = open(file_path, "r")
427
        g.readline()
428
429
430
431
        chunk = list()
        for l in csv.reader(g, delimiter="\t"):
            if not l[6]:
                continue
432
            src_id = l[8]
433
434
            smi = l[6]
            try:
Martino Bertoni's avatar
Martino Bertoni committed
435
                inchikey, inchi = converter.smiles_to_inchi(smi)
436
            except Exception as ex:
437
                Parser.__log.warning("Morphlincs ID %s: %s", src_id, str(ex))
438
439
440
441
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
442
            result = {
443
                "id": id_text,
444
                "molrepo_name": molrepo_name,
445
                "src_id": src_id,
446
447
448
449
450
451
452
453
454
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
455

456
    @staticmethod
457
    def nci60(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
458
        converter = Converter()
459
460

        file_path = os.path.join(
461
            map_paths["DTP_NCI60_ZSCORE"], "output/DTP_NCI60_ZSCORE.xlsx")
462
463
464
465
466
        Parser.__log.info("Converting Zscore xls file to csv")
        data_xls = pd.read_excel(file_path, index_col=0)
        csv_path = file_path[:-4] + ".csv"
        data_xls.to_csv(csv_path, encoding='utf-8')
        f = open(csv_path, "r")
467
        f.readline()
468
469
        chunk = list()
        for l in csv.reader(f):
470
            src_id, smi = l[0], l[5]
471
            try:
Martino Bertoni's avatar
Martino Bertoni committed
472
                inchikey, inchi = converter.smiles_to_inchi(smi)
473
            except Exception as ex:
474
                Parser.__log.warning("NCI60 ID %s: %s", src_id, str(ex))
475
476
477
478
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
479
            result = {
480
                "id": id_text,
481
                "molrepo_name": molrepo_name,
482
                "src_id": src_id,
483
484
485
486
487
488
489
490
491
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
492

493
    @staticmethod
494
    def pdb(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
495
        converter = Converter()
496
497

        file_path = map_paths["pdb_components"]
498
499
        chunk = list()
        f = open(file_path, "r")
500
        for l in f:
501
502
            l = l.rstrip("\n").split("\t")
            if len(l) < 2:
503
                continue
504
            src_id = l[1]
505
506
            smi = l[0]
            try:
Martino Bertoni's avatar
Martino Bertoni committed
507
                inchikey, inchi = converter.smiles_to_inchi(smi)
508
            except Exception as ex:
509
                Parser.__log.warning("PDB ID %s: %s", src_id, str(ex))
510
511
512
513
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
514
            result = {
515
                "id": id_text,
516
                "molrepo_name": molrepo_name,
517
                "src_id": src_id,
518
519
520
521
522
523
524
525
526
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
527

528
    @staticmethod
529
    def sider(map_paths, molrepo_name, chunks=1000):
Martino Bertoni's avatar
Martino Bertoni committed
530
        converter = Converter()
531

532
533
534
        sider_file = ""
        stitch_file = ""
        chunk = list()
535
        for file in map_paths.values():
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
            if "meddra_all_se" in file:
                sider_file = file
                continue
            if "chemicals" in file:
                stitch_file = file

        if sider_file == "" or stitch_file == "":
            raise Exception("Missing expected input files")

        with open(sider_file, "r") as f:
            S = set()
            for l in f:
                l = l.split("\t")
                S.update([l[1]])

        with open(stitch_file, "r") as f:
            stitch = {}
553
            f.readline()
554
555
556
557
            for r in csv.reader(f, delimiter="\t"):
                if r[0] not in S:
                    continue
                stitch[r[0]] = r[-1]
558

559
        for s in list(S):
560
            src_id = s
561
562
            smi = stitch[s]
            try:
Martino Bertoni's avatar
Martino Bertoni committed
563
                inchikey, inchi = converter.smiles_to_inchi(smi)
564
            except Exception as ex:
565
                Parser.__log.warning("SIDER ID %s: %s", src_id, str(ex))
566
567
568
569
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
570
            result = {
571
                "id": id_text,
572
                "molrepo_name": molrepo_name,
573
                "src_id": src_id,
574
575
576
577
578
579
580
581
582
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
583

584
    @staticmethod
585
    def smpdb(map_paths, molrepo_name, chunks=1000):
586
587
588
589
590
        try:
            import pybel
        except ImportError:
            raise ImportError("requires pybel " +
                              "http://openbabel.org")
Martino Bertoni's avatar
Martino Bertoni committed
591
        converter = Converter()
592
593
594

        file_path = os.path.join(
            map_paths["smpdb_structures"], "smpdb_structures")
595
596
597
598
599
600
601
602
603
        S = set()
        L = os.listdir(file_path)
        chunk = list()
        for l in L:
            for mol in pybel.readfile("sdf", file_path + "/" + l):
                if not mol:
                    continue
                smi, Id = mol.write("can").rstrip("\n").split("\t")
                S.update([(Id, smi)])
604

605
        for s in sorted(S):
606
            src_id = s[0]
607
608
            smi = s[1]
            try:
Martino Bertoni's avatar
Martino Bertoni committed
609
                inchikey, inchi = converter.smiles_to_inchi(smi)
610
            except Exception as ex:
611
                Parser.__log.warning("SMPDB ID %s: %s", src_id, str(ex))
612
613
614
615
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
616
            result = {
617
                "id": id_text,
618
                "molrepo_name": molrepo_name,
619
                "src_id": src_id,
620
621
622
623
624
625
626
627
628
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
629
630

    @staticmethod
631
    def biur_real(map_paths, molrepo_name, chunks=1000):
632
633
634
635
636
        try:
            import rdkit.Chem as Chem
        except ImportError:
            raise ImportError("requires rdkit " +
                              "https://www.rdkit.org/")
Martino Bertoni's avatar
Martino Bertoni committed
637
        converter = Converter()
638
639

        file_path = map_paths[molrepo_name]
640
641
642
643
644
645
        chunk = list()
        suppl = Chem.SDMolSupplier(file_path)
        for mol in suppl:
            if not mol:
                continue
            src_id = mol.GetProp("_Name")
646
            smi = Chem.MolToSmiles(mol, isomericSmiles=True)
647
            try:
Martino Bertoni's avatar
Martino Bertoni committed
648
                inchikey, inchi = converter.smiles_to_inchi(smi)
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
            except Exception as ex:
                Parser.__log.warning("biur_real ID %s: %s", src_id, str(ex))
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
            result = {
                "id": id_text,
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk

    @staticmethod
670
    def biur_virtual(map_paths, molrepo_name, chunks=1000):
671
672
673
674
675
        try:
            import rdkit.Chem as Chem
        except ImportError:
            raise ImportError("requires rdkit " +
                              "https://www.rdkit.org/")
Martino Bertoni's avatar
Martino Bertoni committed
676
        converter = Converter()
677
678
679

        file_path = os.path.join(
            str(map_paths[molrepo_name]), "VIRTUAL_BIUR_POR_MW")
680
681
682
683
684
685
686
        chunk = list()
        sdf_files = [f for f in os.listdir(file_path) if f[-4:] == ".sdf"]
        for sdf_file in sdf_files:
            suppl = Chem.SDMolSupplier(file_path + "/" + sdf_file)
            for mol in suppl:

                src_id = mol.GetProp("_Name")
687
                smi = Chem.MolToSmiles(mol, isomericSmiles=True)
688
689

                try:
Martino Bertoni's avatar
Martino Bertoni committed
690
                    inchikey, inchi = converter.smiles_to_inchi(smi)
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
                except Exception as ex:
                    Parser.__log.warning(
                        "biur_virtual ID %s: %s", src_id, str(ex))
                    inchikey, inchi = None, None
                id_text = molrepo_name + "_" + src_id
                if inchikey is not None:
                    id_text += ("_" + inchikey)

                result = {
                    "id": id_text,
                    "molrepo_name": molrepo_name,
                    "src_id": src_id,
                    "smiles": smi,
                    "inchikey": inchikey,
                    "inchi": inchi
                }
                chunk.append(result)
                if len(chunk) == chunks:
                    yield chunk
                    chunk = list()
        yield chunk
712
713

    @staticmethod
714
    def cmaup(map_paths, molrepo_name, chunks=1000):
715
        converter = Converter()
716
717

        file_path = map_paths[molrepo_name]
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
        chunk = list()
        f = open(file_path, "r")
        for l in f:
            l = l.rstrip("\n").split("\t")
            if len(l) < 2:
                continue
            src_id = l[0]
            smi = l[-1]
            try:
                inchikey, inchi = converter.smiles_to_inchi(smi)
            except Exception as ex:
                Parser.__log.warning("CMAUP ID %s: %s", src_id, str(ex))
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
            result = {
                "id": id_text,
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk

    @staticmethod
749
    def repohub(map_paths, molrepo_name, chunks=1000):
750
        converter = Converter()
751
752

        file_path = map_paths[molrepo_name]
753
754
755
756
757
758
759
        chunk = list()
        f = open(file_path, "r")
        for l in f:
            l = l.rstrip("\n").split("\t")
            if len(l) < 2:
                continue
            src_ids = l[7].split(", ")
760
            smis = l[8].split(", ")
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
            for (src_id, smi) in zip(src_ids, smis):
                try:
                    inchikey, inchi = converter.smiles_to_inchi(smi)
                except Exception as ex:
                    Parser.__log.warning("RepoHub ID %s: %s", src_id, str(ex))
                    inchikey, inchi = None, None
                id_text = molrepo_name + "_" + src_id
                if inchikey is not None:
                    id_text += ("_" + inchikey)
                result = {
                    "id": id_text,
                    "molrepo_name": molrepo_name,
                    "src_id": src_id,
                    "smiles": smi,
                    "inchikey": inchikey,
                    "inchi": inchi
                }
                chunk.append(result)
                if len(chunk) == chunks:
                    yield chunk
                    chunk = list()
        yield chunk

Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
784
    @staticmethod
785
    def hmdb(map_paths, molrepo_name, chunks=1000):
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
786
787
788
        from lxml import etree as ET
        converter = Converter()
        # Functions
789

Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
790
791
792
793
794
795
796
797
        def fast_iter(context, func):
            for event, elem in context:
                yield func(elem)
                elem.clear()
                for ancestor in elem.xpath('ancestor-or-self::*'):
                    while ancestor.getprevious() is not None:
                        del ancestor.getparent()[0]
            del context
798

Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
799
        def process_elem(elem):
800
801
802
803
            src_id = elem.find(ns + "accession")
            smiles = elem.find(ns + "smiles")
            if src_id is None or smiles is None:
                return None, None
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
804
            return src_id.text, smiles.text
805
806
807
808
809

        file_path = map_paths["hmdb_metabolites"]
        ns = "{http://www.hmdb.ca}"
        chunk = list()
        idx = 0
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
810
        # parse XML
811
812
        context = ET.iterparse(file_path, events=(
            "end", ), tag=ns + "metabolite")
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
813
        for src_id, smiles in fast_iter(context, process_elem):
814
815
            if src_id is None or smiles is None:
                continue
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
            # the following is always the same
            try:
                inchikey, inchi = converter.smiles_to_inchi(smiles)
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
            result = {
                "id": id_text,
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smiles,
                "inchikey": inchikey,
                "inchi": inchi
            }
833
            idx += 1
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
834
835
836
837
838
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
839
840
841
842
843
844
845
846
847
848
849

    @staticmethod
    def pharmacodb(file_path, molrepo_name, chunks=1000):
        from pubchempy import Compound
        converter = Converter()
        # no file to parse here, but querying the chembl database
        query = "SELECT drug_id, smiles, pubchem " +\
            "FROM drug_annots WHERE smiles IS NOT NULL or pubchem IS NOT NULL"
        cur = psql.qstring_cur(query, molrepo_name)
        chunk = list()
        for idx, row in enumerate(cur):
850
851
            src_id = "pharmacodb_%d" % row[0]
            smiles = row[1]
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
852
853
854
855
856
857
            pubchem = row[2]
            if (smiles is None or smiles == "-666") and pubchem is not None:
                try:
                    smiles = Compound.from_cid(pubchem).isomeric_smiles
                except:
                    continue
858
859
            if smiles is None or smiles == "-666":
                continue
Miquel Duran-Frigola's avatar
Miquel Duran-Frigola committed
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
            # the following is always the same
            try:
                inchikey, inchi = converter.smiles_to_inchi(smiles)
            except Exception as ex:
                Parser.__log.warning("line %s: %s", idx, str(ex))
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
            result = {
                "id": id_text,
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smiles,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk

883
884
885
886
887
888
    @staticmethod
    def touchstone(map_paths, molrepo_name, chunks=1000):
        converter = Converter()
        file_path = map_paths["GSE92742_Broad_LINCS_pert_info"]
        chunk = list()
        f = open(file_path, "r")
889
        reader = csv.reader(f, delimiter="\t")
890
        header = next(reader)
891
892
893
894
        istouch_idx = header.index("is_touchstone")
        pertid_idx = header.index("pert_id")
        pertype_idx = header.index("pert_type")
        smiles_idx = header.index("canonical_smiles")
895
896
897
898
899
900
901
        for r in reader:
            if r[istouch_idx] != "1":
                continue
            if r[pertype_idx] != "trt_cp":
                continue
            src_id = r[pertid_idx]
            smi = r[smiles_idx]
902
903
            if smi == "-666":
                continue
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
            try:
                inchikey, inchi = converter.smiles_to_inchi(smi)
            except Exception as ex:
                Parser.__log.warning("Touchstone ID %s: %s", src_id, str(ex))
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
            result = {
                "id": id_text,
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk

926
927
928
929
930
    @staticmethod
    def zinc(map_paths, molrepo_name, chunks=1000):
        converter = Converter()
        file_path = map_paths[molrepo_name]
        f = open(file_path, "r")
Oriol Guitart's avatar
Oriol Guitart committed
931
932
933
934
935
936
937
938
939
        delimiter = '\t'
        index_smi = 0
        index_id = 1
        min_items = 2
        if molrepo_name == 'tool':
            delimiter = ' '
            index_smi = 0
            index_id = 2
            min_items = 3
940
            f.readline()
Oriol Guitart's avatar
Oriol Guitart committed
941
942
943

        chunk = list()

944
        for l in f:
Oriol Guitart's avatar
Oriol Guitart committed
945
946
            l = l.rstrip("\n").split(delimiter)
            if len(l) < min_items:
947
                continue
Oriol Guitart's avatar
Oriol Guitart committed
948
949
            src_id = l[index_id]
            smi = l[index_smi]
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
            try:
                inchikey, inchi = converter.smiles_to_inchi(smi)
            except Exception as ex:
                Parser.__log.warning("ZINC ID %s: %s", src_id, str(ex))
                inchikey, inchi = None, None
            id_text = molrepo_name + "_" + src_id
            if inchikey is not None:
                id_text += ("_" + inchikey)
            result = {
                "id": id_text,
                "molrepo_name": molrepo_name,
                "src_id": src_id,
                "smiles": smi,
                "inchikey": inchikey,
                "inchi": inchi
            }
            chunk.append(result)
            if len(chunk) == chunks:
                yield chunk
                chunk = list()
        yield chunk