Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Packages
chemical_checker
Commits
411cec5d
Commit
411cec5d
authored
Feb 11, 2021
by
nsoler
Browse files
admin cc scripts
parent
e584e519
Changes
2
Hide whitespace changes
Inline
Side-by-side
package/scripts/calculate_Aspaces_raw.py
deleted
100644 → 0
View file @
e584e519
# Nico: Jan 2021
# Uses the data calculator class to get all A spaces preprocessed data
# Inspired from the cc pipeline
import
os
import
collections
from
chemicalchecker.util.parser
import
DataCalculator
from
chemicalchecker.core.preprocess
import
Preprocess
from
chemicalchecker.util.parser.features_A_spaces
import
fetch_features_A
# type of data, space_name, format size
# Note A3 contains both caffold and framework each encoded on 1024 bits, so we have 2048 total ex: f647,c92
# A4: supposed to be 166 groups but 152 keys are present in raw 2020_01 preprocess.h5
Aspaces
=
(
'A1'
,
'A2'
,
'A3'
,
'A4'
,
'A5'
)
def
calculate_data_fn
(
space
,
dict_inchikey_inchi
):
"""
Launch the (chemistry) data calculation for one type of space
Returns: a list containing the molecular properties in dense format
ex: [{'inchikey': 'ASXBYYWOLISCLQ-UHFFFAOYSA-N', 'raw': ..raw_string}, {}...]
Arguments:
- space (str): either A1, A2, A3, A4, A5, A5
- dict_inchikey_inchi (dict): mapping of the molecules to calculate properties from
"""
data_calculators
=
{
'A1'
:
'morgan_fp_r2_2048'
,
'A1'
:
'e3fp_3conf_1024'
,
'A1'
:
'murcko_1024_cframe_1024'
,
'A1'
:
'maccs_keys_166'
,
'A1'
:
'general_physchem_properties'
,
#'chembl_target_predictions_v23_10um':'A5',
}
type_data
=
data_calculators
.
get
(
space
.
upper
(),
None
)
assert
type_data
is
not
None
,
"Space "
+
space
+
" is not part of the CC A spaces!"
print
(
"Calculating data for "
+
type_data
)
parse_fn
=
DataCalculator
.
calc_fn
(
type_data
)
for
i
,
chunk
in
enumerate
(
parse_fn
(
dict_inchikey_inchi
)):
print
(
"CHUNK"
,
i
)
print
(
chunk
)
if
len
(
chunk
)
==
0
:
continue
else
:
return
chunk
return
None
def
calculate_mol_properties
(
dict_inchikey_inchi
):
"""
Calls calculate_data_fn for all spaces
Returns: a dict containing the molecular properties in dense format for all spaces
Arguments:
- space (str): either A1, A2, A3, A4, A5, A5
- dict_inchikey_inchi (dict): mapping of the molecules to calculate properties from
"""
result
=
dict
()
for
space
in
Aspaces
:
result
[
space
]
=
calculate_data_fn
(
space
,
dict_inchikey_inchi
=
inchikey2inchi_dict
)
# dictionary {'A1': [{'inchikey': 'ASXBYYWOLISCLQ-UHFFFAOYSA-N', 'raw': ..raw_string}, {}...]}
return
result
# Trying to save the result of A1 (from A1)
def
create_h5_from_inchikeys_inchi
(
dict_inchikey_inchi
,
outDir
=
'tmp'
):
"""
Create the preprocessed h5 data files
for spaces A1 to A5 for the molecules specified in the input inchikey_inchi dictionary
Here we cannot use the predict method from Preprocess.save_output since it requires connection to the database
whereas we work locally.
"""
if
not
os
.
path
.
exists
(
outDir
):
try
:
os
.
makedirs
(
outDir
)
except
Exception
as
e
:
print
(
"ERROR: cannot create output directory for "
)
print
(
e
)
return
outputfiles
=
{
space
:
os
.
path
.
join
(
outDir
,
space
+
'_outsideUniv.h5'
)
for
space
in
Aspaces
}
method
=
'fit'
model_path
=
'tmp'
# Dummy tmp folder
# Compute the raw properties
all_properties
=
calculate_mol_properties
(
dict_inchikey_inchi
)
all_features
=
fetch_features_A
()
# Here we need to preprocess the dense format according to which space we have
for
space
in
Aspaces
:
# Dense chemical properties for all spaces
raw_prop
=
all_properties
.
get
(
space
,
None
)
features
=
all_features
.
get
(
space
,
None
)
feature_int
=
False
if
space
in
(
'A3'
,
'A5'
)
else
True
discrete
=
False
if
space
==
'A5'
else
True
if
raw_prop
is
None
or
features
is
None
:
print
(
"WARNING, space"
,
space
,
"properties or features is None, space skipped"
)
continue
# format the dense chemical properties in tuples (inchikey, raw_str)
ACTS
=
[(
dic
[
'inchikey'
],
dic
[
'raw'
])
for
dic
in
result
[
'A1'
]]
if
space
in
(
'A1'
,
'A2'
,
'A3'
):
# Copied from the end of A1 preprocess script
RAW
=
collections
.
defaultdict
(
list
)
for
k
in
ACTS
:
if
features
is
None
:
vals
=
[
str
(
t
)
for
t
in
k
[
1
].
split
(
","
)]
else
:
vals
=
[
str
(
t
)
for
t
in
k
[
1
].
split
(
","
)
if
str
(
t
)
in
features
]
RAW
[
str
(
k
[
0
])]
=
vals
elif
space
==
'A4'
:
for
k
in
ACTS
:
if
k
[
1
]
==
''
or
k
[
1
]
is
None
or
k
[
0
]
is
None
:
continue
if
features
is
None
:
vals
=
[
str
(
t
)
for
t
in
k
[
1
].
split
(
","
)]
else
:
vals
=
[
str
(
t
)
for
t
in
k
[
1
].
split
(
","
)
if
str
(
t
)
in
features
]
RAW
[
str
(
k
[
0
])]
=
vals
elif
space
==
'A5'
:
sigs
=
collections
.
defaultdict
(
list
)
words
=
[]
first
=
True
for
k
in
ACTS
:
if
k
[
1
]
is
None
:
continue
data
=
k
[
1
].
split
(
","
)
vals
=
[]
for
d
in
data
:
ele
=
d
.
split
(
"("
)
if
first
:
words
.
append
(
str
(
ele
[
0
]))
vals
.
append
(
float
(
ele
[
1
][:
-
1
]))
sigs
[
str
(
k
[
0
])]
=
vals
first
=
False
RAW
=
sigs
# NS: Preprocess.save_output will then convert the dense format into binary data
Preprocess
.
save_output
(
outputfiles
[
space
],
RAW
,
method
,
model_path
,
discrete
,
features
,
features_int
=
feature_int
)
\ No newline at end of file
package/scripts/get_repo_version.py
0 → 100644
View file @
411cec5d
# Nico 11 Feb 2021
# Put here the current CC signature repo version that can be then fetched by scripts
import
os
CURRENT
=
"/aloy/web_checker/current"
def
cc_repo_version
():
try
:
return
os
.
path
.
abspath
(
os
.
path
.
realpath
(
CURRENT
))
except
Exception
as
e
:
print
(
"WARNING: the cc signature repo version cannot be retrieved from"
,
CURRENT
)
print
(
e
)
return
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment