Commits (406)
# Chemical Checker Repository
# Chemical Checker Repository
The **Chemical Checker (CC)** is a resource of small molecule signatures. In the CC, compounds are described from multiple viewpoints, spanning every aspect of the drug discovery pipeline, from chemical properties to clinical outcomes.
......@@ -42,17 +42,29 @@ All the dependencies for the CC will be bundled within a singularity image gener
Generating such an image requires roughly 20 minutes:
1. [Install Singularity](https://www.sylabs.io/guides/2.6/user-guide/installation.html)
1. [Install Singularity](https://sylabs.io/guides/3.8/admin-guide/admin_quickstart.html#installation-from-source)
VER=2.5.1
wget https://github.com/sylabs/singularity/releases/download/$VER/singularity-$VER.tar.gz
tar xvf singularity-$VER.tar.gz
cd singularity-$VER
./configure --prefix=/usr/local --sysconfdir=/etc
make
sudo make install
$ sudo apt-get update && sudo apt-get install -y \
build-essential \
uuid-dev \
libgpgme-dev \
squashfs-tools \
libseccomp-dev \
wget \
pkg-config \
git \
cryptsetup-bin\
golang-go
$ export VERSION=3.8.0 && # adjust this as necessary \
wget https://github.com/sylabs/singularity/releases/download/v${VERSION}/singularity-ce-${VERSION}.tar.gz && \
tar -xzf singularity-ce-${VERSION}.tar.gz && \
cd singularity-ce-${VERSION}
$ ./mconfig && \
make -C ./builddir && \
sudo make -C ./builddir install
> In case of errors during this step, check Singularity [prerequisites](https://www.sylabs.io/guides/2.6/user-guide/installation.html#before-you-begin)!
2. Clone this repository to your code folder:
......@@ -73,6 +85,9 @@ source ~/.bashrc
chemcheck
```
_**N.B.** If you are using another shell (e.g. zsh) just copy the chemcheck alias from your .bashrc to your .zshrc_
## Running custom Chemical Checker
If you are contributing with code to the CC you can run the singularity image specifying your local develop branch:
......@@ -89,6 +104,14 @@ The CC rely on one config file containing the information for the current enviro
chemcheck -c /path/to/your/cc_config.json
```
## Running with alternative image
You might want to use a previously compiled or downloaded image:
```bash
chemcheck -i /path/to/your/cc_image.simg
```
## Usage
We make it trivial to either start a Jupyter Notebook within the image or to run a shell:
......
......@@ -7,11 +7,20 @@ FROM conda/miniconda3-centos7:latest
RUN yum install -y gcc g++ gcc-c++ cmake gfortran gcc-gfortran make wget squashfs-tools
RUN yum install -y libarchive-devel
RUN yum groupinstall 'Development Tools' -y
RUN export SINGVER=2.5.1
RUN wget https://github.com/sylabs/singularity/releases/download/2.5.1/singularity-2.5.1.tar.gz
RUN tar xvf singularity-2.5.1.tar.gz
WORKDIR "./singularity-2.5.1"
RUN ./configure --prefix=/usr/local
RUN make
RUN make install
RUN pip install -U setuptools twine
\ No newline at end of file
RUN export SINGVER=3.6.4 && \
wget https://github.com/hpcng/singularity/releases/download/v${SINGVER}/singularity-${SINGVER}.tar.gz && \
tar -xvf singularity-${SINGVER}.tar.gz && \
rm singularity-${SINGVER}.tar.gz
RUN export GOVER=1.13 && \
wget https://go.dev/dl/go$GOVER.linux-amd64.tar.gz && \
tar -C /usr/local -xzvf go$GOVER.linux-amd64.tar.gz && \
rm go$GOVER.linux-amd64.tar.gz
RUN echo 'export GOPATH=${HOME}/go' >> ~/.bashrc && \
echo 'export PATH=/usr/local/go/bin:${PATH}:${GOPATH}/bin' >> ~/.bashrc && \
source ~/.bashrc && \
cd singularity && \
./mconfig && \
make -C ./builddir && \
make -C ./builddir install
WORKDIR singularity
RUN pip install -U setuptools twine
......@@ -64,10 +64,14 @@ From: centos:8
conda install -y -c pytorch faiss-cpu # facebook trick for sparsity
conda install -y -c conda-forge hdbscan # clustering
conda install -y -c efelix fpsim2 # fast compound similarity searches (used in TargetMate)
conda install -c anaconda ipython
conda install -y -c conda-forge jupyterlab
conda install -c anaconda protobuf
conda install -c conda-forge munch
# utility packages
pip install ipython # interactive python
pip install jupyter # jupyter notebooks
#pip install ipython # interactive python
#pip install jupyter # jupyter notebooks
pip install six # py2/py3 compatibility
pip install pytest # unit-testing
pip install mock # mocking for unit-testing
......@@ -103,7 +107,7 @@ From: centos:8
pip install theano # Optimize evaluate math expressions
pip install h5py # HDF5 via python
pip install fancyimpute # matrix completion and imputation algorithms
pip install protobuf==3.6.1 # Google serialization library
#pip install protobuf==3.6.1 # Google serialization library
#pip install intbitset # sort unsigned integers
# graph packages
......@@ -134,13 +138,13 @@ From: centos:8
DEVPI_HOST=gitlabsbnb.irbbarcelona.org
DEVPI_PORT=3141
alias sbnb_pip='pip install --index http://$DEVPI_HOST:$DEVPI_PORT/root/dev/ --trusted-host $DEVPI_HOST'
sbnb_pip pqkmeans==1.0.4 # slighty modified pqkmeans
sbnb_pip pdbe-api # PDBe REST API
sbnb_pip pride-api # PRIDE REST API
sbnb_pip reactome-api # Reactome REST API
sbnb_pip sbnb-dsysmap # access the sbnb-dsysmap database
sbnb_pip sbnb-uniprotkb # interface to the local UniprotKB database
sbnb_pip sbnb-util # utility code in the SBNB lab
#sbnb_pip pqkmeans==1.0.4 # slighty modified pqkmeans
#sbnb_pip pdbe-api # PDBe REST API
#sbnb_pip pride-api # PRIDE REST API
#sbnb_pip reactome-api # Reactome REST API
#sbnb_pip sbnb-dsysmap # access the sbnb-dsysmap database
#sbnb_pip sbnb-uniprotkb # interface to the local UniprotKB database
#sbnb_pip sbnb-util # utility code in the SBNB lab
# chemical beauty QED
curl -LO http://silicos-it.be.s3-website-eu-west-1.amazonaws.com/_downloads/qed-1.0.1.tar.gz
......
BootStrap: docker
From: centos:8
Bootstrap: docker
From: ubuntu:20.04
%environment
# PATHS
export PATH=/opt/miniconda3/bin:$PATH
source activate py37
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate py37
%files
#add node2vec pre-compiled binaries
container/singularity/binaries/node2vec /opt
%post
# bind paths
......@@ -15,33 +20,29 @@ From: centos:8
mkdir -p /aloy/web_checker
mkdir -p /slgpfs
# update yum
yum update -y
# update apt
apt update -y
# basic packages (~2 min)
yum install -y gcc \
gcc-c++ \
gcc-gfortran \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
apt install -y build-essential \
gcc \
graphviz \
cmake \
make \
git \
wget \
curl \
which \
vim \
bzip2 \
bzip2-devel \
libbz2-dev \
file \
libXrender \
libXext \
postgresql-server \
postgresql-contrib \
epel-release
libxrender-dev \
libxext-dev \
postgresql \
postgresql-contrib
# MS compatible font for plotting (~3 min.)
yum install -y xorg-x11-font-utils fontconfig libmspack
rpm -i http://download-ib01.fedoraproject.org/pub/epel/7/x86_64/Packages/c/cabextract-1.5-1.el7.x86_64.rpm
rpm -i https://downloads.sourceforge.net/project/mscorefonts2/rpms/msttcore-fonts-installer-2.6-1.noarch.rpm
DEBIAN_FRONTEND=noninteractive apt install -y ttf-mscorefonts-installer
# conda
mkdir -p /opt/miniconda3
......@@ -53,21 +54,27 @@ From: centos:8
# create and activate conda enviroment
conda update conda -y
. /opt/miniconda3/etc/profile.d/conda.sh
conda create --name=py37 python=3.7 -y
source activate py37
conda activate py37
# conda-specific packages (~1 min)
conda install -y -c rdkit rdkit # Open-Source Cheminformatics Software
conda install -y -c openbabel openbabel # chemical toolbox
conda install -y mkl-service # change the number of CPU’s MKL is using
conda install -y anaconda-client # client that provides an interface to Anaconda Cloud
conda install -y -c pytorch faiss-cpu # facebook trick for sparsity
conda install -y -c conda-forge hdbscan # clustering
conda install -y -c efelix fpsim2 # fast compound similarity searches (used in TargetMate)
conda install -y numpy==1.19.2 # we fix numpy version because of TF
conda install -y -c conda-forge rdkit # Open-Source Cheminformatics Software
conda install -y -c openbabel openbabel # chemical toolbox
conda install -y mkl-service # change the number of CPU’s MKL is using
conda install -y anaconda-client # client that provides an interface to Anaconda Cloud
conda install -y -c conda-forge hdbscan # clustering
conda install -y -c efelix fpsim2 # fast compound similarity searches (used in TargetMate)
conda install -y -c conda-forge jupyterlab # Jupyter notebooks
conda install -y -c anaconda ipython">7.19" # interactive python
conda install -y -c anaconda cython # C extensions
conda install -y -c pytorch faiss-cpu # efficient similarity search and clustering
alias test_faiss='python -c "import faiss, numpy
faiss.Kmeans(10, 20).train(numpy.random.rand(1000, 10).astype(numpy.float32))"'
if ! test_faiss; then printf -- "\033[31m ERROR: faiss is failing! \033[0m\n"; else printf -- '\033[32m SUCCESS: faiss is working correctly. \033[0m\n'; fi
# utility packages (~2 min)
pip install ipython # interactive python
pip install jupyter # jupyter notebooks
pip install six # py2/py3 compatibility
pip install pytest # unit-testing
pip install mock # mocking for unit-testing
......@@ -80,85 +87,66 @@ From: centos:8
pip install patool # unzipping
pip install wget # download library
pip install tqdm # handy progress-bar
#pip install apache-airflow # Install airflow for the pipeline
pip install munch # dictionary that supports attribute-style access
# raw data packages
pip install lxml # xml parser
#pip install xlrd # Extract data from Excel spreadsheets
pip install xlrd # Extract data from Excel spreadsheets
pip install cmapPy # interacting with .gctx and .gct files, and other Connectivity Map resources
pip install csvsort # Sort csv
# chemistry packages
#pip install e3fp # 3D molecular fingreprints (py3.78 compatibility fix at the bottom)
pip install e3fp # 3D molecular fingreprints
pip install pubchempy # Pubchem rest api
pip install standardiser # standardising molecules
pip install chembl_webresource_client # Chembl API
# ML packages
pip install scikit-learn # entry level ML in python
pip install gensim # topic modelling word2vec
pip install tensorflow==1.14.0 # neural network library
pip install adanet==0.5.0 # automl for NN
pip install keras==2.3.1 # NN API
if ! test_faiss; then printf -- "\033[31m ERROR: faiss is failing! \033[0m\n"; else printf -- '\033[32m SUCCESS: faiss is working correctly. \033[0m\n'; fi
pip install tensorflow # neural network library
if ! test_faiss; then printf -- "\033[31m ERROR: faiss is failing! \033[0m\n"; else printf -- '\033[32m SUCCESS: faiss is working correctly. \033[0m\n'; fi
pip install tensorflow-hub # repository of trained machine learning models
pip install adanet # automl for NN
pip install keras # NN API
# numerical packages
pip install cython # C extensions
pip install numpy # best numerical library ever
pip install pandas # handles table-like datastracture
pip install openpyxl # open xlsx
pip install scipy # scientific python
pip install theano # Optimize evaluate math expressions
pip install h5py # HDF5 via python
pip install fancyimpute # matrix completion and imputation algorithms
pip install protobuf==3.6.1 # Google serialization library
#pip install intbitset # sort unsigned integers
pip install fancyimpute==0.5.4 # matrix completion and imputation algorithms
pip install protobuf # Google serialization library
pip install statsmodels # many different statistical models and tests
# graph packages
pip install networkx # graph data stracture and algos
pip install snap-stanford # Python interface for SNAP
# visualization packages
pip install matplotlib==3.2.1 # chart library :)
pip install matplotlib # chart library :)
pip install seaborn # prettier graph built on top of matplotlib
pip install datashader # plot huge 2D datasets (projetions)
pip install statannot # annotate boxplot with significance ***
pip install matplotlib_venn # Venn diagrams
# Dimensionality reduction
pip install MulticoreTSNE # tSNE algo
pip install umap-learn # UMAP algo
# Chembl API (Nico)
pip install chembl_webresource_client
# install local SBNB devpi packages
DEVPI_HOST=gitlabsbnb.irbbarcelona.org
DEVPI_PORT=3141
alias sbnb_pip='pip install --index http://$DEVPI_HOST:$DEVPI_PORT/root/dev/ --trusted-host $DEVPI_HOST'
sbnb_pip pqkmeans==1.0.4 # slighty modified pqkmeans
sbnb_pip pdbe-api # PDBe REST API
sbnb_pip pride-api # PRIDE REST API
sbnb_pip reactome-api # Reactome REST API
sbnb_pip sbnb-dsysmap # access the sbnb-dsysmap database
sbnb_pip sbnb-uniprotkb # interface to the local UniprotKB database
sbnb_pip sbnb-util # utility code in the SBNB lab
# chemical beauty QED
curl -LO http://silicos-it.be.s3-website-eu-west-1.amazonaws.com/_downloads/qed-1.0.1.tar.gz
mv qed-1.0.1.tar.gz /opt
cd /opt
tar -xvf qed-1.0.1.tar.gz
cd qed-1.0.1
2to3 silicos_it/descriptors/qed.py -w # very minor py3 compatibility issues (prints)
python setup.py install
# fix until is compatible with python 3.7
cd /opt
git clone https://github.com/keiserlab/e3fp.git
cd e3fp/
cython e3fp/fingerprint/metrics/_fast.pyx # this is the fix
python setup.py build_ext --inplace
python setup.py install
# additional TargetMate packages
pip install tpot # Auto ML tool with Genetic Programming
pip install mlflow # Platform for the ML lifecycle
pip install xgboost # Gradient Boosting framework
pip install hyperopt # Hyperparameters optimization
pip install shap # SHapley Additive exPlanations for ML models
# APSW stands for Another Python SQLite Wrapper
pip install https://github.com/rogerbinns/apsw/releases/download/3.24.0-r1/apsw-3.24.0-r1.zip --global-option=fetch --global-option=--version --global-option=3.24.0 --global-option=--all --global-option=build --global-option=--enable-all-extensions
# integration of D1
pip install git+git://github.com/Maayanlab/geode.git # Python implementation of the R package GeoDE
#add pre-compiled binaries
%files
container/singularity/binaries/node2vec /opt
# Clone the chemical checker package (used as fallback)
cd /opt
git clone http://gitlabsbnb.irbbarcelona.org/packages/chemical_checker.git
BootStrap: docker
From: centos:8
%environment
# PATHS
export PATH=/opt/miniconda3/bin:$PATH
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate py37
%files
#add node2vec pre-compiled binaries
container/singularity/binaries/node2vec /opt
%post
# bind paths
mkdir -p /aloy
mkdir -p /aloy/home
mkdir -p /aloy/data
mkdir -p /aloy/scratch
mkdir -p /aloy/web_checker
mkdir -p /slgpfs
# update yum
yum update -y
# basic packages (~2 min)
yum install -y gcc \
gcc-c++ \
gcc-gfortran \
graphviz \
cmake \
make \
git \
wget \
curl \
which \
vim \
bzip2 \
bzip2-devel \
file \
libXrender \
libXext \
postgresql-server \
postgresql-contrib \
epel-release \
graphviz
# MS compatible font for plotting (~3 min.)
yum install -y xorg-x11-font-utils fontconfig libmspack
rpm -i http://download-ib01.fedoraproject.org/pub/epel/7/x86_64/Packages/c/cabextract-1.9-7.el7.x86_64.rpm
rpm -i https://downloads.sourceforge.net/project/mscorefonts2/rpms/msttcore-fonts-installer-2.6-1.noarch.rpm
# conda
mkdir -p /opt/miniconda3
cd /opt/miniconda3
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh -p /opt/miniconda3 -b -f
rm Miniconda3-latest-Linux-x86_64.sh
export PATH=/opt/miniconda3/bin:$PATH
# create and activate conda enviroment
conda update conda -y
conda create --name=py37 python=3.7 -y
source activate py37
# conda-specific packages (~1 min)
conda install -y numpy==1.19.2 # we fix numpy version because of TF
conda install -y -c conda-forge rdkit # Open-Source Cheminformatics Software
conda install -y -c openbabel openbabel # chemical toolbox
conda install -y mkl-service # change the number of CPU’s MKL is using
conda install -y anaconda-client # client that provides an interface to Anaconda Cloud
conda install -y -c conda-forge hdbscan # clustering
conda install -y -c efelix fpsim2 # fast compound similarity searches (used in TargetMate)
conda install -y -c conda-forge jupyterlab # Jupyter notebooks
conda install -y -c anaconda ipython">7.19" # interactive python
conda install -y -c anaconda cython # C extensions
conda install -y -c pytorch faiss-cpu # efficient similarity search and clustering
conda install -y -c conda-forge chembl_structure_pipeline # standardizing molecules the ChEMBL way
conda install -y -c tmap tmap # visualization library for large, high-dimensional data sets
alias test_faiss='python -c "import faiss, numpy
faiss.Kmeans(10, 20).train(numpy.random.rand(1000, 10).astype(numpy.float32))"'
if ! test_faiss; then printf -- "\033[31m ERROR: faiss is failing! \033[0m\n"; else printf -- '\033[32m SUCCESS: faiss is working correctly. \033[0m\n'; fi
# utility packages (~2 min)
pip install six # py2/py3 compatibility
pip install pytest # unit-testing
pip install mock # mocking for unit-testing
pip install autologging # logging handler
pip install paramiko # ssh client
pip install psycopg2-binary # Postgesql driver
pip install sqlalchemy # ORM SQL
pip install sphinx # to generate docs
pip install sphinx_rtd_theme # docs theme.
pip install patool # unzipping
pip install wget # download library
pip install tqdm # handy progress-bar
pip install munch # dictionary that supports attribute-style access
# raw data packages
pip install lxml # xml parser
pip install xlrd # Extract data from Excel spreadsheets
pip install cmapPy # interacting with .gctx and .gct files, and other Connectivity Map resources
pip install csvsort # Sort csv
# chemistry packages
pip install e3fp # 3D molecular fingreprints
pip install pubchempy # Pubchem rest api
pip install standardiser # standardising molecules (used only in TargetMate)
pip install chembl_webresource_client # Chembl API
# ML packages
pip install scikit-learn # entry level ML in python
pip install gensim # topic modelling word2vec
if ! test_faiss; then printf -- "\033[31m ERROR: faiss is failing! \033[0m\n"; else printf -- '\033[32m SUCCESS: faiss is working correctly. \033[0m\n'; fi
pip install tensorflow==2.5 # neural network library
if ! test_faiss; then printf -- "\033[31m ERROR: faiss is failing! \033[0m\n"; else printf -- '\033[32m SUCCESS: faiss is working correctly. \033[0m\n'; fi
pip install tensorflow-hub # repository of trained machine learning models
pip install adanet # automl for NN
#pip install keras # NN API
# numerical packages
pip install numpy # best numerical library ever
pip install pandas # handles table-like datastracture
pip install openpyxl # open xlsx
pip install scipy # scientific python
pip install theano # Optimize evaluate math expressions
pip install protobuf # Google serialization library
pip install statsmodels # many different statistical models and tests
# graph packages
pip install networkx # graph data stracture and algos
pip install snap-stanford # Python interface for SNAP
# visualization packages
pip install matplotlib # chart library :)
pip install seaborn # prettier graph built on top of matplotlib
pip install datashader # plot huge 2D datasets (projetions)
pip install statannot # annotate boxplot with significance ***
pip install matplotlib_venn # Venn diagrams
pip install pydot # python graphviz interface
# Dimensionality reduction
pip install cmake
pip install MulticoreTSNE # tSNE algo
pip install umap-learn # UMAP algo
# additional TargetMate packages
pip install tpot # Auto ML tool with Genetic Programming
pip install mlflow # Platform for the ML lifecycle
pip install xgboost # Gradient Boosting framework
pip install hyperopt # Hyperparameters optimization
pip install shap # SHapley Additive exPlanations for ML models
pip install pyrfr # Python interface to RFR, an extensible C++ library for random forests
pip install auto-sklearn # Automated machine learning toolkit
pip install signaturizer # ChemicalChecker Signaturizer
# integration of D1
pip install git+git://github.com/Maayanlab/geode.git # Python implementation of the R package GeoDE
# Clone the chemical checker package (used as fallback)
cd /opt
git clone http://gitlabsbnb.irbbarcelona.org/packages/chemical_checker.git
......@@ -4,7 +4,8 @@ From: centos
%environment
# PATHS
export PATH=/opt/miniconda3/bin:$PATH
source activate py37
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate py37
%post
# bind paths
......@@ -65,6 +66,7 @@ From: centos
pip install csvsort
pip install seaborn
pip install tqdm
pip install munch
# unittest utils
pip install pytest
......
......@@ -39,7 +39,7 @@ lint: ## check style with flake8
flake8 chemicalchecker tests
test: ## run tests quickly with the default Python
pytest -s
pytest tests/ -s
docs: ## generate Sphinx HTML documentation, including API docs
rm -fr docs/_build/
......
......@@ -24,10 +24,12 @@
* The :mod:`~chemicalchecker.util` module pulls together
general utilities.
"""
import pandas
from .core import ChemicalChecker
from .util import Config
#ChemicalChecker.set_verbosity(level=Config().VERBOSITY.level)
__author__ = """SBNB"""
__email__ = 'sbnb@irbbarcelona.org'
__version__ = '1.0.0'
__version__ = '1.0.1'
This diff is collapsed.
......@@ -30,6 +30,7 @@ class DataFactory():
from .sign1 import sign1
from .sign2 import sign2
from .sign3 import sign3
from .sign4 import sign4
from .clus import clus
from .neig import neig # nearest neighbour class
......@@ -60,6 +61,7 @@ class DataFactory():
from .sign1 import sign1
from .sign2 import sign2
from .sign3 import sign3
from .sign4 import sign4
from .signature_data import DataSignature
from .clus import clus
......
This diff is collapsed.
......@@ -6,8 +6,10 @@ import os
import h5py
import datetime
import numpy as np
from tqdm import tqdm
from numpy import linalg as LA
from bisect import bisect_left
from scipy.spatial.distance import euclidean, cosine
from .signature_base import BaseSignature
from .signature_data import DataSignature
......@@ -66,12 +68,19 @@ class neig(BaseSignature, DataSignature):
raise ImportError("requires faiss " +
"https://github.com/facebookresearch/faiss")
# signature specific checks
if self.molset != "reference":
self.__log.debug("Fit will be done with the reference neig1")
self = self.get_molset("reference")
if sign1 is None:
sign1 = self.get_sign(
'sign' + self.cctype[-1]).get_molset("reference")
if sign1.molset != "reference":
raise Exception(
"Fit should be done with the reference sign1")
self.__log.debug("Fit will be done with the reference sign1")
sign1 = self.get_sign(
'sign' + self.cctype[-1]).get_molset("reference")
if not sign1.is_fit():
raise Exception("sign1 is not fitted.")
faiss.omp_set_num_threads(self.cpu)
......@@ -249,7 +258,7 @@ class neig(BaseSignature, DataSignature):
return predictions
def get_vectors(self, keys, include_nan=False, dataset_name='V'):
def get_vectors(self, keys, include_nan=False, dataset_name='indices'):
"""Get vectors for a list of keys, sorted by default.
Args:
......@@ -363,6 +372,23 @@ class neig(BaseSignature, DataSignature):
return predictions
def check_distances(self, n_sign=5, n_neig=10):
sign = self.get_sign('sign' + self.cctype[-1])
dist_fn = eval(self.metric)
for ink1 in tqdm(sign.keys[:n_sign], desc='Checking distances'):
s1 = sign[ink1]
nn = self[ink1]
inks = nn['keys'][:n_neig]
dists = nn['distances'][:n_neig]
for ink2, dist in zip(inks, dists):
ink2 = ink2.decode()
s2 = sign[ink2]
comp_d = dist_fn(s1, s2)
if not np.allclose(dist, comp_d, atol=1e-05):
self.__log.error('%s %s %.6f %.6f', ink1, ink2, dist,
comp_d)
np.testing.assert_allclose(dist, comp_d, atol=1e-05)
@staticmethod
def jaccard_similarity(n1, n2):
"""Compute Jaccard similarity.
......
......@@ -16,9 +16,9 @@ integers
continous: "0.515,1.690,0.996" which is an array of floats
"""
import os
import imp
import h5py
import argparse
import importlib
import numpy as np
from .signature_data import DataSignature
......@@ -34,21 +34,23 @@ features_file = "features.h5"
class Preprocess():
"""Preprocess class."""
def __init__(self, signature_path, dataset, **params):
def __init__(self, signature_path, dataset, *args, **kwargs):
"""Initialize a Preprocess instance.
This class handles calling the external run.py for each dataset and
provide shared methods.
Args:
signature_path(str): the path to the signature directory.
"""
# Calling init on the base class to trigger file existance checks
self.__log.debug('signature path is: %s', signature_path)
# TODO check is kwargs are needed (D1)
self.__log.info('Preprocess signature: %s', signature_path)
self.raw_path = os.path.join(signature_path, "raw")
self.raw_model_path = os.path.join(signature_path, "raw", "models")
if not os.path.isdir(self.raw_path):
Preprocess.__log.info(
"Initializing new raw in: %s" % self.raw_path)
Preprocess.__log.debug(
"Initializing raw path: %s" % self.raw_path)
original_umask = os.umask(0)
os.makedirs(self.raw_path, 0o775)
os.umask(original_umask)
......@@ -58,9 +60,8 @@ class Preprocess():
os.makedirs(self.raw_model_path, 0o775)
os.umask(original_umask)
# NS what is returned by cc.preprocess(sign) after prepro.fit()
# where preprocess data will be saved if fit is called
self.data_path = os.path.join(self.raw_path, "preprocess.h5")
self.__log.debug('data_path: %s', self.data_path)
dir_path = os.path.dirname(os.path.realpath(__file__))
......@@ -72,10 +73,7 @@ class Preprocess():
"run.py")
if not os.path.isfile(self.preprocess_script):
self.__log.warning(
"Pre-process script not found! %s", self.preprocess_script)
for param, value in params.items():
self.__log.debug('parameter %s : %s', param, value)
"Preprocess script not found! %s", self.preprocess_script)
def is_fit(self):
if os.path.exists(self.data_path):
......@@ -83,8 +81,7 @@ class Preprocess():
else:
return False
def call_preprocess(self, output, method, infile=None, entry=None):
def call_preprocess(self, output, method, infile=None, entry=None): #params = {}
"""Call the external pre-process script."""
# create argument list
arglist = ["-o", output, "-mp", self.raw_model_path, "-m", method]
......@@ -93,45 +90,50 @@ class Preprocess():
if entry:
arglist.extend(['-ep', entry])
# import and run the run.py
preprocess = imp.load_source('main', self.preprocess_script)
loader = importlib.machinery.SourceFileLoader('preprocess_script',
self.preprocess_script)
preprocess = loader.load_module()
# self.__log.debug('ARGS: %s' % str(arglist))
preprocess.main(arglist)
def fit(self):
"""Call the external preprocess script to generate h5 data.
"""Call the external preprocess script to generate H5 data.
The preprocess script is invoked with the `fit` argument, which means
features are extracted from datasoruces and saved.
features are extracted from datasoruce and saved.
"""
# check that preprocess script is available and call it
self.__log.debug('Calling pre-process script %s',
self.preprocess_script)
self.__log.info('Calling preprocess script: %s',
self.preprocess_script)
if not os.path.isfile(self.preprocess_script):
raise Exception("Pre-process script not found! %s",
raise Exception("Preprocess script not found! %s",
self.preprocess_script)
self.call_preprocess(self.data_path, "fit")
self.call_preprocess(self.data_path, "fit", None, None) #self.params
def predict(self, input_data_file, destination):
"""Call the external preprocess script to generate h5 data."""
def predict(self, input_data_file, destination, entry_point):
"""Call the external preprocess script to generate H5 data."""
"""
Args:
input_data_file(str): Path to the file with the raw to generate
the signature 0.
destination(str): Path to a .h5 file where the predicted signature
destination(str): Path to a H5 file where the predicted signature
will be saved.
entry_point(str): Entry point of the input data for the
signaturization process. It depends on the type of data passed
at the input_data_file.
"""
# check that preprocess script is available and call it
self.__log.debug('Calling pre-process script %s',self.preprocess_script)
self.__log.info('Calling preprocess script: %s',
self.preprocess_script)
if not os.path.isfile(self.preprocess_script):
raise Exception("Pre-process script not found! %s", self.preprocess_script)
raise Exception("Pre-process script not found! %s",
self.preprocess_script)
#self.call_preprocess(destination, "predict", input_data_file, self.entry_point) # NS: self.entry_point not defined anywhere
self.call_preprocess(destination, "predict", input_data_file)
self.call_preprocess(destination, "predict", infile=input_data_file,
entry=entry_point)
def to_features(self, signatures):
"""Convert signature to explicit feature names.
......@@ -161,20 +163,80 @@ class Preprocess():
result.append(dict(zip(keys, values)))
return result
@classmethod
def preprocess(cls, sign, **params):
"""Return the file with the raw data preprocessed.
Args:
sign: signature object (e.g. obtained from cc.get_signature)
params: specific parameters for a given preprocess script
Returns:
datafile(str): The name of the file where the data is saved.
ex:
os.path.join(self.raw_path, "preprocess.h5")
"""
prepro = cls(sign.signature_path, sign.dataset, **params)
if not prepro.is_fit():
cls.__log.info(
"No raw data file found, calling the preprocessing script")
prepro.fit()
else:
cls.__log.info("Found {}".format(prepro.data_path))
return prepro.data_path
@classmethod
def preprocess_predict(cls, sign, input_file, destination, entry_point):
"""Runs the preprocessing script 'predict'.
Run on an input file of raw data formatted correctly for the space of
interest
Args:
sign: signature object ( e.g. obtained from cc.get_signature)
input_file(str): path to the H5 file containing the data on which
to apply 'predict'
destination(str): Path to a H5 file where the predicted signature
will be saved.
entry_point(str): Entry point of the input data for the
signaturization process. It depends on the type of data passed
at the input_data_file.
Returns:
datafile(str): The H5 file containing the predicted data after
preprocess
"""
input_file = os.path.abspath(input_file)
destination = os.path.abspath(destination)
# Checking the provided paths
if not os.path.exists(input_file):
raise Exception("Error, {} does not exist!".format(input_file))
prepro = cls(sign.signature_path, sign.dataset)
prepro.predict(input_file, destination, entry_point)
return destination
@staticmethod
def get_parser():
description = 'Run preprocess script.'
parser = argparse.ArgumentParser(description=description)
parser.add_argument('-i', '--input_file', type=str,
required=False, default='.', help='Input file only for predict method')
required=False, default='.',
help='Input file only for predict method')
parser.add_argument('-o', '--output_file', type=str,
required=False, default='.', help='Output file')
parser.add_argument('-m', '--method', type=str,
required=False, default='fit', help='Method: fit or predict')
required=False, default='fit',
help='Method: fit or predict')
parser.add_argument('-mp', '--models_path', type=str,
required=False, default='', help='The models path')
required=False, default='',
help='The models path')
parser.add_argument('-ep', '--entry_point', type=str,
required=False, default=None, help='The predict entry point')
required=False, default=None,
help='The predict entry point')
return parser
@staticmethod
......@@ -190,19 +252,44 @@ class Preprocess():
return map_files
@staticmethod
def save_output(output_file, inchikey_raw, method, models_path, discrete, features, features_int=False, chunk=2000):
def save_output(output_file, inchikey_raw, method, models_path, discrete,
features, features_int=False, chunk=2000):
"""Save raw data produced by the preprocess script as matrix.
The result of preprocess scripts are usually in compact format (e.g.
binary data only list features with value of 1) since data might be
sparse and memory intensive to handle. This method convert it to a
signature like (explicit, extended) format. The produced H5 will
contain 3 dataset:
* 'keys': identifier (usually inchikey),
* 'features': features names,
* 'X': the data matrix
Args:
output_file(str): Path to output H5 file.
inchikey_raw(dict): inchikey -> list of values (dense format).
method(str): Same as used in the preprocess script.
models_path(str): Path to signature models directory.
discrete(bool): True if data is binary/discrete, False for
continuous data.
features(list): List of feature names from original sign0,
None when method is 'fit'.
features_int(str): Features have no name, we can use integers as
feature names.
chunk(int): Chunk size for loading data.
"""
keys = []
if discrete:
# check if categorical
categ = False
for k, v in inchikey_raw.items():
if len(v) > 0:
if isinstance(v[0], tuple):
categ = True
break
# words are all possible features
words = set()
for k in sorted(inchikey_raw.keys()):
keys.append(str(k))
......@@ -212,11 +299,14 @@ class Preprocess():
else:
words.update(inchikey_raw[k])
# if we have features available ('predict' method) check overlap
if features is not None:
orderwords = features
Preprocess.__log.info("Predict entries have a total of %s features," +
" %s overlap with trainset and will be considered.",
len(words), len(set(features) & words))
Preprocess.__log.debug(
"Predict entries have a total of %s features,"
" %s overlap with trainset and will be considered.",
len(words), len(set(features) & words))
# otherwise deduce features from data provided and sort them
else:
orderwords = list(words)
del words
......@@ -225,6 +315,9 @@ class Preprocess():
else:
orderwords.sort()
# prepare output file
Preprocess.__log.info("Output file will be of shape: "
"%s" % [len(keys), len(orderwords)])
with h5py.File(output_file, "w") as hf:
hf.create_dataset("keys", data=np.array(
keys, DataSignature.string_dtype()))
......@@ -233,11 +326,12 @@ class Preprocess():
hf.create_dataset("features", data=np.array(
orderwords, DataSignature.string_dtype()))
# write data in H5
raws = np.zeros((chunk, len(orderwords)), dtype=np.int8)
wordspos = {k: v for v, k in enumerate(orderwords)}
index = 0
for i, k in enumerate(keys):
# prepare chunk
shared_features = set(inchikey_raw[k]) & set(orderwords)
if len(shared_features) == 0:
Preprocess.__log.warn(
......@@ -249,6 +343,7 @@ class Preprocess():
raws[index][wordspos[word]] = 1
index += 1
# when chunk is complete or molecules are over, write to file
if index == chunk or i == len(keys) - 1:
end = i + 1
if index != chunk:
......@@ -259,26 +354,25 @@ class Preprocess():
raws = np.zeros((chunk, len(orderwords)), dtype=np.int8)
index = 0
if method == "fit":
with h5py.File(os.path.join(models_path, features_file), "w") as hf:
hf.create_dataset("features", data=np.array(
orderwords, DataSignature.string_dtype()))
saving_features = orderwords
# continuous
else:
# get molecules inchikeys
for k in inchikey_raw.keys():
keys.append(str(k))
keys = np.array(keys)
inds = keys.argsort()
data = []
# sorted data
for i in inds:
data.append(inchikey_raw[keys[i]])
# define features if not available
if features is None:
features = [str(i) for i in range(1, len(data[0]) + 1)]
# save data
with h5py.File(output_file, "w") as hf:
hf.create_dataset("keys", data=np.array(
keys[inds], DataSignature.string_dtype()))
......@@ -286,10 +380,14 @@ class Preprocess():
hf.create_dataset("features", data=np.array(
features, DataSignature.string_dtype()))
if method == "fit":
with h5py.File(os.path.join(models_path, features_file), "w") as hf:
hf.create_dataset("features", data=np.array(
features, DataSignature.string_dtype()))
saving_features = features
# if fitting, we also save the features
if method == "fit":
fn = os.path.join(models_path, features_file)
with h5py.File(fn, "w") as hf:
hf.create_dataset("features", data=np.array(saving_features, DataSignature.string_dtype()))
def to_feature_string(self, signatures, string_func):
"""Covert signature to a string with feature names.
......@@ -315,7 +413,7 @@ class Preprocess():
@staticmethod
def _feat_value_only(res_dict):
"""Suited for continuos spaces."""
"""Suited for continuous spaces."""
strings = list()
for k in sorted(res_dict.keys()):
strings.append("%.3f" % res_dict[k])
......@@ -410,7 +508,8 @@ class Preprocess():
# new_only_keys = self.unique_keys - old_keys
# shared_keys = self.unique_keys & old_keys
# frac_present = len(shared_keys) / float(len(old_keys))
# self.__log.info("Among %s OLD molecules %.2f%% are still present:",
# self.__log.info(
# "Among %s OLD molecules %.2f%% are still present:",
# len(old_keys),
# 100 * frac_present)
# self.__log.info("Old keys: %s", len(old_keys))
......@@ -433,7 +532,8 @@ class Preprocess():
# 'new_sign': None
# }
# to_sample = min(len(shared_keys), to_sample)
# sample = np.random.choice(list(shared_keys), to_sample, replace=False)
# sample = np.random.choice(list(shared_keys), to_sample,
# replace=False)
# res = psql.qstring(
# "SELECT inchikey,raw FROM %s WHERE inchikey = ANY('{%s}');" %
# (table_name, ','.join(sample)), old_dbname)
......@@ -458,7 +558,8 @@ class Preprocess():
# most_diff['new_sign'] = feat_new
# total += len(feat_old)
# frac_equal = not_changed / float(to_sample)
# self.__log.info("Among %s shared sampled signatures %.2f%% are equal:",
# self.__log.info(
# "Among %s shared sampled signatures %.2f%% are equal:",
# to_sample, 100 * frac_equal)
# self.__log.info("Equal: %s Changed: %s", not_changed, changed)
# if changed == 0:
......
......@@ -45,6 +45,7 @@ class proj(BaseSignature, DataSignature):
self.__log.debug('data_path: %s', self.data_path)
self.projector = eval(proj_type)(signature_path, dataset, **kwargs)
self.proj_type = proj_type
self.stats_path = self.projector.stats_path
self.model_path = self.projector.model_path
......@@ -62,7 +63,8 @@ class proj(BaseSignature, DataSignature):
pred_proj = DataSignature(destination)
with h5py.File(signature.data_path, "r") as src, \
h5py.File(destination, "w") as dst:
dst.create_dataset("keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset(
"keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset("name", data=np.array(
['PCA preprocess'], sdtype))
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
......@@ -88,7 +90,8 @@ class proj(BaseSignature, DataSignature):
pred_proj = DataSignature(destination)
with h5py.File(signature.data_path, "r") as src, \
h5py.File(destination, "w") as dst:
dst.create_dataset("keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset(
"keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset("name", data=np.array(
['PCA preprocess'], sdtype))
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
......@@ -118,7 +121,11 @@ class proj(BaseSignature, DataSignature):
# also predict for full if available
sign_full = self.get_sign('sign' + self.cctype[-1]).get_molset("full")
if os.path.isfile(sign_full.data_path):
self.predict(sign_full, self.get_molset("full").data_path)
self_full = self.get_molset("full")
self_full = proj(self_full.signature_path,
self_full.dataset, proj_type=self.proj_type)
#self.predict(sign_full, self_full.data_path)
self.map(self_full.data_path)
def predict(self, signature, destination, *args, **kwargs):
"""Predict projection for new data."""
......
......@@ -5,12 +5,14 @@ import datetime
import numpy as np
from tqdm import tqdm
from time import time
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LinearRegression
from chemicalchecker.core.signature_base import BaseSignature
from chemicalchecker.core.signature_data import DataSignature
from chemicalchecker.util import logged
from chemicalchecker.util.plot import Plot
@logged
......@@ -25,11 +27,6 @@ class TSNE(BaseSignature, DataSignature):
dataset(object): The dataset object with all info related.
"""
# Calling init on the base class to trigger file existance checks
try:
from MulticoreTSNE import MulticoreTSNE
except ImportError:
raise ImportError("requires MulticoreTSNE " +
"http://github.com/DmitryUlyanov/Multicore-TSNE")
BaseSignature.__init__(
self, signature_path, dataset, **params)
self.__log.debug('signature path is: %s', signature_path)
......@@ -50,35 +47,69 @@ class TSNE(BaseSignature, DataSignature):
DataSignature.__init__(self, self.data_path)
self.__log.debug('data_path: %s', self.data_path)
self.name = "_".join([str(self.dataset), "proj", self.proj_name])
# if already fitted load the model and projetions
self.algo_path = os.path.join(self.model_path, 'algo.pkl')
if self.is_fit():
self.algo = pickle.load(open(self.algo_path, 'rb'))
else:
self.algo = MulticoreTSNE(n_components=2, **params)
self.oos_mdl_path = os.path.join(self.model_path, 'oos.pkl')
def fit(self, signature, validations=True, chunk_size=100):
def fit(self, signature, validations=True, chunk_size=5000,
oos_predictor=False, proj_params={}, pre_pca=True):
"""Fit to signature data."""
try:
from MulticoreTSNE import MulticoreTSNE
except ImportError:
raise ImportError("requires MulticoreTSNE " +
"http://github.com/DmitryUlyanov/Multicore-TSNE")
projector = MulticoreTSNE(n_components=2, **proj_params)
# perform fit
self.__log.info("Projecting with %s..." % self.__class__.__name__)
for k, v in proj_params.items():
self.__log.info(' %s %s', k, v)
self.__log.info("Input shape: %s" % str(signature.info_h5['V']))
t_start = time()
with h5py.File(signature.data_path, "r") as src:
data = src["V"][:]
proj_data = self.algo.fit_transform(data)
# pre PCA
if pre_pca:
# find n_components to get 0.9 explained variance
ipca = IncrementalPCA(n_components=signature.shape[1])
with h5py.File(signature.data_path, "r") as src:
src_len = src["V"].shape[0]
for i in tqdm(range(0, src_len, chunk_size), 'fit expl_var'):
chunk = slice(i, i + chunk_size)
ipca.partial_fit(src["V"][chunk])
nr_comp = np.argmax(ipca.explained_variance_ratio_.cumsum() > 0.9)
# fit pca
ipca = IncrementalPCA(n_components=nr_comp)
with h5py.File(signature.data_path, "r") as src:
src_len = src["V"].shape[0]
for i in tqdm(range(0, src_len, chunk_size), 'fit'):
chunk = slice(i, i + chunk_size)
ipca.partial_fit(src["V"][chunk])
# transform
proj_data = list()
with h5py.File(signature.data_path, "r") as src:
src_len = src["V"].shape[0]
for i in tqdm(range(0, src_len, chunk_size), 'transform'):
chunk = slice(i, i + chunk_size)
proj_data.append(ipca.transform(src["V"][chunk]))
data = np.vstack(proj_data)
else:
# read data
with h5py.File(signature.data_path, "r") as src:
data = src["V"][:]
# do projection
self.__log.info("Final input shape: %s" % str(data.shape))
proj_data = projector.fit_transform(data)
if oos_predictor:
# tsne does not predict so we train linear model
self.algo = LinearRegression()
self.algo.fit(data, proj_data)
mdl = LinearRegression()
mdl.fit(data, proj_data)
pickle.dump(mdl, open(self.oos_mdl_path, 'wb'))
t_end = time()
t_delta = datetime.timedelta(seconds=t_end - t_start)
self.__log.info("Projecting took %s" % t_delta)
# save model
pickle.dump(self.algo, open(self.algo_path, 'wb'))
# save h5
sdtype = DataSignature.string_dtype()
with h5py.File(signature.data_path, "r") as src, \
h5py.File(self.data_path, "w") as dst:
dst.create_dataset("keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset(
"keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset("name", data=np.array([self.name], sdtype))
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dst.create_dataset("date", data=np.array([date_str], sdtype))
......@@ -90,6 +121,9 @@ class TSNE(BaseSignature, DataSignature):
for i in tqdm(range(0, src_len, chunk_size), 'write'):
chunk = slice(i, i + chunk_size)
dst['V'][chunk] = proj_data[chunk]
# make plot
plot = Plot(self.dataset, self.stats_path)
xlim, ylim = plot.projection_plot(proj_data, bw=0.1, levels=10)
# run validation
if validations:
self.validate()
......@@ -97,11 +131,15 @@ class TSNE(BaseSignature, DataSignature):
def predict(self, signature, destination, chunk_size=100):
"""Predict new projections."""
if not os.path.isfile(self.oos_mdl_path):
raise Exception('Out-of-sample predictor was not trained.')
mdl = pickle.load(open(self.oos_mdl_path, 'rb'))
# create destination file
sdtype = DataSignature.string_dtype()
with h5py.File(signature.data_path, "r") as src, \
h5py.File(destination, "w") as dst:
dst.create_dataset("keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset(
"keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset("name", data=np.array([self.name], sdtype))
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dst.create_dataset("date", data=np.array([date_str], sdtype))
......@@ -112,4 +150,4 @@ class TSNE(BaseSignature, DataSignature):
dst.create_dataset("V", (src_len, 2), dtype=np.float32)
for i in tqdm(range(0, src_len, chunk_size), 'transform'):
chunk = slice(i, i + chunk_size)
dst['V'][chunk] = self.algo.predict(src['V'][chunk])
dst['V'][chunk] = mdl.predict(src['V'][chunk])
import os
import h5py
import joblib
import datetime
import numpy as np
from tqdm import tqdm
from time import time
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LinearRegression
from chemicalchecker.core.signature_base import BaseSignature
from chemicalchecker.core.signature_data import DataSignature
from chemicalchecker.util.plot import Plot
from chemicalchecker.util import logged
from chemicalchecker.util.plot import Plot
@logged
......@@ -24,11 +25,6 @@ class UMAP(BaseSignature, DataSignature):
signature_path(str): the path to the signature directory.
dataset(object): The dataset object with all info related.
"""
try:
import umap
except ImportError:
raise ImportError("requires umap " +
"https://umap-learn.readthedocs.io/en/latest/")
# Calling init on the base class to trigger file existance checks
BaseSignature.__init__(
self, signature_path, dataset, **params)
......@@ -50,31 +46,69 @@ class UMAP(BaseSignature, DataSignature):
DataSignature.__init__(self, self.data_path)
self.__log.debug('data_path: %s', self.data_path)
self.name = "_".join([str(self.dataset), "proj", self.proj_name])
# get default parameters
self.params = dict(metric='cosine', init='random')
self.params.update(params)
# if already fitted load the model and projetions
self.algo_path = os.path.join(self.model_path, 'algo.pkl')
self.algo = umap.UMAP(n_components=2, **self.params)
self.oos_mdl_path = os.path.join(self.model_path, 'oos.pkl')
def fit(self, signature, validations=True, chunk_size=100):
def fit(self, signature, validations=True, chunk_size=5000,
oos_predictor=False, proj_params={}, pre_pca=True):
"""Fit to signature data."""
try:
import umap
except ImportError:
raise ImportError("requires umap " +
"https://umap-learn.readthedocs.io/en/latest/")
# perform fit
projector = umap.UMAP(n_components=2, **proj_params)
self.__log.info("Projecting with %s..." % self.__class__.__name__)
for k, v in proj_params.items():
self.__log.info(' %s %s', k, v)
self.__log.info("Input shape: %s" % str(signature.info_h5['V']))
t_start = time()
with h5py.File(signature.data_path, "r") as src:
proj_data = self.algo.fit_transform(src["V"][:])
# pre PCA
if pre_pca:
# find n_components to get 0.9 explained variance
ipca = IncrementalPCA(n_components=signature.shape[1])
with h5py.File(signature.data_path, "r") as src:
src_len = src["V"].shape[0]
for i in tqdm(range(0, src_len, chunk_size), 'fit expl_var'):
chunk = slice(i, i + chunk_size)
ipca.partial_fit(src["V"][chunk])
nr_comp = np.argmax(ipca.explained_variance_ratio_.cumsum() > 0.9)
# fit pca
ipca = IncrementalPCA(n_components=nr_comp)
with h5py.File(signature.data_path, "r") as src:
src_len = src["V"].shape[0]
for i in tqdm(range(0, src_len, chunk_size), 'fit'):
chunk = slice(i, i + chunk_size)
ipca.partial_fit(src["V"][chunk])
# transform
proj_data = list()
with h5py.File(signature.data_path, "r") as src:
src_len = src["V"].shape[0]
for i in tqdm(range(0, src_len, chunk_size), 'transform'):
chunk = slice(i, i + chunk_size)
proj_data.append(ipca.transform(src["V"][chunk]))
data = np.vstack(proj_data)
else:
# read data
with h5py.File(signature.data_path, "r") as src:
data = src["V"][:]
# do projection
self.__log.info("Final input shape: %s" % str(data.shape))
proj_data = projector.fit_transform(data)
if oos_predictor:
# tsne does not predict so we train linear model
mdl = LinearRegression()
mdl.fit(data, proj_data)
pickle.dump(mdl, open(self.oos_mdl_path, 'wb'))
t_end = time()
t_delta = datetime.timedelta(seconds=t_end - t_start)
self.__log.info("Projecting took %s" % t_delta)
# save model
joblib.dump(self.algo, open(self.algo_path, 'wb'))
# save h5
sdtype = DataSignature.string_dtype()
with h5py.File(signature.data_path, "r") as src, \
h5py.File(self.data_path, "w") as dst:
dst.create_dataset("keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset(
"keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset("name", data=np.array([self.name], sdtype))
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dst.create_dataset("date", data=np.array([date_str], sdtype))
......@@ -86,23 +120,25 @@ class UMAP(BaseSignature, DataSignature):
for i in tqdm(range(0, src_len, chunk_size), 'write'):
chunk = slice(i, i + chunk_size)
dst['V'][chunk] = proj_data[chunk]
# make plot
plot = Plot(self.dataset, self.stats_path)
xlim, ylim = plot.projection_plot(proj_data, bw=0.1, levels=10)
# run validation
if validations:
self.validate()
self.mark_ready()
def predict(self, signature, destination, chunk_size=100, plot=False, plot_kws=None):
def predict(self, signature, destination, chunk_size=100):
"""Predict new projections."""
# load pickled projector
try:
self.algo = joblib.load(open(self.algo_path))
except Exception as ex:
self.__log.warning("Cannot load projector: %s" % str(ex))
if not os.path.isfile(self.oos_mdl_path):
raise Exception('Out-of-sample predictor was not trained.')
mdl = pickle.load(open(self.oos_mdl_path, 'rb'))
# create destination file
sdtype = DataSignature.string_dtype()
with h5py.File(signature.data_path, "r") as src, \
h5py.File(destination, "w") as dst:
dst.create_dataset("keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset(
"keys", data=src['keys'][:], dtype=sdtype)
dst.create_dataset("name", data=np.array([self.name], sdtype))
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dst.