Merge pull request #11 from rvhonorato/pre-release

Version update
gdocking · Jun 10, 2021 · 7f9362c · 7f9362c
2 parents 0212698 + b209fb3
commit 7f9362c
Show file tree

Hide file tree

Showing 93 changed files with 149,167 additions and 5,398 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,6 @@ benchmark/*/
 examples/dev
 tests/test_data/setup.toml
 .coverage
+.vscode/*
+tests/dummy_f
+tests/test_data/clustering/*.contacts
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "benchmark_data"]
+	path = benchmark_data
+	url = https://github.com/rvhonorato/gdock-benchmark.git
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,11 @@
 language: python
 
 python:
+  - "3.6"
+  - "3.7"
   - "3.8"
+  - "3.9"
+
 install:
   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
   - bash miniconda.sh -b -p $HOME/miniconda
@@ -29,9 +33,18 @@ before_script:
   - git clone https://github.com/haddocking/fcc.git
   - cd fcc
   - git checkout python3
+  - cd src
+  - make
+  - cd $HOME/src
+  - git clone https://github.com/haddocking/pdb-tools
+  - cd $HOME/src
+  - git clone https://github.com/haddocking/haddock-tools
+  - cd haddock-tools
+  - g++ -O2 -o contact-chainID contact-chainID.cpp
   - cd $HOME/src
-  - git clone https://github.com/bjornwallner/DockQ/
-  - cd DockQ
+  - wget http://www.bioinf.org.uk/software/profit/235216/profit.tar.gz
+  - tar zxvf profit.tar.gz
+  - cd ProFit_V3.3/src
   - make
   - cd $TRAVIS_BUILD_DIR
   - sed -i s"|/Users/rodrigo/repos/gdock|$HOME|g" etc/gdock.ini

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,5 +1,10 @@
-If you would like to contribute, the following steps should be done first:
+If you would like to contribute, here's a few suggestions:
 
--   <ins>Code optimization</ins>
--   <ins>Add flexibility to the molecules</ins>
--   <ins>Optimize the many Genetic Algorithm parameters</ins>
+-   <ins>Add flexibility to the docking</ins>, using normal-modes? molecular dynamics?
+-   <ins>Optimize the scoring function</ins>, add weights? change equation? change energy calculation?
+-   <ins>Improve usability</ins>, add `setup.py`
+-   <ins>Add input check</ins>, make sure that the input structures are ok
+
+* * *
+
+:octopus:
diff --git a/INSTALLATION.md b/INSTALLATION.md
@@ -2,43 +2,28 @@
 
 Download the latest relase or clone a stable branch of the repository.
 
-### Python Dependencies
+### Python environment
 
 An `environment.yml` file is provided and Anaconda is recomended.
 
 ```bash
-$ cd gdock
 $ conda env create -f environment.yml
 $ conda activate gdock
 (gdock) $
 ```
 
 ### Third-party Dependencies
 
-**gdock** uses [DComplex](https://sparks-lab.org/Publications_files/zhou061.pdf) as the scoring function, [FCC](https://github.com/haddocking/FCC) as the clustering engine and [DockQ](https://github.com/bjornwallner/DockQ) to calculate CAPRI metrics.
-
--   DComplex
-    -   Download from [this link](http://servers.sparks-lab.org/downloads/dcomplex2.tar.gz)
-    -   For it to work as intended in **gdock** you need to change line 115 and 148 of `dcomplex.c` and provide the full path to the DComplex installation,   then compile normally.
-        ```c++
-        // L28
-        #define MAXA		54000  //total atom number in one protein.
-        // L115
-        read_charge_file("/Users/rodrigo/software/dcomplex_single_file/charge_inp.dat");
-        // L148
-        fp=(FILE *)fopen("/Users/rodrigo/software/dcomplex_single_file/fort.21_alla","r"); //monomic 1.61
-        ```
-
--   FCC
-    -   Clone the repository from [this link](https://github.com/haddocking/FCC)
-    -   Follow the install instructions
-    -   Set the default branch to `python3`
-        ```bash
-        ~/repos/fcc $ git checkout python3
-        ```
-
--   DockQ
-    -   Download it from [this link](https://github.com/bjornwallner/DockQ)
-    -   Follow the install instructions
-
-Edit `etc/gdock.ini` with the correct paths
+`gdock` uses [DComplex](https://sparks-lab.org/Publications_files/zhou061.pdf) as the scoring function, [FCC](https://github.com/haddocking/FCC) as the clustering engine and [PROFIT](http://www.bioinf.org.uk/software/profit) to calculate CAPRI metrics.
+
+
+**After cloning the repository and setting up the python environment with anaconda**, you can use the very rough install script provided to install the third-party dependencies and configure `gdock.ini`:
+
+Example:
+```bash
+$ bash install.sh /home/rodrigo/software/gdock
+```
+
+* * *
+
+:octopus:
diff --git a/README.md b/README.md
@@ -1,77 +1,96 @@
-[![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/rvhonorato/gdock?color=red)](https://github.com/rvhonorato/gdock/releases/tag/v1.0.0)
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4467795.svg)](https://doi.org/10.5281/zenodo.4467795)
-[![Build Status](https://travis-ci.com/rvhonorato/gdock.svg?branch=master)](https://travis-ci.com/rvhonorato/gdock)
-[![Codacy Badge](https://app.codacy.com/project/badge/Coverage/a794c83bedbc4e50b4bb6a0ed73ba3d0)](https://www.codacy.com/gh/rvhonorato/gdock/dashboard?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=rvhonorato/gdock&amp;utm_campaign=Badge_Coverage)
-[![Codacy Badge](https://app.codacy.com/project/badge/Grade/a794c83bedbc4e50b4bb6a0ed73ba3d0)](https://www.codacy.com/gh/rvhonorato/gdock/dashboard?utm_source=github.com&utm_medium=referral&utm_content=rvhonorato/gdock&utm_campaign=Badge_Grade) 
+[![Version](https://img.shields.io/badge/version-v1.1.0-red)](https://github.com/rvhonorato/gdock/releases/tag/v1.1.0)
+[![License: 0BSD](https://img.shields.io/badge/license-0BSD-informational)](https://opensource.org/licenses/0BSD)
+[![Build Status](https://travis-ci.com/rvhonorato/gdock.svg?branch=development)](https://travis-ci.com/rvhonorato/gdock)
+[![Codacy Badge](https://app.codacy.com/project/badge/Coverage/a794c83bedbc4e50b4bb6a0ed73ba3d0)](https://www.codacy.com/gh/rvhonorato/gdock/dashboard?utm_source=github.com&utm_medium=referral&utm_content=rvhonorato/gdock&utm_campaign=Badge_Coverage)
 
 <img src="imgs/gdock_logo.png" width="500">
 
 _Genetic Algorithm applied to Protein-Protein Docking_
 
+## tl:dr
 
-## Disclaimer
-
-I am by no means an expert in genetic algorithms, and do not aspire this software to compete with any of the tens of dozens similar software out there. It was idealized as an exercise in programming, to test a few concepts and as a quarentine project.
+-   [Installation](INSTALLATION.md)
+-   [How-to-use](example/README.md)
 
-I am also aware that there is still very much to be done and improved, so far this is a one-man show. Check the [CONTRIBUTING.md](CONTRIBUTING.md) if you would like to help/criticize (:
+## Disclaimer
 
-## Concept
+Despite having some knowledge in Genetic Algorithms I am not an expert. If you are an expert any suggestions or critics are welcome. (:
 
-In the project we apply [Genetic Algorithm](https://en.wikipedia.org/wiki/Genetic_algorithm) to protein-protein docking. Each individual is represented by its cartesian coordinates and euler angles, and the fitness of each conformation is given by a scoring function.
 
-The input of each simulation is two [PDB files](https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/introduction), one receptor and one ligand; plus a list of residues that are likely to play a part in the interaction. There is (and probably will never be) support for _ab initio_ experiments.
+## Methods
 
-Initial placement of the molecule is done in a way that each of the interface composed by residues that are likely to be in the interface are facing each other. From here on, the receptor is fixed in space and all geometric opertions are be done to the ligand. Each individual (=conformation) is generated by randomly assigning three floats that will describe a region in space (around the geometric center of the initial ligand positioning) and three more floats that will describe its rotation. 
+In `gdock` we apply a [Genetic Algorithm](https://en.wikipedia.org/wiki/Genetic_algorithm) to Protein-Protein docking. Here, each individual is represented by its cartesian coordinates and euler angles, and the fitness of each conformation is given by the ratio of satisfied restraints are satisfied.
 
-The fitness of each one of these individual is evaluated (=scored) and over each generation, individuals have a chance of mutation, where one of the six descriptors is randomly changed and also a chance of crossover, where individuals exchange descriptors.
+The input of each simulation is two [PDB files](https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/introduction), plus a list of residues that are likely to play a part in the interaction. There are many ways one can obtain such residues, from theorethical predictions to _in vitro_ experiments. `gdock` does not support _ab initio_ docking.
 
-## Installation
+During spatial sampling, the receptor is fixed in space and all geometric opertions are be done to the ligand. Each individual (=conformation) is generated by randomly assigning three floats that will describe a region in space around the geometric center of the initial ligand positioning (in a range of ± 5Å) and three more floats that will describe its rotation (0-360). 
 
-Check [INSTALLATION.md](INSTALLATION.md)
+The fitness of each one of these individual is evaluated as a ratio of how many of the inputted restraints are in contact. Over each generation, individuals have a chance of mutation, where one of the six descriptors is randomly changed and also a chance of crossover, where individuals exchange descriptors.
 
-## How-to-use
+The generations are stopped after it "converges" (mean variation > 0.1 over 3 generations) and the binding energy of _all_ conformations are calculated with `DComplex`. The conformations are ranked according to the `gdock_score` which is given simply by: 
 
-```bash
-$ conda activate gdock
-(gdock) $ python gdock/gdock.py run.toml
+```python
+gdock_score = energy / restraint_satisfaction
 ```
 
-Check the [`examples/`](examples/) folder
+Only the top 1000 structures are clustered using according to their [frequency of common contacts](https://github.com/haddocking/FCC) with `cutoff=0.6`. 
+
+Once the simulation is done, the run directory is cleaned to preserve disk space and a data-processing friendly file is written in `analysis/gdock.dat`
 
 ## Benchmark
 
-The accuracy of a docking software can be measured by its ability to reconstruct experimentally determined structures. This can be done using the Protein-Protein Docking Benchmark v5 ([10.1016/j.jmb.2015.07.016](https://www-sciencedirect-com.proxy.library.uu.nl/science/article/pii/S0022283615004180)). To setup the benchmark, clone the [BM5-clean](https://github.com/haddocking/BM5-clean) repository and use the `*-benchmark.py` scripts in `tools/`; by default it will use the true-interface as restraints and use the unbound structures as input and the bound conformation as the native model.
+The accuracy of a docking software can be measured by its ability to reconstruct experimentally determined structures. The dataset most used for such benchmarking is the Protein-Protein Docking Benchmark v5 (BM5) ([10.1016/j.jmb.2015.07.016](https://www-sciencedirect-com.proxy.library.uu.nl/science/article/pii/S0022283615004180)). 
+
+A `gdock` run was done for each of the complexes of BM5, using the _unbound_ forms of the ligand and the receptor as input, and calculating the interface root mean square (i-RMSD) variation of the resulting complexes against the _bound_ form. The restraints used in the benchmark are the _true-interface_, defined as the aminoacids that are observed in the _bound_ interface. These restraints can be interpreted as the perfect scenario, thus ideal to measure `gdock` perfomance.
+
+According to the [_Critical Assessment of PRediction of Interactions_](https://www.ebi.ac.uk/pdbe/complex-pred/capri/) (CAPRI) parameters, comlexes can be categorized as `High`, `Medium` or `Acceptable` according to its (i-RMSD):
 
-The resulting complexes are analyzed according to the [_Critical Assessment of PRediction of Interactions_](https://www.ebi.ac.uk/pdbe/complex-pred/capri/) (CAPRI) parameters, specifically the interface root mean square deviation (i-RMSD). This metric is used to evaluate the success rate of _gdock_ in different subsets of generated solutions. 
+    i-RMSD <= 1 - High
+    i-RMSD <= 2 - Medium
+    i-RMSD <= 4 - Acceptable
 
--   `i-RMSD <= 1` == High
--   `i-RMSD <= 2` == Medium
--   `i-RMSD <= 4` == Acceptable
+### Success rate
 
-Single-structure analysis; for each target in the benchmark, we could rank the generated complexes according to its fitness and, for example, analyse only the _top 5_ conformations. If _at least one_ of these conformations has `i-RMSD <= 2`, this target's success will be considered `Medium`. 
+ **Single-structue**: For each target in the benchmark, we could rank the generated complexes according to its score and, for example, analyse only the _top 5_ conformations. If _at least one_ of these conformations has `i-RMSD <= 2`, this target's success will be considered `Medium`. 
 
-Cluster-based; here the clusters are sorted by fitness and we consider the top5 models of the top 1, 3, 5 or 10 clusters. Same logic as the single-structure analysis is applied.
+ **Cluster-based**: Once the clusters are sorted by fitness and we consider, for example, the _top5_ models of the top 1, 3, 5 or 10 clusters. Same logic as the single-structure analysis is applied.
 
-The results for v1.0.0 are as follows:
+### Results
 
-<img src="imgs/v1.0.0_BM5.png">
+The results for `v1.1.0` are as follows, the % shows the increase/decrease in relation the previous version.
+
+<img src="imgs/v1-1-0_bm5.png" width="475">
 
 _Note: Some conformations were excluded since dcomplex could not be executed on them._
 
-### But what does this means?
+### Discussion
+
+Unfortunately, no `High` ranking conformations were obtained and there was a decrease in success rate for top 1, 5 and 10. Even tho the exact reasons for this behaviour are not certain, there are a few possibilities: 1) the scoring function is not perfoming well at positioning the "best" conformations in the top rankings, 2) the range of spatial exploration might be too large or 3) the lack of flexibility during docking is affecting the success rate.
 
-In layman's terms it means that the perfomance is currently very bad. 
+A notable increase (300%!) in success rates is observed for the top 200, 400 and 1000 both in `Medium` and `Acceptable` ranking, when comparing the benchmark results with the previous version. This comes mostly due to the inclusion of the restraint satisfaction as a score term however these results indicate that the scoring function can still be improved.
 
-For you to get `Acceptable` conformations would you need to check 1000 structures, and only for 60 cases. The low perfomance is also observed in the cluster based analysis. The expected scenario is that as many cases as possible are in the `High` category and within the smallest subset. 
+As a better scientists said before me:
 
-In summary: the opposite of what is happening here.
+> Scoring is the holy grail of docking!
 
-## Future work
+_Note: All relevant benchmarking data are kept in a [different repository](http://github.com/rvhonorato/gdock-benchmark)._
+
+## Collaboration / Development
+
+If you would like to help or improve your python coding skills, `gdock` might be a good project to work on!
+
+The code is licensed under a 0-clause license ([0BSD](LICENSE)), which pretty much means you can do whatever you want with it; copy, edit, deploy, distribute or even sell. :nerd_face:
+
+As for my vision to the project, there's a small [TODO list](CONTRIBUTING.md).
+
+## Installation
+
+Check [INSTALLATION.md](INSTALLATION.md)
+
+## How-to-use
 
-There are many reasons for such bad perfomance, to address it the following work would be needed (and not limited to):
+Check the provided [example](example/README.md)
 
--   <ins>Add flexibility</ins>; currently it is doing rigid-body docking
--   <ins>Optimize the many Genetic Algorithm parameters</ins>; currently its using the ones I choose based on gut feeling (:
--   <ins>Code optimization</ins>; the simulation time could (hopefully) be decreased with some smart code optimizations
+* * *
 
 :octopus:
diff --git a/benchmark/benchmark_v1.0.0.dat.xz b/benchmark/benchmark_v1.0.0.dat.xz
diff --git a/benchmark_data b/benchmark_data
diff --git a/environment.yml b/environment.yml
@@ -13,6 +13,6 @@ dependencies:
   - pip:
     - deap==1.3.1
     - codecov==2.1.10
-    - pdb-tools==2.2.2
+    - mgzip==0.2.1
     - coverage==5.3
 prefix: /Users/rodrigo/software/anaconda3/envs/gdock
diff --git a/etc/gdock.ini b/etc/gdock.ini
@@ -2,7 +2,9 @@
 ;placeholder
 
 [third_party]
-dockq_exe    = /Users/rodrigo/repos/gdock/src/DockQ/DockQ.py
-fcc_path     = /Users/rodrigo/repos/gdock/src/fcc
+profit_exe        = /Users/rodrigo/repos/gdock/src/ProFit_V3.3/src/profit
+fcc_path          = /Users/rodrigo/repos/gdock/src/fcc
+pdbtools_path     = /Users/rodrigo/repos/gdock/src/pdb-tools
+haddocktools_path = /Users/rodrigo/repos/gdock/src/haddock-tools
 ; dcomplex this needs to be "specially compiled", change the fort21 path and the charges
-dcomplex_exe = /Users/rodrigo/repos/gdock/src/dcomplex_single_file/dcomplex
+dcomplex_exe = /Users/rodrigo/repos/gdock/src/dcomplex_single_file/dcomplex
diff --git a/etc/genetic_algorithm_params.toml b/etc/genetic_algorithm_params.toml
@@ -1,18 +1,18 @@
 [general]
-population_size       = 100
+population_size           = 5000
 max_number_of_generations = 50
 
-crossover_probability = 0.2
-mutation_probability  = 0.6
+crossover_probability = 0.4
+mutation_probability  = 0.8
 
 # eta - Crowding degree of the mutation.
 #   A high eta will produce a mutant resembling its parent,
 #   while a small eta will produce a solution much more different.
-eta = 0.6
+eta = 0.1
 
 # Independent probability for each attribute to be exchanged.
 indpb = 0.2
 
 [parameters]
 random_seed        = 42
-convergence_cutoff = 3
+convergence_cutoff = 3
diff --git a/example/1a2k_complex_bound.izone b/example/1a2k_complex_bound.izone
@@ -0,0 +1,25 @@
+zone A.68-A.72
+zone A.84-A.89
+zone A.95-A.99
+zone A.122-A.124
+zone A.222-A.222
+zone A.226-A.226
+zone A.241-A.244
+zone A.246-A.246
+zone A.255-A.264
+zone A.288-A.300
+zone A.319-A.324
+zone B.12-B.12
+zone B.17-B.21
+zone B.23-B.24
+zone B.28-B.28
+zone B.37-B.37
+zone B.39-B.46
+zone B.64-B.71
+zone B.73-B.81
+zone B.94-B.94
+zone B.96-B.97
+zone B.100-B.100
+zone B.103-B.104
+zone B.106-B.107
+zone B.110-B.110