v2 Release (#7)

V2 Release - citeable release
whitead · Dec 12, 2022 · 1ed79a6 · 1ed79a6
1 parent 01dd230
commit 1ed79a6
Show file tree

Hide file tree

Showing 8 changed files with 64 additions and 289 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,11 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "White"
+  given-names: "Andrew D"
+  orcid: "https://orcid.org/0000-0002-6647-3965"
+title: "molbloom: quick assessment of compound purchasability with bloom filters"
+version: 2.0.0
+doi: 10.5281/zenodo.7426402
+date-released: 2022-12-01
+url: "https://github.com/whitead/molbloom"
diff --git a/Changelog.md b/Changelog.md
@@ -0,0 +1,19 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [2.0.0] - 2022-12-13
+
+### Added
+
+- Docstrings for custom filter building
+- Referred to Small World unofficial API in README
+- Citation
+
+### Removed
+
+- Removed `buy_similar`, since it duplicates Small World API functionality
diff --git a/README.md b/README.md
@@ -21,17 +21,22 @@ There are other available catalogs - see options with `molbloom.catalogs()`. Mos
 
 ## Querying Small World
 
-To find similar purchasable molecules,
+Just because `buy` returns `True` doesn't mean you can buy it -- you should follow-up with a real query at [ZINC](https://zinc.docking.org/) or you can use the search feature in `SmallWorld` to find similar purchasable molecules.
+
 ```py
-buy_similar('CCCO')
+from smallworld_api import SmallWorld
+sw = SmallWorld()
+
+aspirin = 'O=C(C)Oc1ccccc1C(=O)O'
+results = sw.search(aspirin, dist=5, db=sw.REAL_dataset)
 ```
-this will query [ZINC Small World](https://sw.docking.org/) defaulting to the *Enamine REAL-22Q1-4.5B* database and return a list of hits and their similarities to the query via few different measures.
+this will query [ZINC Small World](https://sw.docking.org/).
 
 ## Custom Filter
 
 Do you have your own list of SMILES? There are two ways to build a filter -- you can use a C tool that is very fast (1M / s) if your SMILES are in a file and already canonical. Or you can use the Python API to programmaticaly build a filter and canonicalize as you go. See below
 
-Once built:
+Once your custom filter is built:
 
 ```py
 from molbloom import BloomFilter
@@ -47,7 +52,7 @@ You can build your own filter using the code in the `tool/` directory.
 ```sh
 cd tool
 make
-./molbloom-bloom <MB of filter> <filter name> <approx number of compounds> <input file 1> <input file 2> ...
+./molbloom-bloom <MB of final filter> <filter name> <approx number of compounds> <input file 1> <input file 2> ...
 ```
 
 where each input file has SMILES on each line in the first column and is already canonicalized. The higher the MB, the lower the rate of false positives. If you want to choose the false positive rate rather than the size, you can use the equation:
@@ -60,13 +65,15 @@ where $M$ is the size in bits, $N$ is the number of compounds, and $\epsilon$ is
 
 ### Build with Python
 
+You can also build a filter using python as follows:
+
 ```py
 from molbloom import CustomFilter, canon
 bf = CustomFilter(100, 1000, 'myfilter')
 bf.add('CCCO')
-# canonicalize one
+# canonicalize one record
 s = canon("CCCOC")
 bf.add(s)
-# save it
+# finalize filter into a file
 bf.save('test.bloom')
 ```
diff --git a/molbloom/__init__.py b/molbloom/__init__.py
@@ -8,7 +8,7 @@
 _filters = {"zinc20": None, "zinc-instock": None, "zinc-instock-mini": None}
 _descriptions = {
     "zinc20": "All ZINC20 (1,006,651,037 mols) from Oct 2021. FPR of 0.003. Requires download",
-    "instock": "ZINC20 instock (9,227,726 mols). FPR of 0.0003. Requires download",
+    "zinc-instock": "ZINC20 instock (9,227,726 mols). FPR of 0.0003. Requires download",
     "zinc-instock-mini": "ZINC20 instock (9,227,726 mols). FPR of 0.07. Included in package",
 }
 # just put in .cache
@@ -81,246 +81,3 @@ def buy(smiles, catalog="zinc-instock", canonicalize=False):
     if canonicalize:
         smiles = canon(smiles)
     return smiles in _filters[catalog]
-
-
-@dataclass
-class SmallWorldHit:
-    """Small World Similarity Search Hit Data"""
-
-    #: Hit smiles
-    smiles: str
-    #: Hit compound id
-    compound_id: str
-    #: Hit graph edit distance to query
-    dist: int
-    #: Hit extended connectivity fingerprint (radius = 2) to query
-    ecfp4: float
-    #: Hit daylight fingerprint distance
-    daylight: float
-    #: Hit Maximum Common Edge Subgraph
-    mces: int
-
-
-def buy_similar(
-    smiles,
-    db="REAL-Database-22Q1.smi.anon",
-    small_world_args={
-        "dist": 4,
-        "sdist": 12,
-        "tdn": 6,
-        "tup": 6,
-        "rdn": 6,
-        "rup": 2,
-        "ldn": 2,
-        "lup": 2,
-        "maj": 6,
-        "min": 6,
-        "sub": 6,
-        "scores": "Atom%20Alignment,ECFP4,Daylight",
-    },
-    n_retries=8,
-    verbose=False,
-):
-
-    import urllib
-    import urllib.parse
-    import urllib.request
-    import time
-
-    try:
-        sw_server_path = "https://sw.docking.org/search/"
-        args = (("smi", smiles), ("db", db)) + tuple(small_world_args.items())
-        query_url = f"{sw_server_path}submit?{urllib.parse.urlencode(args)}"
-    except:
-        raise Exception(
-            f"Failed to construct sw.docking.org query url for smiles '{smiles}'"
-        )
-
-    if verbose:
-        print(f"Querying ZINC Small World with url: {query_url}")
-
-    hlid = None
-    for attempt_i in range(n_retries):
-        if verbose:
-            print(f"Query attempt {attempt_i + 1} / {n_retries}")
-        lines = None
-        http_status = None
-        try:
-            with urllib.request.urlopen(query_url) as response:
-                http_status = response.status
-                if http_status == 200:
-                    lines = [line.decode("utf-8")[:-1] for line in response.readlines()]
-        except urllib.error.HTTPError as e:
-            if e.getcode() == 400:
-                print(
-                    f"ERROR: Failed to query https://sw.docking.org with smiles '{smiles}'"
-                )
-                print(f"ERROR: Query URL: {query_url}")
-                print(f"ERROR: {e}")
-                raise e
-
-            if verbose:
-                print(
-                    f"ERROR: Failed to query https://sw.docking.org with smiles '{smiles}'"
-                )
-                print(f"ERROR: Query URL: {query_url}")
-                print(f"ERROR: {e}")
-
-            time.sleep(2)
-            continue
-
-        if lines is None:
-            if verbose:
-                print(
-                    f"ERROR: Failed to query sw.docking.org, HTTPS status: {http_status}"
-                )
-            time.sleep(2)
-            continue
-
-        sw_status = None
-        try:
-            for line in lines:
-                if line == "":
-                    continue
-                line = line.replace("data:{", "").replace("}\n", "")
-                line = line.split(",")
-
-                for key_value in line:
-                    if '"status":' in key_value:
-                        sw_status = key_value.replace('"status":', "").replace('"', "")
-
-                    if "hlid" in key_value:
-                        hlid = key_value.replace('"hlid":', "")
-        except Exception as e:
-            if verbose:
-                print(f"ERROR: Failed to parse query response with error\n{e}")
-            time.sleep(2)
-            continue
-
-        if sw_status is None:
-            response_str = "\n".join(lines)
-            if verbose:
-                print(f"ERROR: Unexpected result from SmallWorld:\n{response_str}")
-            time.sleep(2)
-            continue
-        elif sw_status == "FIRST":
-            if verbose:
-                print(f"Got first hit, but didn't finish... retrying")
-            time.sleep(2)
-            continue
-        elif sw_status == "Ground Control to Major Tom" or sw_status == "MORE":
-            if verbose:
-                print("Still proccessing results... retrying")
-        elif sw_status == "MISS":
-            if verbose:
-                print(f"ERROR: No hits found for smiles {smiles}")
-            time.sleep(2)
-            continue
-        elif sw_status != "END":
-            if verbose:
-                print(f"ERROR Unexpected status from SmallWorld '{sw_status}'")
-            time.sleep(2)
-            continue
-
-        try:
-            hlid = int(hlid)
-        except:
-            if verboes:
-                print(
-                    f"ERROR: Expected small world query id to be an integer, instead it was {hlid}"
-                )
-            time.sleep(2)
-            continue
-
-        # stop got the hlid stop retrying
-        break
-
-    # the query should give back an hlid which we can use to get the results below
-    if not isinstance(hlid, int):
-        if verbose:
-            print(
-                f"Failed to get the result with {n_retries}. Consider trying with more retries or checking the query smiles '{smiles}' on https://sw.docking.org/search"
-            )
-        return []
-
-    results_args = (
-        "&".join(
-            [
-                f"hlid={hlid}",
-                "order[0][column]=0",
-                "columns[0][name]=alignment",
-                "order[0][dir]=asc",
-                "columns[1][name]=dist",
-                "columns[1][search][value]=0-12",
-                "columns[2][name]=ecfp4",
-                "columns[3][name]=daylight",
-                "columns[5][name]=mces",
-            ]
-        )
-        .replace("[", "%5B")
-        .replace("]", "%5D")
-    )
-    results_url = f"{sw_server_path}export?{results_args}"
-
-    if verbose:
-        print(f"Getting results from ZINC Small World with url: {results_url}")
-
-    http_status = None
-    hits = []
-    for attempt_i in range(n_retries):
-        if verbose:
-            print(f"Retrieve results attempt {attempt_i + 1} / {n_retries}")
-        try:
-            with urllib.request.urlopen(results_url) as response:
-                if response is None:
-                    time.sleep(2)
-                    continue
-                http_status = response.status
-                if http_status == 200:
-                    next(response)
-                    for line in response.readlines():
-                        line = line.decode("utf-8")[:-1].split("\t")
-                        smiles, compound_id = line[0].split(" ")
-                        hits.append(
-                            SmallWorldHit(
-                                smiles=smiles,
-                                compound_id=compound_id,
-                                dist=int(line[1]),
-                                ecfp4=float(line[2]),
-                                daylight=float(line[3]),
-                                #
-                                mces=int(line[5]),
-                            )
-                        )
-        except urllib.error.HTTPError as e:
-            if e.getcode() == 400:
-                print(
-                    f"ERROR: Failed to retrieve results from https://sw.docking.org with smiles '{smiles}'"
-                )
-                print(f"ERROR: Results url {results_url}")
-                print(f"ERROR: {e}")
-                raise e
-
-            if verbose:
-                print(
-                    f"ERROR: Failed to retrieve results from https://sw.docking.org with smiles '{smiles}'"
-                )
-                print(f"ERROR: Results url {results_url}")
-                print(f"ERROR: {e}")
-
-            time.sleep(2)
-            continue
-
-        except Exception as e:
-            if verbose:
-                print(f"ERROR: Unable to parse results with error\n{e}")
-            time.sleep(2)
-            continue
-
-        # read the hits without error
-        break
-
-    if verbose:
-        print(f"retrieved {len(hits)} hits.")
-
-    return hits
diff --git a/molbloom/bloom.pyx b/molbloom/bloom.pyx
@@ -4,6 +4,8 @@ cimport cbloom
 
 
 cdef class BloomFilter:
+    '''A read-only bloom filter -- use this if you want to load a filter from disk
+    '''
     cdef cbloom.bloom_t * _c_bloom
 
     def __cinit__(self, str filename):
@@ -23,6 +25,17 @@ cdef class BloomFilter:
         return cbloom.bloom_check(self._c_bloom, bsmiles) == 1
 
 cdef class CustomFilter:
+    '''An editable bloom filter -- use this if you want to modify and save
+
+    Parameters
+    ----------
+    size : int
+        The size of the filter in bits.
+    n : int
+        The total number elements that will be in the filter (estimated).
+    name : str
+        The name of the filter.
+    '''
     cdef cbloom.bloom_t * _c_bloom
 
     def __cinit__(self, int size, int n, str name):

diff --git a/molbloom/version.py b/molbloom/version.py
@@ -1 +1 @@
-__version__ = "1.0.0"
+__version__ = "2.0.0"