update docstrings, changelog and off by 1 when loading pandas/polars …

…objects
BiocPy · Jan 12, 2025 · c6afcf8 · c6afcf8
1 parent 0ba4ad7
commit c6afcf8
Show file tree

Hide file tree

Showing 8 changed files with 46 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## Version 0.6.0
+
+An rewrite of the package to use the new and improve IRanges packages (>= 0.4.2)
+
+- More consistent results across all methods compared to R/Bioconductor implementations.
+- Similar to IRanges, search and overlap operations may return a hits like object.
+- Nearest returns slightly different matches but since the select="arbitrary", its probably ok.
+- More robust testing of the combination of inputs and parameter choices.
+- Updated the tests, docstrings
+
 ## Version 0.5.2
 
 - Restrict IRanges to the last compatible version before the migration.

diff --git a/README.md b/README.md
@@ -6,8 +6,6 @@
 
 GenomicRanges provides container classes designed to represent genomic locations and support genomic analysis. It is similar to Bioconductor's [GenomicRanges](https://bioconductor.org/packages/release/bioc/html/GenomicRanges.html).
 
-**Note: V0.4.0 is a complete overhaul of the package, as such the constructor to GenomicRanges has changed. Please refer the documentation for updated usage of the classes and the methods.**
-
 To get started, install the package from [PyPI](https://pypi.org/project/genomicranges/)
 
 ```shell
@@ -102,11 +100,11 @@ print(gr)
     GenomicRanges with 5 ranges and 5 metadata columns
         seqnames    ranges           strand     score                  GC
            <str> <IRanges> <ndarray[int64]>   <range>              <list>
-    [0]     chr1 101 - 112                * |       0  0.2593301003406461
-    [1]     chr2 102 - 123                - |       1  0.7207993213776644
-    [2]     chr3 103 - 128                * |       2 0.23391468067222065
-    [3]     chr2 104 - 134                + |       3  0.7671026589720187
-    [4]     chr3 105 - 110                - |       4 0.03355777784472458
+    [0]     chr1 101 - 111                * |       0  0.2593301003406461
+    [1]     chr2 102 - 122                - |       1  0.7207993213776644
+    [2]     chr3 103 - 127                * |       2 0.23391468067222065
+    [3]     chr2 104 - 133                + |       3  0.7671026589720187
+    [4]     chr3 105 - 109                - |       4 0.03355777784472458
     ------
     seqinfo(3 sequences): chr1 chr2 chr3
 
@@ -138,11 +136,11 @@ print(gr)
     GenomicRanges with 5 ranges and 5 metadata columns
       seqnames    ranges           strand    score                  GC
          <str> <IRanges> <ndarray[int64]>   <list>              <list>
-    0     chr1 101 - 112                * |      0  0.4862658925128007
-    1     chr2 102 - 103                - |      1 0.27948386889389953
-    2     chr1 103 - 128                * |      2  0.5162697718607901
-    3     chr3 104 - 134                + |      3  0.5979843806415466
-    4     chr2 109 - 111                - |      4 0.04740781186083798
+    0     chr1 101 - 111                * |      0  0.4862658925128007
+    1     chr2 102 - 102                - |      1 0.27948386889389953
+    2     chr1 103 - 127                * |      2  0.5162697718607901
+    3     chr3 104 - 133                + |      3  0.5979843806415466
+    4     chr2 109 - 110                - |      4 0.04740781186083798
     ------
     seqinfo(3 sequences): chr1 chr2 chr3
 

diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -8,7 +8,7 @@ kernelspec:
 
 `GenomicRanges` is a Python package designed to handle genomic locations and facilitate genomic analysis. It is similar to Bioconductor's [GenomicRanges](https://bioconductor.org/packages/release/bioc/html/GenomicRanges.html) and uses the [IRanges](https://github.com/BiocPy/IRanges) package under the hood to manage and provide interval-based arithmetic operations.
 
-An `IRanges` holds a **start** position and a **width**, and is typically used to represent coordinates along a genomic sequence. The interpretation of the **start** position depends on the application; for sequences, the **start** is usually a 1-based position, but other use cases may allow zero or even negative values, e.g., circular genomes. `IRanges` uses [nested containment lists](https://github.com/pyranges/ncls) under the hood to perform fast overlap and search-based operations.
+An `IRanges` holds a **start** position and a **width**, and is typically used to represent coordinates along a genomic sequence. The interpretation of the **start** position depends on the application; for sequences, the **start** is usually a 1-based position, but other use cases may allow zero or even negative values, e.g., circular genomes. Ends are considered inclusive. `IRanges` uses [nested containment lists](https://github.com/pyranges/ncls) under the hood to perform fast overlap and search-based operations.
 
 The package provides a `GenomicRanges` class to specify multiple genomic elements, typically where genes start and end. Genes are themselves made of many subregions, such as exons, and a `GenomicRangesList` enables the representation of this nested structure.
 

diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py
@@ -1078,7 +1078,7 @@ def from_pandas(cls, input) -> "GenomicRanges":
             width = input["widths"].tolist()
         else:
             drops.append("ends")
-            width = input["ends"] - input["starts"]
+            width = input["ends"] - input["starts"] + 1
 
         if "seqnames" not in input.columns:
             raise ValueError("'input' must contain column 'seqnames'.")
@@ -1158,7 +1158,7 @@ def from_polars(cls, input) -> "GenomicRanges":
             width = input["widths"].to_list()
         else:
             drops.append("ends")
-            width = input["ends"] - input["starts"]
+            width = input["ends"] - input["starts"] + 1
 
         if "seqnames" not in input.columns:
             raise ValueError("'input' must contain column 'seqnames'.")
@@ -2491,7 +2491,7 @@ def precede(
                 res_idx = _sub_subset._ranges.precede(query=_query_subset._ranges, select=select)
 
                 if select == "first":
-                    matches = res_idx != None
+                    matches = res_idx != None  # noqa: E711
                     not_none = res_idx[matches]
 
                     if len(not_none) > 0:
@@ -2578,7 +2578,7 @@ def follow(
                 res_idx = _sub_subset._ranges.follow(query=_query_subset._ranges, select=select)
 
                 if select == "last":
-                    matches = res_idx != None
+                    matches = res_idx != None  # noqa: E711
                     not_none = res_idx[matches]
 
                     if len(not_none) > 0:

diff --git a/src/genomicranges/io/gtf.py b/src/genomicranges/io/gtf.py
@@ -100,7 +100,7 @@ def parse_gtf(
     rows = Parallel(n_jobs=-2)(delayed(_parse_all_attribute)(row) for _, row in df.iterrows())
     gtf = DataFrame.from_records(rows)
     gtf.drop(["group"], axis=1)
-
+    gtf["ends"] = gtf["ends"] - 1
     return gtf
 
 

diff --git a/tests/test_SeqInfo.py b/tests/test_SeqInfo.py
@@ -1,7 +1,9 @@
-from genomicranges.SeqInfo import SeqInfo, merge_SeqInfo
 from random import random
-import pytest
+
 import numpy as np
+import pytest
+
+from genomicranges.SeqInfo import SeqInfo, merge_SeqInfo
 
 __author__ = "jkanche"
 __copyright__ = "jkanche"
@@ -78,7 +80,7 @@ def test_create_seqInfo_numpy_masked():
 def test_create_empty():
     si = SeqInfo.empty()
 
-    with pytest.raises(ValueError) as ex:
+    with pytest.raises(ValueError):
         si.set_seqnames([None, "chrB", "chrC"])
 
 

diff --git a/tests/test_gr_basic.py b/tests/test_gr_basic.py
@@ -1,9 +1,11 @@
+from random import random
+
+import biocutils as ut
 import pandas as pd
-from genomicranges import GenomicRanges
 from biocframe import BiocFrame
-import biocutils as ut
 from iranges import IRanges
-from random import random
+
+from genomicranges import GenomicRanges
 
 __author__ = "jkanche"
 __copyright__ = "jkanche"
@@ -80,6 +82,8 @@ def test_gr_empty_subset():
     assert len(gre) == 0
 
     subset = gre[0:10]
+    assert subset is not None
+    assert len(subset) == 0
 
 
 def test_export():

diff --git a/tests/test_grl_methods.py b/tests/test_grl_methods.py
@@ -1,9 +1,10 @@
 import pytest
-from genomicranges import GenomicRanges, GenomicRangesList
-from biocutils import combine_sequences
 from biocframe import BiocFrame
+from biocutils import combine_sequences
 from iranges import IRanges
 
+from genomicranges import GenomicRanges, GenomicRangesList
+
 __author__ = "jkanche"
 __copyright__ = "jkanche"
 __license__ = "MIT"
@@ -26,13 +27,13 @@
 def test_is_empty_False():
     grl = GenomicRangesList(ranges=[a, b], names=["a", "b"])
 
-    assert grl.is_empty() == False
+    assert grl.is_empty() is False
 
 
 def test_is_empty_slice():
     grl = GenomicRangesList(ranges=[a, b], names=["a", "b"])
 
-    assert grl.is_empty() == False
+    assert grl.is_empty() is False
 
     sgrl = grl[0:1]
     assert sgrl is not None
@@ -43,7 +44,7 @@ def test_is_empty_slice():
 def test_slice_by_name():
     grl = GenomicRangesList(ranges=[a, b], names=["a", "b"])
 
-    assert grl.is_empty() == False
+    assert grl.is_empty() is False
 
     sgrl = grl[["a"]]
     assert sgrl is not None
@@ -54,7 +55,7 @@ def test_slice_by_name():
 def test_slice_by_bool():
     grl = GenomicRangesList(ranges=[a, b], names=["a", "b"])
 
-    assert grl.is_empty() == False
+    assert grl.is_empty() is False
 
     sgrl = grl[[True, False]]
     assert sgrl is not None
@@ -68,7 +69,7 @@ def test_slice_by_bool():
 def test_is_empty_True():
     grl = GenomicRangesList(GenomicRanges.empty(), range_lengths=[0])
 
-    assert grl.is_empty() == True
+    assert grl.is_empty() is True
     assert len(grl) == 1