Skip to content

Commit

Permalink
Use dbstat to show pages info
Browse files Browse the repository at this point in the history
Fixes #542
  • Loading branch information
rogerbinns committed Nov 24, 2024
1 parent 7ad57f9 commit c4c2998
Show file tree
Hide file tree
Showing 8 changed files with 468 additions and 0 deletions.
194 changes: 194 additions & 0 deletions apsw/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,200 @@ def __exit__(self, *_) -> None:
}


@dataclasses.dataclass
class PageUsage:
"""Returned by :func:`analyze_pages`"""

page_size: int
"Size of pages in bytes. All pages in the database are the same size."
pages_used: int
"Pages with content"
sequential_pages: int
"How many pages were sequential in the database file"
data_stored: int
"Bytes of SQL content stored"
cells: int
"""Cells are what is `stored <https://www.sqlite.org/fileformat.html#b_tree_pages>`__
including sizing information, pointers to overflow etc"""
max_payload: int
"Largest cell size"
tables: list[str]
"Names of tables providing these statistics"
indices: list[str]
"Names of indices providing these statistics"


@dataclasses.dataclass
class DatabasePageUsage(PageUsage):
"""Returned by :func:`analyze_pages` when asking about the database as a whole"""

pages_total: int
"Number of pages in the database"
pages_freelist: int
"How many pages are unused, for example if data got deleted"
max_page_count: int
"Limit on the `number of pages <https://www.sqlite.org/pragma.html#pragma_max_page_count>`__"


def _analyze_pages_for_name(con: apsw.Connection, schema: str, name: str, usage: PageUsage):
qschema = '"' + schema.replace('"', '""') + '"'

for pages_used, ncell, payload, mx_payload in con.execute(
"""SELECT pageno, ncell, payload, mx_payload
FROM dbstat(?, 1) WHERE name=?
""",
(schema, name),
):
usage.pages_used += pages_used
usage.data_stored += payload
usage.cells += ncell
usage.max_payload = max(usage.max_payload, mx_payload)
t = con.execute(f"select type from {qschema}.sqlite_schema where name=?", (name,)).get
if t == "index":
usage.indices.append(name)
usage.indices.sort()
else:
usage.tables.append(name)
usage.tables.sort()

# by definition the first page is sequential but won't match next, so fake it
sequential = 1
next = None
for (pageno,) in con.execute("select pageno from dbstat(?) WHERE name=? ORDER BY path", (schema, name)):
sequential += pageno == next
next = pageno + 1
usage.sequential_pages += sequential


def analyze_pages(con: apsw.Connection, scope: int, schema: str = "main") -> DatabasePageUsage | dict[str, PageUsage]:
"""Summarizes page usage for the database
The `dbstat <https://www.sqlite.org/dbstat.html>`__ virtual table
is used to gather statistics.
See :download:`example output <../examples/analyze_pages.txt>`.
:param con: Connection to use
:param scope:
.. list-table::
:widths: auto
:header-rows: 1
* - Value
- Scope
- Returns
* - ``0``
- The database as a whole
- :class:`DatabasePageUsage`
* - ``1``
- Tables and their indices are grouped together. Virtual tables
like FTS5 have multiple backing tables which are grouped.
- A :class:`dict` where the key is the name of the
table, and a corresponding :class:`PageUsage` as the
value. The :attr:`PageUsage.tables` and
:attr:`PageUsage.indices` fields tell you which ones
were included.
* - ``2``
- Each table and index separately.
- :class:`dict` of each name and a corresponding
:class:`PageUsage` where one of the
:attr:`PageUsage.tables` and :attr:`PageUsage.indices`
fields will have the name.
.. note::
dbstat is present in PyPI builds, and many platform SQLite
distributions. You can use `pragma module_list
<https://www.sqlite.org/pragma.html#pragma_module_list>`__ to
check. If the table is not present then calling this function
will give :class:`apsw.SQLError` with message ``no such table:
dbstat``.
"""

qschema = '"' + schema.replace('"', '""') + '"'

if scope == 0:
total_usage = DatabasePageUsage(
page_size=con.pragma("page_size", schema=schema),
pages_total=con.pragma("page_count", schema=schema),
pages_freelist=con.pragma("freelist_count", schema=schema),
max_page_count=con.pragma("max_page_count", schema=schema),
pages_used=0,
data_stored=0,
sequential_pages=0,
tables=[],
indices=[],
cells=0,
max_payload=0,
)

_analyze_pages_for_name(con, schema, "sqlite_schema", total_usage)
for (name,) in con.execute(f"select name from {qschema}.sqlite_schema where rootpage!=0"):
_analyze_pages_for_name(con, schema, name, total_usage)

return total_usage

res = {}

grouping: dict[str, list[str]] = {}

if scope == 2:
grouping["sqlite_schema"] = ["sqlite_schema"]
for (name,) in con.execute(f"select name from {qschema}.sqlite_schema where rootpage!=0"):
grouping[name] = [name]
elif scope == 1:
grouping["sqlite"] = ["sqlite_schema"]
is_virtual_table: set[str] = set()
for type, name, tbl_name, rootpage in con.execute(
# the order by tbl_name is so we get eg fts base table
# name before the shadow tables. type desc is so that 'table'
# comes before index
f"""select type, name, tbl_name, rootpage
from {qschema}.sqlite_schema
where type in ('table', 'index')
order by tbl_name, type desc"""
):
if type == "index":
# indexes always know their table-
grouping[tbl_name].append(name)
continue
if name.startswith("sqlite_"):
grouping["sqlite"].append(name)
continue
if rootpage == 0:
grouping[name] = []
is_virtual_table.add(name)
continue
# shadow table? we assume an underscore separator searching longest names first
for n in sorted(grouping, key=lambda x: (len(x), x)):
if n in is_virtual_table and name.startswith(n + "_"):
grouping[n].append(name)
break
else:
grouping[name] = [name] if rootpage else []
else:
raise ValueError(f"Unknown {scope=}")

for group, names in sorted(grouping.items()):
usage = PageUsage(
page_size=con.pragma("page_size", schema=schema),
pages_used=0,
data_stored=0,
sequential_pages=0,
tables=[],
indices=[],
cells=0,
max_payload=0,
)

for name in names:
_analyze_pages_for_name(con, schema, name, usage)
res[group] = usage

return res


def format_query_table(
db: apsw.Connection,
query: str,
Expand Down
73 changes: 73 additions & 0 deletions apsw/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import inspect
import io
import json
import math
import os
import re
import shlex
Expand Down Expand Up @@ -2434,6 +2435,78 @@ def command_output(self, cmd):
finally:
self._out_colour()

def _command_pages_write_value(self, usage: apsw.ext.PageUsage | apsw.ext.DatabasePageUsage, name: str, width: int):
self.write(self.stdout, " " * (width - len(name)))
self.write(self.stdout, name + ": ")
value = getattr(usage, name)


def storage(v):
if not v:
return "0"
power = math.floor(math.log(v, 1024))
suffix = ["B", "KB", "MB", "GB", "TB", "PB", "EB"][int(power)]
if suffix == "B":
return f"{v}B"
return f"{v / 1024**power:.1f}".rstrip(".0") + suffix

if name in {"tables", "indices"}:
self.write_value(len(value))
self.write(self.stdout, "\n")
for name in value:
self.write(self.stdout, (width * " ") + " ")
self.write(self.stdout, self.colour.colour_value(name, name))
self.write(self.stdout, "\n")
return
elif name in {"page_size", "data_stored", "max_payload"}:
self.write(self.stdout, self.colour.colour_value(value, storage(value)))
elif name in {"pages_used", "pages_total", "max_page_count"}:
self.write(self.stdout, self.colour.colour_value(value, f"{value:,} ({storage(value*usage.page_size)})"))
elif name in {"sequential_pages"}:
self.write(self.stdout, self.colour.colour_value(value, f"{value:,} ({value/max(usage.pages_used, 1):.0%})"))
elif name in {"cells", "pages_freelist"}:
self.write(self.stdout, self.colour.colour_value(value, f"{value:,}"))
else:
self.write_value(value)

self.write(self.stdout, "\n")

def command_pages(self, cmd):
"""pages SCOPE: Shows page usage summary in human units
SCOPE is a number 0, 1, or 2.
0 - shows the database as a whole. 1 - groups by each table,
including its indices. 2 - shows each table and index
separately.
"""
scope = 0
try:
if len(cmd) != 1:
raise ValueError()
scope = int(cmd[0])
if scope not in {0, 1, 2}:
raise ValueError()
except ValueError:
raise self.Error("Expected a single parameter of 0, 1, or 2 for the scope")

width = max(
len(f.name) for f in dataclasses.fields(apsw.ext.PageUsage) + dataclasses.fields(apsw.ext.DatabasePageUsage)
)

res = apsw.ext.analyze_pages(self.db, scope)
if scope == 0:
for f in dataclasses.fields(res):
self._command_pages_write_value(res, f.name, width)
return

for name, pages in res.items():
self.write(self.stdout, self.colour.colour_value(name, name))
self.write(self.stdout, "\n")
for f in dataclasses.fields(pages):
self._command_pages_write_value(pages, f.name, width)
self.write(self.stdout, "\n")

def command_parameter(self, cmd):
"""parameter CMD ...: Maintain named bindings you can use in your queries.
Expand Down
40 changes: 40 additions & 0 deletions apsw/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10531,6 +10531,46 @@ def messy(arg1, arg2, arg3=lambda x: 1 / 0):
):
self.assertRaises(apsw.SQLError, self.db.execute, query)

def testExtAnalyzePages(self) -> None:
"analyze pages"
if "dbstat" not in self.db.pragma("module_list"):
return

self.assertRaises(ValueError, apsw.ext.analyze_pages, self.db, -1)
self.assertRaises(ValueError, apsw.ext.analyze_pages, self.db, 3)
self.assertRaises(ValueError, apsw.ext.analyze_pages, self.db, "hello")

# shadow tables
if "fts5" in self.db.pragma("module_list"):
self.db.execute('create virtual table "fts5_root" using fts5(a,b,c)')
if "rtree" in self.db.pragma("module_list"):
self.db.execute('create virtual table "rtree_root" using rtree(a,b,c,d,e)')
if "geopoly" in self.db.pragma("module_list"):
self.db.execute('create virtual table "geopoly_root" using geopoly(a,b,c,d,e)')
self.db.execute("""
create table one(x,y);
create index xxx on one(x);
attach '' as [db"2];
create table [db"2].two(x);
create index [db"2].two_index on two(x);
""")

for group, usage in apsw.ext.analyze_pages(self.db, 1).items():
if "root" in group:
self.assertTrue(group.endswith("_root"))

# check schema is obeyed
to_check = (
[apsw.ext.analyze_pages(self.db, 0, 'db"2')]
+ list(apsw.ext.analyze_pages(self.db, 1, 'db"2').values())
+ list(apsw.ext.analyze_pages(self.db, 2, 'db"2').values())
)
for tc in to_check:
for name in tc.tables + tc.indices:
if not name.startswith("sqlite_"):
self.assertIn("two", name)
self.assertNotIn("root", name)

def testExtQueryInfo(self) -> None:
"apsw.ext.query_info"
qd = apsw.ext.query_info(self.db, "select 3; a syntax error")
Expand Down
5 changes: 5 additions & 0 deletions doc/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ A command line tool ``apsw`` is defined which invokes the :doc:`shell
<https://docs.astral.sh/uv/guides/tools/>`__ without having to
explicitly install APSW.

Added :func:`apsw.ext.analyze_pages` which uses `dbstat
<https://www.sqlite.org/dbstat.html>`__ to provide useful information
about the pages making up the database, and fragmentation. The shell
:ref:`.pages command <shell-cmd-pages>` shows it in a pretty form.

3.47.0.0
========

Expand Down
14 changes: 14 additions & 0 deletions doc/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@ from SQL. You can use ``.help`` to see a list of all commands and
``.output`` ``FILENAME``
Send output to ``FILENAME`` (or stdout)

``.pages`` ``SCOPE``
Shows page usage summary in human units

``.parameter`` ``CMD`` ``...``
Maintain named bindings you can use in your queries.

Expand Down Expand Up @@ -643,6 +646,17 @@ when the shell was started. The file is opened using the current encoding
(change with ``encoding`` command).


.pages SCOPE
------------

Shows page usage summary in human units

``SCOPE`` is a number 0, 1, or 2.

0 - shows the database as a whole. 1 - groups by each table, including its
indices. 2 - shows each table and index separately.


.parameter CMD ...
------------------

Expand Down
7 changes: 7 additions & 0 deletions doc/ext.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ into a virtual table source.
:meth:`generate_series` and :meth:`generate_series_sqlite` provide
`generate_series <https://sqlite.org/series.html>`__.

Database storage usage
----------------------

Use :func:`analyze_pages` to find out how much storage is in use, and
how fragmented it is. See :download:`example output
<../examples/analyze_pages.txt>`.

Accessing result rows by column name
------------------------------------

Expand Down
Loading

0 comments on commit c4c2998

Please sign in to comment.