From 902ef2be86b96bcec2a8fc18adceb2e4d6c15e3c Mon Sep 17 00:00:00 2001 From: Caleb Grant <48339519+geocoug@users.noreply.github.com> Date: Fri, 19 Jul 2024 22:15:28 -0700 Subject: [PATCH] Refactor (#2) * start refactoring * continue refactoring, + docs * update readme Signed-off-by: Caleb Grant * update docs, lint and format Signed-off-by: Caleb Grant --------- Signed-off-by: Caleb Grant --- .github/workflows/ci-cd.yml | 25 + .gitignore | 1 + Makefile | 21 +- README.md | 313 +-- docs/conf.py | 60 + docs/examples.rst | 314 +++ docs/index.rst | 136 ++ docs/pg_upsert.rst | 12 + pg_upsert/__init__.py | 13 +- pg_upsert/_version.py | 7 + pg_upsert/pg_upsert.py | 3845 ++++++++++++++++++----------------- pyproject.toml | 12 +- requirements.txt | 41 + tests/data.sql | 6 +- tests/test_pg_upsert.py | 86 +- 15 files changed, 2695 insertions(+), 2197 deletions(-) create mode 100644 docs/conf.py create mode 100644 docs/examples.rst create mode 100644 docs/index.rst create mode 100644 docs/pg_upsert.rst create mode 100644 pg_upsert/_version.py diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml index 83dfaa5..524d378 100644 --- a/.github/workflows/ci-cd.yml +++ b/.github/workflows/ci-cd.yml @@ -76,6 +76,31 @@ jobs: platforms: linux/amd64,linux/arm64 + docs-build-and-deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install sphinx sphinx-book-theme + - name: Build Sphinx documentation + run: | + cd docs + make html + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./docs/_build/html + + pypi-publish: name: PyPI Publish runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 5514699..f17aeee 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ dist/ *.egg-info build/ +docs/_build/ diff --git a/Makefile b/Makefile index 71954b3..487767a 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,13 @@ PYTHON = $(BIN)/python PIP = $(BIN)/pip TEST = pytest +# Sphinx documentation +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SPHINXAPIDOC ?= sphinx-apidoc +SOURCEDIR = docs +BUILDDIR = docs/_build + # Self documenting commands .DEFAULT_GOAL := help .PHONY: help @@ -61,12 +68,22 @@ lint: $(VENV)/bin/activate ## Run pre-commit hooks test: $(VENV)/bin/activate ## Run unit tests $(PYTHON) -m $(TEST) -build: $(VENV)/bin/activate ## Generate distrubition packages +build-dist: $(VENV)/bin/activate ## Generate distrubition packages $(PYTHON) -m build +build-docs: ## Generate documentation + @printf "Building documentation\n" + @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) + publish: $(VENV)/bin/activate ## Publish to PyPI $(MAKE) lint $(MAKE) test - $(MAKE) build + $(MAKE) build-dist $(PYTHON) -m twine upload --repository pypi dist/* $(MAKE) clean + +build: $(VENV)/bin/activate ## Build the project + $(MAKE) lint + $(MAKE) test + $(MAKE) build-dist + $(MAKE) build-docs diff --git a/README.md b/README.md index 615a5ac..b86f42e 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![ci/cd](https://github.com/geocoug/pg_upsert/actions/workflows/ci-cd.yml/badge.svg)](https://github.com/geocoug/pg_upsert/actions/workflows/ci-cd.yml) [![PyPI Latest Release](https://img.shields.io/pypi/v/pg_upsert.svg)](https://pypi.org/project/pg_upsert/) [![PyPI Downloads](https://img.shields.io/pypi/dm/pg_upsert.svg?label=pypi%20downloads)](https://pypi.org/project/pg_upsert/) +[![Python Version Support](https://img.shields.io/pypi/pyversions/pg_upsert.svg)](https://pypi.org/project/pg_upsert/) **pg_upsert** is a Python package that provides a method to *interactively* update and insert (upsert) rows of a base table or base tables from the staging table(s) of the same name. The package is designed to work exclusively with PostgreSQL databases. @@ -12,26 +13,6 @@ The program will perform initial table checks in the form of not-null, primary k This project was created using inspiration from [ExecSQL](https://execsql.readthedocs.io/en/latest/index.html) and the example script [`pg_upsert.sql`](https://osdn.net/projects/execsql-upsert/). The goal of this project is to provide a Python implementation of `pg_upsert.sql` without the need for ExecSQL. -## Installation - -1. Create a virtual environment - - ```sh - python -m venv .venv - ``` - -2. Activate the virtual environment - - ```sh - source .venv/bin/activate - ``` - -3. Install the package - - ```sh - pip install pg_upsert - ``` - ## Usage ### CLI @@ -65,302 +46,12 @@ options: method to use for upsert ``` -### Python - -```py -import logging -from pathlib import Path - -from pg_upsert import upsert - - -logfile = Path("pg_upsert.log") -if logfile.exists(): - logfile.unlink() - -logging.basicConfig( - level=logging.INFO, - format="%(message)s", - handlers=[ - logging.FileHandler(logfile), - logging.StreamHandler(), - ], -) - -upsert( - host="localhost", - database="", - user="postgres", - # passwd=, # if not provided, will prompt for password - tables=[], - stg_schema="staging", - base_schema="public", - upsert_method="upsert", # "upsert" | "update" | "insert", default: "upsert" - commit=False, # optional, default=False - interactive=True, # optional, default=False - exclude_cols=[], # optional - exclude_null_check_columns=[], # optional -) -``` - ### Docker ```sh -docker run --rm -v $(pwd):/app ghcr.io/geocoug/pg_upsert [-h] [-q] [-d] [-l LOGFILE] [-e EXCLUDE_COLUMNS] [-n NULL_COLUMNS] [-c] [-i] [-m METHOD] HOST DATABASE USER STAGING_SCHEMA BASE_SCHEMA TABLE [TABLE ...] +docker pull ghcr.io/geocoug/pg_upsert:latest ``` -## Example - -This example will demonstrate how to use `pg_upsert` to upsert data from staging tables to base tables. - -1. Initialize a PostgreSQL database called `dev` with the following schema and data. - - ```sql - -- Create base tables. - drop table if exists public.genres cascade; - create table public.genres ( - genre varchar(100) primary key, - description varchar not null - ); - - drop table if exists public.books cascade; - create table public.books ( - book_id varchar(100) primary key, - book_title varchar(200) not null, - genre varchar(100) not null, - notes text, - foreign key (genre) references genres(genre) - ); - - drop table if exists public.authors cascade; - create table public.authors ( - author_id varchar(100) primary key, - first_name varchar(100) not null, - last_name varchar(100) not null, - -- Check that the first and last name are not the same - constraint chk_authors check (first_name <> last_name), - -- Check that first_name only contains letters - constraint chk_authors_first_name check (first_name ~ '^[a-zA-Z]+$'), - -- Check that last_name only contains letters - constraint chk_authors_last_name check (last_name ~ '^[a-zA-Z]+$') - ); - - drop table if exists public.book_authors cascade; - create table public.book_authors ( - book_id varchar(100) not null, - author_id varchar(100) not null, - foreign key (author_id) references authors(author_id), - foreign key (book_id) references books(book_id), - constraint pk_book_authors primary key (book_id, author_id) - ); - - -- Create staging tables that mimic base tables. - -- Note: staging tables have the same columns as base tables but no PK, FK, or NOT NULL constraints. - create schema if not exists staging; - - drop table if exists staging.genres cascade; - create table staging.genres ( - genre varchar(100), - description varchar - ); - - drop table if exists staging.books cascade; - create table staging.books ( - book_id varchar(100), - book_title varchar(200), - genre varchar(100), - notes text - ); - - drop table if exists staging.authors cascade; - create table staging.authors ( - author_id varchar(100), - first_name varchar(100), - last_name varchar(100) - ); - - drop table if exists staging.book_authors cascade; - create table staging.book_authors ( - book_id varchar(100), - author_id varchar(100) - ); - - -- Insert data into staging tables. - insert into staging.genres (genre, description) values - ('Fiction', 'Literary works that are imaginary, not based on real events or people'), - ('Non-Fiction', 'Literary works based on real events, people, and facts'); - - insert into staging.authors (author_id, first_name, last_name) values - ('JDoe', 'John', 'Doe'), - ('JSmith', 'Jane', 'Smith'), - ('JTrent', 'Joe', 'Trent'); - - insert into staging.books (book_id, book_title, genre, notes) values - ('B001', 'The Great Novel', 'Fiction', 'An epic tale of love and loss'), - ('B002', 'Not Another Great Novel', 'Non-Fiction', 'A comprehensive guide to writing a great novel'); - - insert into staging.book_authors (book_id, author_id) values - ('B001', 'JDoe'), - ('B001', 'JTrent'), - ('B002', 'JSmith'); - ``` - -2. Create a Python script called `upsert_data.py` that calls `pg_upsert` to upsert data from staging tables to base tables. - - ```py - import logging - from pathlib import Path - - from pg_upsert import upsert - - logfile = Path("pg_upsert.log") - if logfile.exists(): - logfile.unlink() - - logging.basicConfig( - level=logging.INFO, - format="%(message)s", - handlers=[ - logging.FileHandler(logfile), - logging.StreamHandler(), - ], - ) - - upsert( - host="localhost", - database="dev", - user="docker", # Change this - tables=["books", "authors", "genres", "book_authors"], - stg_schema="staging", - base_schema="public", - upsert_method="upsert", - commit=True, - interactive=False, - exclude_cols=[], - exclude_null_check_columns=[], - ) - ``` - -3. Run the script: `python upsert_data.py` - - ```txt - The script pg_upsert.py wants the password for PostgresDB(host=localhost, database=dev, user=docker): - Upserting to public from staging - Tables selected for upsert: - books - authors - genres - book_authors - - ===Non-NULL checks=== - Conducting non-null QA checks on table staging.books - Conducting non-null QA checks on table staging.authors - Conducting non-null QA checks on table staging.genres - Conducting non-null QA checks on table staging.book_authors - - ===Primary Key checks=== - Conducting primary key QA checks on table staging.books - Conducting primary key QA checks on table staging.authors - Conducting primary key QA checks on table staging.genres - Conducting primary key QA checks on table staging.book_authors - - ===Foreign Key checks=== - Conducting foreign key QA checks on table staging.books - Conducting foreign key QA checks on table staging.authors - Conducting foreign key QA checks on table staging.genres - Conducting foreign key QA checks on table staging.book_authors - - ===Check Constraint checks=== - Conducting check constraint QA checks on table staging.books - Conducting check constraint QA checks on table staging.authors - Conducting check constraint QA checks on table staging.genres - Conducting check constraint QA checks on table staging.book_authors - - ===QA checks passed. Starting upsert=== - Performing upsert on table public.genres - Adding data to public.genres - 2 rows inserted - Performing upsert on table public.authors - Adding data to public.authors - 3 rows inserted - Performing upsert on table public.books - Adding data to public.books - 2 rows inserted - Performing upsert on table public.book_authors - Adding data to public.book_authors - 3 rows inserted - - Changes committed - ``` - -4. Modify a row in the staging table. - - ```sql - update staging.books set book_title = 'The Great Novel 2' where book_id = 'B001'; - ``` - -5. Run the script again, but this time set `interactive=True` in the `upsert` function call in `upsert_data.py`. - - The script will display GUI dialogs during the upsert process to show which rows will be added and which rows will be updated. The user can chose to confirm, skip, or cancel the upsert process at any time. The script will not commit any changes to the database until all of the upserts have been completed successfully. - - ![Screenshot](https://raw.githubusercontent.com/geocoug/pg_upsert/main/screenshot.png) - -6. Let's test some of the QA checks. Modify the `staging.books` table to include a row with a missing value in the `book_title` and `Mystery` value in the `genre` column. The `book_title` column is a non-null column, and the `genre` column is a foreign key column. Let's also modify the `staging.authors` table by adding `JDoe` again as the `author_id` but this time we will set both the `first_name` and `last_name` to `Doe1`. This should trigger a primary key error and check constraint errors. - - ```sql - insert into staging.books (book_id, book_title, genre, notes) - values ('B003', null, 'Mystery', 'A book with no name!'); - - insert into staging.authors (author_id, first_name, last_name) - values ('JDoe', 'Doe1', 'Doe1'); - ``` - - Run the script again: `python upsert_data.py` - - ```txt - The script pg_upsert.py wants the password for PostgresDB(host=localhost, database=dev, user=docker): - Upserting to public from staging - Tables selected for upsert: - books - authors - genres - book_authors - - ===Non-NULL checks=== - Conducting non-null QA checks on table staging.books - Column book_title has 1 null values - Conducting non-null QA checks on table staging.authors - Conducting non-null QA checks on table staging.genres - Conducting non-null QA checks on table staging.book_authors - - ===Primary Key checks=== - Conducting primary key QA checks on table staging.books - Conducting primary key QA checks on table staging.authors - Duplicate key error in columns author_id - Conducting primary key QA checks on table staging.genres - Conducting primary key QA checks on table staging.book_authors - - ===Foreign Key checks=== - Conducting foreign key QA checks on table staging.books - Foreign key error referencing genres - Conducting foreign key QA checks on table staging.authors - Conducting foreign key QA checks on table staging.genres - Conducting foreign key QA checks on table staging.book_authors - - ===Check Constraint checks=== - Conducting check constraint QA checks on table staging.books - Conducting check constraint QA checks on table staging.authors - Check constraint chk_authors has 1 failing rows - Check constraint chk_authors_first_name has 1 failing rows - Check constraint chk_authors_last_name has 1 failing rows - Conducting check constraint QA checks on table staging.genres - Conducting check constraint QA checks on table staging.book_authors - - QA checks failed. Aborting upsert. - ``` - - The script failed to upsert data because there are non-null and foreign key checks that failed on the `staging.books` table, and primary key and check constraint that failed on the `staging.authors` table. The interactive GUI will display all values in the `books.genres` column that fail the foreign key check. No GUI dialogs are displayed for non-null checks, because there are no values to display. Similarly, if there is a primary key check that fails (like in the `staging.authors` table), a GUI dialog will be displayed with the primary keys in the table that are failing. No GUI dialogs are displayed for check constraint checks. - ## Contributing 1. Fork the repository diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..682cc96 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,60 @@ +# Configuration file for the Sphinx documentation builder. + +import os +import sys + +sys.path.insert(0, os.path.abspath("..")) + +import pg_upsert + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = pg_upsert.__title__ +title = pg_upsert.__title__ +copyright = f"2024, {pg_upsert.__author__}" +author = pg_upsert.__author__ + +# The short X.Y version. +version = pg_upsert.__version__ +# The full version, including alpha/beta/rc tags. +release = pg_upsert.__version__ + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.doctest", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx_copybutton", +] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +add_module_names = True + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "sphinx_rtd_theme" +html_context = { + "display_github": True, # Integrate GitHub + "github_user": "geocoug", # Username + "github_repo": "pg_upsert", # Repo name + "github_version": "main", # Version + "conf_py_path": "/docs/", # Path in the checkout to the docs root +} +html_static_path = ["_static"] +html_sidebars = { + "**": ["examples.hsml"], +} +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +html_show_sphinx = False diff --git a/docs/examples.rst b/docs/examples.rst new file mode 100644 index 0000000..25f8252 --- /dev/null +++ b/docs/examples.rst @@ -0,0 +1,314 @@ +.. _examples: + +Examples +======== + +Detailed example +---------------- + +This example will demonstrate how to use `pg_upsert` to upsert data from staging tables to base tables. + +1. Initialize a PostgreSQL database called `dev` with the following schema and data. + + .. code-block:: sql + + -- Create base tables. + drop table if exists public.genres cascade; + create table public.genres ( + genre varchar(100) primary key, + description varchar not null + ); + + drop table if exists public.books cascade; + create table public.books ( + book_id varchar(100) primary key, + book_title varchar(200) not null, + genre varchar(100) not null, + notes text, + foreign key (genre) references genres(genre) + ); + + drop table if exists public.authors cascade; + create table public.authors ( + author_id varchar(100) primary key, + first_name varchar(100) not null, + last_name varchar(100) not null, + -- Check that the first and last name are not the same + constraint chk_authors check (first_name <> last_name), + -- Check that first_name only contains letters + constraint chk_authors_first_name check (first_name ~ '^[a-zA-Z]+$'), + -- Check that last_name only contains letters + constraint chk_authors_last_name check (last_name ~ '^[a-zA-Z]+$') + ); + + drop table if exists public.book_authors cascade; + create table public.book_authors ( + book_id varchar(100) not null, + author_id varchar(100) not null, + foreign key (author_id) references authors(author_id), + foreign key (book_id) references books(book_id), + constraint pk_book_authors primary key (book_id, author_id) + ); + + -- Create staging tables that mimic base tables. + -- Note: staging tables have the same columns as base tables but no PK, FK, or NOT NULL constraints. + create schema if not exists staging; + + drop table if exists staging.genres cascade; + create table staging.genres ( + genre varchar(100), + description varchar + ); + + drop table if exists staging.books cascade; + create table staging.books ( + book_id varchar(100), + book_title varchar(200), + genre varchar(100), + notes text + ); + + drop table if exists staging.authors cascade; + create table staging.authors ( + author_id varchar(100), + first_name varchar(100), + last_name varchar(100) + ); + + drop table if exists staging.book_authors cascade; + create table staging.book_authors ( + book_id varchar(100), + author_id varchar(100) + ); + + -- Insert data into staging tables. + insert into staging.genres (genre, description) values + ('Fiction', 'Literary works that are imaginary, not based on real events or people'), + ('Non-Fiction', 'Literary works based on real events, people, and facts'); + + insert into staging.authors (author_id, first_name, last_name) values + ('JDoe', 'John', 'Doe'), + ('JSmith', 'Jane', 'Smith'), + ('JTrent', 'Joe', 'Trent'); + + insert into staging.books (book_id, book_title, genre, notes) values + ('B001', 'The Great Novel', 'Fiction', 'An epic tale of love and loss'), + ('B002', 'Not Another Great Novel', 'Non-Fiction', 'A comprehensive guide to writing a great novel'); + + insert into staging.book_authors (book_id, author_id) values + ('B001', 'JDoe'), + ('B001', 'JTrent'), + ('B002', 'JSmith'); + +2. Create a Python script called `upsert_data.py` that calls `pg_upsert` to upsert data from staging tables to base tables. + + .. code-block:: python + + import logging + + from pg_upsert import PgUpsert + + logger = logging.getLogger("pg_upsert") + logger.setLevel(logging.INFO) + logger.addHandler(logging.StreamHandler()) + + PgUpsert( + host="localhost", + port=5432, + database="dev", + user="username", + tables=("genres", "books", "authors", "book_authors"), + stg_schema="staging", + base_schema="public", + do_commit=True, + upsert_method="upsert", + interactive=False, + ).run() + +3. Run the script: `python upsert_data.py` + + .. code-block:: text + + The script pg_upsert.py wants the password for PostgresDB(host=localhost, database=dev, user=docker): + Upserting to public from staging + Tables selected for upsert: + books + authors + genres + book_authors + + ===Non-NULL checks=== + Conducting non-null QA checks on table staging.books + Conducting non-null QA checks on table staging.authors + Conducting non-null QA checks on table staging.genres + Conducting non-null QA checks on table staging.book_authors + + ===Primary Key checks=== + Conducting primary key QA checks on table staging.books + Conducting primary key QA checks on table staging.authors + Conducting primary key QA checks on table staging.genres + Conducting primary key QA checks on table staging.book_authors + + ===Foreign Key checks=== + Conducting foreign key QA checks on table staging.books + Conducting foreign key QA checks on table staging.authors + Conducting foreign key QA checks on table staging.genres + Conducting foreign key QA checks on table staging.book_authors + + ===Check Constraint checks=== + Conducting check constraint QA checks on table staging.books + Conducting check constraint QA checks on table staging.authors + Conducting check constraint QA checks on table staging.genres + Conducting check constraint QA checks on table staging.book_authors + + ===QA checks passed. Starting upsert=== + Performing upsert on table public.genres + Adding data to public.genres + 2 rows inserted + Performing upsert on table public.authors + Adding data to public.authors + 3 rows inserted + Performing upsert on table public.books + Adding data to public.books + 2 rows inserted + Performing upsert on table public.book_authors + Adding data to public.book_authors + 3 rows inserted + + Changes committed + +4. Modify a row in the staging table. + + .. code-block:: sql + + update staging.books set book_title = 'The Great Novel 2' where book_id = 'B001'; + +5. Run the script again, but this time set `interactive=True` in the `upsert` function call in `upsert_data.py`. + + The script will display GUI dialogs during the upsert process to show which rows will be added and which rows will be updated. The user can chose to confirm, skip, or cancel the upsert process at any time. The script will not commit any changes to the database until all of the upserts have been completed successfully. + + .. image:: https://raw.githubusercontent.com/geocoug/pg_upsert/main/screenshot.png + +6. Let's test some of the QA checks. Modify the `staging.books` table to include a row with a missing value in the `book_title` and `Mystery` value in the `genre` column. The `book_title` column is a non-null column, and the `genre` column is a foreign key column. Let's also modify the `staging.authors` table by adding `JDoe` again as the `author_id` but this time we will set both the `first_name` and `last_name` to `Doe1`. This should trigger a primary key error and check constraint errors. + + .. code-block:: sql + + insert into staging.books (book_id, book_title, genre, notes) + values ('B003', null, 'Mystery', 'A book with no name!'); + + insert into staging.authors (author_id, first_name, last_name) + values ('JDoe', 'Doe1', 'Doe1'); + + Run the script again: `python upsert_data.py` + + .. code-block:: text + + The script pg_upsert.py wants the password for PostgresDB(host=localhost, database=dev, user=docker): + Upserting to public from staging + Tables selected for upsert: + books + authors + genres + book_authors + + ===Non-NULL checks=== + Conducting non-null QA checks on table staging.books + Column book_title has 1 null values + Conducting non-null QA checks on table staging.authors + Conducting non-null QA checks on table staging.genres + Conducting non-null QA checks on table staging.book_authors + + ===Primary Key checks=== + Conducting primary key QA checks on table staging.books + Conducting primary key QA checks on table staging.authors + Duplicate key error in columns author_id + Conducting primary key QA checks on table staging.genres + Conducting primary key QA checks on table staging.book_authors + + ===Foreign Key checks=== + Conducting foreign key QA checks on table staging.books + Foreign key error referencing genres + Conducting foreign key QA checks on table staging.authors + Conducting foreign key QA checks on table staging.genres + Conducting foreign key QA checks on table staging.book_authors + + ===Check Constraint checks=== + Conducting check constraint QA checks on table staging.books + Conducting check constraint QA checks on table staging.authors + Check constraint chk_authors has 1 failing rows + Check constraint chk_authors_first_name has 1 failing rows + Check constraint chk_authors_last_name has 1 failing rows + Conducting check constraint QA checks on table staging.genres + Conducting check constraint QA checks on table staging.book_authors + + QA checks failed. Aborting upsert. + + The script failed to upsert data because there are non-null and foreign key checks that failed on the `staging.books` table, and primary key and check constraint that failed on the `staging.authors` table. The interactive GUI will display all values in the `books.genres` column that fail the foreign key check. No GUI dialogs are displayed for non-null checks, because there are no values to display. Similarly, if there is a primary key check that fails (like in the `staging.authors` table), a GUI dialog will be displayed with the primary keys in the table that are failing. No GUI dialogs are displayed for check constraint checks. + + +QA and upsert +------------- + +.. code-block:: python + + upsert.run() + + +QA checks only +-------------- + +Run all not-null, primary key, foreign key, and check constraint QA checks on all tables. + +.. code-block:: python + + upsert.qa_all() + + +Upsert only +----------- + +Run upsert procedures on all tables and commit changes. Changes will not be committed if `do_commit=False`. + +.. code-block:: python + + upsert.upsert_all().commit() + + +Run upsert on one table +----------------------- + +Run upsert procedures on one table and commit changes. Changes will not be committed if `do_commit=False`. + +.. code-block:: python + + upsert.upsert_one(table="authors").commit() + + +Run a specific set of QA checks on one table +-------------------------------------------- + +Run a specific set of QA checks on one table. The following QA checks are available: null checks, primary key checks, foreign key checks, and check constraint checks. + +.. code-block:: python + + # Null checks + upsert.qa_one_null("authors") + # Primary key checks + upsert.qa_one_pk("authors") + # Foreign key checks + upsert.qa_one_fk("authors") + # Check constraint checks + upsert.qa_one_ck("authors") + + +Modify control table +-------------------- + +Modify the control table on a table-by-table basis. The control table is initialized when the class is instantiated. Modifying the control table allows you to make fine-grained changes to the upsert process including excluding columns from the upsert process, toggling interactivity for a specific table, and excluding columns from not-null QA checks. + +.. code-block:: python + + upsert.db.execute( + f"update {upsert.control_table} set exclude_cols = 'first_name,last_name', interactive=true where table_name = 'authors';" + ) + upsert.upsert_one(table="authors").commit() diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..6662c28 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,136 @@ +.. pg_upsert documentation master file, created by + sphinx-quickstart on Thu Jul 18 11:16:22 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +pg_upsert documentation +======================= + +.. image:: https://github.com/geocoug/pg_upsert/actions/workflows/ci-cd.yml/badge.svg + :target: https://pypi.org/project/pg_upsert/ + :alt: CI/CD Badge + +.. image:: https://img.shields.io/pypi/v/pg_upsert.svg + :target: https://pypi.org/project/pg_upsert/ + :alt: PyPI Latest Release Badge + +.. image:: https://img.shields.io/pypi/dm/pg_upsert.svg?label=pypi%20downloads + :target: https://pypi.org/project/pg_upsert/ + :alt: pg_upsert Downloads Per Month Badge + +.. image:: https://img.shields.io/pypi/pyversions/pg_upsert.svg + :target: https://pypi.org/project/pg_upsert/ + :alt: Python Version Support Badge + +**pg_upsert** is a Python package that runs not-NULL, Primary Key, Foreign Key, and Check Constraint checks on PostgreSQL staging tables then updates and inserts (upsert) data from staging tables to base tables. + +Looking for examples? Check out the `examples `_ page. + + +Installation +------------ + +You can install **pg_upsert** via pip from PyPI: + +.. code-block:: bash + + python -m venv .venv \ + && source .venv/bin/activate \ + && pip install pg_upsert + +There is also a Docker image available on the GitHub Container Registry: + +.. code-block:: bash + + docker pull ghcr.io/geocoug/pg_upsert:latest + + +Module Contents +--------------- + +.. toctree:: + :maxdepth: 4 + + pg_upsert + examples + +Usage +----- + +Python +^^^^^^ + +Below is a simple example of how to use the `PgUpsert` class to upsert data from staging tables to base tables. For a more detailed example, check out the `examples `_ page. + +.. code-block:: python + + import logging + + from pg_upsert import PgUpsert + + logger = logging.getLogger("pg_upsert") + logger.setLevel(logging.INFO) + logger.addHandler(logging.StreamHandler()) + + PgUpsert( + host="localhost", + port=5432, + database="dev", + user="username", + tables=("genres", "books", "authors", "book_authors"), + stg_schema="staging", + base_schema="public", + do_commit=True, + upsert_method="upsert", + interactive=False, + ).run() + + +CLI +^^^ + +`pg_upsert` has a command-line interface that can be used to perform all the functionality of the `PgUpsert` class. The CLI can be accessed by running `pg_upsert` in the terminal. + +.. code-block:: bash + + usage: pg_upsert.py [-h] [-q] [-d] [-l LOGFILE] [-e EXCLUDE_COLUMNS] [-n NULL_COLUMNS] [-c] [-i] [-m METHOD] HOST DATABASE USER STAGING_SCHEMA BASE_SCHEMA TABLE [TABLE ...] + + Update and insert (upsert) data from staging tables to base tables. + + positional arguments: + HOST database host + DATABASE database name + USER database user + STAGING_SCHEMA staging schema name + BASE_SCHEMA base schema name + TABLE table name(s) + + options: + -h, --help show this help message and exit + -q, --quiet suppress all console output + -d, --debug display debug output + -l LOGFILE, --log LOGFILE + write log to LOGFILE + -e EXCLUDE_COLUMNS, --exclude EXCLUDE_COLUMNS + comma-separated list of columns to exclude from null checks + -n NULL_COLUMNS, --null NULL_COLUMNS + comma-separated list of columns to exclude from null checks + -c, --commit commit changes to database + -i, --interactive display interactive GUI of important table information + -m METHOD, --method METHOD + method to use for upsert + + +Docker +^^^^^^ + +There is a Docker image available on the GitHub Container Registry that can be used to run `pg_upsert`: +.. code-block:: bash + + docker pull ghcr.io/geocoug/pg_upsert:latest + + +Credits +------- + +This project was created using inspiration from `execsql `_ and the example script `pg_upsert.sql `_. The goal of this project is to provide a Python implementation of `pg_upsert.sql` without the need for ExecSQL. diff --git a/docs/pg_upsert.rst b/docs/pg_upsert.rst new file mode 100644 index 0000000..d0830ae --- /dev/null +++ b/docs/pg_upsert.rst @@ -0,0 +1,12 @@ +.. _pg_upsert: + +pg\_upsert +========== + +All of pg_upsert's functionality can be accessed by the :class:`PgUpsert` object, which +includes all the methods and attributes mentioned in the sections below. + +.. automodule:: pg_upsert + :members: + :undoc-members: + :show-inheritance: diff --git a/pg_upsert/__init__.py b/pg_upsert/__init__.py index 1365b43..9984284 100644 --- a/pg_upsert/__init__.py +++ b/pg_upsert/__init__.py @@ -1,3 +1,12 @@ -from .pg_upsert import upsert +from ._version import ( + __author__, + __author_email__, + __description__, + __license__, + __title__, + __url__, + __version__, +) +from .pg_upsert import PgUpsert -__all__ = ["upsert"] +__all__ = ["PgUpsert"] diff --git a/pg_upsert/_version.py b/pg_upsert/_version.py new file mode 100644 index 0000000..7f36af0 --- /dev/null +++ b/pg_upsert/_version.py @@ -0,0 +1,7 @@ +__title__ = "pg_upsert" +__author__ = "Caleb Grant" +__url__ = "https://github.com/geocoug/pg_upsert" +__author_email__ = "grantcaleb22@gmail.com" +__license__ = "GNU GPLv3" +__version__ = "1.1.4" +__description__ = "Run not-NULL, Primary Key, Foreign Key, and Check Constraint checks on staging tables then update and insert (upsert) data from staging tables to base tables." # noqa: E501 diff --git a/pg_upsert/pg_upsert.py b/pg_upsert/pg_upsert.py index 4c0e6ef..2973d50 100644 --- a/pg_upsert/pg_upsert.py +++ b/pg_upsert/pg_upsert.py @@ -13,29 +13,18 @@ from datetime import datetime from pathlib import Path -import polars as pl import psycopg2 from psycopg2.extras import DictCursor from psycopg2.sql import SQL, Composable, Identifier, Literal from tabulate import tabulate -__version__ = "1.1.4" - -description_long = """ -Check data in a staging table or set of staging tables, then update and insert (upsert) -rows of a base table or base tables from the staging table(s) of the same name. -Initial table checks include not-null, primary key, and foreign key checks. -If any of these checks fail, the program will exit with an error message. -If all checks pass, the program will display the number of rows to be inserted -and updated, and ask for confirmation before proceeding. If the user confirms, the -program will perform the upserts and display the number of rows inserted and updated. -If the user does not confirm, the program will exit without performing any upserts. -""" - -description_short = ( - "Update and insert (upsert) data from staging tables to base tables." -) +from ._version import __description__, __version__ +logging.basicConfig( + level=logging.INFO, + format="%(message)s", + handlers=[logging.NullHandler()], +) logger = logging.getLogger(__name__) @@ -47,33 +36,50 @@ def __init__( host: str, database: str, user: str, - **kwargs, + port: int = 5432, + passwd: None | str = None, ) -> None: self.host = host + self.port = port self.database = database self.user = user - if ("passwd" in kwargs and kwargs["passwd"] is not None) or ( - "password" in kwargs and kwargs["password"] is not None - ): - self.passwd = kwargs["passwd"] + if passwd is not None: + self.passwd = passwd else: self.passwd = self.get_password() - self.port = 5432 self.in_transaction = False self.encoding = "UTF8" self.conn = None + if not self.valid_connection(): + raise psycopg2.Error(f"Error connecting to {self!s}") def __repr__(self: PostgresDB) -> str: - return f"{self.__class__.__name__}(host={self.host}, database={self.database}, user={self.user})" # noqa: E501 + return ( + f"{self.__class__.__name__}(host={self.host}, port={self.port}, database={self.database}, user={self.user})" + ) def __del__(self: PostgresDB) -> None: """Delete the instance.""" self.close() def get_password(self): - return getpass.getpass( - f"The script {Path(__file__).name} wants the password for {self!s}: ", - ) + try: + return getpass.getpass( + f"The script {Path(__file__).name} wants the password for {self!s}: ", + ) + except (KeyboardInterrupt, EOFError) as err: + raise err + + def valid_connection(self: PostgresDB) -> bool: + """Test the database connection.""" + logger.debug(f"Testing connection to {self!s}") + try: + self.open_db() + return True + except psycopg2.Error: + return False + finally: + self.close() def open_db(self: PostgresDB) -> None: """Open a database connection.""" @@ -126,11 +132,14 @@ def execute(self: PostgresDB, sql: str | Composable, params=None): try: curs = self.cursor() if isinstance(sql, Composable): + logger.debug(f"\n{sql.as_string(curs)}") curs.execute(sql) else: if params is None: + logger.debug(f"\n{sql}") curs.execute(sql.encode(self.encoding)) else: + logger.debug(f"\nSQL:\n{sql}\nParameters:\n{params}") curs.execute(sql.encode(self.encoding), params) except Exception: self.rollback() @@ -150,14 +159,7 @@ def dict_row(): row = curs.fetchone() if row: if self.encoding: - r = [ - ( - c.decode(self.encoding, "backslashreplace") - if isinstance(c, bytes) - else c - ) - for c in row - ] + r = [(c.decode(self.encoding, "backslashreplace") if isinstance(c, bytes) else c) for c in row] else: r = row return dict(zip(headers, r, strict=True)) @@ -165,16 +167,6 @@ def dict_row(): return (iter(dict_row, None), headers, curs.rowcount) - def dataframe( - self: PostgresDB, - sql: str | Composable, - params=None, - **kwargs, - ) -> pl.DataFrame: - """Return query results as a Polars dataframe object.""" - data, cols, rowcount = self.rowdict(sql, params) - return pl.DataFrame(data, infer_schema_length=rowcount, **kwargs) - class CompareUI: def __init__( @@ -527,1759 +519,2130 @@ def click(self: ClickSet, *args): self.ui_obj.win.destroy() -def treeview_table( - parent: ttk.Frame, - rowset: list | tuple, - column_headers: list | tuple, - select_mode="none", -): - """Creates a TreeView table containing the specified data, with scrollbars and - status bar in an enclosing frame. - This does not grid the table frame in its parent widget. Returns a tuple - of 0: the frame containing the table, and 1: the table widget itself. +class PgUpsert: """ - nrows = range(len(rowset)) - ncols = range(len(column_headers)) - hdrwidths = [len(column_headers[j]) for j in ncols] - if len(rowset) > 0: - datawidthtbl = [ - [ - len( - ( - rowset[i][j] - if isinstance(rowset[i][j], str) - else str(rowset[i][j]) - ), - ) - for i in nrows - ] - for j in ncols - ] - datawidths = [max(cwidths) for cwidths in datawidthtbl] - else: - datawidths = hdrwidths - colwidths = [max(hdrwidths[i], datawidths[i]) for i in ncols] - # Set the font. - ff = tkfont.nametofont("TkFixedFont") - tblstyle = ttk.Style() - tblstyle.configure("tblstyle", font=ff) - charpixels = int(1.3 * ff.measure("0")) - tableframe = ttk.Frame(master=parent, padding="3 3 3 3") - statusframe = ttk.Frame(master=tableframe) - # Create and configure the Treeview table widget - tv_widget = ttk.Treeview( - tableframe, - columns=column_headers, - selectmode=select_mode, - show="headings", - ) - tv_widget.configure()["style"] = tblstyle - ysb = ttk.Scrollbar(tableframe, orient="vertical", command=tv_widget.yview) - xsb = ttk.Scrollbar(tableframe, orient="horizontal", command=tv_widget.xview) - tv_widget.configure(yscrollcommand=ysb.set, xscrollcommand=xsb.set) - # Status bar - statusbar = ttk.Label( - statusframe, - text=" %d rows" % len(rowset), - relief=tk.RIDGE, - anchor=tk.W, - ) - tableframe.statuslabel = statusbar - # Fill the Treeview table widget with data - set_tv_headers(tv_widget, column_headers, colwidths, charpixels) - fill_tv_table(tv_widget, rowset, statusbar) - # Place the table - tv_widget.grid(column=0, row=0, sticky=tk.NSEW) - ysb.grid(column=1, row=0, sticky=tk.NS) - xsb.grid(column=0, row=1, sticky=tk.EW) - statusframe.grid(column=0, row=3, sticky=tk.EW) - tableframe.columnconfigure(0, weight=1) - tableframe.rowconfigure(0, weight=1) - # Place the status bar - statusbar.pack(side=tk.BOTTOM, fill=tk.X) - # Allow resizing of the table - tableframe.columnconfigure(0, weight=1) - tableframe.rowconfigure(0, weight=1) - # - return tableframe, tv_widget - - -def set_tv_headers( - tvtable: ttk.Treeview, - column_headers: list, - colwidths: list, - charpixels: int, -): - """Set the headers and column widths for a Treeview table widget.""" - pixwidths = [charpixels * col for col in colwidths] - for i in range(len(column_headers)): - hdr = column_headers[i] - tvtable.column(hdr, width=pixwidths[i]) - tvtable.heading( - hdr, - text=hdr, - command=lambda _col=hdr: treeview_sort_column(tvtable, _col, False), + Perform one or all of the following operations on a set of PostgreSQL tables: + + - Perform QA checks on data in a staging table or set of staging tables. QA checks include not-null, primary key, foreign key, and check constraint checks. + - Perform updates and inserts (upserts) on a base table or set of base tables from the staging table(s) of the same name. + + PgUpsert utilizes temporary tables and views inside the PostgreSQL database to dynamically + generate SQL for QA checks and upserts. All temporary objects are initialized with the `ups_` prefix. + + The upsert process is transactional. If any part of the process fails, the transaction will be rolled back. + Committing changes to the database is optional and can be controlled with the `do_commit` flag. + + To avoid SQL injection, all SQL statements are generated using the `psycopg2.sql`_ module. + + :param host: Name of the PostgreSQL host. + :type host: str + :param database: Name of the PostgreSQL database. + :type database: str + :param user: Name of the PostgreSQL user. This user must have the necessary permissions to + connect to the database, query the information_schema, create temporary objects, + select from the staging tables, and update and insert into the base tables. + No checking is done to verify these permissions. + :type user: str + :param port: PostgreSQL database port, defaults to 5432. + :type port: int, optional + :param passwd: Password for the PostgreSQL user. If None, the user will be prompted to enter + the password. Defaults to None. + :type passwd: None or str, optional + :param tables: List of table names to perform QA checks on and upsert. Defaults to (). + :type tables: list or tuple or None, optional + :param stg_schema: Name of the staging schema where tables are located which will be used for + QA checks and upserts. Tables in the staging schema must have the same name + as the tables in the base schema that they will be upserted to. Defaults to None. + :type stg_schema: str or None, optional + :param base_schema: Name of the base schema where tables are located which will be updated or + inserted into. Defaults to None. + :type base_schema: str or None, optional + :param do_commit: If True, changes will be committed to the database once the upsert process + is complete. If False, changes will be rolled back. Defaults to False. + :type do_commit: bool, optional + :param interactive: If True, the user will be prompted with multiple dialogs to confirm various + steps during the upsert process. If False, the upsert process will run + without user intervention. Defaults to False. + :type interactive: bool, optional + :param upsert_method: The method to use for upserting data. Must be one of "upsert", "update", + or "insert". Defaults to "upsert". + :type upsert_method: str, optional + :param exclude_cols: List of column names to exclude from the upsert process. These columns will + not be updated or inserted to, however, they will still be checked during + the QA process. + :type exclude_cols: list or tuple or None, optional + :param exclude_null_check_cols: List of column names to exclude from the not-null check during + the QA process. Defaults to (). + :type exclude_null_check_cols: list or tuple or None, optional + :param control_table: Name of the temporary control table that will be used to track changes + during the upsert process. Defaults to "ups_control". + :type control_table: str, optional + + :example: + + .. code-block:: python + + from pg_upsert import PgUpsert + + PgUpsert( + host="localhost", + port=5432, + database="postgres", + user="username", + tables=("genres", "books", "authors", "book_authors"), + stg_schema="staging", + base_schema="public", + do_commit=False, + upsert_method="upsert", + interactive=False, + exclude_cols=("rev_user", "rev_time", "created_at", "updated_at"), + exclude_null_check_cols=("rev_user", "rev_time", "created_at", "updated_at", "alias"), ) + .. _psycopg2.sql: https://www.psycopg.org/docs/sql.html + """ # noqa: E501 -def treeview_sort_column(tv: ttk.Treeview, col: str, reverse: bool): - """Sort a column in a Treeview table widget. - - From https://stackoverflow.com/questions/1966929/tk-treeview-column-sort#1967793 - """ - colvals = [(tv.set(k, col), k) for k in tv.get_children()] - colvals.sort(reverse=reverse) - # Rearrange items in sorted positions - for index, (_val, k) in enumerate(colvals): - tv.move(k, "", index) - # Reverse sort next time - tv.heading(col, command=lambda: treeview_sort_column(tv, col, not reverse)) - - -def fill_tv_table(tvtable: ttk.Treeview, rowset: list | tuple, status_label=None): - """Fill a Treeview table widget with data.""" - for i, row in enumerate(rowset): - enc_row = [c if c is not None else "" for c in row] - tvtable.insert(parent="", index="end", iid=str(i), values=enc_row) - if status_label is not None: - status_label.config(text=" %d rows" % len(rowset)) - + def __init__( + self, + host: str, + database: str, + user: str, + port: int = 5432, + passwd: None | str = None, + tables: list | tuple | None = (), + stg_schema: str | None = None, + base_schema: str | None = None, + do_commit: bool = False, + interactive: bool = False, + upsert_method: str = "upsert", + exclude_cols: list | tuple | None = (), + exclude_null_check_cols: list | tuple | None = (), + control_table: str = "ups_control", + ): + if upsert_method not in self._upsert_methods(): + raise ValueError( + f"Invalid upsert method: {upsert_method}. Must be one of {self._upsert_methods()}", + ) + if not base_schema or not stg_schema: + if not base_schema and not stg_schema: + raise ValueError("No base or staging schema specified") + if not base_schema: + raise ValueError("No base schema specified") + if not stg_schema: + raise ValueError("No staging schema specified") + if not tables: + raise ValueError("No tables specified") + if stg_schema == base_schema: + raise ValueError( + f"Staging and base schemas must be different. Got {stg_schema} for both.", + ) + self.db = PostgresDB( + host=host, + port=port, + database=database, + user=user, + passwd=passwd, + ) + logger.debug(f"Connected to {self.db!s}") + self.tables = tables + self.stg_schema = stg_schema + self.base_schema = base_schema + self.do_commit = do_commit + self.interactive = interactive + self.upsert_method = upsert_method + self.exclude_cols = exclude_cols + self.exclude_null_check_cols = exclude_null_check_cols + self.control_table = control_table + self.qa_passed = False + self._validate_schemas() + for table in self.tables: + self._validate_table(table) + self._init_ups_control() + + @staticmethod + def _upsert_methods() -> tuple[str, str, str]: + """Return a tuple of valid upsert methods.""" + return ("upsert", "update", "insert") + + def __repr__(self): + return f"{self.__class__.__name__}(db={self.db!r}, tables={self.tables}, stg_schema={self.stg_schema}, base_schema={self.base_schema}, do_commit={self.do_commit}, interactive={self.interactive}, upsert_method={self.upsert_method}, exclude_cols={self.exclude_cols}, exclude_null_check_cols={self.exclude_null_check_cols})" # noqa: E501 + + def _show(self, sql: str | Composable) -> None | str: + """Display the results of a query in a table format. If the interactive flag is set, + the results will be displayed in a Tkinter window. Otherwise, the results will be + displayed in the console using the tabulate module.""" + rows, headers, rowcount = self.db.rowdict(sql) + if rowcount == 0: + logger.info("No results found") + return None + return f"{tabulate(rows, headers='keys', tablefmt='github', showindex=False)}" -def validate_schemas(base_schema: str, stg_schema: str): - """Validate the base and staging schemas.""" - sql = SQL( - """ - drop table if exists ups_ctrl_invl_schema cascade; - select - string_agg(schemas.schema_name - || ' (' - || schema_type - || ')', '; ' order by schema_type - ) as schema_string - into temporary table ups_ctrl_invl_schema - from - ( - select - {base_schema} as schema_name, - 'base' as schema_type - union + def _validate_schemas(self: PgUpsert) -> None: + """Validate that the base and staging schemas exist.""" + logger.debug(f"Validating schemas {self.base_schema} and {self.stg_schema}") + sql = SQL( + """ select + string_agg(schemas.schema_name + || ' (' + || schema_type + || ')', '; ' order by schema_type + ) as schema_string + from + ( + select + {base_schema} as schema_name, + 'base' as schema_type + union + select - {stg_schema} as schema_name, - 'staging' as schema_type - ) as schemas - left join information_schema.schemata as iss - on schemas.schema_name=iss.schema_name - where - iss.schema_name is null - having count(*)>0; - - """, - ).format( - base_schema=Literal(base_schema), - stg_schema=Literal(stg_schema), - ) - if db.execute(sql).rowcount > 0: - errors.append( - "Invalid schema(s) specified: {}".format( - db.dataframe( - SQL( - "select schema_string from ups_ctrl_invl_schema", - ), - )["schema_string"][0], - ), + {stg_schema} as schema_name, + 'staging' as schema_type + ) as schemas + left join information_schema.schemata as iss + on schemas.schema_name=iss.schema_name + where + iss.schema_name is null + having count(*)>0; + """, + ).format( + base_schema=Literal(self.base_schema), + stg_schema=Literal(self.stg_schema), ) - error_handler(errors) + if self.db.execute(sql).rowcount > 0: + raise ValueError( + f"Invalid schema(s): {next(iter(self.db.rowdict(sql)[0]))['schema_string']}", + ) + def _validate_table(self, table: str) -> None: + """Utility script to validate one table in both base and staging schema. -def validate_table(base_schema: str, stg_schema: str, table: str): - """Utility script to validate one table in both base and staging schema. + Halts script processing if any either of the schemas are non-existent, + or if either of the tables are not present within those schemas pass. - Halts script processing if any either of the schemas are non-existent, - or if either of the tables are not present within those schemas pass. - """ - validate_schemas(base_schema, stg_schema) - sql = SQL( + :param table: The table to validate. + :type table: str """ - drop table if exists ups_invl_table cascade; - select string_agg( - tt.schema_name || '.' || tt.table_name || ' (' || tt.schema_type || ')', - '; ' - order by tt.schema_name, - tt.table_name - ) as schema_table into temporary table ups_invl_table - from ( - select {base_schema} as schema_name, - 'base' as schema_type, - {table} as table_name - union - select {stg_schema} as schema_name, - 'staging' as schema_type, - {table} as table_name - ) as tt - left join information_schema.tables as iss - on tt.schema_name = iss.table_schema - and tt.table_name = iss.table_name - where iss.table_name is null - having count(*) > 0; - """, - ).format( - base_schema=Literal(base_schema), - stg_schema=Literal(stg_schema), - table=Literal(table), - ) - if db.execute(sql).rowcount > 0: - errors.append( - "Invalid table(s) specified: {}".format( - db.dataframe(SQL("select schema_table from ups_invl_table"))[ - "schema_table" - ][0], - ), + logger.debug( + f"Validating table {table} exists in {self.base_schema} and {self.stg_schema} schemas", + ) + sql = SQL( + """ + select string_agg( + tt.schema_name || '.' || tt.table_name || ' (' || tt.schema_type || ')', + '; ' + order by tt.schema_name, + tt.table_name + ) as schema_table + from ( + select {base_schema} as schema_name, + 'base' as schema_type, + {table} as table_name + union + select {stg_schema} as schema_name, + 'staging' as schema_type, + {table} as table_name + ) as tt + left join information_schema.tables as iss + on tt.schema_name = iss.table_schema + and tt.table_name = iss.table_name + where iss.table_name is null + having count(*) > 0; + """, + ).format( + base_schema=Literal(self.base_schema), + stg_schema=Literal(self.stg_schema), + table=Literal(table), ) + if self.db.execute(sql).rowcount > 0: + raise ValueError( + f"Invalid table(s): {next(iter(self.db.rowdict(sql)[0]))['schema_table']}", + ) - error_handler(errors) + def _validate_control(self: PgUpsert) -> None: + """Validate contents of control table against base and staging schema. + :objects created: -def validate_control(base_schema: str, stg_schema: str, control_table: str): - """Validate contents of control table against base and staging schema.""" - validate_schemas(base_schema, stg_schema) - sql = SQL( + - `ups_validate_control`: Temporary table containing the results of the validation. + - `ups_ctrl_invl_table`: Temporary table containing the names of invalid tables. """ - drop table if exists ups_validate_control cascade; - select cast({base_schema} as text) as base_schema, - cast({stg_schema} as text) as staging_schema, - table_name, - False as base_exists, - False as staging_exists into temporary table ups_validate_control - from {control_table}; - - update ups_validate_control as vc - set base_exists = True - from information_schema.tables as bt - where vc.base_schema = bt.table_schema - and vc.table_name = bt.table_name - and bt.table_type = cast('BASE TABLE' as text); - update ups_validate_control as vc - set staging_exists = True - from information_schema.tables as st - where vc.staging_schema = st.table_schema - and vc.table_name = st.table_name - and st.table_type = cast('BASE TABLE' as text); - drop table if exists ups_ctrl_invl_table cascade; - select string_agg( - schema_table, - '; ' - order by it.schema_table - ) as schema_table into temporary table ups_ctrl_invl_table - from ( - select base_schema || '.' || table_name as schema_table - from ups_validate_control - where not base_exists - union - select staging_schema || '.' || table_name as schema_table - from ups_validate_control - where not staging_exists - ) as it - having count(*) > 0; - """, - ).format( - base_schema=Literal(base_schema), - stg_schema=Literal(stg_schema), - control_table=Identifier(control_table), - ) - if db.execute(sql).rowcount > 0: - error_handler( - [ - "Invalid table(s) specified: {}".format( - db.dataframe("select schema_table from ups_ctrl_invl_table")[ - "schema_table" - ][0], + logger.debug("Validating control table") + self._validate_schemas() + # Check if the control table exists + if ( + self.db.execute( + SQL( + """ + select 1 + from information_schema.tables + where table_name = {control_table} + """, + ).format( + base_schema=Literal(self.base_schema), + control_table=Literal(self.control_table), ), - ], - ) - - -def staged_to_load(control_table: str, tables): - """Creates a table having the structure that is used to drive - the upsert operation on multiple staging tables. - """ - sql = SQL( - """ - drop table if exists {control_table} cascade; - create temporary table {control_table} ( - table_name text not null unique, - exclude_cols text, - exclude_null_checks text, - interactive boolean not null default false, - null_errors text, - pk_errors text, - fk_errors text, - ck_errors text, - rows_updated integer, - rows_inserted integer - ); - insert into {control_table} - (table_name) - select - trim(unnest(string_to_array({tables}, ','))); - """, - ).format( - control_table=Identifier(control_table), - tables=Literal(",".join(tables)), - ) - db.execute(sql) - - -def load_staging(base_schema: str, stg_schema: str, control_table: str): - """Performs QA checks for nulls in non-null columns, for duplicated - primary key values, and for invalid foreign keys in a set of staging - tables to be loaded into base tables. If there are failures in the - QA checks, loading is not attempted. If the loading step is carried - out, it is done within a transaction. - - The "null_errors", "pk_errors", and "fk_errors" columns of the - control table will be updated to identify any errors that occur, - so that this information is available to the caller. - - The "rows_updated" and "rows_inserted" columns of the control table - will be updated with counts of the number of rows affected by the - upsert operation for each table. - - When the upsert operation updates the base table, all columns of the - base table that are also in the staging table are updated. The - update operation does not test to see if column contents are different, - and so does not update only those values that are different. - """ - # Clear the columns of return values from the control table, - # in case this control table has been used previously. - db.execute( - SQL( + ).rowcount + == 0 + ): + self._init_ups_control() + sql = SQL( """ - update {control_table} - set null_errors = null, - pk_errors = null, - fk_errors = null, - ck_errors = null, - rows_updated = null, - rows_inserted = null; + drop table if exists ups_validate_control cascade; + select cast({base_schema} as text) as base_schema, + cast({stg_schema} as text) as staging_schema, + table_name, + False as base_exists, + False as staging_exists into temporary table ups_validate_control + from {control_table}; + + update ups_validate_control as vc + set base_exists = True + from information_schema.tables as bt + where vc.base_schema = bt.table_schema + and vc.table_name = bt.table_name + and bt.table_type = cast('BASE TABLE' as text); + update ups_validate_control as vc + set staging_exists = True + from information_schema.tables as st + where vc.staging_schema = st.table_schema + and vc.table_name = st.table_name + and st.table_type = cast('BASE TABLE' as text); + drop table if exists ups_ctrl_invl_table cascade; + select string_agg( + schema_table, + '; ' + order by it.schema_table + ) as schema_table into temporary table ups_ctrl_invl_table + from ( + select base_schema || '.' || table_name as schema_table + from ups_validate_control + where not base_exists + union + select staging_schema || '.' || table_name as schema_table + from ups_validate_control + where not staging_exists + ) as it + having count(*) > 0; """, - ).format(control_table=Identifier(control_table)), - ) - qa_all(base_schema, stg_schema, control_table) + ).format( + base_schema=Literal(self.base_schema), + stg_schema=Literal(self.stg_schema), + control_table=Identifier(self.control_table), + ) + if self.db.execute(sql).rowcount > 0: + logger.error("Invalid table(s) specified:") + rows, headers, rowcount = self.db.rowdict( + SQL("select schema_table from ups_ctrl_invl_table"), + ) + for row in rows: + logger.error(f" {row['schema_table']}") + def _init_ups_control(self: PgUpsert) -> None: + """Creates a table having the structure that is used to drive + the upsert operation on multiple staging tables. -def qa_all(base_schema: str, stg_schema: str, control_table: str): - """Conducts null, primary key, foreign key, and check constraint - checks on multiple staging tables containing new or revised data - for staging tables, using the NULLQA_ONE, PKQA_ONE, FKQA_ONE, - and CKQA_ONE functions. - """ - # Create a list of the selected tables with a loop control flag. - db.execute( - SQL( - """ - drop table if exists ups_proctables cascade; - select tl.table_name, - tl.exclude_null_checks, - tl.interactive, - False::boolean as processed into temporary table ups_proctables - from {control_table} as tl; - """, - ).format(control_table=Identifier(control_table)), - ) - # Create a view returning a single unprocessed table, in order. - db.execute( - SQL( + :objects created: + + - `ups_control`: Temporary table containing the control data. + """ + logger.debug("Initializing upsert control table") + sql = SQL( """ - drop view if exists ups_toprocess cascade; - create temporary view ups_toprocess as - select - table_name, - exclude_null_checks, - interactive - from ups_proctables - where not processed - limit 1; - """, - ), - ) - interactive = db.dataframe("select interactive from ups_toprocess;")["interactive"][ - 0 - ] - # Null checks - logger.info("") - qa_check = "Non-NULL" - logger.info(f"==={qa_check} checks===") - start_time = datetime.now() - qa_all_nullloop(base_schema, stg_schema, control_table, interactive) - logger.debug(f"{qa_check} checks completed in {ellapsed_time(start_time)}") - logger.info("") - - # Reset the loop control flag. - db.execute("update ups_proctables set processed = False;") - - qa_check = "Primary Key" - logger.info(f"==={qa_check} checks===") - start_time = datetime.now() - qa_all_pkloop(base_schema, stg_schema, control_table, interactive) - logger.debug(f"{qa_check} checks completed in {ellapsed_time(start_time)}") - logger.info("") - - # Reset the loop control flag. - db.execute("update ups_proctables set processed = False;") - - qa_check = "Foreign Key" - logger.info(f"==={qa_check} checks===") - start_time = datetime.now() - qa_all_fkloop(base_schema, stg_schema, control_table, interactive) - logger.debug(f"{qa_check} checks completed in {ellapsed_time(start_time)}") - logger.info("") - - # Reset the loop control flag. - db.execute("update ups_proctables set processed = False;") - - qa_check = "Check Constraint" - logger.info(f"==={qa_check} checks===") - start_time = datetime.now() - qa_all_ckloop(base_schema, stg_schema, control_table, interactive) - logger.debug(f"{qa_check} checks completed in {ellapsed_time(start_time)}") - logger.info("") - - -def qa_all_nullloop( - base_schema: str, - stg_schema: str, - control_table: str, - interactive: bool, -): - null_errors = [] - while True: - df = db.dataframe(SQL("select * from ups_toprocess;")) - if df.is_empty(): - break - null_qa_one( - base_schema, - stg_schema, - table=df["table_name"][0], - errors=null_errors, - exclude_null_checks=df["exclude_null_checks"][0], - interactive=interactive, + drop table if exists {control_table} cascade; + create temporary table {control_table} ( + table_name text not null unique, + exclude_cols text, + exclude_null_checks text, + interactive boolean not null default false, + null_errors text, + pk_errors text, + fk_errors text, + ck_errors text, + rows_updated integer, + rows_inserted integer + ); + insert into {control_table} + (table_name) + select + trim(unnest(string_to_array({tables}, ','))); + """, + ).format( + control_table=Identifier(self.control_table), + tables=Literal(",".join(self.tables)), ) - err_df = db.dataframe("select * from ups_null_error_list;") - if not err_df.is_empty(): - db.execute( + self.db.execute(sql) + # Update the control table with the list of columns to exclude from being updated or inserted to. + if self.exclude_cols and len(self.exclude_cols) > 0: + self.db.execute( SQL( """ - update {control_table} - set null_errors = {null_errors} - where table_name = {table}; + update {control_table} + set exclude_cols = {exclude_cols}; """, ).format( - control_table=Identifier(control_table), - null_errors=Literal(err_df["null_errors"][0]), - table=Literal(df["table_name"][0]), + control_table=Identifier(self.control_table), + exclude_cols=Literal(",".join(self.exclude_cols)), ), ) - - db.execute( - SQL( - """update ups_proctables set processed = True - where table_name = {table_name};""", - ).format(table_name=Literal(df["table_name"][0])), + # Update the control table with the list of columns to exclude from null checks. + if self.exclude_null_check_cols and len(self.exclude_null_check_cols) > 0: + self.db.execute( + SQL( + """ + update {control_table} + set exclude_null_checks = {exclude_null_check_cols}; + """, + ).format( + control_table=Identifier(self.control_table), + exclude_null_check_cols=Literal( + ",".join(self.exclude_null_check_cols), + ), + ), + ) + if self.interactive: + self.db.execute( + SQL( + """ + update {control_table} + set interactive = {interactive}; + """, + ).format( + control_table=Identifier(self.control_table), + interactive=Literal(self.interactive), + ), + ) + debug_sql = SQL("select * from {control_table}").format( + control_table=Identifier(self.control_table), + ) + logger.debug( + f"Control table after being initialized:\n{self._show(debug_sql)}", ) + def qa_all(self: PgUpsert) -> PgUpsert: + """Performs QA checks for nulls in non-null columns, for duplicated + primary key values, for invalid foreign keys, and invalid check constraints + in a set of staging tables to be loaded into base tables. + If there are failures in the QA checks, loading is not attempted. + If the loading step is carried out, it is done within a transaction. -def null_qa_one( - base_schema: str, - stg_schema: str, - table: str, - errors: list, - exclude_null_checks: str, - interactive: bool, -): - logger.info(f"Conducting non-null QA checks on table {stg_schema}.{table}") - validate_table(base_schema, stg_schema, table) - # Create a table listing the columns of the base table that must - # be non-null and that do not have a default expression. - # Include a column for the number of rows with nulls in the staging table. - # Include a 'processed' column for loop control. - db.execute( - SQL( - """ - drop table if exists ups_nonnull_cols cascade; - select column_name, - 0::integer as null_rows, - False as processed - into temporary table ups_nonnull_cols - from information_schema.columns - where table_schema = {base_schema} - and table_name = {table} - and is_nullable = 'NO' - and column_default is null and column_name not in ({exclude_null_checks}); - """, - ).format( - base_schema=Literal(base_schema), - table=Literal(table), - exclude_null_checks=( - SQL(",").join(Literal(col) for col in exclude_null_checks.split(",")) - if exclude_null_checks - else Literal("") - ), - ), - ) + The "null_errors", "pk_errors", "fk_errors", "ck_errors" columns of the + control table will be updated to identify any errors that occur, + so that this information is available to the caller. - # Process all non-nullable columns. - while True: - df = db.dataframe( - """ - select column_name - from ups_nonnull_cols - where not processed - limit 1; + The "rows_updated" and "rows_inserted" columns of the control table + will be updated with counts of the number of rows affected by the + upsert operation for each table. + + When the upsert operation updates the base table, all columns of the + base table that are also in the staging table are updated. The + update operation does not test to see if column contents are different, + and so does not update only those values that are different. + + This method runs :class:`PgUpsert` methods in the following order: + + 1. :meth:`PgUpsert.qa_all_null` + 2. :meth:`PgUpsert.qa_all_pk` + 3. :meth:`PgUpsert.qa_all_fk` + 4. :meth:`PgUpsert.qa_all_ck` + + :objects created: + + - `ups_proctables`: Temporary table containing the list of tables to process. + - `ups_toprocess`: Temporary view returning a single unprocessed table. + """ + self._validate_control() + # Clear the columns of return values from the control table, + # in case this control table has been used previously. + self.db.execute( + SQL( + """ + update {control_table} + set null_errors = null, + pk_errors = null, + fk_errors = null, + ck_errors = null, + rows_updated = null, + rows_inserted = null; """, + ).format(control_table=Identifier(self.control_table)), ) - if df.is_empty(): - break - db.execute( + # Create a list of the selected tables with a loop control flag. + self.db.execute( SQL( """ - create or replace temporary view ups_qa_nonnull_col as - select nrows - from ( - select count(*) as nrows - from {stg_schema}.{table} - where {column_name} is null - ) as nullcount - where nrows > 0 - limit 1; + drop table if exists ups_proctables cascade; + select + table_name, + exclude_null_checks, + interactive, + False::boolean as processed + into temporary table ups_proctables + from {control_table}; """, - ).format( - stg_schema=Identifier(stg_schema), - table=Identifier(table), - column_name=Identifier(df["column_name"][0]), - ), + ).format(control_table=Identifier(self.control_table)), ) - null_df = db.dataframe("select * from ups_qa_nonnull_col;") - if not null_df.is_empty(): - logger.warning( - f" Column {df['column_name'][0]} has {null_df['nrows'][0]} null values", # noqa: E501 - ) - db.execute( - SQL( - """ - update ups_nonnull_cols - set null_rows = ( - select nrows - from ups_qa_nonnull_col - limit 1 - ) - where column_name = {column_name}; - """, - ).format(column_name=Literal(df["column_name"][0])), - ) - db.execute( + # Create a view returning a single unprocessed table, in order. + self.db.execute( SQL( """ - update ups_nonnull_cols - set processed = True - where column_name = {column_name}; + drop view if exists ups_toprocess cascade; + create temporary view ups_toprocess as + select + table_name, + exclude_null_checks, + interactive + from ups_proctables + where not processed + limit 1; """, - ).format(column_name=Literal(df["column_name"][0])), + ), ) - # Update the control table with the number of rows with nulls in the staging table. - db.execute( - """ - create or replace temporary view ups_null_error_list as - select string_agg(column_name || ' (' || null_rows || ')', ', ') as null_errors - from ups_nonnull_cols - where coalesce(null_rows, 0) > 0; - """, - ) - + qa_funcs = { + "Non-NULL": self.qa_all_null, + "Primary Key": self.qa_all_pk, + "Foreign Key": self.qa_all_fk, + "Check Constraint": self.qa_all_ck, + } + + for qa_check, qa_func in qa_funcs.items(): + logger.info(f"==={qa_check} checks===") + start_time = datetime.now() + qa_func() + logger.debug(f"{qa_check} checks completed in {ellapsed_time(start_time)}") + logger.debug(f"Control table after {qa_check} checks:") + ctrl = SQL("select * from {control_table};").format( + control_table=Identifier(self.control_table), + ) + if not self.interactive: + logger.debug(f"\n{self._show(ctrl)}") + # Reset the loop control flag in the control table. + self.db.execute(SQL("update ups_proctables set processed = False;")) -def qa_all_pkloop( - base_schema: str, - stg_schema: str, - control_table: str, - interactive: bool, -): - while True: - df = db.dataframe(SQL("select * from ups_toprocess;")) - if df.is_empty(): - break - pk_errors = pk_qa_one( - base_schema, - stg_schema, - table=df["table_name"][0], - interactive=interactive, + # Check for errors + rows, headers, rowcount = self.db.rowdict( + SQL( + """select * from {control_table} + where coalesce(null_errors, pk_errors, fk_errors, ck_errors) is not null; + """, + ).format( + control_table=Identifier(self.control_table), + ), ) - if pk_errors: - db.execute( + if rowcount > 0: + ctrl = SQL("select * from {control_table};").format( + control_table=Identifier(self.control_table), + ) + logger.debug("QA checks failed") + logger.debug(f"\n{self._show(ctrl)}") + logger.debug("") + if self.interactive: + btn, return_value = TableUI( + "QA Errors", + "QA checks failed. Below is a summary of the errors:", + [ + ("Continue", 0, ""), + ("Cancel", 1, ""), + ], + headers, + [[row[header] for header in headers] for row in rows], + ).activate() + else: + logger.error("===QA checks failed. Below is a summary of the errors===") + logger.error(self._show(ctrl)) + return self + self.qa_passed = True + return self + + def qa_all_null(self: PgUpsert) -> PgUpsert: + """Performs null checks for non-null columns in selected staging tables.""" + while True: + rows, headers, rowcount = self.db.rowdict( + SQL("select * from ups_toprocess;"), + ) + if rowcount == 0: + break + rows = next(iter(rows)) + self.qa_one_null(table=rows["table_name"]) + # Set the 'processed' column to True in the control table. + self.db.execute( SQL( """ - update {control_table} - set pk_errors = {pk_errors} - where table_name = {table}; + update ups_proctables + set processed = True + where table_name = {table_name}; """, - ).format( - control_table=Identifier(control_table), - pk_errors=Literal(pk_errors[0]), - table=Literal(df["table_name"][0]), - ), + ).format(table_name=Literal(rows["table_name"])), ) + return self - db.execute( - SQL( - """update ups_proctables set processed = True where table_name = {table_name};""", # noqa: E501 - ).format(table_name=Literal(df["table_name"][0])), - ) + def qa_one_null(self: PgUpsert, table: str) -> PgUpsert: + """Performs null checks for non-null columns in a single staging table. + :param table: The name of the staging table to check for null values. + :type table: str -def pk_qa_one(base_schema: str, stg_schema: str, table: str, interactive: bool): - pk_errors = [] - logger.info(f"Conducting primary key QA checks on table {stg_schema}.{table}") - validate_table(base_schema, stg_schema, table) - # Create a table of primary key columns on this table - db.execute( - SQL( - """ - drop table if exists ups_primary_key_columns cascade; - select k.constraint_name, k.column_name, k.ordinal_position - into temporary table ups_primary_key_columns - from information_schema.table_constraints as tc - inner join information_schema.key_column_usage as k - on tc.constraint_type = 'PRIMARY KEY' - and tc.constraint_name = k.constraint_name - and tc.constraint_catalog = k.constraint_catalog - and tc.constraint_schema = k.constraint_schema - and tc.table_schema = k.table_schema - and tc.table_name = k.table_name - and tc.constraint_name = k.constraint_name - where - k.table_name = {table} - and k.table_schema = {base_schema} - order by k.ordinal_position - ; - """, - ).format(table=Literal(table), base_schema=Literal(base_schema)), - ) - df = db.dataframe("select * from ups_primary_key_columns;") - if df.is_empty(): - return None - logger.debug(f" Checking constraint {df['constraint_name'][0]}") - # Get a comma-delimited list of primary key columns to build SQL selection - # for duplicate keys - pkcol_df = db.dataframe( + :objects created: + + - `ups_nonnull_cols`: Temporary table containing the non-null columns of the base table. + - `ups_qa_nonnull_col`: Temporary view containing the number of rows with nulls in the staging table. + - `ups_null_error_list`: Temporary view containing the list of null errors. """ - select - string_agg(column_name, ', ' order by ordinal_position) as pkcollist - from ups_primary_key_columns - ; - """, - ) - pkcollist = pkcol_df["pkcollist"][0] - db.execute( - SQL( - """ - drop view if exists ups_pk_check cascade; - create temporary view ups_pk_check as - select {pkcollist}, count(*) as nrows - from {stg_schema}.{table} as s - group by {pkcollist} - having count(*) > 1; - """, - ).format( - pkcollist=SQL(pkcollist), - stg_schema=Identifier(stg_schema), - table=Identifier(table), - ), - ) - pk_check = db.dataframe("select * from ups_pk_check;") - if not pk_check.is_empty(): - logger.warning(f" Duplicate key error in columns {pkcollist}") - err_df = db.dataframe( - """ - select count(*) as errcnt, sum(nrows) as total_rows - from ups_pk_check; - """, - ) - pk_errors.append( - f"{err_df['errcnt'][0]} duplicated keys ({int(err_df['total_rows'][0])} rows) in table {stg_schema}.{table}", # noqa: E501 + logger.info( + f"Conducting not-null QA checks on table {self.stg_schema}.{table}", ) - logger.debug("") - logger.debug( - tabulate( - pk_check.iter_rows(), - headers=pk_check.columns, - tablefmt="pipe", - showindex=False, - colalign=["left"] * len(pk_check.columns), + self._validate_table(table) + # Create a table listing the columns of the base table that must + # be non-null and that do not have a default expression. + # Include a column for the number of rows with nulls in the staging table. + # Include a 'processed' column for loop control. + self.db.execute( + SQL( + """ + drop table if exists ups_nonnull_cols cascade; + select column_name, + 0::integer as null_rows, + False as processed + into temporary table ups_nonnull_cols + from information_schema.columns + where table_schema = {base_schema} + and table_name = {table} + and is_nullable = 'NO' + and column_default is null + and column_name not in ({exclude_null_check_cols}); + """, + ).format( + base_schema=Literal(self.base_schema), + table=Literal(table), + exclude_null_check_cols=( + SQL(",").join(Literal(col) for col in self.exclude_null_check_cols) + if self.exclude_null_check_cols + else Literal("") + ), ), ) - logger.debug("") - if interactive: - btn, return_value = TableUI( - "Duplicate key error", - f"{err_df['errcnt'][0]} duplicated keys ({int(err_df['total_rows'][0])} rows) in table {stg_schema}.{table}", # noqa: E501 - [ - ("Continue", 0, ""), - ("Cancel", 1, ""), - ], - pk_check.columns, - list(pk_check.iter_rows()), - ).activate() - if btn != 0: - error_handler(["Script canceled by user."]) - - return pk_errors - - -def qa_all_fkloop( - base_schema: str, - stg_schema: str, - control_table: str, - interactive: bool, -): - while True: - df = db.dataframe(SQL("select * from ups_toprocess;")) - if df.is_empty(): - break - fk_errors = fk_qa_one( - base_schema, - stg_schema, - table=df["table_name"][0], - interactive=interactive, + # Process all non-nullable columns. + while True: + rows, headers, rowcount = self.db.rowdict( + SQL("select * from ups_nonnull_cols where not processed limit 1;"), + ) + if rowcount == 0: + break + rows = next(iter(rows)) + logger.debug(f" Checking column {rows['column_name']} for nulls") + self.db.execute( + SQL( + """ + create or replace temporary view ups_qa_nonnull_col as + select nrows + from ( + select count(*) as nrows + from {stg_schema}.{table} + where {column_name} is null + ) as nullcount + where nrows > 0 + limit 1; + """, + ).format( + stg_schema=Identifier(self.stg_schema), + table=Identifier(table), + column_name=Identifier(rows["column_name"]), + ), + ) + # Get the number of rows with nulls in the staging table. + null_rows, headers, rowcount = self.db.rowdict( + SQL("select * from ups_qa_nonnull_col;"), + ) + if rowcount > 0: + null_rows = next(iter(null_rows)) + logger.warning( + f" Column {rows['column_name']} has {null_rows['nrows']} null values", + ) + # Set the number of rows with nulls in the control table. + self.db.execute( + SQL( + """ + update ups_nonnull_cols + set null_rows = ( + select nrows + from ups_qa_nonnull_col + limit 1 + ) + where column_name = {column_name}; + """, + ).format(column_name=Literal(rows["column_name"])), + ) + # Set the 'processed' column to True in the control table. + self.db.execute( + SQL( + """ + update ups_nonnull_cols + set processed = True + where column_name = {column_name}; + """, + ).format(column_name=Literal(rows["column_name"])), + ) + # Update the control table with the number of rows with nulls in the staging table. + self.db.execute( + """ + create or replace temporary view ups_null_error_list as + select string_agg(column_name || ' (' || null_rows || ')', ', ') as null_errors + from ups_nonnull_cols + where coalesce(null_rows, 0) > 0; + """, + ) + # Query the ups_null_error_list control table for the null errors. + err_rows, err_headers, err_rowcount = self.db.rowdict( + SQL("select * from ups_null_error_list;"), ) - if fk_errors: - db.execute( + if err_rowcount > 0: + self.db.execute( SQL( """ update {control_table} - set fk_errors = {fk_errors} - where table_name = {table}; + set null_errors = {null_errors} + where table_name = {table_name}; """, ).format( - control_table=Identifier(control_table), - fk_errors=Literal(fk_errors), - table=Literal(df["table_name"][0]), + control_table=Identifier(self.control_table), + null_errors=Literal(next(iter(err_rows))["null_errors"]), + table_name=Literal(table), ), ) + return self - db.execute( - SQL( - """update ups_proctables set processed = True where table_name = {table_name};""", # noqa: E501 - ).format(table_name=Literal(df["table_name"][0])), - ) + def qa_all_pk(self: PgUpsert) -> PgUpsert: + """Performs primary key checks for duplicated primary key values in selected staging tables.""" + while True: + rows, headers, rowcount = self.db.rowdict( + SQL("select * from ups_toprocess;"), + ) + if rowcount == 0: + break + rows = next(iter(rows)) + self.qa_one_pk(table=rows["table_name"]) + # Set the 'processed' column to True in the control table. + self.db.execute( + SQL( + """ + update ups_proctables + set processed = True + where table_name = {table_name}; + """, + ).format(table_name=Literal(rows["table_name"])), + ) + return self + def qa_one_pk(self: PgUpsert, table: str) -> PgUpsert: + """Performs primary key checks for duplicated primary key values in a single staging table. -def fk_qa_one(base_schema: str, stg_schema: str, table: str, interactive: bool): - logger.info(f"Conducting foreign key QA checks on table {stg_schema}.{table}") - # Create a table of *all* foreign key dependencies in this database. - # Only create it once because it may slow the QA process down. - if ( - db.execute( - SQL( - """select * from information_schema.tables - where table_name = {ups_foreign_key_columns};""", - ).format(ups_foreign_key_columns=Literal("ups_foreign_key_columns")), - ).rowcount - == 0 - ): - db.execute( - SQL( - """ - select - fkinf.constraint_name, - fkinf.table_schema, - fkinf.table_name, - att1.attname as column_name, - fkinf.uq_schema, - cls.relname as uq_table, - att2.attname as uq_column - into - temporary table {ups_foreign_key_columns} - from - (select - ns1.nspname as table_schema, - cls.relname as table_name, - unnest(cons.conkey) as uq_table_id, - unnest(cons.confkey) as table_id, - cons.conname as constraint_name, - ns2.nspname as uq_schema, - cons.confrelid, - cons.conrelid - from - pg_constraint as cons - inner join pg_class as cls on cls.oid = cons.conrelid - inner join pg_namespace ns1 on ns1.oid = cls.relnamespace - inner join pg_namespace ns2 on ns2.oid = cons.connamespace - where - cons.contype = 'f' - ) as fkinf - inner join pg_attribute att1 on - att1.attrelid = fkinf.conrelid and att1.attnum = fkinf.uq_table_id - inner join pg_attribute att2 on - att2.attrelid = fkinf.confrelid and att2.attnum = fkinf.table_id - inner join pg_class cls on cls.oid = fkinf.confrelid; - """, - ).format(ups_foreign_key_columns=Identifier("ups_foreign_key_columns")), - ) + :param table: The name of the staging table to check for duplicate primary key values. + :type table: str - # Create a temporary table of just the foreign key relationships for the base - # table corresponding to the staging table to check. - db.execute( - SQL( - """ - drop table if exists ups_sel_fks cascade; - select - constraint_name, table_schema, table_name, - column_name, uq_schema, uq_table, uq_column - into - temporary table ups_sel_fks - from - ups_foreign_key_columns - where - table_schema = {base_schema} - and table_name = {table}; - """, - ).format(base_schema=Literal(base_schema), table=Literal(table)), - ) - # Create a temporary table of all unique constraint names for - # this table, with an integer column to be populated with the - # number of rows failing the foreign key check, and a 'processed' - # flag to control looping. - db.execute( - SQL( - """ - drop table if exists ups_fk_constraints cascade; - select distinct - constraint_name, table_schema, table_name, - 0::integer as fkerror_values, - False as processed - into temporary table ups_fk_constraints - from ups_sel_fks; - """, - ), - ) - while True: - # Create a view to select one constraint to process. - df = db.dataframe( - SQL( - """ - select constraint_name, table_schema, table_name - from ups_fk_constraints - where not processed - limit 1; - """, - ), - ) - if df.is_empty(): - break - logger.debug( - f" Checking constraint {df['constraint_name'][0]} for table {table}", + :objects created: + + - `ups_primary_key_columns`: Temporary table containing the primary key columns of the base table. + - `ups_pk_check`: Temporary view containing the duplicate primary key values. + """ + pk_errors = [] + logger.info( + f"Conducting primary key QA checks on table {self.stg_schema}.{table}", ) - db.execute( + self._validate_table(table) + # Create a table listing the primary key columns of the base table. + self.db.execute( SQL( """ - drop table if exists ups_one_fk cascade; - select column_name, uq_schema, uq_table, uq_column - into temporary table ups_one_fk - from ups_sel_fks + drop table if exists ups_primary_key_columns cascade; + select k.constraint_name, k.column_name, k.ordinal_position + into temporary table ups_primary_key_columns + from information_schema.table_constraints as tc + inner join information_schema.key_column_usage as k + on tc.constraint_type = 'PRIMARY KEY' + and tc.constraint_name = k.constraint_name + and tc.constraint_catalog = k.constraint_catalog + and tc.constraint_schema = k.constraint_schema + and tc.table_schema = k.table_schema + and tc.table_name = k.table_name + and tc.constraint_name = k.constraint_name where - constraint_name = {constraint_name} - and table_schema = {table_schema} - and table_name = {table_name}; - """, - ).format( - constraint_name=Literal(df["constraint_name"][0]), - table_schema=Literal(df["table_schema"][0]), - table_name=Literal(df["table_name"][0]), - ), + k.table_name = {table} + and k.table_schema = {base_schema} + order by k.ordinal_position + ; + """, + ).format(table=Literal(table), base_schema=Literal(self.base_schema)), + ) + rows, headers, rowcount = self.db.rowdict( + "select * from ups_primary_key_columns;", ) - const_df = db.dataframe("select * from ups_one_fk;") - # Create join expressions from staging table (s) to unique table (u) - # and to staging table equivalent to unique table (su) (though we - # don't know yet if the latter exists). Also create a 'where' - # condition to ensure that all columns being matched are non-null. - # Also create a comma-separated list of the columns being checked. - fk_df = db.dataframe( + if rowcount == 0: + logger.info("Table has no primary key") + return None + # rows = next(iter(rows)) + rows = list(rows) + logger.debug(f" Checking constraint {rows[0]['constraint_name']}") + # Get a comma-delimited list of primary key columns to build SQL selection + # for duplicate keys, ordered by ordinal position. + pk_cols = SQL(",").join(Identifier(row["column_name"]) for row in rows) + self.db.execute( SQL( """ - select - string_agg('s.' || column_name || ' = u.' || uq_column, ' and ') as u_join, - string_agg('s.' || column_name || ' = su.' || uq_column, ' and ') as su_join, - string_agg('s.' || column_name || ' is not null', ' and ') as s_not_null, - string_agg('s.' || column_name, ', ') as s_checked - from - (select * from ups_one_fk) as fkcols; - """, - ), - ) - # Determine whether a staging-table equivalent of the unique table exists. - su_exists = False - if ( - db.execute( - SQL( - """select * from information_schema.tables - where table_name = {table} and table_schema = {stg_schema};""", - ).format( - table=Literal(const_df["uq_table"][0]), - stg_schema=Literal(stg_schema), - ), - ).rowcount - > 0 - ): - su_exists = True - - # Construct a query to test for missing unique values for fk columns. - query = SQL( - """ - drop view if exists ups_fk_check cascade; - create or replace temporary view ups_fk_check as - select {s_checked}, count(*) as nrows + drop view if exists ups_pk_check cascade; + create temporary view ups_pk_check as + select {pkcollist}, count(*) as nrows from {stg_schema}.{table} as s - left join {uq_schema}.{uq_table} as u on {u_join} + group by {pkcollist} + having count(*) > 1; """, - ).format( - s_checked=SQL(fk_df["s_checked"][0]), - stg_schema=Identifier(stg_schema), - table=Identifier(table), - uq_schema=Identifier(const_df["uq_schema"][0]), - uq_table=Identifier(const_df["uq_table"][0]), - u_join=SQL(fk_df["u_join"][0]), - ) - if su_exists: - query += SQL( - """ left join {stg_schema}.{uq_table} as su on {su_join}""", ).format( - stg_schema=Identifier(stg_schema), - uq_table=Identifier(const_df["uq_table"][0]), - su_join=SQL(fk_df["su_join"][0]), - ) - query += SQL(" where u.{uq_column} is null").format( - uq_column=Identifier(const_df["uq_column"][0]), + pkcollist=pk_cols, + stg_schema=Identifier(self.stg_schema), + table=Identifier(table), + ), ) - if su_exists: - query += SQL(" and su.{uq_column} is null").format( - uq_column=Identifier(const_df["uq_column"][0]), - ) - query += SQL( - """ and {s_not_null} - group by {s_checked};""", - ).format( - s_not_null=SQL(fk_df["s_not_null"][0]), - s_checked=SQL(fk_df["s_checked"][0]), + pk_errs, pk_headers, pk_rowcount = self.db.rowdict( + "select * from ups_pk_check;", ) - - db.execute(query) - fk_check_df = db.dataframe("select * from ups_fk_check;") - - if not fk_check_df.is_empty(): + if pk_rowcount > 0: logger.warning( - f" Foreign key error referencing {const_df['uq_table'][0]}", + f" Duplicate key error in columns {pk_cols.as_string(self.db.cursor())}", ) - logger.debug("") - logger.debug( - tabulate( - fk_check_df.iter_rows(), - headers=fk_check_df.columns, - tablefmt="pipe", - showindex=False, - colalign=["left"] * len(fk_check_df.columns), + pk_errs = list(pk_errs) + tot_errs, tot_headers, tot_rowcount = self.db.rowdict( + SQL( + "select count(*) as errcount, sum(nrows) as total_rows from ups_pk_check;", ), ) + tot_errs = next(iter(tot_errs)) + err_msg = f"{tot_errs['errcount']} duplicate keys ({tot_errs['total_rows']} rows) in table {self.stg_schema}.{table}" # noqa: E501 + pk_errors.append(err_msg) logger.debug("") - if interactive: + err_sql = SQL("select * from ups_pk_check;") + logger.debug(f"\n{self._show(err_sql)}") + logger.debug("") + if self.interactive: btn, return_value = TableUI( - "Foreign Key Error", - f"Foreign key error referencing {const_df['uq_table'][0]}", + "Duplicate key error", + err_msg, [ ("Continue", 0, ""), ("Cancel", 1, ""), ], - fk_check_df.columns, - list(fk_check_df.iter_rows()), + pk_headers, + [[row[header] for header in pk_headers] for row in pk_errs], ).activate() if btn != 0: - error_handler(["Script canceled by user."]) - - db.execute( + logger.warning("Script cancelled by user") + sys.exit(0) + if len(pk_errors) > 0: + self.db.execute( SQL( """ - update ups_fk_constraints - set fkerror_values = {fkerror_count} - where constraint_name = {constraint_name} - and table_schema = {table_schema} - and table_name = {table_name}; + update {control_table} + set pk_errors = {pk_errors} + where table_name = {table_name}; """, ).format( - fkerror_count=Literal(fk_check_df["nrows"][0]), - constraint_name=Literal(df["constraint_name"][0]), - table_schema=Literal(df["table_schema"][0]), - table_name=Literal(df["table_name"][0]), + control_table=Identifier(self.control_table), + pk_errors=Literal(",".join(pk_errors)), + table_name=Literal(table), ), ) - db.execute( - SQL( - """ - update ups_fk_constraints - set processed = True - where - constraint_name = {constraint_name} - and table_schema = {table_schema} - and table_name = {table_name}; - """, - ).format( - constraint_name=Literal(df["constraint_name"][0]), - table_schema=Literal(df["table_schema"][0]), - table_name=Literal(df["table_name"][0]), - ), - ) + return self - err_df = db.dataframe( - SQL( - """ - select string_agg( - constraint_name || ' (' || fkerror_values || ')', ', ' - ) as fk_errors - from ups_fk_constraints - where coalesce(fkerror_values, 0) > 0; - """, - ), - ) - return err_df["fk_errors"][0] - - -def qa_all_ckloop( - base_schema: str, - stg_schema: str, - control_table: str, - interactive: bool, -): - ck_errors = [] - while True: - df = db.dataframe(SQL("select * from ups_toprocess;")) - if df.is_empty(): - break - ck_qa_one( - base_schema, - stg_schema, - table=df["table_name"][0], - errors=ck_errors, - interactive=interactive, - ) - err_df = db.dataframe( - "select * from ups_ck_error_list where ck_errors is not null", - ) - if not err_df.is_empty(): - db.execute( + def qa_all_fk(self: PgUpsert) -> PgUpsert: + """Performs foreign key checks for invalid foreign key values in selected staging tables.""" + while True: + rows, headers, rowcount = self.db.rowdict( + SQL("select * from ups_toprocess;"), + ) + if rowcount == 0: + break + rows = next(iter(rows)) + self.qa_one_fk(table=rows["table_name"]) + # Set the 'processed' column to True in the control table. + self.db.execute( SQL( """ - update {control_table} - set ck_errors = {ck_errors} - where table_name = {table}; + update ups_proctables + set processed = True + where table_name = {table_name}; """, - ).format( - control_table=Identifier(control_table), - ck_errors=Literal(err_df["ck_errors"][0]), - table=Literal(df["table_name"][0]), - ), + ).format(table_name=Literal(rows["table_name"])), ) + return self - db.execute( - SQL( - """update ups_proctables set processed = True - where table_name = {table_name};""", - ).format(table_name=Literal(df["table_name"][0])), - ) - + def qa_one_fk(self: PgUpsert, table: str) -> PgUpsert: + """Performs foreign key checks for invalid foreign key values in a single staging table. -def ck_qa_one( - base_schema: str, - stg_schema: str, - table: str, - errors: list, - interactive: bool, -): - logger.info(f"Conducting check constraint QA checks on table {stg_schema}.{table}") - validate_table(base_schema, stg_schema, table) - # Create a table of *all* check constraints in this database. - # Because this may be an expensive operation (in terms of time), the - # table is not re-created if it already exists. "Already exists" - # means that a table with the expected name exists. No check is - # done to ensure that this table has the correct structure. The - # goal is to create the table of all check constraints only once to - # minimize the time required if QA checks are to be run on multiple - # staging tables. - if ( - db.execute( - SQL( - """select * from information_schema.tables - where table_name = {ups_check_constraints};""", - ).format(ups_check_constraints=Literal("ups_check_constraints")), - ).rowcount - == 0 - ): - db.execute( - SQL( - """ - drop table if exists ups_check_constraints cascade; - select - nspname as table_schema, - cast(conrelid::regclass as text) as table_name, - conname as constraint_name, - pg_get_constraintdef(pg_constraint.oid) AS consrc - into temporary table ups_check_constraints - from pg_constraint - inner join pg_class on pg_constraint.conrelid = pg_class.oid - inner join pg_namespace on pg_class.relnamespace=pg_namespace.oid - where contype = 'c' and nspname = {base_schema}; - """, - ).format(base_schema=Literal(base_schema)), - ) + :param table: The name of the staging table to check for invalid foreign key values. + :type table: str - # Create a temporary table of just the check constraints for the base - # table corresponding to the staging table to check. Include a - # column for the number of rows failing the check constraint, and a - # 'processed' flag to control looping. - db.execute( - SQL( - """ - drop table if exists ups_sel_cks cascade; - select - constraint_name, table_schema, table_name, consrc, - 0::integer as ckerror_values, - False as processed - into temporary table ups_sel_cks - from ups_check_constraints - where - table_schema = {base_schema} - and table_name = {table}; - """, - ).format(base_schema=Literal(base_schema), table=Literal(table)), - ) + :objects created: - # Process all check constraints. - while True: - df = db.dataframe( - """ - select constraint_name, table_schema, table_name, consrc - from ups_sel_cks - where not processed - limit 1; - """, + - `ups_foreign_key_columns`: Temporary table containing the foreign key columns of the base table. + - `ups_sel_fks`: Temporary table containing the foreign key relationships for the base table. + - `ups_fk_constraints`: Temporary table containing the unique constraint names for the table. + - `ups_one_fk`: Temporary table containing the foreign key relationships for the base table. + - `ups_fk_check`: Temporary view containing the invalid foreign key values. + """ + logger.info( + f"Conducting foreign key QA checks on table {self.stg_schema}.{table}", ) - if df.is_empty(): - break - logger.debug(f" Checking constraint {df['constraint_name'][0]}") - # Create a df with the check constraint sql and remove the 'CHECK' keyword - check_df = db.dataframe( + self._validate_table(table) + # Create a table of *all* foreign key dependencies in this database. + # Only create it once because it may slow the QA process down. + if ( + self.db.execute( + SQL( + """select * from information_schema.tables + where table_name = {ups_foreign_key_columns};""", + ).format(ups_foreign_key_columns=Literal("ups_foreign_key_columns")), + ).rowcount + == 0 + ): + self.db.execute( + SQL( + """ + select + fkinf.constraint_name, + fkinf.table_schema, + fkinf.table_name, + att1.attname as column_name, + fkinf.uq_schema, + cls.relname as uq_table, + att2.attname as uq_column + into + temporary table {ups_foreign_key_columns} + from + (select + ns1.nspname as table_schema, + cls.relname as table_name, + unnest(cons.conkey) as uq_table_id, + unnest(cons.confkey) as table_id, + cons.conname as constraint_name, + ns2.nspname as uq_schema, + cons.confrelid, + cons.conrelid + from + pg_constraint as cons + inner join pg_class as cls on cls.oid = cons.conrelid + inner join pg_namespace ns1 on ns1.oid = cls.relnamespace + inner join pg_namespace ns2 on ns2.oid = cons.connamespace + where + cons.contype = 'f' + ) as fkinf + inner join pg_attribute att1 on + att1.attrelid = fkinf.conrelid and att1.attnum = fkinf.uq_table_id + inner join pg_attribute att2 on + att2.attrelid = fkinf.confrelid and att2.attnum = fkinf.table_id + inner join pg_class cls on cls.oid = fkinf.confrelid; + """, + ).format(ups_foreign_key_columns=Identifier("ups_foreign_key_columns")), + ) + # Create a temporary table of just the foreign key relationships for the base + # table corresponding to the staging table to check. + self.db.execute( SQL( """ + drop table if exists ups_sel_fks cascade; select - regexp_replace(consrc, '^CHECK\\s*\\((.*)\\)$', '\\1') as check_sql - from ups_sel_cks + constraint_name, table_schema, table_name, + column_name, uq_schema, uq_table, uq_column + into + temporary table ups_sel_fks + from + ups_foreign_key_columns where - constraint_name = {constraint_name} - and table_schema = {table_schema} - and table_name = {table_name}; + table_schema = {base_schema} + and table_name = {table}; """, - ).format( - constraint_name=Literal(df["constraint_name"][0]), - table_schema=Literal(df["table_schema"][0]), - table_name=Literal(df["table_name"][0]), - ), + ).format(base_schema=Literal(self.base_schema), table=Literal(table)), ) - # Run the check_sql - db.execute( + # Create a temporary table of all unique constraint names for + # this table, with an integer column to be populated with the + # number of rows failing the foreign key check, and a 'processed' + # flag to control looping. + self.db.execute( SQL( """ - create or replace temporary view ups_ck_check_check as - select count(*) from {stg_schema}.{table} - where not ({check_sql}) - """, - ).format( - stg_schema=Identifier(stg_schema), - table=Identifier(table), - check_sql=SQL(check_df["check_sql"][0]), + drop table if exists ups_fk_constraints cascade; + select distinct + constraint_name, table_schema, table_name, + 0::integer as fkerror_values, + False as processed + into temporary table ups_fk_constraints + from ups_sel_fks; + """, ), ) - - ck_check = db.dataframe("select * from ups_ck_check_check;") - if ck_check["count"][0] > 0: - logger.warning( - f" Check constraint {df['constraint_name'][0]} has {ck_check['count'][0]} failing rows", # noqa: E501 + while True: + # Create a view to select one constraint to process. + rows, headers, rowcount = self.db.rowdict( + SQL( + """select constraint_name, table_schema, table_name + from ups_fk_constraints where not processed limit 1;""", + ), ) - - db.execute( + if rowcount == 0: + break + rows = next(iter(rows)) + logger.debug(f" Checking constraint {rows['constraint_name']}") + self.db.execute( SQL( """ - update ups_sel_cks - set ckerror_values = {ckerror_count} + drop table if exists ups_one_fk cascade; + select column_name, uq_schema, uq_table, uq_column + into temporary table ups_one_fk + from ups_sel_fks where constraint_name = {constraint_name} and table_schema = {table_schema} and table_name = {table_name}; + """, + ).format( + constraint_name=Literal(rows["constraint_name"]), + table_schema=Literal(rows["table_schema"]), + table_name=Literal(rows["table_name"]), + ), + ) + const_rows, const_headers, const_rowcount = self.db.rowdict( + "select * from ups_one_fk;", + ) + if const_rowcount == 0: + logger.debug(" No foreign key columns found") + break + const_rows = next(iter(const_rows)) + # Create join expressions from staging table (s) to unique table (u) + # and to staging table equivalent to unique table (su) (though we + # don't know yet if the latter exists). Also create a 'where' + # condition to ensure that all columns being matched are non-null. + # Also create a comma-separated list of the columns being checked. + fk_rows, fk_headers, fk_rowcount = self.db.rowdict( + SQL( + """ + select + string_agg('s.' || column_name || ' = u.' || uq_column, ' and ') as u_join, + string_agg('s.' || column_name || ' = su.' || uq_column, ' and ') as su_join, + string_agg('s.' || column_name || ' is not null', ' and ') as s_not_null, + string_agg('s.' || column_name, ', ') as s_checked + from + (select * from ups_one_fk) as fkcols; + """, + ), + ) + fk_rows = next(iter(fk_rows)) + # Determine whether a staging-table equivalent of the unique table exists. + su_exists = False + if ( + self.db.execute( + SQL( + """select * from information_schema.tables + where table_name = {table} and table_schema = {stg_schema};""", + ).format( + table=Literal(const_rows["uq_table"]), + stg_schema=Literal(self.stg_schema), + ), + ).rowcount + > 0 + ): + su_exists = True + # Construct a query to test for missing unique values for fk columns. + query = SQL( + """ + drop view if exists ups_fk_check cascade; + create or replace temporary view ups_fk_check as + select {s_checked}, count(*) as nrows + from {stg_schema}.{table} as s + left join {uq_schema}.{uq_table} as u on {u_join} + """, + ).format( + s_checked=SQL(fk_rows["s_checked"]), + stg_schema=Identifier(self.stg_schema), + table=Identifier(table), + uq_schema=Identifier(const_rows["uq_schema"]), + uq_table=Identifier(const_rows["uq_table"]), + u_join=SQL(fk_rows["u_join"]), + ) + if su_exists: + query += SQL( + """ left join {stg_schema}.{uq_table} as su on {su_join}""", + ).format( + stg_schema=Identifier(self.stg_schema), + uq_table=Identifier(const_rows["uq_table"]), + su_join=SQL(fk_rows["su_join"]), + ) + query += SQL(" where u.{uq_column} is null").format( + uq_column=Identifier(const_rows["uq_column"]), + ) + if su_exists: + query += SQL(" and su.{uq_column} is null").format( + uq_column=Identifier(const_rows["uq_column"]), + ) + query += SQL( + """ and {s_not_null} + group by {s_checked};""", + ).format( + s_not_null=SQL(fk_rows["s_not_null"]), + s_checked=SQL(fk_rows["s_checked"]), + ) + self.db.execute(query) + check_sql = SQL("select * from ups_fk_check;") + fk_check_rows, fk_check_headers, fk_check_rowcount = self.db.rowdict( + check_sql, + ) + if fk_check_rowcount > 0: + fk_check_rows = next(iter(fk_check_rows)) + logger.warning( + f" Foreign key error referencing {const_rows['uq_schema']}.{const_rows['uq_table']}", + ) + logger.debug("") + logger.debug(f"\n{self._show(check_sql)}") + logger.debug("") + if self.interactive: + btn, return_value = TableUI( + "Foreign key error", + f"Foreign key error referencing {const_rows['uq_schema']}.{const_rows['uq_table']}", + [ + ("Continue", 0, ""), + ("Cancel", 1, ""), + ], + fk_check_headers, + [[row[header] for header in fk_check_headers] for row in [fk_check_rows]], + ).activate() + if btn != 0: + logger.warning("Script cancelled by user") + sys.exit(0) + + self.db.execute( + SQL( + """ + update ups_fk_constraints + set fkerror_values = {fkerror_count} + where constraint_name = {constraint_name} + and table_schema = {table_schema} + and table_name = {table_name}; """, + ).format( + fkerror_count=Literal(fk_check_rows["nrows"]), + constraint_name=Literal(rows["constraint_name"]), + table_schema=Literal(rows["table_schema"]), + table_name=Literal(rows["table_name"]), + ), + ) + self.db.execute( + SQL( + """ + update ups_fk_constraints + set processed = True + where + constraint_name = {constraint_name} + and table_schema = {table_schema} + and table_name = {table_name}; + """, ).format( - ckerror_count=Literal(ck_check["count"][0]), - constraint_name=Literal(df["constraint_name"][0]), - table_schema=Literal(df["table_schema"][0]), - table_name=Literal(df["table_name"][0]), + constraint_name=Literal(rows["constraint_name"]), + table_schema=Literal(rows["table_schema"]), + table_name=Literal(rows["table_name"]), ), ) - - db.execute( + err_rows, err_headers, err_rowcount = self.db.rowdict( SQL( """ - update ups_sel_cks - set processed = True - where - constraint_name = {constraint_name} - and table_schema = {table_schema} - and table_name = {table_name}; + select string_agg( + constraint_name || ' (' || fkerror_values || ')', ', ' + ) as fk_errors + from ups_fk_constraints + where coalesce(fkerror_values, 0) > 0; """, - ).format( - constraint_name=Literal(df["constraint_name"][0]), - table_schema=Literal(df["table_schema"][0]), - table_name=Literal(df["table_name"][0]), ), ) + if err_rowcount > 0: + err_rows = list(err_rows) + # If any 'fk_errors' key is not None in the list of dictionaries, + # update the control table with the list of foreign key errors. + if any(err["fk_errors"] for err in err_rows): + self.db.execute( + SQL( + """ + update {control_table} + set fk_errors = {fk_errors} + where table_name = {table_name}; + """, + ).format( + control_table=Identifier(self.control_table), + fk_errors=Literal( + ",".join( + [err["fk_errors"] for err in err_rows if err["fk_errors"]], + ), + ), + table_name=Literal(table), + ), + ) + return self - # Update the control table with the number of rows failing the check constraint. - db.execute( - SQL( - """ - create or replace temporary view ups_ck_error_list as - select string_agg( - constraint_name || ' (' || ckerror_values || ')', ', ' - ) as ck_errors - from ups_sel_cks - where coalesce(ckerror_values, 0) > 0; - """, - ), - ) + def qa_all_ck(self: PgUpsert) -> PgUpsert: + """Performs check constraint checks for invalid check constraint values in selected staging tables. + :objects created: -def upsert_all( - base_schema: str, - stg_schema: str, - control_table: str, - upsert_method: str, -): - validate_control(base_schema, stg_schema, control_table) + - `ups_check_constraints`: Temporary table containing the check constraints of the base table. + - `ups_sel_cks`: Temporary table containing the check constraints for the base table. + - `ups_ck_check_check`: Temporary view containing the check constraint values. + - `ups_ck_error_list`: Temporary table containing the list of check constraint errors. + """ + while True: + rows, headers, rowcount = self.db.rowdict( + SQL("select * from ups_toprocess;"), + ) + if rowcount == 0: + break + rows = next(iter(rows)) + self.qa_one_ck(table=rows["table_name"]) + err_rows, err_headers, err_rowcount = self.db.rowdict( + "select * from ups_ck_error_list;", + ) + if err_rowcount > 0: + self.db.execute( + SQL( + """ + update {control_table} + set ck_errors = {ck_errors} + where table_name = {table_name}; + """, + ).format( + control_table=Identifier(self.control_table), + ck_errors=Literal(next(iter(err_rows))["ck_errors"]), + table_name=Literal(rows["table_name"]), + ), + ) + # Set the 'processed' column to True in the control table. + self.db.execute( + SQL( + """ + update ups_proctables + set processed = True + where table_name = {table_name}; + """, + ).format(table_name=Literal(rows["table_name"])), + ) + return self - # Get a table of all dependencies for the base schema. - db.execute( - SQL( - """ - drop table if exists ups_dependencies cascade; - create temporary table ups_dependencies as - select - tc.table_name as child, - tu.table_name as parent - from - information_schema.table_constraints as tc - inner join information_schema.constraint_table_usage as tu - on tu.constraint_name = tc.constraint_name - where - tc.constraint_type = 'FOREIGN KEY' - and tc.table_name <> tu.table_name - and tc.table_schema = {base_schema}; - """, - ).format(base_schema=Literal(base_schema)), - ) + def qa_one_ck(self: PgUpsert, table: str) -> PgUpsert: + """Performs check constraint checks for invalid check constraint values in a single staging table. - # Create a list of tables in the base schema ordered by dependency. - db.execute( - SQL( - """ - drop table if exists ups_ordered_tables cascade; - with recursive dep_depth as ( - select - dep.child as first_child, - dep.child, - dep.parent, - 1 as lvl - from - ups_dependencies as dep - union all - select - dd.first_child, - dep.child, - dep.parent, - dd.lvl + 1 as lvl - from - dep_depth as dd - inner join ups_dependencies as dep on dep.parent = dd.child - and dep.child <> dd.parent - and not (dep.parent = dd.first_child and dd.lvl > 2) - ) - select - table_name, - table_order - into - temporary table ups_ordered_tables - from ( - select - dd.parent as table_name, - max(lvl) as table_order - from - dep_depth as dd - group by - table_name - union - select - dd.child as table_name, - max(lvl) + 1 as level - from - dep_depth as dd - left join ups_dependencies as dp on dp.parent = dd.child - where - dp.parent is null - group by - dd.child - union - select distinct - t.table_name, - 0 as level - from - information_schema.tables as t - left join ups_dependencies as p on t.table_name=p.parent - left join ups_dependencies as c on t.table_name=c.child - where - t.table_schema = {base_schema} - and t.table_type = 'BASE TABLE' - and p.parent is null - and c.child is null - ) as all_levels; - """, - ).format(base_schema=Literal(base_schema)), - ) + :param table: The name of the staging table to check for invalid check constraint values. + :type table: str - # Create a list of the selected tables with ordering information. - db.execute( - SQL( - """ - drop table if exists ups_proctables cascade; - select - ot.table_order, - tl.table_name, - tl.exclude_cols, - tl.interactive, - tl.rows_updated, - tl.rows_inserted, - False::boolean as processed - into - temporary table ups_proctables - from - {control_table} as tl - inner join ups_ordered_tables as ot on ot.table_name = tl.table_name - ; - """, - ).format(control_table=Identifier(control_table)), - ) + :objects created: - while True: - # Create a view returning a single unprocessed table, in order. - proc_df = db.dataframe( + - `ups_sel_cks`: Temporary table containing the check constraints for the base table. + - `ups_ck_check_check`: Temporary view containing the check constraint values. + - `ups_ck_error_list`: Temporary table containing the list of check constraint errors. + """ + logger.info( + f"Conducting check constraint QA checks on table {self.stg_schema}.{table}", + ) + # Create a table of *all* check constraints in this database. + # Because this may be an expensive operation (in terms of time), the + # table is not re-created if it already exists. "Already exists" + # means that a table with the expected name exists. No check is + # done to ensure that this table has the correct structure. The + # goal is to create the table of all check constraints only once to + # minimize the time required if QA checks are to be run on multiple + # staging tables. + if ( + self.db.execute( + SQL( + """select * from information_schema.tables + where table_name = {ups_check_constraints};""", + ).format(ups_check_constraints=Literal("ups_check_constraints")), + ).rowcount + == 0 + ): + self.db.execute( + SQL( + """ + drop table if exists ups_check_constraints cascade; + select + nspname as table_schema, + cast(conrelid::regclass as text) as table_name, + conname as constraint_name, + pg_get_constraintdef(pg_constraint.oid) AS consrc + into temporary table ups_check_constraints + from pg_constraint + inner join pg_class on pg_constraint.conrelid = pg_class.oid + inner join pg_namespace on pg_class.relnamespace=pg_namespace.oid + where contype = 'c' and nspname = {base_schema}; + """, + ).format(base_schema=Literal(self.base_schema)), + ) + + # Create a temporary table of just the check constraints for the base + # table corresponding to the staging table to check. Include a + # column for the number of rows failing the check constraint, and a + # 'processed' flag to control looping. + self.db.execute( SQL( """ + drop table if exists ups_sel_cks cascade; select - table_name, exclude_cols, interactive, - rows_updated, rows_inserted - from ups_proctables - where not processed - order by table_order - limit 1; + constraint_name, table_schema, table_name, consrc, + 0::integer as ckerror_values, + False as processed + into temporary table ups_sel_cks + from ups_check_constraints + where + table_schema = {base_schema} + and table_name = {table}; """, - ), - ) - if proc_df.is_empty(): - break - - rows_updated, rows_inserted = upsert_one( - base_schema, - stg_schema, - upsert_method, - proc_df["table_name"][0], - proc_df["exclude_cols"][0].split(",") if proc_df["exclude_cols"][0] else [], - proc_df["interactive"][0], + ).format(base_schema=Literal(self.base_schema), table=Literal(table)), ) - db.execute( + # Process all check constraints. + while True: + rows, headers, rowcount = self.db.rowdict( + SQL( + """select constraint_name, table_schema, table_name, consrc + from ups_sel_cks where not processed limit 1;""", + ), + ) + if rowcount == 0: + break + rows = next(iter(rows)) + logger.debug(f" Checking constraint {rows['constraint_name']}") + # Remove the 'CHECK' keyword from the constraint definition. + const_rows, const_headers, const_rowcount = self.db.rowdict( + SQL( + """ + select + regexp_replace(consrc, '^CHECK\\s*\\((.*)\\)$', '\\1') as check_sql + from ups_sel_cks + where + constraint_name = {constraint_name} + and table_schema = {table_schema} + and table_name = {table_name}; + """, + ).format( + constraint_name=Literal(rows["constraint_name"]), + table_schema=Literal(rows["table_schema"]), + table_name=Literal(rows["table_name"]), + ), + ) + const_rows = next(iter(const_rows)) + # Run the check_sql + self.db.execute( + SQL( + """ + create or replace temporary view ups_ck_check_check as + select count(*) from {stg_schema}.{table} + where not ({check_sql}) + """, + ).format( + stg_schema=Identifier(self.stg_schema), + table=Identifier(table), + check_sql=SQL(const_rows["check_sql"]), + ), + ) + + ck_check_rows, ck_check_headers, ck_check_rowcount = self.db.rowdict( + "select * from ups_ck_check_check where count > 0;", + ) + if ck_check_rowcount > 0: + ck_check_rows = next(iter(ck_check_rows)) + logger.warning( + f" Check constraint {rows['constraint_name']} has {ck_check_rowcount} failing rows", + ) + self.db.execute( + SQL( + """ + update ups_sel_cks + set ckerror_values = {ckerror_count} + where + constraint_name = {constraint_name} + and table_schema = {table_schema} + and table_name = {table_name}; + """, + ).format( + ckerror_count=Literal(ck_check_rows["count"]), + constraint_name=Literal(rows["constraint_name"]), + table_schema=Literal(rows["table_schema"]), + table_name=Literal(rows["table_name"]), + ), + ) + self.db.execute( + SQL( + """ + update ups_sel_cks + set processed = True + where + constraint_name = {constraint_name} + and table_schema = {table_schema} + and table_name = {table_name}; + """, + ).format( + constraint_name=Literal(rows["constraint_name"]), + table_schema=Literal(rows["table_schema"]), + table_name=Literal(rows["table_name"]), + ), + ) + + # Update the control table with the number of rows failing the check constraint. + self.db.execute( SQL( """ - update ups_proctables - set rows_updated = {rows_updated}, - rows_inserted = {rows_inserted} - where table_name = {table_name}; + create or replace temporary view ups_ck_error_list as + select string_agg( + constraint_name || ' (' || ckerror_values || ')', ', ' + ) as ck_errors + from ups_sel_cks + where coalesce(ckerror_values, 0) > 0; """, - ).format( - rows_updated=Literal(rows_updated), - rows_inserted=Literal(rows_inserted), - table_name=Literal(proc_df["table_name"][0]), ), ) + return self - db.execute( + def upsert_all(self: PgUpsert) -> PgUpsert: + """Performs upsert operations on all selected tables in the base schema. + + :objects created: + + - `ups_dependencies`: Temporary table containing the dependencies of the base schema. + - `ups_ordered_tables`: Temporary table containing the selected tables ordered by dependency. + - `ups_proctables`: Temporary table containing the selected tables with ordering information. + """ + self._validate_control() + if not self.qa_passed: + logger.warning( + "QA checks have not been run or have failed. Continuing anyway.", + ) + logger.info(f"===Starting upsert procedures (COMMIT={self.do_commit})===") + # Get a table of all dependencies for the base schema. + self.db.execute( SQL( """ - update ups_proctables - set processed = True - where table_name = {table_name}; - """, - ).format(table_name=Literal(proc_df["table_name"][0])), - ) - - # Move the update/insert counts back into the control table. - db.execute( - SQL( - """ - update {control_table} as ct - set - rows_updated = pt.rows_updated, - rows_inserted = pt.rows_inserted + drop table if exists ups_dependencies cascade; + create temporary table ups_dependencies as + select + tc.table_name as child, + tu.table_name as parent from - ups_proctables as pt + information_schema.table_constraints as tc + inner join information_schema.constraint_table_usage as tu + on tu.constraint_name = tc.constraint_name where - pt.table_name = ct.table_name; + tc.constraint_type = 'FOREIGN KEY' + and tc.table_name <> tu.table_name + and tc.table_schema = {base_schema}; """, - ).format(control_table=Identifier(control_table)), - ) + ).format(base_schema=Literal(self.base_schema)), + ) + # Create a list of tables in the base schema ordered by dependency. + self.db.execute( + SQL( + """ + drop table if exists ups_ordered_tables cascade; + with recursive dep_depth as ( + select + dep.child as first_child, + dep.child, + dep.parent, + 1 as lvl + from + ups_dependencies as dep + union all + select + dd.first_child, + dep.child, + dep.parent, + dd.lvl + 1 as lvl + from + dep_depth as dd + inner join ups_dependencies as dep on dep.parent = dd.child + and dep.child <> dd.parent + and not (dep.parent = dd.first_child and dd.lvl > 2) + ) + select + table_name, + table_order + into + temporary table ups_ordered_tables + from ( + select + dd.parent as table_name, + max(lvl) as table_order + from + dep_depth as dd + group by + table_name + union + select + dd.child as table_name, + max(lvl) + 1 as level + from + dep_depth as dd + left join ups_dependencies as dp on dp.parent = dd.child + where + dp.parent is null + group by + dd.child + union + select distinct + t.table_name, + 0 as level + from + information_schema.tables as t + left join ups_dependencies as p on t.table_name=p.parent + left join ups_dependencies as c on t.table_name=c.child + where + t.table_schema = {base_schema} + and t.table_type = 'BASE TABLE' + and p.parent is null + and c.child is null + ) as all_levels; + """, + ).format(base_schema=Literal(self.base_schema)), + ) + # Create a list of the selected tables with ordering information. + self.db.execute( + SQL( + """ + drop table if exists ups_proctables cascade; + select + ot.table_order, + tl.table_name, + tl.exclude_cols, + tl.interactive, + False::boolean as processed + into + temporary table ups_proctables + from + {control_table} as tl + inner join ups_ordered_tables as ot on ot.table_name = tl.table_name + ; + """, + ).format(control_table=Identifier(self.control_table)), + ) + while True: + # Create a view returning a single unprocessed table, in order. + proc_rows, proc_headers, proc_rowcount = self.db.rowdict( + SQL( + """ + select + table_name, exclude_cols, interactive + from ups_proctables + where not processed + order by table_order + limit 1; + """, + ), + ) + if proc_rowcount == 0: + break + proc_rows = next(iter(proc_rows)) + self.upsert_one(proc_rows["table_name"]) + self.db.execute( + SQL( + """ + update ups_proctables + set processed = True + where table_name = {table_name}; + """, + ).format(table_name=Literal(proc_rows["table_name"])), + ) + return self + def upsert_one(self: PgUpsert, table: str) -> PgUpsert: + """Performs an upsert operation on a single table. -def upsert_one( - base_schema: str, - stg_schema: str, - upsert_method: str, - table: str, - exclude_cols: list[str], - interactive: bool = False, -): - rows_updated = 0 - rows_inserted = 0 + :param table: The name of the table to upsert. + :type table: str - logger.info(f"Performing upsert on table {base_schema}.{table}") - validate_table(base_schema, stg_schema, table) + :objects created: - # Populate a (temporary) table with the names of the columns - # in the base table that are to be updated from the staging table. - # Include only those columns from staging table that are also in base table. - # db.execute( - query = SQL( + - `ups_cols`: Temporary table containing the columns to be updated. + - `ups_pks`: Temporary table containing the primary key columns. + - `ups_fk_check`: Temporary view containing the foreign key check. + - `ups_toprocess`: Temporary table containing the tables to be processed. """ - drop table if exists ups_cols cascade; - select s.column_name - into temporary table ups_cols - from information_schema.columns as s - inner join information_schema.columns as b on s.column_name=b.column_name - where - s.table_schema = {stg_schema} - and s.table_name = {table} - and b.table_schema = {base_schema} - and b.table_name = {table} - """, - ).format( - stg_schema=Literal(stg_schema), - table=Literal(table), - base_schema=Literal(base_schema), - ) - if exclude_cols: - query += SQL( - """ - and s.column_name not in ({exclude_cols}) + rows_updated = 0 + rows_inserted = 0 + logger.info(f"Performing upsert on table {self.base_schema}.{table}") + self._validate_table(table) + + spec_rows, spec_headers, spec_rowcount = self.db.rowdict( + SQL( + """ + select table_name, exclude_cols, interactive + from {control_table} + where table_name = {table}; """, - ).format( - exclude_cols=SQL(",").join(Literal(col) for col in exclude_cols), + ).format( + control_table=Identifier(self.control_table), + table=Literal(table), + ), ) - query += SQL(" order by s.ordinal_position;") - db.execute(query) - - # Populate a (temporary) table with the names of the primary key - # columns of the base table. - db.execute( - SQL( - """ - drop table if exists ups_pks cascade; - select k.column_name - into temporary table ups_pks - from information_schema.table_constraints as tc - inner join information_schema.key_column_usage as k - on tc.constraint_type = 'PRIMARY KEY' - and tc.constraint_name = k.constraint_name - and tc.constraint_catalog = k.constraint_catalog - and tc.constraint_schema = k.constraint_schema - and tc.table_schema = k.table_schema - and tc.table_name = k.table_name - and tc.constraint_name = k.constraint_name - where - k.table_name = {table} - and k.table_schema = {base_schema} - order by k.ordinal_position; - """, - ).format(table=Literal(table), base_schema=Literal(base_schema)), - ) - - # Get all base table columns that are to be updated into a comma-delimited list. - all_col_list = db.dataframe( - SQL( - """ - select string_agg(column_name, ', ') as cols from ups_cols;""", - ), - )["cols"][0] - - # Get all base table columns that are to be updated into a - # comma-delimited list with a "b." prefix. - base_col_list = db.dataframe( - SQL( - """ - select string_agg('b.' || column_name, ', ') as cols - from ups_cols;""", - ), - )["cols"][0] - - # Get all staging table column names for columns that are to be updated - # into a comma-delimited list with an "s." prefix. - stg_col_list = db.dataframe( - SQL( - """ - select string_agg('s.' || column_name, ', ') as cols - from ups_cols;""", - ), - )["cols"][0] - - # Get the primary key columns in a comma-delimited list. - pk_col_list = db.dataframe( - SQL( - """ - select string_agg(column_name, ', ') as cols - from ups_pks;""", - ), - )["cols"][0] - - # Create a join expression for key columns of the base (b) and - # staging (s) tables. - join_expr = db.dataframe( - SQL( - """ - select - string_agg('b.' || column_name || ' = s.' || column_name, ' and ') as expr - from - ups_pks; - """, - ), - )["expr"][0] - - # Create a FROM clause for an inner join between base and staging - # tables on the primary key column(s). - from_clause = SQL( - """FROM {base_schema}.{table} as b - INNER JOIN {stg_schema}.{table} as s ON {join_expr}""", - ).format( - base_schema=Identifier(base_schema), - table=Identifier(table), - stg_schema=Identifier(stg_schema), - join_expr=SQL(join_expr), - ) - - # Create SELECT queries to pull all columns with matching keys from both - # base and staging tables. - db.execute( - SQL( + if spec_rowcount == 0: + logger.warning(f"Table {table} not found in control table") + return self + spec_rows = next(iter(spec_rows)) + # Populate a (temporary) table with the names of the columns + # in the base table that are to be updated from the staging table. + # Include only those columns from staging table that are also in base table. + query = SQL( """ - drop view if exists ups_basematches cascade; - create temporary view ups_basematches as select {base_col_list} {from_clause}; - - drop view if exists ups_stgmatches cascade; - create temporary view ups_stgmatches as select {stg_col_list} {from_clause}; - """, + drop table if exists ups_cols cascade; + select s.column_name + into temporary table ups_cols + from information_schema.columns as s + inner join information_schema.columns as b on s.column_name=b.column_name + where + s.table_schema = {stg_schema} + and s.table_name = {table} + and b.table_schema = {base_schema} + and b.table_name = {table} + """, ).format( - base_col_list=SQL(base_col_list), - stg_col_list=SQL(stg_col_list), - from_clause=from_clause, - ), - ) - # Get non-key columns to be updated - db.execute( - SQL( - """ - drop view if exists ups_nk cascade; - create temporary view ups_nk as - select column_name from ups_cols - except - select column_name from ups_pks; - """, - ), - ) - # Prompt user to examine matching data and commit, don't commit, or quit. - - do_updates = False - update_stmt = None - # if not stg_df.is_empty() and not nk_df.is_empty(): - if upsert_method in ("upsert", "update"): - stg_curs = db.execute("select * from ups_stgmatches;") - stg_cols = [col.name for col in stg_curs.description] - stg_rowcount = stg_curs.rowcount - stg_data = stg_curs.fetchall() - nk_curs = db.execute("select * from ups_nk;") - # nk_cols = [col.name for col in nk_curs.description] - nk_rowcount = nk_curs.rowcount - # nk_data = nk_curs.fetchall() - if stg_rowcount > 0 and nk_rowcount > 0: - base_curs = db.execute("select * from ups_basematches;") - base_cols = [col.name for col in base_curs.description] - # base_rowcount = base_curs.rowcount - base_data = base_curs.fetchall() - - if interactive: - btn, return_value = CompareUI( - "Compare Tables", - f"Do you want to make these changes? For table {table}, new data are shown in the top table; existing data are shown in the bottom table.", # noqa: E501 - [ - ("Continue", 0, ""), - ("Skip", 1, ""), - ("Cancel", 2, ""), - ], - stg_cols, - stg_data, - base_cols, - base_data, - pk_col_list.split(", "), - sidebyside=False, - ).activate() - else: - btn = 0 - if btn == 2: - error_handler(["Upsert cancelled"]) - if btn == 0: - do_updates = True - # Create an assignment expression to update non-key columns of the - # base table (un-aliased) from columns of the staging table (as s). - ups_expr = db.dataframe( - SQL( + stg_schema=Literal(self.stg_schema), + table=Literal(table), + base_schema=Literal(self.base_schema), + ) + if spec_rows["exclude_cols"]: + query += SQL( + """ + and s.column_name not in ({exclude_cols}) + """, + ).format( + exclude_cols=SQL(",").join( + Literal(col) for col in spec_rows["exclude_cols"] if spec_rows["exclude_cols"] + ), + ) + query += SQL(" order by s.ordinal_position;") + self.db.execute(query) + # Populate a (temporary) table with the names of the primary key + # columns of the base table. + self.db.execute( + SQL( + """ + drop table if exists ups_pks cascade; + select k.column_name + into temporary table ups_pks + from information_schema.table_constraints as tc + inner join information_schema.key_column_usage as k + on tc.constraint_type = 'PRIMARY KEY' + and tc.constraint_name = k.constraint_name + and tc.constraint_catalog = k.constraint_catalog + and tc.constraint_schema = k.constraint_schema + and tc.table_schema = k.table_schema + and tc.table_name = k.table_name + and tc.constraint_name = k.constraint_name + where + k.table_name = {table} + and k.table_schema = {base_schema} + order by k.ordinal_position; + """, + ).format(table=Literal(table), base_schema=Literal(self.base_schema)), + ) + # Get all base table columns that are to be updated into a comma-delimited list. + all_col_list = self.db.execute( + SQL( + """ + select string_agg(column_name, ', ') as cols from ups_cols;""", + ), + ).fetchone() + if not all_col_list: + logger.warning("No columns found in base table") + return self + all_col_list = next(iter(all_col_list)) + # Get all base table columns that are to be updated into a + # comma-delimited list with a "b." prefix. + base_col_list = self.db.execute( + SQL( + """ + select string_agg('b.' || column_name, ', ') as cols + from ups_cols;""", + ), + ).fetchone() + if not base_col_list: + logger.warning("No columns found in base table") + return self + base_col_list = next(iter(base_col_list)) + # Get all staging table column names for columns that are to be updated + # into a comma-delimited list with an "s." prefix. + stg_col_list = self.db.execute( + SQL( + """ + select string_agg('s.' || column_name, ', ') as cols + from ups_cols;""", + ), + ).fetchone() + if not stg_col_list: + logger.warning("No columns found in staging table") + return self + stg_col_list = next(iter(stg_col_list)) + # Get the primary key columns in a comma-delimited list. + pk_col_list = self.db.execute( + SQL( + """ + select string_agg(column_name, ', ') as cols + from ups_pks;""", + ), + ).fetchone() + if not pk_col_list: + logger.warning("Base table has no primary key") + return self + pk_col_list = next(iter(pk_col_list)) + # Create a join expression for key columns of the base (b) and + # staging (s) tables. + join_expr = self.db.execute( + SQL( + """ + select + string_agg('b.' || column_name || ' = s.' || column_name, ' and ') as expr + from + ups_pks; + """, + ), + ).fetchone() + if not join_expr: + logger.warning("Base table has no primary key") + return self + # Create a FROM clause for an inner join between base and staging + # tables on the primary key column(s). + from_clause = SQL( + """FROM {base_schema}.{table} as b + INNER JOIN {stg_schema}.{table} as s ON {join_expr}""", + ).format( + base_schema=Identifier(self.base_schema), + table=Identifier(table), + stg_schema=Identifier(self.stg_schema), + join_expr=SQL(join_expr[0]), + ) + # Create SELECT queries to pull all columns with matching keys from both + # base and staging tables. + self.db.execute( + SQL( + """ + drop view if exists ups_basematches cascade; + create temporary view ups_basematches as select {base_col_list} {from_clause}; + + drop view if exists ups_stgmatches cascade; + create temporary view ups_stgmatches as select {stg_col_list} {from_clause}; + """, + ).format( + base_col_list=SQL(base_col_list), + stg_col_list=SQL(stg_col_list), + from_clause=from_clause, + ), + ) + # Get non-key columns to be updated + self.db.execute( + SQL( + """ + drop view if exists ups_nk cascade; + create temporary view ups_nk as + select column_name from ups_cols + except + select column_name from ups_pks; + """, + ), + ) + do_updates = False + update_stmt = None + # Prepare updates + if self.upsert_method in ("upsert", "update"): + stg_curs = self.db.execute("select * from ups_stgmatches;") + if stg_curs.rowcount == 0: + logger.debug( + " No rows in staging table matching primary key in base table", + ) + stg_cols = [col.name for col in stg_curs.description] + stg_rowcount = stg_curs.rowcount + stg_data = stg_curs.fetchall() + nk_curs = self.db.execute("select * from ups_nk;") + nk_rowcount = nk_curs.rowcount + if stg_rowcount > 0 and nk_rowcount > 0: + base_curs = self.db.execute("select * from ups_basematches;") + if base_curs.rowcount == 0: + logger.debug( + " No rows in base table matching primary key in staging table", + ) + return self + base_cols = [col.name for col in base_curs.description] + base_data = base_curs.fetchall() + if spec_rows["interactive"]: + btn, return_value = CompareUI( + "Compare Tables", + f"Do you want to make these changes? For table {table}, new data are shown in the top table; existing data are shown in the bottom table.", # noqa: E501 + [ + ("Continue", 0, ""), + ("Skip", 1, ""), + ("Cancel", 2, ""), + ], + stg_cols, + stg_data, + base_cols, + base_data, + pk_col_list.split(", "), + sidebyside=False, + ).activate() + else: + btn = 0 + if btn == 2: + logger.warning("Script cancelled by user") + sys.exit(0) + if btn == 0: + do_updates = True + # Create an assignment expression to update non-key columns of the + # base table (un-aliased) from columns of the staging table (as s). + ups_expr = self.db.execute( + SQL( + """ + select string_agg( + column_name || ' = s.' || column_name, ', ' + ) as col + from ups_nk; + """, + ), + ).fetchone() + if not ups_expr: + logger.warning("Unexpected error in upsert_one") + return self + # Create an UPDATE statement to update the base table with + # non-key columns from the staging table. + # No semicolon terminating generated SQL. + update_stmt = SQL( """ - select string_agg( - column_name || ' = s.' || column_name, ', ' - ) as col - from ups_nk; + UPDATE {base_schema}.{table} as b + SET {ups_expr} + FROM {stg_schema}.{table} as s WHERE {join_expr} """, - ), - )["col"][0] - # Create an UPDATE statement to update the base table with - # non-key columns from the staging table. - # No semicolon terminating generated SQL. - update_stmt = SQL( + ).format( + base_schema=Identifier(self.base_schema), + table=Identifier(table), + stg_schema=Identifier(self.stg_schema), + ups_expr=SQL(ups_expr[0]), + join_expr=SQL(join_expr[0]), + ) + else: + logger.info(" No rows to update") + + # Prepare the inserts. + do_inserts = False + insert_stmt = None + if self.upsert_method in ("upsert", "insert"): + # Create a select statement to find all rows of the staging table + # that are not in the base table. + self.db.execute( + SQL( """ - UPDATE {base_schema}.{table} as b - SET {ups_expr} - FROM {stg_schema}.{table} as s WHERE {join_expr} + drop view if exists ups_newrows cascade; + create temporary view ups_newrows as with newpks as ( + select {pk_col_list} + from {stg_schema}.{table} + except + select {pk_col_list} + from {base_schema}.{table} + ) + select s.* + from {stg_schema}.{table} as s + inner join newpks using ({pk_col_list}); """, ).format( - base_schema=Identifier(base_schema), + stg_schema=Identifier(self.stg_schema), table=Identifier(table), - stg_schema=Identifier(stg_schema), - ups_expr=SQL(ups_expr), - join_expr=SQL(join_expr), - ) - else: - logger.debug(" No data to update") - - do_inserts = False - insert_stmt = None - if upsert_method in ("upsert", "insert"): - # Create a select statement to find all rows of the staging table - # that are not in the base table. - db.execute( + pk_col_list=SQL(pk_col_list), + base_schema=Identifier(self.base_schema), + ), + ) + # Prompt user to examine new data and continue or quit. + new_curs = self.db.execute("select * from ups_newrows;") + new_cols = [col.name for col in new_curs.description] + new_rowcount = new_curs.rowcount + new_data = new_curs.fetchall() + if new_rowcount > 0: + if spec_rows["interactive"]: + btn, return_value = TableUI( + "New Data", + f"Do you want to add these new data to the {self.base_schema}.{table} table?", + [ + ("Continue", 0, ""), + ("Skip", 1, ""), + ("Cancel", 2, ""), + ], + new_cols, + new_data, + ).activate() + else: + btn = 0 + if btn == 2: + logger.warning("Script cancelled by user") + sys.exit(0) + if btn == 0: + do_inserts = True + # Create an insert statement. No semicolon terminating generated SQL. + insert_stmt = SQL( + """ + INSERT INTO {base_schema}.{table} ({all_col_list}) + SELECT {all_col_list} FROM ups_newrows + """, + ).format( + base_schema=Identifier(self.base_schema), + table=Identifier(table), + all_col_list=SQL(all_col_list), + ) + else: + logger.info(" No new data to insert") + # Run the update and insert statements. + if do_updates and update_stmt and self.upsert_method in ("upsert", "update"): + logger.info(f" Updating {self.base_schema}.{table}") + logger.debug(f" UPDATE statement for {self.base_schema}.{table}") + logger.debug(f"{update_stmt.as_string(self.db.cursor())}") + self.db.execute(update_stmt) + rows_updated = stg_rowcount + logger.info(f" {rows_updated} rows updated") + if do_inserts and insert_stmt and self.upsert_method in ("upsert", "insert"): + logger.info(f" Adding data to {self.base_schema}.{table}") + logger.debug(f" INSERT statement for {self.base_schema}.{table}") + logger.debug(f"{insert_stmt.as_string(self.db.cursor())}") + self.db.execute(insert_stmt) + rows_inserted = new_rowcount + logger.info(f" {rows_inserted} rows inserted") + # Move the update/insert counts into the control table. + self.db.execute( SQL( """ - drop view if exists ups_newrows cascade; - create temporary view ups_newrows as with newpks as ( - select {pk_col_list} - from {stg_schema}.{table} - except - select {pk_col_list} - from {base_schema}.{table} - ) - select s.* - from {stg_schema}.{table} as s - inner join newpks using ({pk_col_list}); + update {control_table} + set + rows_updated = {rows_updated}, + rows_inserted = {rows_inserted} + where + table_name = {table_name}; """, ).format( - stg_schema=Identifier(stg_schema), - table=Identifier(table), - pk_col_list=SQL(pk_col_list), - base_schema=Identifier(base_schema), + control_table=Identifier(self.control_table), + rows_updated=Literal(rows_updated), + rows_inserted=Literal(rows_inserted), + table_name=Literal(table), ), ) - # Prompt user to examine new data and continue or quit. - new_curs = db.execute("select * from ups_newrows;") - new_cols = [col.name for col in new_curs.description] - new_rowcount = new_curs.rowcount - new_data = new_curs.fetchall() - - # if not new_df.is_empty(): - if new_rowcount > 0: - if interactive: - btn, return_value = TableUI( - "New Data", - f"Do you want to add these new data to the {base_schema}.{table} table?", # noqa: E501 - [ - ("Continue", 0, ""), - ("Skip", 1, ""), - ("Cancel", 2, ""), - ], - # new_df.columns, - new_cols, - # list(new_df.iter_rows()), - new_data, - ).activate() + return self + + def run(self: PgUpsert) -> PgUpsert: + """Run all QA checks and upsert operations. + + This method runs :class:`PgUpsert` methods in the following order: + + 1. :meth:`PgUpsert.qa_all` + 2. :meth:`PgUpsert.upsert_all` + 3. :meth:`PgUpsert.commit` + """ + start_time = datetime.now() + logger.info(f"Upserting to {self.base_schema} from {self.stg_schema}") + if self.interactive: + logger.debug("Tables selected for upsert:") + for table in self.tables: + logger.debug(f" {table}") + btn, return_value = TableUI( + "Upsert Tables", + "Tables selected for upsert", + [ + ("Continue", 0, ""), + ("Cancel", 1, ""), + ], + ["Table"], + [[table] for table in self.tables], + ).activate() + if btn != 0: + logger.info("Upsert cancelled") + return self + else: + logger.info("Tables selected for upsert:") + for table in self.tables: + logger.info(f" {table}") + self._init_ups_control() + self.qa_all() + if self.qa_passed: + self.upsert_all() + self.commit() + logger.debug(f"Upsert completed in {ellapsed_time(start_time)}") + return self + + def commit(self: PgUpsert) -> PgUpsert: + """Commits the transaction to the database and show a summary of changes. + + Changes are committed if the following criteria are met: + + - The `do_commit` flag is set to `True`. + - All QA checks have passed (i.e., the `qa_passed` flag is set to `True`). Note that no checking is done to ensure that QA checks have been run. + - The summary of changes shows that rows have been updated or inserted. + - If the `interactive` flag is set to `True` and the `do_commit` flag is is set to `False`, the user is prompted to commit the changes and the user selects "Continue". + """ # noqa: E501 + self._validate_control() + final_ctrl_sql = SQL("select * from {control_table}").format( + control_table=Identifier(self.control_table), + ) + final_ctrl_rows, final_ctrl_headers, final_ctrl_rowcount = self.db.rowdict( + final_ctrl_sql, + ) + if self.interactive: + btn, return_value = TableUI( + "Upsert Summary", + "Below is a summary of changes. Do you want to commit these changes? ", + [ + ("Continue", 0, ""), + ("Cancel", 1, ""), + ], + final_ctrl_headers, + [[row[header] for header in final_ctrl_headers] for row in final_ctrl_rows], + ).activate() + else: + btn = 0 + logger.info("") + logger.info("Summary of changes:") + logger.info(self._show(final_ctrl_sql)) + + logger.info("") + + if btn == 0: + upsert_rows, upsert_headers, upsert_rowcount = self.db.rowdict( + SQL( + "select * from {control_table} where rows_updated > 0 or rows_inserted > 0", + ).format(control_table=Identifier(self.control_table)), + ) + if upsert_rowcount == 0: + logger.info("No changes to commit") + self.db.rollback() else: - btn = 0 - if btn == 2: - error_handler(["Upsert cancelled"]) - if btn == 0: - do_inserts = True - # Create an insert statement. No semicolon terminating generated SQL. - insert_stmt = SQL( - """ - INSERT INTO {base_schema}.{table} ({all_col_list}) - SELECT {all_col_list} FROM ups_newrows - """, - ).format( - base_schema=Identifier(base_schema), - table=Identifier(table), - all_col_list=SQL(all_col_list), - ) + if self.do_commit: + self.db.commit() + logger.info("Changes committed") + else: + logger.info( + "The do_commit flag is set to FALSE, rolling back changes.", + ) + self.db.rollback() else: - logger.debug(" No new data to insert") - - # Run the update and insert statements. - if do_updates and update_stmt and upsert_method in ("upsert", "update"): - logger.info(f" Updating {base_schema}.{table}") - logger.debug(f" UPDATE statement for {base_schema}.{table}") - logger.debug(f"{update_stmt.as_string(db.conn)}") - db.execute(update_stmt) - rows_updated = stg_rowcount - logger.info(f" {rows_updated} rows updated") - if do_inserts and insert_stmt and upsert_method in ("upsert", "insert"): - logger.info(f" Adding data to {base_schema}.{table}") - logger.debug(f" INSERT statement for {base_schema}.{table}") - logger.debug(f"{insert_stmt.as_string(db.conn)}") - db.execute(insert_stmt) - rows_inserted = new_rowcount - logger.info(f" {rows_inserted} rows inserted") - return rows_updated, rows_inserted - - -def error_handler(errors: list[str]): - """Log errors and exit.""" - for error in errors: - logger.error(error) - if errors: - db.rollback() - sys.exit(1) + logger.info("Rolling back changes") + self.db.rollback() + self.db.close() + return self + + +def treeview_table( + parent: ttk.Frame, + rowset: list | tuple, + column_headers: list | tuple, + select_mode="none", +): + """Creates a TreeView table containing the specified data, with scrollbars and + status bar in an enclosing frame. + This does not grid the table frame in its parent widget. Returns a tuple + of 0: the frame containing the table, and 1: the table widget itself. + """ + nrows = range(len(rowset)) + ncols = range(len(column_headers)) + hdrwidths = [len(column_headers[j]) for j in ncols] + if len(rowset) > 0: + datawidthtbl = [ + [ + len( + (rowset[i][j] if isinstance(rowset[i][j], str) else str(rowset[i][j])), + ) + for i in nrows + ] + for j in ncols + ] + datawidths = [max(cwidths) for cwidths in datawidthtbl] + else: + datawidths = hdrwidths + colwidths = [max(hdrwidths[i], datawidths[i]) for i in ncols] + # Set the font. + ff = tkfont.nametofont("TkFixedFont") + tblstyle = ttk.Style() + tblstyle.configure("tblstyle", font=ff) + charpixels = int(1.3 * ff.measure("0")) + tableframe = ttk.Frame(master=parent, padding="3 3 3 3") + statusframe = ttk.Frame(master=tableframe) + # Create and configure the Treeview table widget + tv_widget = ttk.Treeview( + tableframe, + columns=column_headers, + selectmode=select_mode, + show="headings", + ) + tv_widget.configure()["style"] = tblstyle + ysb = ttk.Scrollbar(tableframe, orient="vertical", command=tv_widget.yview) + xsb = ttk.Scrollbar(tableframe, orient="horizontal", command=tv_widget.xview) + tv_widget.configure(yscrollcommand=ysb.set, xscrollcommand=xsb.set) + # Status bar + statusbar = ttk.Label( + statusframe, + text=" %d rows" % len(rowset), + relief=tk.RIDGE, + anchor=tk.W, + ) + tableframe.statuslabel = statusbar + # Fill the Treeview table widget with data + set_tv_headers(tv_widget, column_headers, colwidths, charpixels) + fill_tv_table(tv_widget, rowset, statusbar) + # Place the table + tv_widget.grid(column=0, row=0, sticky=tk.NSEW) + ysb.grid(column=1, row=0, sticky=tk.NS) + xsb.grid(column=0, row=1, sticky=tk.EW) + statusframe.grid(column=0, row=3, sticky=tk.EW) + tableframe.columnconfigure(0, weight=1) + tableframe.rowconfigure(0, weight=1) + # Place the status bar + statusbar.pack(side=tk.BOTTOM, fill=tk.X) + # Allow resizing of the table + tableframe.columnconfigure(0, weight=1) + tableframe.rowconfigure(0, weight=1) + # + return tableframe, tv_widget + + +def set_tv_headers( + tvtable: ttk.Treeview, + column_headers: list, + colwidths: list, + charpixels: int, +): + """Set the headers and column widths for a Treeview table widget.""" + pixwidths = [charpixels * col for col in colwidths] + for i in range(len(column_headers)): + hdr = column_headers[i] + tvtable.column(hdr, width=pixwidths[i]) + tvtable.heading( + hdr, + text=hdr, + command=lambda _col=hdr: treeview_sort_column(tvtable, _col, False), + ) + + +def treeview_sort_column(tv: ttk.Treeview, col: str, reverse: bool): + """Sort a column in a Treeview table widget. + + From https://stackoverflow.com/questions/1966929/tk-treeview-column-sort#1967793 + """ + colvals = [(tv.set(k, col), k) for k in tv.get_children()] + colvals.sort(reverse=reverse) + # Rearrange items in sorted positions + for index, (_val, k) in enumerate(colvals): + tv.move(k, "", index) + # Reverse sort next time + tv.heading(col, command=lambda: treeview_sort_column(tv, col, not reverse)) + + +def fill_tv_table(tvtable: ttk.Treeview, rowset: list | tuple, status_label=None): + """Fill a Treeview table widget with data.""" + for i, row in enumerate(rowset): + enc_row = [c if c is not None else "" for c in row] + tvtable.insert(parent="", index="end", iid=str(i), values=enc_row) + if status_label is not None: + status_label.config(text=" %d rows" % len(rowset)) def ellapsed_time(start_time: datetime): @@ -2289,14 +2652,13 @@ def ellapsed_time(start_time: datetime): return f"{round((datetime.now() - start_time).total_seconds(), 3)} seconds" if dt < 3600: return f"{int(dt // 60)} minutes, {round(dt % 60, 3)} seconds" - return f"{int(dt // 3600)} hours, {int((dt % 3600)) // 60} minutes, {round(dt % 60, 3)} seconds" # noqa: E501 UP034 + return f"{int(dt // 3600)} hours, {int((dt % 3600)) // 60} minutes, {round(dt % 60, 3)} seconds" # noqa: UP034 def clparser() -> argparse.ArgumentParser: """Command line interface for the upsert function.""" parser = argparse.ArgumentParser( - description=description_short, - epilog=description_long, + description=__description__, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( @@ -2319,6 +2681,7 @@ def clparser() -> argparse.ArgumentParser: parser.add_argument( "-l", "--log", + type=Path, metavar="LOGFILE", help="write log to LOGFILE", ) @@ -2336,7 +2699,7 @@ def clparser() -> argparse.ArgumentParser: ) parser.add_argument( "-c", - "--commit", + "--do-commit", action="store_true", help="commit changes to database", ) @@ -2348,8 +2711,9 @@ def clparser() -> argparse.ArgumentParser: ) parser.add_argument( "-m", - "--method", - metavar="METHOD", + "--upsert-method", + metavar="UPSERT_METHOD", + default="upsert", choices=["upsert", "update", "insert"], help="method to use for upsert", ) @@ -2358,6 +2722,12 @@ def clparser() -> argparse.ArgumentParser: metavar="HOST", help="database host", ) + parser.add_argument( + "port", + metavar="PORT", + type=int, + help="database port", + ) parser.add_argument( "database", metavar="DATABASE", @@ -2387,228 +2757,37 @@ def clparser() -> argparse.ArgumentParser: return parser -def upsert( - host: str, - database: str, - user: str, - tables: list[str], - stg_schema: str, - base_schema: str, - upsert_method: str = "upsert", - commit: bool = False, - interactive: bool = False, - exclude_cols: list[str] | None = None, - exclude_null_check_columns: list[str] | None = None, - **kwargs, -): - """Upsert staging tables to base tables.""" - if exclude_null_check_columns is None: - exclude_null_check_columns = [] - if exclude_cols is None: - exclude_cols = [] - global db - global errors - global control_table - global timer - - errors = [] - control_table = "ups_control" - timer = datetime.now() - logger.debug(f"Starting upsert at {timer.strftime('%Y-%m-%d %H:%M:%S')}") - - db = PostgresDB( - host=host, - database=database, - user=user, - passwd=kwargs.get("passwd", None), - ) - logger.debug(f"Connected to {db}") - - validate_schemas(base_schema, stg_schema) - for table in tables: - validate_table(base_schema, stg_schema, table) - - logger.info(f"Upserting to {base_schema} from {stg_schema}") - if interactive: - btn, return_value = TableUI( - "Upsert Tables", - "Tables selected for upsert", - [ - ("Continue", 0, ""), - ("Cancel", 1, ""), - ], - ["Table"], - [[table] for table in tables], - ).activate() - if btn != 0: - error_handler(["Script canceled by user."]) - else: - logger.info("Tables selected for upsert:") - for table in tables: - logger.info(f" {table}") - - # Initialize the control table - logger.debug("Initializing control table") - staged_to_load(control_table, tables) - - # Update the control table with the list of columns to exclude from null checks - if exclude_cols: - db.execute( - SQL( - """ - update {control_table} - set exclude_cols = {exclude_cols}; - """, - ).format( - control_table=Identifier(control_table), - exclude_cols=Literal(",".join(exclude_cols)), - ), - ) - if exclude_null_check_columns: - db.execute( - SQL( - """ - update {control_table} - set exclude_null_checks = {exclude_null_check_columns}; - """, - ).format( - control_table=Identifier(control_table), - exclude_null_check_columns=Literal( - ",".join(exclude_null_check_columns), - ), - ), - ) - if interactive: - db.execute( - SQL( - """ - update {control_table} - set interactive = {interactive}; - """, - ).format( - control_table=Identifier(control_table), - interactive=Literal(interactive), - ), - ) - - # Run not-null, primary key, and foreign key QA checks on the staging tables - load_staging(base_schema, stg_schema, control_table) - - ctrl_df = db.dataframe( - SQL( - """ - select * from {control_table} - where - null_errors is not null - or pk_errors is not null - or fk_errors is not null - or ck_errors is not null; - """, - ).format(control_table=Identifier(control_table)), - ) - - qa_pass = False - # if errors in control table - if not ctrl_df.is_empty(): - logger.debug("QA Errors:") - logger.debug( - tabulate( - ctrl_df.iter_rows(), - headers=ctrl_df.columns, - tablefmt="pipe", - showindex=False, - colalign=["left"] * len(ctrl_df.columns), - ), - ) - logger.debug("") - if interactive: - btn, return_value = TableUI( - "QA Errors", - "Below is a summary of errors.", - [ - ("Continue", 0, ""), - ("Cancel", 1, ""), - ], - ctrl_df.columns, - list(ctrl_df.iter_rows()), - ).activate() - error_handler(["QA checks failed. Aborting upsert."]) - else: - qa_pass = True - logger.info("===QA checks passed. Starting upsert===") - - if qa_pass: - upsert_all(base_schema, stg_schema, control_table, upsert_method) - - final_ctrl_df = db.dataframe( - SQL("select * from {control_table};").format( - control_table=Identifier(control_table), - ), - ) - - if interactive: - btn, return_value = TableUI( - "Upsert Summary", - "Below is a summary of changes. Do you want to commit these changes? ", - [ - ("Continue", 0, ""), - ("Cancel", 1, ""), - ], - final_ctrl_df.columns, - list(final_ctrl_df.iter_rows()), - ).activate() - else: - btn = 0 - - logger.info("") - - if btn == 0: - if final_ctrl_df.filter( - (pl.col("rows_updated") > 0) | (pl.col("rows_inserted") > 0), - ).is_empty(): - logger.info("No changes to commit") - db.rollback() - else: - if commit: - logger.info("Changes committed") - db.commit() - else: - logger.info( - f"Commit set to {str(commit).upper()}, rolling back changes", - ) - db.rollback() - else: - logger.info("Rolling back changes") - db.rollback() - - logger.debug(f"Upsert completed in {ellapsed_time(timer)}") - - -def main() -> None: +def cli() -> None: """Main command line entrypoint for the upsert function.""" args = clparser().parse_args() - logging.basicConfig( - level=logging.INFO if not args.debug else logging.DEBUG, - format="%(message)s", - handlers=[ - logging.StreamHandler() if not args.quiet else logging.NullHandler(), - logging.FileHandler(Path(args.log)) if args.log else logging.NullHandler(), - ], - ) - upsert( + if args.log and args.log.exists(): + args.log.unlink() + if not args.quiet: + logger.addHandler(logging.StreamHandler()) + if args.log: + logger.addHandler(logging.FileHandler(args.log)) + if args.debug: + logger.setLevel(logging.DEBUG) + formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(lineno)d - %(message)s", + ) + for handler in logger.handlers: + handler.setFormatter(formatter) + PgUpsert( host=args.host, + port=args.port, database=args.database, user=args.user, tables=args.tables, stg_schema=args.stg_schema, base_schema=args.base_schema, - commit=args.commit, - upsert_method=args.method, + do_commit=args.do_commit, + upsert_method=args.upsert_method, interactive=args.interactive, exclude_cols=args.exclude.split(",") if args.exclude else None, - exclude_null_check_columns=args.null.split(",") if args.null else None, - ) + exclude_null_check_cols=args.null.split(",") if args.null else None, + ).run() if __name__ == "__main__": - main() + cli() diff --git a/pyproject.toml b/pyproject.toml index 6aa2dd4..2bfe459 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,11 +16,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)", ] -dependencies = [ - "polars >= 1.0.1", - "psycopg2-binary >= 2.9.9", - "tabulate >= 0.9.0", -] +dependencies = ["psycopg2-binary >= 2.9.9", "tabulate >= 0.9.0"] keywords = ["postgresql", "postgres", "dbms", "etl", "upsert", "database"] [project.scripts] @@ -66,7 +62,7 @@ exclude = [ "*cache*", ] # The line length to use when enforcing long-lines violations (like E501). -line-length = 88 +line-length = 120 # Assume Python 3.11. target-version = "py311" # Whether to automatically exclude files that are ignored by .ignore, .gitignore, .git/info/exclude, and global gitignore files. @@ -96,7 +92,7 @@ select = [ "PD", "RUF", ] -ignore = ["PD901", "S101"] +ignore = ["PD901", "S101", "F401"] [tool.ruff.format] # Like Black, use double quotes for strings. @@ -115,4 +111,4 @@ commit_args = "--no-verify" tag = true [[tool.bumpversion.files]] -filename = "pg_upsert/pg_upsert.py" +filename = "pg_upsert/__init__.py" diff --git a/requirements.txt b/requirements.txt index ac49147..786c598 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,38 @@ +accessible-pygments==0.0.5 +alabaster==0.7.16 annotated-types==0.7.0 +Babel==2.15.0 +backports.tarfile==1.2.0 +beautifulsoup4==4.12.3 bracex==2.4 build==1.2.1 bump-my-version==0.21.1 +certifi==2024.7.4 cfgv==3.4.0 +charset-normalizer==3.3.2 click==8.1.7 coverage==7.6.0 distlib==0.3.8 +docutils==0.20.1 filelock==3.14.0 identify==2.5.36 +idna==3.7 +imagesize==1.4.1 +importlib_metadata==8.0.0 iniconfig==2.0.0 +jaraco.classes==3.4.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +Jinja2==3.1.4 +keyring==25.2.1 markdown-it-py==3.0.0 +MarkupSafe==2.1.5 mdurl==0.1.2 +more-itertools==10.3.0 +nh3==0.2.18 nodeenv==1.8.0 packaging==24.1 +pkginfo==1.10.0 platformdirs==4.2.2 pluggy==1.5.0 polars==1.2.0 @@ -22,6 +42,7 @@ psycopg2-binary==2.9.9 pydantic==2.7.3 pydantic-settings==2.3.1 pydantic_core==2.18.4 +pydata-sphinx-theme==0.15.4 Pygments==2.18.0 pyproject_hooks==1.1.0 pytest==8.2.2 @@ -29,12 +50,32 @@ pytest-cov==5.0.0 python-dotenv==1.0.1 PyYAML==6.0.1 questionary==2.0.1 +readme_renderer==44.0 +requests==2.32.3 +requests-toolbelt==1.0.0 +rfc3986==2.0.0 rich==13.7.1 rich-click==1.8.3 ruff==0.5.2 +snowballstemmer==2.2.0 +soupsieve==2.5 +Sphinx==7.4.6 +sphinx-book-theme==1.1.3 +sphinx-copybutton==0.5.2 +sphinx-rtd-theme==2.0.0 +sphinxcontrib-applehelp==1.0.8 +sphinxcontrib-devhelp==1.0.6 +sphinxcontrib-htmlhelp==2.0.5 +sphinxcontrib-jquery==4.1 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.7 +sphinxcontrib-serializinghtml==1.1.10 tabulate==0.9.0 tomlkit==0.12.5 +twine==5.1.1 typing_extensions==4.12.2 +urllib3==2.2.2 virtualenv==20.26.2 wcmatch==8.5.2 wcwidth==0.2.13 +zipp==3.19.2 diff --git a/tests/data.sql b/tests/data.sql index 3b5825f..8643381 100644 --- a/tests/data.sql +++ b/tests/data.sql @@ -10,7 +10,7 @@ create table public.books ( book_title varchar(200) not null, genre varchar(100) not null, notes text, - foreign key (genre) references genres(genre) + foreign key (genre) references public.genres(genre) ); drop table if exists public.authors cascade; @@ -27,8 +27,8 @@ drop table if exists public.book_authors cascade; create table public.book_authors ( book_id varchar(100) not null, author_id varchar(100) not null, - foreign key (author_id) references authors(author_id), - foreign key (book_id) references books(book_id), + foreign key (author_id) references public.authors(author_id), + foreign key (book_id) references public.books(book_id), constraint pk_book_authors primary key (book_id, author_id) ); diff --git a/tests/test_pg_upsert.py b/tests/test_pg_upsert.py index 4131f7d..fd55096 100644 --- a/tests/test_pg_upsert.py +++ b/tests/test_pg_upsert.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv from psycopg2.sql import SQL, Identifier, Literal -from pg_upsert.pg_upsert import PostgresDB +from pg_upsert.pg_upsert import PgUpsert, PostgresDB load_dotenv() @@ -48,6 +48,25 @@ def db(global_variables): db.close() +@pytest.fixture(scope="session") +def ups(global_variables): + """Return a PgUpsert object.""" + obj = PgUpsert( + host=global_variables["POSTGRES_HOST"], + database=global_variables["POSTGRES_DB"], + user=global_variables["POSTGRES_USER"], + passwd=global_variables["POSTGRES_PASSWORD"], + tables=("genres", "books", "authors", "book_authors"), + stg_schema="staging", + base_schema="public", + do_commit=False, + interactive=False, + upsert_method="upsert", + ) + yield obj + obj.db.close() + + def test_db_connection(db): """Test the database connection is successful, then close it.""" assert db.conn is None @@ -109,44 +128,35 @@ def test_db_rowdict_params(db): assert rows[0]["two"] == 2 -def test_db_dataframe(db): - """Test the dataframe function.""" - df = db.dataframe("SELECT 1 as one, 2 as two") - assert df.shape == (1, 2) - assert df["one"][0] == 1 - assert df["two"][0] == 2 +def test_pgupsert_init(global_variables, ups): + assert ups.tables == ("genres", "books", "authors", "book_authors") + assert ups.stg_schema == "staging" + assert ups.base_schema == "public" + assert ups.do_commit is False + assert ups.interactive is False + assert ups.upsert_method == "upsert" + assert ups.control_table == "ups_control" + assert ups.exclude_cols == () + assert ups.exclude_null_check_cols == () -def test_db_dataframe_params(db): - """Test the dataframe function with parameters.""" - df = db.dataframe( - SQL("SELECT {one} as one, {two} as two").format( - one=Literal(1), - two=Literal(2), +def test_pgupsert_control_table_init(global_variables, ups): + # Test that the control table was initialized + cur = ups.db.execute( + SQL( + "select table_name from information_schema.tables where table_name={table}", + ).format( + table=Literal(ups.control_table), + ), + ) + assert cur.rowcount == 1 + assert cur.fetchone()[0] == ups.control_table + # Test that the control table has the correct columns + cur = ups.db.execute( + SQL( + "select column_name from information_schema.columns where table_name={table}", + ).format( + table=Literal(ups.control_table), ), ) - assert df.shape == (1, 2) - assert df["one"][0] == 1 - assert df["two"][0] == 2 - - -# def test_upsert_no_commit(global_variables, db): -# # Run the upsert function. The function should raise a SystemExit error that -# # qa checks failed. -# with pytest.raises(SystemExit) as exc_info: -# upsert( -# host=global_variables["POSTGRES_HOST"], -# database=global_variables["POSTGRES_DB"], -# user=global_variables["POSTGRES_USER"], -# passwd=global_variables["POSTGRES_PASSWORD"], -# tables=["genres", "authors", "books", "book_authors"], -# stg_schema="staging", -# base_schema="public", -# upsert_method="upsert", -# commit=False, -# interactive=False, -# exclude_cols=[], -# exclude_null_check_colls=[], -# ) -# assert exc_info.type is SystemExit -# assert exc_info.value.code == 1 + assert cur.rowcount == 10