NYCPlanning · damonmcc · Feb 9, 2023 · Feb 8, 2023 · Feb 8, 2023 · Feb 8, 2023
diff --git a/README.md b/README.md
@@ -1,9 +1,19 @@
 # Data Engineering Quality Control and Assurance Application
-This web application displays charts and tables to assess the consistency, quality and completeness of a particular build of one of data engineering's data products.
-It's written in Python using the [streamlit](https://streamlit.io/) framework. 
 
-Best practice to run the app locally is to use the devcontainer 
+This web application displays charts and tables to assess the consistency, quality and completeness of a particular build of one of data engineering's data products.
 
 The deployed app is at https://edm-data-engineering.nycplanningdigital.com/?page=Home
 
+It's written in Python using the [streamlit](https://streamlit.io/) framework.
+
 The code to produce data this application assess can be found at https://github.com/NYCPlanning/
+
+## Dev
+
+Best practice to run the app locally is to use the dev container (especially via VS Code)
+
+1. From a dev container terminal, run `./entrypoint.sh`
+
+2. If in VS Code, a popup should appear with an option to navigate to the site in a browser
+
+3. If an error of `Access to localhost was denied` appears in the browser, try navigating to `127.0.0.1:5000` rather than `localhost:5000`
diff --git a/example.env b/example.env
@@ -0,0 +1,4 @@
+AWS_S3_ENDPOINT=
+AWS_SECRET_ACCESS_KEY=
+AWS_ACCESS_KEY_ID=
+AWS_S3_BUCKET=
diff --git a/src/cpdb/helpers.py b/src/cpdb/helpers.py
@@ -31,7 +31,7 @@
 
 
 def get_geometries(branch, table) -> dict:
-    client = digital_ocean_client()
+    client = DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)
 
     gdf = client.shapefile_from_DO(
         shapefile_zip=f"db-cpdb/{branch}/latest/output/{table}.shp.zip"
@@ -49,7 +49,7 @@ def get_data(branch, previous_version) -> dict:
         "geometries": ["cpdb_dcpattributes_pts", "cpdb_dcpattributes_poly"],
     }
 
-    client = digital_ocean_client()
+    client = DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)
 
     for t in tables["analysis"]:
         rv[t] = client.csv_from_DO(url=construct_url(branch, t, sub_folder="analysis/"))
@@ -81,7 +81,6 @@ def get_diff_dataframe(df: pd.DataFrame, df_pre: pd.DataFrame):
 
 
 def get_map_percent_diff(df: pd.DataFrame, df_pre: pd.DataFrame, keys: dict):
-
     diff = pd.DataFrame(
         columns=["percent_mapped", "pre_percent_mapped", "diff_percent_mapped"],
         index=df.index,
@@ -98,7 +97,6 @@ def get_map_percent_diff(df: pd.DataFrame, df_pre: pd.DataFrame, keys: dict):
 def sort_base_on_option(
     df: pd.DataFrame, subcategory, view_type, map_option, ascending=True
 ):
-
     df_sort = df.sort_values(
         by=VIZKEY[subcategory][view_type]["values"][map_option], ascending=ascending
     )
@@ -111,7 +109,3 @@ def construct_url(branch, table, build="latest", sub_folder=""):
         f"https://edm-publishing.nyc3.digitaloceanspaces.com/db-cpdb/{branch}"
         f"/{build}/output/{sub_folder}{table}.csv"
     )
-
-
-def digital_ocean_client():
-    return DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)
diff --git a/src/devdb/helpers.py b/src/devdb/helpers.py
@@ -174,7 +174,7 @@ def get_data(branch):
     rv = {}
     url = f"https://edm-publishing.nyc3.digitaloceanspaces.com/db-developments/{branch}/latest/output"
 
-    client = digital_ocean_client()
+    client = DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)
 
     rv["qaqc_app"] = client.csv_from_DO(
         url=f"{url}/qaqc_app.csv",
@@ -191,7 +191,3 @@ def get_data(branch):
     rv["qaqc_quarter_check"] = client.csv_from_DO(url=f"{url}/qaqc_quarter_check.csv")
 
     return rv
-
-
-def digital_ocean_client():
-    return DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)
diff --git a/src/digital_ocean_client.py b/src/digital_ocean_client.py
@@ -39,14 +39,20 @@ def s3_resource(self):
             endpoint_url=os.getenv("AWS_S3_ENDPOINT"),
         )
 
-    def get_all_folders_in_repo(self):
+    def get_all_folder_names_in_repo_folder(self):
         all_folders = set()
 
         for obj in self.repo:
             all_folders.add(obj._key.split("/")[1])
 
         return all_folders
 
+    def get_all_filenames_in_folder(self, folder_path: str):
+        filenames = set()
+        for object in self.bucket.objects.filter(Prefix=f"{folder_path}/"):
+            filenames.add(object.key.split("/")[-1])
+        return filenames
+
     def unzip_csv(self, csv_filename, zipfile):
         try:
             with zipfile.open(csv_filename) as csv:

diff --git a/src/pluto/components/changes_report.py b/src/pluto/components/changes_report.py
@@ -0,0 +1,143 @@
+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+from st_aggrid import AgGrid
+from src.constants import COLOR_SCHEME
+from abc import ABC
+
+
+class ChangesReport:
+    def __init__(self, data) -> None:
+        self.applied_changes = data["pluto_changes_applied"]
+        self.not_applied_changes = data["pluto_changes_not_applied"]
+        self.version_dropdown = np.flip(
+            np.sort(data["pluto_changes_applied"].version.dropna().unique())
+        )
+
+    def __call__(self):
+        st.header("Manual Changes")
+
+        st.markdown(
+            """
+            PLUTO is created using the best available data from a number of city agencies. To further
+            improve data quality, the Department of City Planning (DCP) applies changes to selected field
+            values.
+
+            Each change to a field is labeled for a version of PLUTO.
+
+            For programmatic changes, this is version in which the programmatic change was
+            implemented. For research and user reported changes, this is the version in which the BBL
+            change was added to PLUTO_input_research.csv.
+
+            For more information about the structure of the pluto changes report,
+            see the [Pluto Changelog Readme](https://www1.nyc.gov/assets/planning/download/pdf/data-maps/open-data/pluto_change_file_readme.pdf?r=22v1).
+
+            NOTE: This report is based on the files
+            `pluto_changes_applied.csv`/`pluto_changes_not_applied.csv`
+            (or legacy files `pluto_corrections_applied.csv`/`pluto_corrections_not_applied.csv`)
+            """
+        )
+
+        if self.applied_changes is None or self.not_applied_changes is None:
+            st.info(
+                "There are no available changes reports for this branch. This is likely due to a problem on the backend with the files on Digital Ocean."
+            )
+            return
+
+        version = st.sidebar.selectbox(
+            "Filter the changes to fields by the PLUTO Version in which they were first introduced",
+            self.version_dropdown,
+        )
+
+        AppliedChangesSection(self.applied_changes, version)()
+        NotAppliedChangesSection(self.not_applied_changes, version)()
+
+        st.info(
+            """
+            See [here](https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page) for a full accounting of the changes made for the latest version
+            in the PLUTO change file.
+            """
+        )
+
+
+class ChangesSection(ABC):
+    def __init__(self, changes, version) -> None:
+        super().__init__()
+        self.changes = self.filter_by_version(changes, version)
+        self.version_text = self.version_text(version)
+
+    def filter_by_version(self, df, version):
+        if version == "All":
+            return df
+        else:
+            return df.loc[df["version"] == version]
+
+    def version_text(self, version):
+        return "All Versions" if version == "All" else f"Version {version}"
+
+    def display_changes_figures(self, df, title):
+        figure = self.generate_graph(self.field_change_counts(df), title)
+        st.plotly_chart(figure)
+
+        self.display_changes_df(df, title)
+
+    def generate_graph(self, changes, title):
+        return px.bar(
+            changes,
+            x="field",
+            y="size",
+            text="size",
+            title=title,
+            labels={"size": "Count of Records", "field": "Altered Field"},
+            color_discrete_sequence=COLOR_SCHEME,
+        )
+
+    def field_change_counts(self, df):
+        return df.groupby(["field"]).size().to_frame("size").reset_index()
+
+    def display_changes_df(self, changes, title):
+        changes = changes.sort_values(
+            by=["version", "reason", "bbl"], ascending=[False, True, True]
+        )
+
+        AgGrid(data=changes, key=f"display_changes_df_{title}")
+
+
+class AppliedChangesSection(ChangesSection):
+    def __call__(self):
+        st.subheader("Manual Changes Applied", anchor="changes-applied")
+
+        if self.changes.empty:
+            st.info(f"No Changes introduced in {self.version_text} were applied.")
+        else:
+            title_text = (
+                f"Applied Manual Changes introduced in {self.version_text} by Field"
+            )
+            self.display_changes_figures(self.changes, title_text)
+        st.markdown(
+            """
+            For each record in the PLUTO Changes table, PLUTO attempts to change a record to the New Value column by matching on the BBL and the 
+            Old Value column. The graph and table below outline the records in the pluto changes table that were successfully applied to PLUTO.
+            """
+        )
+
+
+class NotAppliedChangesSection(ChangesSection):
+    def __call__(self):
+        st.subheader("Manual Changes Not Applied", anchor="changes-not-applied")
+        st.markdown(
+            """ 
+            For each record in the PLUTO Changes table, PLUTO attempts to change a record by matching on the BBL and the 
+            Old Value column. As the underlying datasources change and improve, PLUTO records may no longer match the old value 
+            specified in the pluto changes table. The graph and table below outline the records in the pluto changes table that failed to be applied for this reason.
+            """
+        )
+
+        if self.changes.empty:
+            st.info(f"All Changes introduced in {self.version_text} were applied.")
+        else:
+            title_text = (
+                f"Manual Changes not Applied introduced in {self.version_text} by Field"
+            )
+            self.display_changes_figures(self.changes, title_text)
diff --git a/src/pluto/components/corrections_report.py b/src/pluto/components/corrections_report.py