Skip to content
This repository has been archived by the owner on Aug 8, 2023. It is now read-only.

fix use of pluto change files #252

Merged
merged 20 commits into from
Feb 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
# Data Engineering Quality Control and Assurance Application
This web application displays charts and tables to assess the consistency, quality and completeness of a particular build of one of data engineering's data products.
It's written in Python using the [streamlit](https://streamlit.io/) framework.

Best practice to run the app locally is to use the devcontainer
This web application displays charts and tables to assess the consistency, quality and completeness of a particular build of one of data engineering's data products.

The deployed app is at https://edm-data-engineering.nycplanningdigital.com/?page=Home

It's written in Python using the [streamlit](https://streamlit.io/) framework.

The code to produce data this application assess can be found at https://github.com/NYCPlanning/

## Dev

Best practice to run the app locally is to use the dev container (especially via VS Code)

1. From a dev container terminal, run `./entrypoint.sh`

2. If in VS Code, a popup should appear with an option to navigate to the site in a browser

3. If an error of `Access to localhost was denied` appears in the browser, try navigating to `127.0.0.1:5000` rather than `localhost:5000`
4 changes: 4 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
AWS_S3_ENDPOINT=
AWS_SECRET_ACCESS_KEY=
AWS_ACCESS_KEY_ID=
AWS_S3_BUCKET=
10 changes: 2 additions & 8 deletions src/cpdb/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@


def get_geometries(branch, table) -> dict:
client = digital_ocean_client()
client = DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)

gdf = client.shapefile_from_DO(
shapefile_zip=f"db-cpdb/{branch}/latest/output/{table}.shp.zip"
Expand All @@ -49,7 +49,7 @@ def get_data(branch, previous_version) -> dict:
"geometries": ["cpdb_dcpattributes_pts", "cpdb_dcpattributes_poly"],
}

client = digital_ocean_client()
client = DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)

for t in tables["analysis"]:
rv[t] = client.csv_from_DO(url=construct_url(branch, t, sub_folder="analysis/"))
Expand Down Expand Up @@ -81,7 +81,6 @@ def get_diff_dataframe(df: pd.DataFrame, df_pre: pd.DataFrame):


def get_map_percent_diff(df: pd.DataFrame, df_pre: pd.DataFrame, keys: dict):

diff = pd.DataFrame(
columns=["percent_mapped", "pre_percent_mapped", "diff_percent_mapped"],
index=df.index,
Expand All @@ -98,7 +97,6 @@ def get_map_percent_diff(df: pd.DataFrame, df_pre: pd.DataFrame, keys: dict):
def sort_base_on_option(
df: pd.DataFrame, subcategory, view_type, map_option, ascending=True
):

df_sort = df.sort_values(
by=VIZKEY[subcategory][view_type]["values"][map_option], ascending=ascending
)
Expand All @@ -111,7 +109,3 @@ def construct_url(branch, table, build="latest", sub_folder=""):
f"https://edm-publishing.nyc3.digitaloceanspaces.com/db-cpdb/{branch}"
f"/{build}/output/{sub_folder}{table}.csv"
)


def digital_ocean_client():
return DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)
6 changes: 1 addition & 5 deletions src/devdb/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def get_data(branch):
rv = {}
url = f"https://edm-publishing.nyc3.digitaloceanspaces.com/db-developments/{branch}/latest/output"

client = digital_ocean_client()
client = DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)

rv["qaqc_app"] = client.csv_from_DO(
url=f"{url}/qaqc_app.csv",
Expand All @@ -191,7 +191,3 @@ def get_data(branch):
rv["qaqc_quarter_check"] = client.csv_from_DO(url=f"{url}/qaqc_quarter_check.csv")

return rv


def digital_ocean_client():
return DigitalOceanClient(bucket_name=BUCKET_NAME, repo_name=REPO_NAME)
8 changes: 7 additions & 1 deletion src/digital_ocean_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,20 @@ def s3_resource(self):
endpoint_url=os.getenv("AWS_S3_ENDPOINT"),
)

def get_all_folders_in_repo(self):
def get_all_folder_names_in_repo_folder(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function names are much more detailed...NOICE

all_folders = set()

for obj in self.repo:
all_folders.add(obj._key.split("/")[1])

return all_folders

def get_all_filenames_in_folder(self, folder_path: str):
filenames = set()
for object in self.bucket.objects.filter(Prefix=f"{folder_path}/"):
filenames.add(object.key.split("/")[-1])
return filenames

def unzip_csv(self, csv_filename, zipfile):
try:
with zipfile.open(csv_filename) as csv:
Expand Down
143 changes: 143 additions & 0 deletions src/pluto/components/changes_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from st_aggrid import AgGrid
from src.constants import COLOR_SCHEME
from abc import ABC


class ChangesReport:
def __init__(self, data) -> None:
self.applied_changes = data["pluto_changes_applied"]
self.not_applied_changes = data["pluto_changes_not_applied"]
self.version_dropdown = np.flip(
np.sort(data["pluto_changes_applied"].version.dropna().unique())
)

def __call__(self):
st.header("Manual Changes")

st.markdown(
"""
PLUTO is created using the best available data from a number of city agencies. To further
improve data quality, the Department of City Planning (DCP) applies changes to selected field
values.

Each change to a field is labeled for a version of PLUTO.

For programmatic changes, this is version in which the programmatic change was
implemented. For research and user reported changes, this is the version in which the BBL
change was added to PLUTO_input_research.csv.

For more information about the structure of the pluto changes report,
see the [Pluto Changelog Readme](https://www1.nyc.gov/assets/planning/download/pdf/data-maps/open-data/pluto_change_file_readme.pdf?r=22v1).

NOTE: This report is based on the files
`pluto_changes_applied.csv`/`pluto_changes_not_applied.csv`
(or legacy files `pluto_corrections_applied.csv`/`pluto_corrections_not_applied.csv`)
"""
)

if self.applied_changes is None or self.not_applied_changes is None:
st.info(
"There are no available changes reports for this branch. This is likely due to a problem on the backend with the files on Digital Ocean."
)
return

version = st.sidebar.selectbox(
"Filter the changes to fields by the PLUTO Version in which they were first introduced",
self.version_dropdown,
)

AppliedChangesSection(self.applied_changes, version)()
NotAppliedChangesSection(self.not_applied_changes, version)()

st.info(
"""
See [here](https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page) for a full accounting of the changes made for the latest version
in the PLUTO change file.
"""
)


class ChangesSection(ABC):
def __init__(self, changes, version) -> None:
super().__init__()
self.changes = self.filter_by_version(changes, version)
self.version_text = self.version_text(version)

def filter_by_version(self, df, version):
if version == "All":
return df
else:
return df.loc[df["version"] == version]

def version_text(self, version):
return "All Versions" if version == "All" else f"Version {version}"

def display_changes_figures(self, df, title):
figure = self.generate_graph(self.field_change_counts(df), title)
st.plotly_chart(figure)

self.display_changes_df(df, title)

def generate_graph(self, changes, title):
return px.bar(
changes,
x="field",
y="size",
text="size",
title=title,
labels={"size": "Count of Records", "field": "Altered Field"},
color_discrete_sequence=COLOR_SCHEME,
)

def field_change_counts(self, df):
return df.groupby(["field"]).size().to_frame("size").reset_index()

def display_changes_df(self, changes, title):
changes = changes.sort_values(
by=["version", "reason", "bbl"], ascending=[False, True, True]
)

AgGrid(data=changes, key=f"display_changes_df_{title}")


class AppliedChangesSection(ChangesSection):
def __call__(self):
st.subheader("Manual Changes Applied", anchor="changes-applied")

if self.changes.empty:
st.info(f"No Changes introduced in {self.version_text} were applied.")
else:
title_text = (
f"Applied Manual Changes introduced in {self.version_text} by Field"
)
self.display_changes_figures(self.changes, title_text)
st.markdown(
"""
For each record in the PLUTO Changes table, PLUTO attempts to change a record to the New Value column by matching on the BBL and the
Old Value column. The graph and table below outline the records in the pluto changes table that were successfully applied to PLUTO.
"""
)


class NotAppliedChangesSection(ChangesSection):
def __call__(self):
st.subheader("Manual Changes Not Applied", anchor="changes-not-applied")
st.markdown(
"""
For each record in the PLUTO Changes table, PLUTO attempts to change a record by matching on the BBL and the
Old Value column. As the underlying datasources change and improve, PLUTO records may no longer match the old value
specified in the pluto changes table. The graph and table below outline the records in the pluto changes table that failed to be applied for this reason.
"""
)

if self.changes.empty:
st.info(f"All Changes introduced in {self.version_text} were applied.")
else:
title_text = (
f"Manual Changes not Applied introduced in {self.version_text} by Field"
)
self.display_changes_figures(self.changes, title_text)
137 changes: 0 additions & 137 deletions src/pluto/components/corrections_report.py

This file was deleted.

Loading