diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0a9ab50 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +.github/ +docs/ +.DS_Store +.gitignore +.project +.pydevproject +CHANGELOG.md +CODE_OF_CONDUCT.md +CONTRIBUTING.md +docker-compose.test.yml +LICENSE +Makefile +PULL_REQUEST_TEMPLATE.md +README.md diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..36bf3ae --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,5 @@ +# Default code owner + +* @Senzing/senzing-community + +/.github/ @Senzing/senzing-devsecops diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..78367af --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,17 @@ +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + - package-ecosystem: "docker" + directory: "/" + schedule: + interval: "daily" + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/linters/.checkov.yaml b/.github/linters/.checkov.yaml new file mode 100644 index 0000000..e2d7c03 --- /dev/null +++ b/.github/linters/.checkov.yaml @@ -0,0 +1,2 @@ +quiet: true +skip-check: CKV_DOCKER_7 diff --git a/.github/linters/.jscpd.json b/.github/linters/.jscpd.json new file mode 100644 index 0000000..8665357 --- /dev/null +++ b/.github/linters/.jscpd.json @@ -0,0 +1,3 @@ +{ + "threshold": 32 +} \ No newline at end of file diff --git a/.github/senzing-corporate-contributor-license-agreement.pdf b/.github/senzing-corporate-contributor-license-agreement.pdf new file mode 100644 index 0000000..dac5edb Binary files /dev/null and b/.github/senzing-corporate-contributor-license-agreement.pdf differ diff --git a/.github/senzing-individual-contributor-license-agreement.pdf b/.github/senzing-individual-contributor-license-agreement.pdf new file mode 100644 index 0000000..12ea6bd Binary files /dev/null and b/.github/senzing-individual-contributor-license-agreement.pdf differ diff --git a/.github/workflows/add-labels-standardized.yaml b/.github/workflows/add-labels-standardized.yaml new file mode 100644 index 0000000..01aa8a1 --- /dev/null +++ b/.github/workflows/add-labels-standardized.yaml @@ -0,0 +1,17 @@ +name: add labels standardized + +on: + issues: + types: + - opened + - reopened + +permissions: + issues: write + +jobs: + add-issue-labels: + secrets: + ORG_MEMBERSHIP_TOKEN: ${{ secrets.ORG_MEMBERSHIP_TOKEN }} + SENZING_MEMBERS: ${{ secrets.SENZING_MEMBERS }} + uses: senzing-factory/build-resources/.github/workflows/add-labels-to-issue.yaml@v2 diff --git a/.github/workflows/add-to-project-senzing-dependabot.yaml b/.github/workflows/add-to-project-senzing-dependabot.yaml new file mode 100644 index 0000000..8097fb4 --- /dev/null +++ b/.github/workflows/add-to-project-senzing-dependabot.yaml @@ -0,0 +1,16 @@ +name: add to project senzing github organization dependabot + +on: + pull_request: + branches: [main] + +permissions: + repository-projects: write + +jobs: + add-to-project-dependabot: + secrets: + SENZING_GITHUB_PROJECT_RW_TOKEN: ${{ secrets.SENZING_GITHUB_PROJECT_RW_TOKEN }} + uses: senzing-factory/build-resources/.github/workflows/add-to-project-dependabot.yaml@v2 + with: + project: ${{ vars.SENZING_GITHUB_ORGANIZATION_PROJECT }} diff --git a/.github/workflows/add-to-project-senzing.yaml b/.github/workflows/add-to-project-senzing.yaml new file mode 100644 index 0000000..0b015c9 --- /dev/null +++ b/.github/workflows/add-to-project-senzing.yaml @@ -0,0 +1,20 @@ +name: add to project senzing github organization + +on: + issues: + types: + - opened + - reopened + +permissions: + repository-projects: write + +jobs: + add-to-project: + secrets: + SENZING_GITHUB_PROJECT_RW_TOKEN: ${{ secrets.SENZING_GITHUB_PROJECT_RW_TOKEN }} + uses: senzing-factory/build-resources/.github/workflows/add-to-project.yaml@v2 + with: + classic: false + project-number: ${{ vars.SENZING_GITHUB_ORGANIZATION_PROJECT }} + org: ${{ vars.SENZING_GITHUB_ACCOUNT_NAME }} diff --git a/.github/workflows/lint-workflows.yaml b/.github/workflows/lint-workflows.yaml new file mode 100644 index 0000000..c471330 --- /dev/null +++ b/.github/workflows/lint-workflows.yaml @@ -0,0 +1,17 @@ +name: lint workflows + +on: + push: + branches-ignore: [main] + pull_request: + branches: [main] + +permissions: + contents: read + packages: read + pull-requests: read + statuses: write + +jobs: + lint-workflows: + uses: senzing-factory/build-resources/.github/workflows/lint-workflows.yaml@v2 diff --git a/.github/workflows/move-pr-to-done-dependabot.yaml b/.github/workflows/move-pr-to-done-dependabot.yaml new file mode 100644 index 0000000..72b1f5f --- /dev/null +++ b/.github/workflows/move-pr-to-done-dependabot.yaml @@ -0,0 +1,17 @@ +name: move pr to done dependabot + +on: + pull_request: + branches: [main] + types: [closed] + +permissions: + repository-projects: write + +jobs: + move-pr-to-done-dependabot: + secrets: + SENZING_GITHUB_PROJECT_RW_TOKEN: ${{ secrets.SENZING_GITHUB_PROJECT_RW_TOKEN }} + uses: senzing-factory/build-resources/.github/workflows/move-pr-to-done-dependabot.yaml@v2 + with: + project: ${{ vars.SENZING_GITHUB_ORGANIZATION_PROJECT }} diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml new file mode 100644 index 0000000..d56aab9 --- /dev/null +++ b/.github/workflows/pylint.yaml @@ -0,0 +1,31 @@ +name: pylint + +on: [push] + +permissions: + contents: read + +jobs: + pylint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + + - name: analysing the code with pylint + run: | + # shellcheck disable=SC2046 + pylint $(git ls-files '*.py') diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/.project b/.project new file mode 100644 index 0000000..6e2475c --- /dev/null +++ b/.project @@ -0,0 +1,4 @@ + + + code-snippets-v4 + diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..eaca2ae --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,16 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog], [markdownlint], +and this project adheres to [Semantic Versioning]. + +## [1.0.0] - 2024-11-28 + +### Added to 1.0.0 + +- Initial + +[Keep a Changelog]: https://keepachangelog.com/en/1.0.0/ +[markdownlint]: https://dlaa.me/markdownlint/ +[Semantic Versioning]: https://semver.org/spec/v2.0.0.html diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..f6f9f91 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at support@senzing.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..d4b0a76 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,124 @@ +# Contributing + +Welcome to the project! + +We encourage contribution in a manner consistent with the [Code of Conduct](CODE_OF_CONDUCT.md). +The following will guide you through the process. + +There are a number of ways you can contribute: + +1. [Asking questions](#questions) +1. [Requesting features](#feature-requests) +1. [Reporting bugs](#bug-reporting) +1. [Contributing code or documentation](#contributing-code-or-documentation) + +## License Agreements + +If your contribution modifies the git repository, the following agreements must be established. + +*Note:* License agreements are only needed for adding, modifying, and deleting artifacts kept within the repository. +In simple terms, license agreements are needed before pull requests can be accepted. +A license agreement is not needed for submitting feature request, bug reporting, or other project management. + +### Individual Contributor License Agreement + +In order to contribute to this repository, an +[Individual Contributor License Agreement (ICLA)](.github/senzing-individual-contributor-license-agreement.pdf) +must be completed, submitted and accepted. + +### Corporate Contributor License Agreement + +If the contribution to this repository is on behalf of a company, a +[Corporate Contributor License Agreement (CCLA)](.github/senzing-corporate-contributor-license-agreement.pdf) +must also be completed, submitted and accepted. + +### Project License Agreement + +The license agreement for this repository is stated in the +[LICENSE](LICENSE) file. + +## Questions + +Please do not use the GitHub issue tracker to submit questions. + +TODO: Instead, use ??? + +1. ??? Slack ??? +1. ??? stackoverflow.com ??? + +## Feature Requests + +All feature requests are "GitHub issues". +To request a feature, create a +[GitHub issue](https://help.github.com/articles/creating-an-issue/) +in this repository. + +When creating an issue, there will be a choice to create a "Bug report" or a "Feature request". +Choose "Feature request". + +## Bug Reporting + +All bug reports are "GitHub issues". +Before reporting on a bug, check to see if it has +[already been reported](https://github.com/search?q=+is%3Aissue+user%3Asenzing). +To report a bug, create a +[GitHub issue](https://help.github.com/articles/creating-an-issue/) +in this repository. + +When creating an issue, there will be a choice to create a "Bug report" or a "Feature request". +Choose "Bug report". + +## Contributing code or documentation + +To contribute code or documentation to the repository, you must have +[License Agreements](#license-agreements) in place. +This needs to be complete before a [Pull Request](#pull-requests) can be accepted. + +### Setting up a development environment + +#### Set Environment variables + +These variables may be modified, but do not need to be modified. +The variables are used throughout the installation procedure. + +```console +export GIT_ACCOUNT=senzing +export GIT_REPOSITORY=template-python +``` + +Synthesize environment variables. + +```console +export GIT_ACCOUNT_DIR=~/${GIT_ACCOUNT}.git +export GIT_REPOSITORY_DIR="${GIT_ACCOUNT_DIR}/${GIT_REPOSITORY}" +export GIT_REPOSITORY_URL="git@github.com:${GIT_ACCOUNT}/${GIT_REPOSITORY}.git" +``` + +#### Clone repository + +Get repository. + +```console +mkdir --parents ${GIT_ACCOUNT_DIR} +cd ${GIT_ACCOUNT_DIR} +git clone ${GIT_REPOSITORY_URL} +cd ${GIT_REPOSITORY_DIR} +``` + +### Coding conventions + +TODO: + +### Testing + +TODO: + +### Pull Requests + +Code in the main branch is modified via GitHub pull request. +Follow GitHub's +[Creating a pull request from a branch](https://help.github.com/articles/creating-a-pull-request/) +or +[Creating a pull request from a fork](https://help.github.com/articles/creating-a-pull-request-from-a-fork/) instructions. + +Accepting pull requests will be at the discretion of Senzing, Inc. and the repository owner(s). diff --git a/README.md b/README.md new file mode 100644 index 0000000..5ada3ee --- /dev/null +++ b/README.md @@ -0,0 +1,203 @@ +# code-snippets-v4 + +## Overview + +Succinct examples of how you might use the Senzing APIs for operational tasks. + +## Contents + +1. [Legend](#legend) +1. [Warning](#warning) +1. [Senzing Engine Configuration](#senzing-engine-configuration) +1. [Senzing APIs Bare Metal Usage](#senzing-apis-bare-metal-usage) + 1. [Configuration](#configuration) + 2. [Usage](#usage) +1. [Docker Usage](#docker-usage) + 1. [Configuration](#configuration-1) + 2. [Usage](#usage-1) +1. [Items of Note](#items-of-note) + 1. [With Info](#with-info) + 2. [Parallel Processing](#parallel-processing) + 3. [Scalability](#scalability) + 4. [Randomize Input Files](#randomize-input-files) + 5. [Purging Senzing Repository Between Examples](#purging-senzing-repository-between-examples) + 6. [Input Load File Sizes](#input-load-file-sizes) + +### Legend + +1. :thinking: - A "thinker" icon means that a little extra thinking may be required. + Perhaps there are some choices to be made. + Perhaps it's an optional step. +1. :pencil2: - A "pencil" icon means that the instructions may need modification before performing. +1. :warning: - A "warning" icon means that something tricky is happening, so pay attention. + +## Warning + +:warning::warning::warning: **Only run the code snippets against a test Senzing database instance.** Running the snippets adds and deletes data, and some snippets purge the entire database of currently ingested data. It is recommended to create a separate test Senzing project if you are using a bare metal Senzing install, or if using Docker a separate Senzing database to use only with the snippets. If you are getting started and are unsure please contact [Senzing Support](https://senzing.zendesk.com/hc/en-us/requests/new). :warning::warning::warning: + +## Senzing Engine Configuration + +A JSON configuration string is used by the snippets to specify initialization parameters to the Senzing engine: + +```json +{ + "PIPELINE": { + "SUPPORTPATH": "/home/senzing/mysenzproj1/data", + "CONFIGPATH": "/home/senzing/mysenzproj1/etc", + "RESOURCEPATH": "/home/senzing/mysenzproj1/resources" + }, + "SQL": { + "CONNECTION": "postgresql://user:password@host:5432:g2" + } +} +``` + +The JSON configuration string is set via the environment variable `SENZING_ENGINE_CONFIGURATION_JSON`. + +## Senzing APIs Bare Metal Usage + +You may already have installed the Senzing APIs and created a Senzing project by following the [Quickstart Guide](https://senzing.zendesk.com/hc/en-us/articles/115002408867-Quickstart-Guide). If not, and you would like to install the Senzing APIs directly on a machine, follow the steps in the[ Quickstart Guide](https://senzing.zendesk.com/hc/en-us/articles/115002408867-Quickstart-Guide). Be sure to review the API [Quickstart Roadmap](https://senzing.zendesk.com/hc/en-us/articles/115001579954-API-Quickstart-Roadmap), especially the [System Requirements](https://senzing.zendesk.com/hc/en-us/articles/115010259947). + +### Configuration + +When using a bare metal install, the initialization parameters used by the Senzing Python utilities are maintained within `/etc/G2Module.ini`. + +🤔To convert an existing Senzing project G2Module.ini file to a JSON string use one of the following methods: + +- [G2ModuleIniToJson.py](Python/Tasks/Initialization/) + + - Modify the path to your projects G2Module.ini file. + +- [jc](https://github.com/kellyjonbrazil/jc) + - ```console + cat /etc/G2Module.ini | jc --ini + ``` +- Python one liner + - ```python + python3 -c $'import configparser; ini_file_name = "/etc/G2Module.ini";engine_config_json = {};cfgp = configparser.ConfigParser();cfgp.optionxform = str;cfgp.read(ini_file_name)\nfor section in cfgp.sections(): engine_config_json[section] = dict(cfgp.items(section))\nprint(engine_config_json)' + ``` +- [SenzingGo.py](https://github.com/Senzing/senzinggo) + - ```console + /python/SenzingGo.py --iniToJson + ``` + +:pencil2: `` in the above example should point to your project. + +### Usage + +1. Clone this repository +2. Export the engine configuration obtained for your project from [Configuration](#configuration), e.g., + +```console + export SENZING_ENGINE_CONFIGURATION_JSON='{"PIPELINE": {"SUPPORTPATH": "//data", "CONFIGPATH": "/etc", "RESOURCEPATH": "/resources"}, "SQL": {"CONNECTION": "postgresql://user:password@host:5432:g2"}}' +``` + +3. Source the Senzing project setupEnv file + +```console +source /setupEnv +``` + +4. Run code snippets + +:pencil2: `` in the above examples should point to your project. + +## Docker Usage + +The included Dockerfile leverages the [Senzing API runtime](https://github.com/Senzing/senzingapi-runtime) image to provide an environment to run the code snippets. + +### Configuration + +When used with a container, the JSON configuration is relative to the paths within the container. The JSON configuration should look like: + +```json +{ + "PIPELINE": { + "CONFIGPATH": "/etc/opt/senzing", + "RESOURCEPATH": "/opt/senzing/g2/resources", + "SUPPORTPATH": "/opt/senzing/data" + }, + "SQL": { + "CONNECTION": "postgresql://senzing:password@myhost:5432:g2" + } +} +``` + +✏️You only need to modify the `CONNECTION` string to point to your Senzing database. + +### Usage + +1. Clone this repository +2. Export the engine configuration environment variable + +```console + export SENZING_ENGINE_CONFIGURATION_JSON='{"PIPELINE": {"CONFIGPATH": "/etc/opt/senzing", "RESOURCEPATH": "/opt/senzing/g2/resources", "SUPPORTPATH": "/opt/senzing/data"}, "SQL": {"CONNECTION": "postgresql://user:password@host:5432:g2"}}' +``` + +3. Build the Docker image + +```console +cd +docker build --tag senzing/code-snippets-v4 . +``` + +4. Run a container + +```console +docker run \ + --env SENZING_ENGINE_CONFIGURATION_JSON \ + --interactive \ + --tty \ + --rm \ + senzing/code-snippets-v4 +``` + +✏️You only need to modify the `CONNECTION` string to point to your Senzing database. + +## Items of Note + +### With Info + +A feature of Senzing is the capability to pass changes from data manipulation API calls to downstream systems for analysis, consolidation and replication. Any API that can change the outcome of entity resolution have a "WithInfo" version of the API. For example, addRecord and addRecordWithInfo. The "WithInfo" version of the API returns a response message detailing any entities that were affected by the API. In the following example (from addRecordWithInfo) a single entity with the ID 7903 was affected. + +```json +{ + "DATA_SOURCE": "TEST", + "RECORD_ID": "10945", + "AFFECTED_ENTITIES": [ + { + "ENTITY_ID": 7903, + "LENS_CODE": "DEFAULT" + } + ], + "INTERESTING_ENTITIES": [] +} +``` + +The AFFECTED_ENTITIES object contains a list of all entity IDs affected. Separate processes can query the affected entities and synchronize changes and information to downstream systems. For additional information see [Real-time replication and analytics](https://senzing.zendesk.com/hc/en-us/articles/4417768234131--Advanced-Real-time-replication-and-analytics). + +### Parallel Processing + +Many of the example tasks demonstrate concurrent execution with threads. The entity resolution process involves IO operations, the use of concurrent processes and threads when calling the Senzing APIs provides scalability and performance. If using multiple processes, each process should have its own instance of a Senzing engine, for example G2Engine. Each engine object can support multiple threads. + +### Scalability + +Many of the examples demonstrate using multiple threads to utilize the resources available on the machine. Consider loading data into Senzing and increasing the load rate, loading (and other tasks) can be horizontally scaled by utilizing additional machines. + +If a single very large load file and 3 machines were available for performing data load, the file can be split into 3 with each machine running the sample code or your own application. Horizontal scaling such as this does require the Senzing database to have the capacity to accept the additional workload and not become the bottleneck. + +### Randomize Input Data + +When providing your own input file(s) to the snippets or your own applications and processing data manipulation tasks (adding, deleting, replacing), it is important to randomize the file(s) or other input methods when running multiple threads. If source records that pertain to the same entity are clustered together, multiple processes or threads could all be trying to work on the same entity concurrently. This causes contention and overhead resulting in slower performance. To prevent this contention always randomize input data. + +You may be able to randomize your input files during ETL and mapping the source data to the [Senzing Entity Specification](https://senzing.zendesk.com/hc/en-us/articles/231925448-Generic-Entity-Specification). Otherwise utilities such as [shuf](https://man7.org/linux/man-pages/man1/shuf.1.html) or [terashuf](https://github.com/alexandres/terashuf) for large files can be used. + +### Purging Senzing Repository Between Examples + +When trying out different examples you may notice consecutive tasks complete much faster than an initial run. For example, running a loading task for the first time without the data in the system will be representative of load rate. If the same example is subsequently run again without purging the system it will complete much faster. This is because Senzing knows the records already exist in the system and it skips them. + +To run the same example again and see representative performance, first [purge](Python/Tasks/Initialization/PurgeRepository.py) the Senzing repository of the loaded data. Some examples don't require purging between running them, an example would be the deleting examples that require data to be ingested first. See the usage notes for each task category for an overview of how to use the snippets. + +### Input Load File Sizes + +There are different sized load files within the [Data](Resources/Data/) path that can be used to decrease or increase the volume of data loaded depending on the specification of your hardware. The files are named loadx.json, where the x specifies the number of records in the file.