From 4e683786d866dd908e14ebb238efa2547efcc8d6 Mon Sep 17 00:00:00 2001 From: John Turner Date: Fri, 6 Sep 2024 12:25:36 -0400 Subject: [PATCH] reworking to remove scripts from this repo and move to NIGMS-Sandbox --- .github/workflows/check-links.yaml | 31 +---- .github/workflows/check_links.py | 168 --------------------------- .github/workflows/lint.py | 53 --------- .github/workflows/notebook-lint.yaml | 37 +----- .github/workflows/requirements.txt | 1 - 5 files changed, 9 insertions(+), 281 deletions(-) delete mode 100644 .github/workflows/check_links.py delete mode 100644 .github/workflows/lint.py delete mode 100644 .github/workflows/requirements.txt diff --git a/.github/workflows/check-links.yaml b/.github/workflows/check-links.yaml index 20ba4b8..166bb1c 100644 --- a/.github/workflows/check-links.yaml +++ b/.github/workflows/check-links.yaml @@ -1,34 +1,13 @@ name: 'Check Links' on: - workflow_call: - inputs: - directory: - required: false - type: string - repo_link_ignore_list: - required: true - type: string - secrets: - PAT: - required: false push: pull_request: + workflow_dispatch: + jobs: link_check: name: 'Link Check' - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Link Check - run: | - echo ${{jobs.link_check.uses}} - - - name: Link Check - run: | - python3 ${{github.action_path}}/check_links.py - env: - LINK_IGNORE_LIST: https://www.sciencedirect.com,https://portlandpress.com - PAT: ${{ secrets.PAT }} \ No newline at end of file + uses: NIGMS/NIGMS-Sandbox/.github/workflows/check-links.yaml@main + with: + repo_link_ignore_list: "" \ No newline at end of file diff --git a/.github/workflows/check_links.py b/.github/workflows/check_links.py deleted file mode 100644 index 967fe7f..0000000 --- a/.github/workflows/check_links.py +++ /dev/null @@ -1,168 +0,0 @@ -import http.client -import urllib.request, urllib.error -import os -import sys -import re - - - -# set some default variables -remove_characters = ['**', '\\n'] - -# text that tends to be at the end of the url that we need truncate everything past them -end_characters = [')',",","'",'`',"\"",'','',"\\",">","]"] - -big_regex = re.compile('|'.join(map(re.escape, remove_characters))) - -# if there are any URLs to ignore add here -link_ignore_list = [] -link_ignore_list_env = os.getenv("LINK_IGNORE_LIST") -if link_ignore_list_env and len(link_ignore_list_env) > 0: - link_ignore_list = link_ignore_list_env.split(',') - -# Add any repo specific ignores -link_ignore_list_env_2 = os.getenv("inputs.repo_link_ignore_list") -if link_ignore_list_env_2 and len(link_ignore_list_env_2) > 0: - link_ignore_list.extend(link_ignore_list_env.split(',')) - -print_valid = os.getenv("print_valid_links") is not None - -# If we are given a directory then use it, otherwise assume path is current directory -path = "." -if len(sys.argv) >1 and os.path.exists(sys.argv[1]): - path = sys.argv[1] - -# directory environment overrides the system arguments and default. -directory_env = os.getenv("inputs.directory") -if directory_env and len(directory_env) > 0: - path = directory_env - -pat_env = os.getenv("INPUT_PAT") -if directory_env and len(directory_env) > 0: - path = directory_env - -# list which stores all links to check -links_to_check = [] -link_file_map = {} -# Get the response code of the url to see if it exists -def getResponseCode(url): - content = None - try: - req = urllib.request.Request(url, - headers={'User-Agent': 'Mozilla/5.0'}) - conn = urllib.request.urlopen(req) - # Only get HTML if we have a potential anchor link - if "#" in url and "pdf" not in url: - content = conn.read().decode("utf-8") - except urllib.error.HTTPError as e: - return [e.code, content] - except urllib.error.URLError as e: - return [404, content] - except http.client.InvalidURL: - return [200, content] - return [conn.getcode(), content] - -def clean_link(link): - if link.endswith("."): - link = link[:link.rfind(".")] - if link.endswith("'"): - link = link[:link.rfind("'")] - if link.endswith("\""): - link = link[:link.rfind("\"")] - link_stripped = big_regex.sub("", link.strip()) - for end_c in end_characters: - end_index = link_stripped.find(end_c) - if end_index != -1: - link_stripped = link_stripped[:end_index] - return link_stripped - -def add_link(loc,link): - # this is a command being ran so difficult to validate in this script, skip it - if '$(uname' in link: - return False - - # get just from the http portion if there was more in from of the string we grabbed - link = link[link.find("http"):] - - # if there is a period at the end, truncate to that period. Other periods may be valid - # strip various characters that may be in the string - link_stripped = clean_link(link) - while link_stripped != link: - link = link_stripped - link_stripped = clean_link(link) - - # add link to be checked - links_to_check.append(link_stripped) - - # store where the link is so we can fix it - link_file_map[link_stripped] = loc -def check_link(link): - # try and get the url, if its 404 or 500 then its invalid, let us know and trigger the error flag - code = getResponseCode(link) - loc =link_file_map[link] - if code[0] in [404, 403, 500]: - - # If the link failed, but we are ignoring it then just mention that - for ignored_link in link_ignore_list: - if ignored_link in link: - print( - loc + ", " + link + ", Ignored") - return False - - # print(file+" Code:"+str(code[0])+" Line "+str(line_num)+"("+str(char)+"):"+item_stripped) - print( - loc + ", " + link + ", Failed") - return True - - # check for missing anchors - elif "#" in link and \ - code[1] is not None \ - and 'href=\"' + link[link.find("#"):] + '\"' not in \ - code[1]: - print( - loc + ", " + link + ", Failed - Anchor") - # print(file + " Missing Anchor Line " + str( - # line_num) + "(" + str( - # char) + "):" + item_stripped) - elif print_valid: - print( - loc + ", " + link + ", Valid") - return True - - -if __name__ == "__main__": - err = 0 - print("Directory is "+path) - # Loop through all files in path - for root, dirs, files in os.walk(path): - for file in files: - # only read file that match template ( txt, md or python notebook) - if file.endswith(".md") or file.endswith(".txt") or file.endswith( - ".ipynb"): - - # get content and separate into lines and then separate by spaces - raw_content = open(os.path.join(root, file), "r").read() - content = raw_content.split("\n") - content = [x.split(" ") for x in content] - loc = os.path.join(root, file) - # have an incrementer for line number later export - for line in content: - for item in line: - - if "https://" in item or "http://" in item: - if "](" in item: - add_link(loc,item[item.find("]"):]) - # if we get any error then add it - if item[item.find("("):] == item[item.find("]"):]: - continue - add_link(loc,item[item.find("("):]) - else: - add_link(loc,item) - - for link in set(links_to_check): - # if we get any error then add to err variable - err = check_link(link) + err - # if the error is > 1 then set it to 1 to error as 1 - if err > 1: - err = 1 - exit(err) diff --git a/.github/workflows/lint.py b/.github/workflows/lint.py deleted file mode 100644 index 5808266..0000000 --- a/.github/workflows/lint.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import shutil -import nbformat -from nbformat.v4 import new_notebook - -def clean_notebook(file_path): - with open(file_path, 'r', encoding='utf-8') as f: - notebook = nbformat.read(f, as_version=4) - - # Clean cells - for cell in notebook.cells: - if 'outputs' in cell: - cell['outputs'] = [] - if 'execution_count' in cell: - cell['execution_count'] = None - if 'metadata' in cell: - cell['metadata'] = {} - - # Clean notebook metadata - if 'metadata' in notebook: - notebook['metadata'] = {} - - with open(file_path, 'w', encoding='utf-8') as f: - nbformat.write(notebook, f) - -def delete_checkpoints_dirs(root_dir): - # Walk through the directory tree - for dirpath, dirnames, filenames in os.walk(root_dir): - for dirname in dirnames: - # Check if the directory name is 'checkpoints' - if dirname == '.ipynb_checkpoints': - # Construct the full path to the directory - dir_to_delete = os.path.join(dirpath, dirname) - # Delete the directory - shutil.rmtree(dir_to_delete) - print(f'Deleted {dir_to_delete}') - print('Consider adding .ipynb_checkpoints to your .gitignore file!') - - -if __name__ == "__main__": - # Change this to the directory containing your notebooks - notebook_dir = '../../' - - for root, dirs, files in os.walk(notebook_dir): - for file in files: - if file.endswith('.ipynb'): - file_path = os.path.join(root, file) - clean_notebook(file_path) - print(f'Cleaned {file_path}') - - # Delete all 'checkpoints' directories - delete_checkpoints_dirs(notebook_dir) - diff --git a/.github/workflows/notebook-lint.yaml b/.github/workflows/notebook-lint.yaml index 4af3fd9..2038a72 100644 --- a/.github/workflows/notebook-lint.yaml +++ b/.github/workflows/notebook-lint.yaml @@ -1,11 +1,7 @@ name: 'Lint Notebook' on: - workflow_call: - inputs: - directory: - required: false - type: string push: + workflow_dispatch: permissions: contents: write id-token: write @@ -13,31 +9,6 @@ permissions: jobs: lint: name: 'Linting' - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: 'pip' - - - name: Install requirements.txt - run: | - python3 -m pip install --upgrade pip - pip3 install nbformat - - - name: Notebook Linting - working-directory: .github/workflows - run: | - python3 lint.py - - - name: Commit changes - uses: EndBug/add-and-commit@v9 - with: - author_name: github-action - author_email: cbiit-github-action@github.com - message: 'Github Action: Refresh stats' + uses: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@main + with: + directory: . diff --git a/.github/workflows/requirements.txt b/.github/workflows/requirements.txt deleted file mode 100644 index d0537a8..0000000 --- a/.github/workflows/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -nbformat==5.10.4