diff --git a/.github/workflows/update_contributors.yaml b/.github/workflows/update_contributors.yaml new file mode 100644 index 0000000..7cdc904 --- /dev/null +++ b/.github/workflows/update_contributors.yaml @@ -0,0 +1,14 @@ +on: + push: + branches: + - main + +jobs: + contrib-readme-job: + runs-on: ubuntu-latest + name: A job to automate contrib in readme + steps: + - name: Contribute List + uses: akhilmhdh/contributors-readme-action@v2.3.6 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0f5f2ec --- /dev/null +++ b/Makefile @@ -0,0 +1,3 @@ +lint: + black . + ruff check . --fix \ No newline at end of file diff --git a/README.md b/README.md index 8af2ab1..4574871 100644 --- a/README.md +++ b/README.md @@ -64,3 +64,8 @@ First install the project using the installation instructions in docs/source/get ``` hist --help ``` + +## Contributors + + + diff --git a/pyproject.toml b/pyproject.toml index e0611e0..6035600 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ dependencies = [ dev = [ "black", # code auto-formatting "coverage==7.3.2", # testing - "isort", # code auto-formatting "mypy", # linting "pytest", # testing "ruff==0.0.292", # linting @@ -59,30 +58,36 @@ package-dir = {"" = "src"} [project.scripts] hist = "interviewkit.cli:app" - -# Isort -# ----- - -[tool.isort] -line_length = 99 -profile = "black" -default_section = "THIRDPARTY" -lines_after_imports = 2 - - -# Mypy +# Ruff # ---- -[tool.mypy] -files = "." +[tool.ruff] +select = [ + "E", # pycodestyle + "F", # pyflakes + "I", # isort +] +ignore = [ + "E501", # line too long - black takes care of this for us +] -# Use strict defaults -strict = true -warn_unreachable = true -warn_no_return = true +[tool.ruff.per-file-ignores] +# Allow unused imports in __init__ files, as these are convenience imports +"**/__init__.py" = [ "F401" ] + +[tool.ruff.isort] +lines-after-imports = 2 +section-order = [ + "future", + "standard-library", + "third-party", + "first-party", + "project", + "local-folder", +] -[[tool.mypy.overrides]] -# Don't require test functions to include types -module = "tests.*" -allow_untyped_defs = true -disable_error_code = "attr-defined" +[tool.ruff.isort.sections] +"project" = [ + "src", + "tests", +] diff --git a/src/interviewkit/cli.py b/src/interviewkit/cli.py index 64102d5..d510d31 100644 --- a/src/interviewkit/cli.py +++ b/src/interviewkit/cli.py @@ -1,11 +1,10 @@ import sys -import typer - from pathlib import Path -from typing_extensions import Annotated +import typer from slicer import audio_slicing from transcript import transcribe_from_paths +from typing_extensions import Annotated __version__ = '0.0.1' @@ -41,6 +40,25 @@ def slice( """Slices an audio file into smaller audio files.""" audio_slicing(source, start, duration) +@app.command() +def generate_questions(source: Annotated[ + Path, + typer.Argument( + exists=True, + file_okay=True, + dir_okay=False, + readable=True, + resolve_path=True, + help="Source transcript file", + ), + ], + target: Path + ): + """Generates questions from a transcript.""" + + questions = generate_questions_from_transcript(source.read_text()) + target.write_text(questions) + @app.command() def transcribe( diff --git a/src/interviewkit/interview.py b/src/interviewkit/interview.py index bdab42e..f84a994 100644 --- a/src/interviewkit/interview.py +++ b/src/interviewkit/interview.py @@ -1,6 +1,6 @@ -from random import choice -from interviewee import Interviewee from enum import StrEnum, auto + +from interviewee import Interviewee from transcript import Transcript diff --git a/src/interviewkit/questions.py b/src/interviewkit/questions.py new file mode 100644 index 0000000..743f458 --- /dev/null +++ b/src/interviewkit/questions.py @@ -0,0 +1,50 @@ +from clarifai_grpc.channel.clarifai_channel import ClarifaiChannel +from clarifai_grpc.grpc.api import resources_pb2, service_pb2, service_pb2_grpc +from clarifai_grpc.grpc.api.status import status_code_pb2 + +# # Securely get your credentials +# TODO: Pass in arguments or use env vars +CLARIFAI_PAT = '' +# Specify the correct user_id/app_id pairings +# Since you're making inferences outside your app's scope +CLARIFAI_USER_ID = 'meta' +CLARIFAI_APP_ID = 'Llama-2' +# Change these to whatever model and text URL you want to use +CLARIFAI_MODEL_ID = 'llama2-70b-chat' +CLARIFAI_MODEL_VERSION_ID = 'acba9c1995f8462390d7cb77d482810b' + + +def generate_questions_from_transcript(transcript: str): + + channel = ClarifaiChannel.get_grpc_channel() + stub = service_pb2_grpc.V2Stub(channel) + + metadata = (('authorization', 'Key ' + CLARIFAI_PAT),) + userDataObject = resources_pb2.UserAppIDSet( + user_id=CLARIFAI_USER_ID, app_id=CLARIFAI_APP_ID) + + post_model_outputs_response = stub.PostModelOutputs( + service_pb2.PostModelOutputsRequest( + user_app_id=userDataObject, + model_id=CLARIFAI_MODEL_ID, + version_id=CLARIFAI_MODEL_VERSION_ID, + inputs=[ + resources_pb2.Input( + data=resources_pb2.Data( + text=resources_pb2.Text( + raw=transcript + ) + ) + ) + ] + ), + metadata=metadata + ) + + if post_model_outputs_response.status.code != status_code_pb2.SUCCESS: + print(post_model_outputs_response.status) + status = post_model_outputs_response.status.description + raise Exception(f"Post model outputs failed, status: {status}") + + output = post_model_outputs_response.outputs[0] + return output.data.text.raw diff --git a/src/interviewkit/slicer.py b/src/interviewkit/slicer.py index 15273b4..785747e 100644 --- a/src/interviewkit/slicer.py +++ b/src/interviewkit/slicer.py @@ -27,9 +27,10 @@ python interviewkit/slicer.py data/Martine+Barrat_FINAL.mp3 80:30 90:40 """ +import shutil import sys from pathlib import Path -import shutil + try: import pydub diff --git a/src/interviewkit/transcript.py b/src/interviewkit/transcript.py index 423e331..bfc1aaf 100644 --- a/src/interviewkit/transcript.py +++ b/src/interviewkit/transcript.py @@ -1,9 +1,6 @@ from pathlib import Path from rich.console import Console import sys -from clarifai_grpc.channel.clarifai_channel import ClarifaiChannel -from clarifai_grpc.grpc.api import resources_pb2, service_pb2, service_pb2_grpc -from clarifai_grpc.grpc.api.status import status_code_pb2 try: import whisper @@ -20,53 +17,6 @@ class Transcript(BaseModel): """The Transcript entity represents the transcript of an interview.""" content: str -# # Securely get your credentials -# PAT = os.getenv('CLARIFAI_PAT') -# USER_ID = os.getenv('CLARIFAI_USER_ID') -# APP_ID = os.getenv('CLARIFAI_APP_ID') -# MODEL_ID = os.getenv('CLARIFAI_MODEL_ID') -# MODEL_VERSION_ID = os.getenv('CLARIFAI_MODEL_VERSION_ID') -PAT = '' -# Specify the correct user_id/app_id pairings -# Since you're making inferences outside your app's scope -USER_ID = 'meta' -APP_ID = 'Llama-2' -# Change these to whatever model and text URL you want to use -MODEL_ID = 'llama2-70b-chat' -MODEL_VERSION_ID = 'acba9c1995f8462390d7cb77d482810b' - -def generate_questions(transcript_chunk): - channel = ClarifaiChannel.get_grpc_channel() - stub = service_pb2_grpc.V2Stub(channel) - - metadata = (('authorization', 'Key ' + PAT),) - userDataObject = resources_pb2.UserAppIDSet(user_id=USER_ID, app_id=APP_ID) - - post_model_outputs_response = stub.PostModelOutputs( - service_pb2.PostModelOutputsRequest( - user_app_id=userDataObject, - model_id=MODEL_ID, - version_id=MODEL_VERSION_ID, - inputs=[ - resources_pb2.Input( - data=resources_pb2.Data( - text=resources_pb2.Text( - raw=transcript_chunk - ) - ) - ) - ] - ), - metadata=metadata - ) - - if post_model_outputs_response.status.code != status_code_pb2.SUCCESS: - print(post_model_outputs_response.status) - raise Exception(f"Post model outputs failed, status: {post_model_outputs_response.status.description}") - - output = post_model_outputs_response.outputs[0] - return output.data.text.raw - def transcribe_from_paths(source: Path, target: Path) -> None: console.print("Loading whisper base model...") model = whisper.load_model("base") @@ -89,20 +39,6 @@ def transcribe_from_paths(source: Path, target: Path) -> None: console.print("Transcript saved to:") console.print(f" [green bold]{target / source.name}.txt[/green bold]") - # Generate questions from the transcript - transcript_chunk = result['text'] # Assuming 'result' contains the transcribed text - # Debug: Print type and value of transcript_chunk - print(f"Type of transcript_chunk: {type(transcript_chunk)}") - print(f"Value of transcript_chunk: {transcript_chunk}") - - # Ensure transcript_chunk is a string - if not isinstance(transcript_chunk, str): - print("Warning: transcript_chunk is not a string. Trying to convert...") - transcript_chunk = str(transcript_chunk) - - questions = generate_questions(transcript_chunk) - console.print("Generated Questions:\n", questions) - if __name__ == "__main__": source = Path(sys.argv[1]) target = Path(sys.argv[2]) diff --git a/src/interviewkit/transcript_using_m5.py b/src/interviewkit/transcript_using_m5.py index b34288c..8771c5b 100644 --- a/src/interviewkit/transcript_using_m5.py +++ b/src/interviewkit/transcript_using_m5.py @@ -1,7 +1,9 @@ +import sys from pathlib import Path + from rich.console import Console -import sys -from transformers import T5Tokenizer, T5ForConditionalGeneration +from transformers import T5ForConditionalGeneration, T5Tokenizer + try: import whisper @@ -9,8 +11,9 @@ print("Please install Whisper: pip install openai-whisper") exit(1) -from whisper.utils import get_writer from pydantic import BaseModel +from whisper.utils import get_writer + console = Console()