From dd18a11cab8bfcd2c74233bb9b2fb6a0823ca5f0 Mon Sep 17 00:00:00 2001 From: chenxl Date: Mon, 29 Jul 2024 11:51:28 +0000 Subject: [PATCH] [feature] support for pypi install --- README.md | 44 ++++++------ install.sh | 14 +--- pyproject.toml | 60 ++++++++++++++++ setup.py | 191 +++++++++++++++++++++++++------------------------ 4 files changed, 185 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index 87e0a4e..d34dffe 100644 --- a/README.md +++ b/README.md @@ -74,24 +74,37 @@ Some preparation: conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first ``` - Download source code: +- Make sure that PyTorch, packaging, ninja is installed + ``` + pip install torch packaging ninja + ``` + +

Installation

+You can install using Pypi: + +``` +pip install ktransformers --no-build-isolation +``` + +Or download source code and compile: + - init source code ```sh git clone https://github.com/kvcache-ai/ktransformers.git cd ktransformers git submodule init git submodule update ``` + - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh``` + - Compile and install + ``` + bash install.sh + ```

Local Chat

We provide a simple command-line local chat Python script that you can run for testing. > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test. -

Install

- -```sh -bash install.sh -```

Run Example

@@ -109,11 +122,11 @@ wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepS cd .. # Move to repo's root dir # Start local chat -python ktransformers/local_chat.py --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF +python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try: # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite -# python ktransformers/local_chat.py --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF +# python ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF ``` @@ -154,7 +167,7 @@ wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2 cd .. -python ktransformers/local_chat.py --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF +python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try: # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct @@ -172,11 +185,11 @@ wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/De cd .. -python ktransformers/local_chat.py --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF +python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try: # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628 -# python ktransformers/local_chat.py --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF +# python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF ``` | model name | weights download link | @@ -193,15 +206,6 @@ python ktransformers/local_chat.py --model_name deepseek-ai/DeepSeek-V2-Chat-062

RESTful API and Web UI

-

Install

- -[Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```pip install .``` - -Install ktransformers with source. -``` -pip install -r requirements-local_chat.txt -pip install . --no-build-isolation -``` Start without website: diff --git a/install.sh b/install.sh index d8cceef..fa5ba18 100644 --- a/install.sh +++ b/install.sh @@ -10,16 +10,6 @@ rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info echo "Installing python dependencies from requirements.txt" pip install -r requirements-local_chat.txt -echo "Installing ktransformers cpuinfer" -mkdir -p ktransformers/ktransformers_ext/build -cd ktransformers/ktransformers_ext/build -cmake .. -cmake --build . --config Release - -echo "Installing ktransformers gpu kernel, this may take for a while, please wait" -sleep 3 - -cd ../cuda -python setup.py install -cd ../../.. +echo "Installing ktransformers" +pip install . --no-build-isolation echo "Installation completed successfully" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3378ef0..0bbef99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,3 +6,63 @@ requires = [ "packaging" ] build-backend = "setuptools.build_meta" + +[project] + +name = "ktransformers" + +dynamic = ["version"] + +dependencies = [ + "torch >= 2.3.0", + "transformers == 4.43.2", + "fastapi >= 0.111.0", + "langchain >= 0.2.0", + "blessed >= 1.20.0", + "accelerate >= 0.31.0", + "sentencepiece >= 0.1.97", + "setuptools", + "ninja", + "wheel", + "colorlog", + "build", + "fire" +] + +requires-python = ">=3.11" + +authors = [ + {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"} +] + +maintainers = [ + {name = "james0zan", email = "zhang.mingxing@outlook.com"}, + {name = "awake", email = "awake@approaching.ai"}, + {name = "unicorn chan", email = "nl@approaching.ai"} +] + +description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies." + +readme = "README.md" +license = {file = "LICENSE"} + +keywords = ["ktransformers", "llm"] + +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" +] + +[project.urls] +Homepage = "https://kvcache.ai" +Repository = "https://github.com/kvcache-ai/ktransformers.git" +Issues = "https://github.com/kvcache-ai/ktransformers/issues" + + +[project.scripts] +ktransformers = "ktransformers.server.main:main" + +[tool.setuptools.packages.find] +where = ["./", ] +include = ["ktransformers"] \ No newline at end of file diff --git a/setup.py b/setup.py index 5219f7b..38ee098 100644 --- a/setup.py +++ b/setup.py @@ -3,44 +3,54 @@ ''' Description : Author : chenxl -Date : 2024-07-12 07:25:42 +Date : 2024-07-27 16:15:27 Version : 1.0.0 LastEditors : chenxl -LastEditTime : 2024-07-27 04:31:03 +LastEditTime : 2024-07-29 09:40:24 +Adapted from: +https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py +Copyright (c) 2023, Tri Dao. +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' + import os -import shutil import sys import re import ast import subprocess import platform -import io +import urllib.request +import urllib.error from pathlib import Path from packaging.version import parse import torch.version from wheel.bdist_wheel import bdist_wheel as _bdist_wheel from setuptools import setup, Extension -import torch from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME -ROOT_DIR = os.path.dirname(__file__) + class VersionInfo: THIS_DIR = os.path.dirname(os.path.abspath(__file__)) PACKAGE_NAME = "ktransformers" + BASE_WHEEL_URL:str = ( + "https://github.com/kvcache-ai/ktransformers/releases/download/{tag_name}/{wheel_filename}" + ) + FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE" + def get_cuda_bare_metal_version(self, cuda_dir): - raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) + raw_output = subprocess.check_output( + [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) output = raw_output.split() release_idx = output.index("release") + 1 bare_metal_version = parse(output[release_idx].split(",")[0]) cuda_version = f"{bare_metal_version.major}{bare_metal_version.minor}" return cuda_version - + def get_cuda_version_of_torch(self,): torch_cuda_version = parse(torch.version.cuda) cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}" return cuda_version - + def get_platform(self,): """ Returns the platform name as used in wheel filenames. @@ -49,13 +59,13 @@ def get_platform(self,): return f'linux_{platform.uname().machine}' else: raise ValueError("Unsupported platform: {}".format(sys.platform)) - + def get_cpu_instruct(self,): if sys.platform.startswith("linux"): - with open('/proc/cpuinfo', 'r') as cpu_f: + with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f: cpuinfo = cpu_f.read() - - flags_line = [line for line in cpuinfo.split('\n') if line.startswith('flags')][0] + flags_line = [line for line in cpuinfo.split( + '\n') if line.startswith('flags')][0] flags = flags_line.split(':')[1].strip().split(' ') for flag in flags: if 'avx512' in flag: @@ -63,38 +73,70 @@ def get_cpu_instruct(self,): for flag in flags: if 'avx2' in flag: return 'avx2' - raise ValueError("Unsupported cpu Instructions: {}".format(flags_line)) - + raise ValueError( + "Unsupported cpu Instructions: {}".format(flags_line)) + else: + raise ValueError("Unsupported platform: {}".format(sys.platform)) + def get_torch_version(self,): torch_version_raw = parse(torch.__version__) torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}" return torch_version - def get_package_version(self,): - version_file = os.path.join(Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py") + def get_flash_version(self,): + version_file = os.path.join( + Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py") with open(version_file, "r", encoding="utf-8") as f: - version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) - public_version = ast.literal_eval(version_match.group(1)) - package_version = f"{str(public_version)}+cu{self.get_cuda_bare_metal_version(CUDA_HOME)}torch{self.get_torch_version()}{self.get_cpu_instruct()}" + version_match = re.search( + r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) + flash_version = ast.literal_eval(version_match.group(1)) + return flash_version + + def get_package_version(self, full_version=False): + flash_version = self.get_flash_version() + package_version = f"{str(flash_version)}+cu{self.get_cuda_bare_metal_version(CUDA_HOME)}torch{self.get_torch_version()}{self.get_cpu_instruct()}" + if full_version: + return package_version + if not VersionInfo.FORCE_BUILD: + return str(flash_version) return package_version - + class BuildWheelsCommand(_bdist_wheel): def get_wheel_name(self,): version_info = VersionInfo() + package_version = version_info.get_package_version(full_version=True) + flash_version = version_info.get_flash_version() python_version = f"cp{sys.version_info.major}{sys.version_info.minor}" - wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{version_info.get_package_version()}-{python_version}-{python_version}-{version_info.get_platform()}.whl" - return wheel_filename - - + wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{package_version}-{python_version}-{python_version}-{version_info.get_platform()}.whl" + wheel_url = VersionInfo.BASE_WHEEL_URL.format(tag_name=f"v{flash_version}", wheel_filename=wheel_filename) + return wheel_filename, wheel_url + + def run(self): - super().run() - impl_tag, abi_tag, plat_tag = self.get_tag() - archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" - wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") - wheel_name_with_platform = os.path.join(self.dist_dir, self.get_wheel_name()) - os.rename(wheel_path, wheel_name_with_platform) - + if VersionInfo.FORCE_BUILD: + super().run() + wheel_filename, wheel_url = self.get_wheel_name() + print("Guessing wheel URL: ", wheel_url) + try: + urllib.request.urlretrieve(wheel_url, wheel_filename) + # Make the archive + # Lifted from the root wheel processing command + # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85 + if not os.path.exists(self.dist_dir): + os.makedirs(self.dist_dir) + + impl_tag, abi_tag, plat_tag = self.get_tag() + archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" + + wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") + print("Raw wheel path", wheel_path) + os.rename(wheel_filename, wheel_path) + except (urllib.error.HTTPError, urllib.error.URLError): + print("Precompiled wheel not found. Building from source...") + # If the wheel could not be downloaded, build from source + super().run() + # Convert distutils Windows platform specifiers to CMake -A arguments PLAT_TO_CMAKE = { @@ -104,22 +146,17 @@ def run(self): "win-arm64": "ARM64", } -class CopyExtension(Extension): - def __init__(self, name: str, sourcedir: str = "", copy_file_source="") -> None: - super().__init__(name, sources=[]) - self.sourcedir = os.fspath(Path(sourcedir).resolve()) - self.source_file = copy_file_source + class CMakeExtension(Extension): def __init__(self, name: str, sourcedir: str = "") -> None: super().__init__(name, sources=[]) - self.sourcedir = os.fspath(Path(sourcedir).resolve() / "ktransformers/ktransformers_ext") + self.sourcedir = os.fspath( + Path(sourcedir).resolve() / "ktransformers" / "ktransformers_ext") + + class CMakeBuild(BuildExtension): + def build_extension(self, ext) -> None: - if isinstance(ext, CopyExtension): - ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name) - extdir = ext_fullpath.parent.resolve() - shutil.copy(ext.source_file, extdir) - return if not isinstance(ext, CMakeExtension): super().build_extension(ext) return @@ -129,7 +166,8 @@ def build_extension(self, ext) -> None: # Using this requires trailing slash for auto-detection & inclusion of # auxiliary "native" libs - debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug + debug = int(os.environ.get("DEBUG", 0) + ) if self.debug is None else self.debug cfg = "Debug" if debug else "Release" # CMake lets you override the generator - we need to check this. @@ -146,10 +184,12 @@ def build_extension(self, ext) -> None: ] build_args = [] if "CMAKE_ARGS" in os.environ: - cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] + cmake_args += [ + item for item in os.environ["CMAKE_ARGS"].split(" ") if item] # In this example, we pass in the version to C++. You might not need to. - cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"] + cmake_args += [ + f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"] if self.compiler.compiler_type != "msvc": if not cmake_generator or cmake_generator == "Ninja": try: @@ -165,7 +205,8 @@ def build_extension(self, ext) -> None: else: # Single config generators are handled "normally" - single_config = any(x in cmake_generator for x in {"NMake", "Ninja"}) + single_config = any( + x in cmake_generator for x in {"NMake", "Ninja"}) # CMake allows an arch-in-generator style for backward compatibility contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"}) @@ -183,7 +224,8 @@ def build_extension(self, ext) -> None: # Cross-compile support for macOS - respect ARCHFLAGS if set archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", "")) if archs: - cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))] + cmake_args += [ + "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))] if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: if hasattr(self, "parallel") and self.parallel: @@ -199,51 +241,16 @@ def build_extension(self, ext) -> None: ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True ) -def read_readme() -> str: - p = os.path.join(ROOT_DIR, "README.md") - if os.path.isfile(p): - return io.open(p, "r", encoding="utf-8").read() - else: - return "" setup( - name="ktransformers", version=VersionInfo().get_package_version(), - author="KVCache.ai", - license="Apache 2.0", - description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies.", - long_description=read_readme(), - long_description_content_type="text/markdown", - cmdclass={"build_ext": CMakeBuild}, - install_requires = [ - "torch >= 2.3.0", - "transformers == 4.43.2", - "fastapi >= 0.111.0", - "langchain >= 0.2.0", - "blessed >= 1.20.0", - "accelerate >= 0.31.0", - "sentencepiece >= 0.1.97", - "setuptools", - "ninja", - "wheel", - "colorlog", - "build", - "packaging", - "fire" - ], - python_requires=">=3.10", - entry_points={ - "console_scripts": [ - "ktransformers=ktransformers.server.main:main", - ], - }, - packages=["ktransformers"], - include_package_data=True, + cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, ext_modules=[ - CUDAExtension('KTransformersOps', [ - 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu', - 'ktransformers/ktransformers_ext/cuda/binding.cpp', - 'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu', - ]), - CMakeExtension("cpuinfer_ext")] -) \ No newline at end of file + CMakeExtension("cpuinfer_ext"), + CUDAExtension('KTransformersOps', [ + 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu', + 'ktransformers/ktransformers_ext/cuda/binding.cpp', + 'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu' + ]) + ] +)