diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..b1f21b3 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,74 @@ +name: QuantLLM CI/CD + +on: + push: + branches: [ main ] + tags: + - 'v*' + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev,test,gguf] + pip install pytest pytest-cov black isort + + - name: Check code formatting + run: | + black . --check + isort . --check-only + + - name: Run tests + run: | + pytest tests/ --cov=quantllm --cov-report=xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + fail_ci_if_error: true + + publish: + needs: test + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine + + - name: Build package + run: python -m build + + - name: Publish to PyPI + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + twine check dist/* + twine upload dist/* \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..aac3c20 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,42 @@ +name: Documentation + +on: + push: + branches: [ main ] + paths: + - 'docs/**' + - '.github/workflows/docs.yml' + pull_request: + branches: [ main ] + paths: + - 'docs/**' + +jobs: + docs: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[docs] + pip install sphinx sphinx-rtd-theme + + - name: Build documentation + run: | + cd docs + make html + + - name: Deploy to GitHub Pages + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./docs/_build/html \ No newline at end of file diff --git a/README.md b/README.md index bf635d5..927e5ca 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,72 @@ -# 🧠 QuantLLM: Lightweight Library for Quantized LLM Fine-Tuning and Deployment +# 🧠 QuantLLM: Efficient GGUF Model Quantization and Deployment [![PyPI Downloads](https://static.pepy.tech/badge/quantllm)](https://pepy.tech/projects/quantllm) PyPI - Version - ## 📌 Overview -**QuantLLM** is a Python library designed for developers, researchers, and teams who want to fine-tune and deploy large language models (LLMs) **efficiently** using **4-bit and 8-bit quantization** techniques. It provides a modular and flexible framework for: - -- **Loading and quantizing models** with advanced configurations -- **LoRA / QLoRA-based fine-tuning** with customizable parameters -- **Dataset management** with preprocessing and splitting -- **Training and evaluation** with comprehensive metrics -- **Model checkpointing** and versioning -- **Hugging Face Hub integration** for model sharing +**QuantLLM** is a Python library designed for efficient model quantization using the GGUF (GGML Universal Format) method. It provides a robust framework for converting and deploying large language models with minimal memory footprint and optimal performance. Key capabilities include: -The goal of QuantLLM is to **democratize LLM training**, especially in low-resource environments, while keeping the workflow intuitive, modular, and production-ready. +- **Memory-efficient GGUF quantization** with multiple precision options (2-bit to 8-bit) +- **Chunk-based processing** for handling large models +- **Comprehensive benchmarking** tools +- **Detailed progress tracking** with memory statistics +- **Easy model export** and deployment ## 🎯 Key Features | Feature | Description | |----------------------------------|-------------| -| ✅ Quantized Model Loading | Load HuggingFace models with various quantization techniques (including AWQ, GPTQ, GGUF) in 4-bit or 8-bit precision, featuring customizable settings. | -| ✅ Advanced Dataset Management | Load, preprocess, and split datasets with flexible configurations | -| ✅ LoRA / QLoRA Fine-Tuning | Memory-efficient fine-tuning with customizable LoRA parameters | -| ✅ Comprehensive Training | Advanced training loop with mixed precision, gradient accumulation, and early stopping | -| ✅ Model Evaluation | Flexible evaluation with custom metrics and batch processing | -| ✅ Checkpoint Management | Save, resume, and manage training checkpoints with versioning | -| ✅ Hub Integration | Push models and checkpoints to Hugging Face Hub with authentication | -| ✅ Configuration Management | YAML/JSON config support for reproducible experiments | -| ✅ Logging and Monitoring | Comprehensive logging and Weights & Biases integration | +| ✅ Multiple GGUF Types | Support for various GGUF quantization types (Q2_K to Q8_0) with different precision-size tradeoffs | +| ✅ Memory Optimization | Chunk-based processing and CPU offloading for efficient handling of large models | +| ✅ Progress Tracking | Detailed layer-wise progress with memory statistics and ETA | +| ✅ Benchmarking Tools | Comprehensive benchmarking suite for performance evaluation | +| ✅ Hardware Optimization | Automatic device selection and memory management | +| ✅ Easy Deployment | Simple conversion to GGUF format for deployment | +| ✅ Flexible Configuration | Customizable quantization parameters and processing options | ## 🚀 Getting Started ### Installation +Basic installation: ```bash pip install quantllm ``` +With GGUF support (recommended): +```bash +pip install quantllm[gguf] +``` + +### Quick Example + +```python +from quantllm import QuantLLM +from transformers import AutoTokenizer + +# Load tokenizer and prepare data +model_name = "facebook/opt-125m" +tokenizer = AutoTokenizer.from_pretrained(model_name) +calibration_text = ["Example text for calibration."] * 10 +calibration_data = tokenizer(calibration_text, return_tensors="pt", padding=True)["input_ids"] + +# Quantize model +quantized_model, benchmark_results = QuantLLM.quantize_from_pretrained( + model_name_or_path=model_name, + bits=4, # Quantization bits (2-8) + group_size=32, # Group size for quantization + quant_type="Q4_K_M", # GGUF quantization type + calibration_data=calibration_data, + benchmark=True, # Run benchmarks + benchmark_input_shape=(1, 32) +) + +# Save and convert to GGUF +QuantLLM.save_quantized_model(model=quantized_model, output_path="quantized_model") +QuantLLM.convert_to_gguf(model=quantized_model, output_path="model.gguf") +``` + For detailed usage examples and API documentation, please refer to our: - 📚 [Official Documentation](https://quantllm.readthedocs.io/) - 🎓 [Tutorials](https://quantllm.readthedocs.io/tutorials/) @@ -48,39 +76,41 @@ For detailed usage examples and API documentation, please refer to our: ### Minimum Requirements - **CPU**: 4+ cores -- **RAM**: 16GB -- **Storage**: 20GB free space -- **Python**: 3.8+ +- **RAM**: 16GB+ +- **Storage**: 10GB+ free space +- **Python**: 3.10+ -### Recommended Requirements +### Recommended for Large Models +- **CPU**: 8+ cores +- **RAM**: 32GB+ - **GPU**: NVIDIA GPU with 8GB+ VRAM -- **RAM**: 32GB -- **Storage**: 50GB+ SSD - **CUDA**: 11.7+ +- **Storage**: 20GB+ free space + +### GGUF Quantization Types -### Resource Usage Guidelines -| Model Size | 4-bit (GPU RAM) | 8-bit (GPU RAM) | CPU RAM (min) | -|------------|----------------|-----------------|---------------| -| 3B params | ~6GB | ~9GB | 16GB | -| 7B params | ~12GB | ~18GB | 32GB | -| 13B params | ~20GB | ~32GB | 64GB | -| 70B params | ~90GB | ~140GB | 256GB | +| Type | Bits | Description | Use Case | +|---------|------|-----------------------|-----------------------------| +| Q2_K | 2 | Extreme compression | Size-critical deployment | +| Q3_K_S | 3 | Small size | Limited storage | +| Q4_K_M | 4 | Balanced quality | General use | +| Q5_K_M | 5 | Higher quality | Quality-sensitive tasks | +| Q8_0 | 8 | Best quality | Accuracy-critical tasks | ## 🔄 Version Compatibility | QuantLLM | Python | PyTorch | Transformers | CUDA | |----------|--------|----------|--------------|-------| -| latest | ≥3.10 | ≥2.0.0 | ≥4.30.0 | ≥11.7 | +| 1.2.0 | ≥3.10 | ≥2.0.0 | ≥4.30.0 | ≥11.7 | ## 🗺 Roadmap -- [ ] Multi-GPU training support -- [ ] AutoML for hyperparameter tuning -- [ ] Integration of additional advanced quantization algorithms and techniques. -- [ ] Custom model architecture support -- [ ] Enhanced logging and visualization -- [ ] Model compression techniques -- [ ] Deployment optimizations +- [ ] Support for more GGUF model architectures +- [ ] Enhanced benchmarking capabilities +- [ ] Multi-GPU processing support +- [ ] Advanced memory optimization techniques +- [ ] Integration with more deployment platforms +- [ ] Custom quantization kernels ## 🤝 Contributing @@ -92,14 +122,12 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## 🙏 Acknowledgments -- [HuggingFace](https://huggingface.co/) for their amazing Transformers library -- [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) for quantization -- [PEFT](https://github.com/huggingface/peft) for parameter-efficient fine-tuning -- [Weights & Biases](https://wandb.ai/) for experiment tracking +- [llama.cpp](https://github.com/ggerganov/llama.cpp) for GGUF format +- [HuggingFace](https://huggingface.co/) for Transformers library +- [CTransformers](https://github.com/marella/ctransformers) for GGUF support ## 📫 Contact & Support -- GitHub Issues: [Create an issue](https://github.com/yourusername/QuantLLM/issues) +- GitHub Issues: [Create an issue](https://github.com/codewithdark-git/QuantLLM/issues) - Documentation: [Read the docs](https://quantllm.readthedocs.io/) -- Discord: [Join our community](https://discord.gg/quantllm) -- Email: support@quantllm.ai +- Email: codewithdark90@gmail.com diff --git a/docs/api_reference/model.rst b/docs/api_reference/model.rst deleted file mode 100644 index fb1f168..0000000 --- a/docs/api_reference/model.rst +++ /dev/null @@ -1,76 +0,0 @@ -Model API -========= - -Model ------ - -.. automodule:: quantllm.model.model - :members: - :undoc-members: - :show-inheritance: - -Model Configuration ------------------ - -.. automodule:: quantllm.model.lora_config - :members: - :undoc-members: - :show-inheritance: - -Example Usage ------------- - -Basic Usage -~~~~~~~~~~ - -.. code-block:: python - - from quantllm import Model, ModelConfig - - # Configure model - config = ModelConfig( - model_name="facebook/opt-125m", - load_in_4bit=True - ) - - # Load model - model = Model(config) - model_instance = model.get_model() - -With LoRA -~~~~~~~~ - -.. code-block:: python - - config = ModelConfig( - model_name="facebook/opt-125m", - load_in_4bit=True, - use_lora=True - ) - model = Model(config) - -CPU Offloading -~~~~~~~~~~~~ - -.. code-block:: python - - config = ModelConfig( - model_name="facebook/opt-125m", - cpu_offload=True - ) - model = Model(config) - -Advanced Configuration -~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - config = ModelConfig( - model_name="facebook/opt-125m", - load_in_4bit=True, - use_lora=True, - gradient_checkpointing=True, - bf16=True, - trust_remote_code=True - ) - model = Model(config) \ No newline at end of file diff --git a/docs/api_reference/quantization.rst b/docs/api_reference/quantization.rst index 0041f00..3e4ad2e 100644 --- a/docs/api_reference/quantization.rst +++ b/docs/api_reference/quantization.rst @@ -1,17 +1,17 @@ -# QuantLLM: Advanced Model Quantization +# QuantLLM: GGUF Model Quantization =================================== 💫 Introduction ------------ -QuantLLM is a powerful library that provides state-of-the-art quantization methods to compress large language models while maintaining their performance. Supporting multiple quantization methods (AWQ, GPTQ, GGUF), it enables efficient model deployment in production environments. +QuantLLM provides efficient model quantization using the GGUF (GGML Universal Format) method, enabling memory-efficient deployment of large language models. The library focuses on providing robust quantization with comprehensive progress tracking and benchmarking capabilities. 🚀 Getting Started --------------- -QuantLLM offers multiple quantization methods, each optimized for different use cases. The high-level `QuantLLM` API provides a simple interface to quantize models while the low-level API gives you fine-grained control over the quantization process. +QuantLLM offers both high-level and low-level APIs for GGUF quantization. The high-level `QuantLLM` API provides a simple interface, while the low-level `GGUFQuantizer` gives you fine-grained control over the quantization process. Key Features: -- Multiple quantization methods (AWQ, GPTQ, GGUF) -- Memory-efficient processing +- Multiple GGUF quantization types (Q2_K to Q8_0) +- Memory-efficient chunk-based processing - Hardware-specific optimizations - Comprehensive metrics and logging - Easy model export and deployment @@ -26,18 +26,17 @@ Complete Example from transformers import AutoTokenizer import time - # 1. Model and Method Selection + # 1. Model Selection model_name = "facebook/opt-125m" # Any Hugging Face model - method = "awq" # Choose: 'awq', 'gptq', or 'gguf' - # 2. Configure Quantization + # 2. Configure GGUF Quantization quant_config = { "bits": 4, # Quantization bits (2-8) - "group_size": 128, # Size of quantization groups - "zero_point": True, # Zero-point quantization (AWQ) - "version": "v2", # AWQ algorithm version - "scale_dtype": "fp32", # Scale factor data type - "batch_size": 4 # Processing batch size + "group_size": 32, # Size of quantization groups + "quant_type": "Q4_K_M", # GGUF quantization type + "use_packed": True, # Use packed format + "cpu_offload": False, # CPU offloading for large models + "chunk_size": 1000 # Chunk size for memory efficiency } # 3. Prepare Calibration Data @@ -63,16 +62,19 @@ Complete Example # 4. Model Quantization with Error Handling try: - print("Starting quantization process...") + print("Starting GGUF quantization process...") start_time = time.time() # Perform quantization - quantized_model, tokenizer = QuantLLM.quantize_from_pretrained( - model_name=model_name, - method=method, - quant_config_dict=quant_config, + quantized_model, benchmark_results = QuantLLM.quantize_from_pretrained( + model_name_or_path=model_name, + bits=4, + group_size=32, + quant_type="Q4_K_M", calibration_data=inputs["input_ids"], - calibration_steps=50, + benchmark=True, + benchmark_input_shape=(1, 32), + benchmark_steps=50, device="cuda" if torch.cuda.is_available() else "cpu" ) @@ -93,262 +95,138 @@ Complete Example result = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"Test Output: {result}") - # 6. Save Quantized Model (Optional) - save_path = "./quantized_model" - quantized_model.save_pretrained(save_path) - tokenizer.save_pretrained(save_path) - print(f"Model saved to {save_path}") + # 6. Save and Convert to GGUF + QuantLLM.save_quantized_model( + model=quantized_model, + output_path="./quantized_model", + save_tokenizer=True + ) + + QuantLLM.convert_to_gguf( + model=quantized_model, + output_path="model.gguf" + ) + print("Model saved and converted to GGUF format") except Exception as e: print(f"Error during quantization: {str(e)}") raise - # Define quantization configuration - quant_config = { - "bits": 4, - "group_size": 128, - "zero_point": True, # AWQ specific - "awq_version": "v2" # AWQ specific (maps to 'version') - } - - # Prepare dummy calibration data (replace with your actual data) - # For demonstration, creating random data. - # The shape and content should be representative of your model's input. - # Tokenizer is usually needed to prepare real calibration data. - # For this example, let's assume calibration data is a tensor. - # If the model needs input_ids, it should be shaped like (num_samples, seq_len) - # If the model's first layer takes features directly, it might be (num_samples, feature_dim) - # The factory passes this data to the specific quantizer. - # BaseQuantizer's prepare_calibration_data expects a torch.Tensor. - - # For opt-125m, tokenizer.model_max_length is 2048, hidden_size is 768. - # A simple approach for dummy calibration data: - num_calibration_samples = 10 - sequence_length = 32 # A shorter sequence length for dummy data - # Assuming calibration data is a tensor of input features for simplicity here. - # In a real scenario, this would be tokenized input_ids. - # The DummyModel in tests uses calibration_data as direct input if last dim matches. - # Actual models need tokenized input_ids. The factory itself doesn't tokenize. - # The user must provide calibration_data in the format expected by the model. - # For now, we'll create a simple tensor. - # For a real LLM, this should be tokenized sequences. - # Let's create dummy input_ids as calibration data - dummy_input_ids = torch.randint(0, 30000, (num_calibration_samples, sequence_length)) - - - try: - quantized_model, tokenizer = QuantizerFactory.quantize_from_pretrained( - model_name_or_path=model_name, - method=method, - quant_config_dict=quant_config, - calibration_data=dummy_input_ids, # Pass input_ids directly - calibration_steps=50, # For AWQ - device="cpu" # Specify 'cuda' for GPU - ) - print(f"Model {model_name} quantized with {method} successfully.") - print(f"Quantized model: {quantized_model}") - print(f"Tokenizer: {tokenizer}") - - # You can now use the quantized_model and tokenizer for inference - # For example: - # if tokenizer: - # inputs = tokenizer("Hello, world!", return_tensors="pt").to(quantized_model.device) - # with torch.no_grad(): - # outputs = quantized_model(**inputs) - # print("Inference output:", outputs) - - except Exception as e: - print(f"An error occurred during quantization: {e}") - Main Parameters of `quantize_from_pretrained` --------------------------------------------- - ``model_name_or_path (str)``: Hugging Face model ID (e.g., "facebook/opt-125m") or a local path to a pretrained model. -- ``method (str)``: The quantization method to use. Supported values are ``'awq'``, ``'gptq'``, or ``'gguf'``. -- ``quant_config_dict (Optional[Dict[str, Any]])``: A dictionary containing parameters for the chosen quantization method. - - **Common Keys (can be used for most methods, defaults may apply):** - - `bits (int)`: Number of bits for quantization (e.g., 4, 8). Default: 4. - - `group_size (int)`: Size of quantization groups. Default: 128. - - `batch_size (int)`: Batch size used internally by the quantizer during its initialization/calibration steps. Default: 4. - - **AWQ Specific Keys:** - - `zero_point (bool)`: Enable/disable zero-point for activations. Default: True. - - `awq_version (str)`: AWQ algorithm version (e.g., "v1", "v2"). Default: "v2". (Maps to `version` in `AWQQuantizer`). - - `scale_dtype (str)`: Data type for scales (e.g., "fp32", "bf16"). Default: "fp32". (Passed to `AWQQuantizer`). - - `enable_mnn_kernel (bool)`: Enable MNN kernel optimizations, if applicable. Default: False. (Passed to `AWQQuantizer`). - - Note: `batch_size` from the common keys is used by AWQ for its calibration processing. - - **GPTQ Specific Keys:** - - `actorder (bool)`: Enable activation-order quantization. Default: True. - - `percdamp (float)`: Dampening percentage for Hessian update. Default: 0.01. - - `sym (bool)`: Use symmetric quantization for weights. Default: True. - - **GGUF Specific Keys:** - - `use_packed (bool)`: Enable weight packing for GGUF. Default: True. - - `cpu_offload (bool)`: Offload quantized layers to CPU. Default: False. - - `desc_act (bool)`: Describe activations in GGUF metadata. Default: False. - - `desc_ten (bool)`: Describe tensors in GGUF metadata. Default: False. - - `legacy_format (bool)`: Use legacy GGUF format. Default: False. - Refer to the docstring of `QuantizerFactory.quantize_from_pretrained` and individual quantizer classes for more details on all available parameters. -- ``calibration_data (Optional[Any])``: Data required for quantization, typically a `torch.Tensor`. The format depends on the model's input requirements (e.g., tokenized `input_ids`). This data is passed to the underlying quantizer. -- ``calibration_steps (Optional[int])``: Number of calibration steps. This is particularly relevant for methods like AWQ that use it in their `quantize()` method. Default: 100. -- ``device (Optional[str])``: The device to run the quantization on (e.g., "cpu", "cuda", "cuda:0"). If `None`, the default device selection logic within the quantizers (usually prioritizing CUDA if available) will be used. - -The method returns a tuple containing the quantized ``PreTrainedModel`` and its associated tokenizer (if loadable). - -For a full example demonstrating quantization and pushing to the Hugging Face Hub, see the script in ``examples/01_quantize_and_push_to_hub.py``. - -Advanced: Direct Quantizer Usage -================================ - -While `QuantizerFactory` is recommended for ease of use, you can also use the individual quantizer classes directly for more fine-grained control or custom workflows. - -Common Parameters for Direct Initialization -------------------------------------------- - -All quantizers share a common set of parameters in their `__init__` method, inherited from `BaseQuantizer`: - -- ``model_or_model_name_or_path (Union[str, PreTrainedModel])``: A Hugging Face model ID, a local path to a model, or an already loaded `PreTrainedModel` instance. -- ``bits (int)``: Number of quantization bits (e.g., 2-8). -- ``device (Optional[Union[str, torch.device]])``: Specifies the primary computation device ('cpu' or 'cuda') for the quantizer and the prepared model. - -Individual Quantizer Details ----------------------------- - -Below are details specific to each quantization method when used directly. - -### 1. GPTQ (`GPTQQuantizer`) - -GPTQ offers Hessian-based quantization with activation ordering for high accuracy. - -.. automodule:: quantllm.quant.gptq - :noindex: - -.. autoclass:: quantllm.quant.gptq.GPTQQuantizer - :members: __init__, quantize - :show-inheritance: - :inherited-members: - :undoc-members: - -**Key `__init__` Parameters for `GPTQQuantizer`:** -- ``group_size (int)``: Size of quantization groups. -- ``actorder (bool)``: Enables activation ordering. -- ``sym (bool)``: Use symmetric quantization for weights. -- ``percdamp (float)``: Dampening for Hessian update. -- ``use_triton (bool)``: Note: Custom GPTQ Triton kernels are not yet fully integrated for core quantization steps. - -**Usage Example (Direct):** - -.. code-block:: python - - from quantllm.quant import GPTQQuantizer - # Assuming 'model' is a loaded PreTrainedModel instance - # and 'calibration_data' is prepared - - quantizer = GPTQQuantizer( - model_or_model_name_or_path=model, # Can also be model name/path - bits=4, - group_size=128, - actorder=True - ) - quantized_model = quantizer.quantize(calibration_data=calibration_data) - -### 2. AWQ (`AWQQuantizer`) - -AWQ adapts quantization based on activation patterns. - -.. automodule:: quantllm.quant.awq - :noindex: - -.. autoclass:: quantllm.quant.awq.AWQQuantizer - :members: __init__, quantize - :show-inheritance: - :inherited-members: - :undoc-members: - -**Inference with AWQ Quantized Models:** Models quantized using `AWQQuantizer` (or via the high-level API with the 'awq' method) are returned as standard Hugging Face `PreTrainedModel` instances. The quantization is handled transparently by the custom `QuantizedLinear` layers. Therefore, inference can be performed using the usual methods like `.generate()` or by directly calling the model, with no special steps required for AWQ-quantized layers. - -**Key `__init__` Parameters for `AWQQuantizer`:** -- ``group_size (int)``: Size of the quantization group. Default: 128. -- ``zero_point (bool)``: Whether to use zero-point quantization for activations. Default: True. -- ``version (str)``: AWQ algorithm version (e.g., "v1", "v2"). Default: "v2". -- ``scale_dtype (str)``: Data type for scales (e.g., "fp32", "bf16"). Default: "fp32". -- ``enable_mnn_kernel (bool)``: Whether to enable MNN kernel optimizations, if applicable. Default: False. -- ``batch_size (int)``: Batch size for calibration data processing during the `quantize` method. Default: 2. - -**Usage Example (Direct):** - -.. code-block:: python - - from quantllm.quant import AWQQuantizer - # Assuming 'model' is a loaded PreTrainedModel instance - # and 'calibration_data' is prepared - - quantizer = AWQQuantizer( - model_or_model_name_or_path=model, - bits=4, - group_size=128, - zero_point=True - ) - quantized_model = quantizer.quantize( - calibration_data=calibration_data, - calibration_steps=100 # AWQ's quantize method takes this - ) - -### 3. GGUF (`GGUFQuantizer`) - -GGUF provides an efficient format with CTransformers integration. It can also offload quantized layers to CPU. - -.. automodule:: quantllm.quant.gguf - :noindex: - -.. autoclass:: quantllm.quant.gguf.GGUFQuantizer - :members: __init__, quantize, convert_to_gguf - :show-inheritance: - :inherited-members: - :undoc-members: - -**Key `__init__` Parameters for `GGUFQuantizer`:** -- ``group_size (int)``: Group size. -- ``use_packed (bool)``: Enable weight packing. -- ``cpu_offload (bool)``: If True, quantized layers are placed on CPU. -- ``desc_act (bool)``, ``desc_ten (bool)``, ``legacy_format (bool)``: GGUF format-specific flags. - -**Usage Example (Direct):** +- ``bits (int)``: Number of bits for quantization (2-8). Default: 4. +- ``group_size (int)``: Size of quantization groups. Default: 32. +- ``quant_type (str)``: GGUF quantization type (e.g., "Q4_K_M"). Optional. +- ``use_packed (bool)``: Enable weight packing. Default: True. +- ``cpu_offload (bool)``: Offload layers to CPU for memory efficiency. Default: False. +- ``chunk_size (int)``: Size of processing chunks. Default: 1000. +- ``calibration_data (torch.Tensor)``: Input IDs for calibration. +- ``benchmark (bool)``: Whether to run benchmarks. Default: False. +- ``benchmark_input_shape (tuple)``: Shape for benchmark inputs. +- ``benchmark_steps (int)``: Number of benchmark steps. +- ``device (str)``: Device for quantization ("cpu" or "cuda"). + +GGUF Quantization Types +---------------------- + +============ ================ ==================== +Bits Types Description +============ ================ ==================== +2-bit Q2_K Extreme compression +3-bit Q3_K_S Small size +3-bit Q3_K_M Medium accuracy +3-bit Q3_K_L Better accuracy +4-bit Q4_K_S Standard quality +4-bit Q4_K_M Better quality +5-bit Q5_K_S High quality +5-bit Q5_K_M Higher quality +6-bit Q6_K Very high quality +8-bit Q8_0 Best quality +============ ================ ==================== + +Direct GGUFQuantizer Usage +========================= + +For more fine-grained control, you can use the `GGUFQuantizer` class directly: .. code-block:: python from quantllm.quant import GGUFQuantizer - # Assuming 'model' is a loaded PreTrainedModel instance + # Initialize quantizer quantizer = GGUFQuantizer( - model_or_model_name_or_path=model, + model_name="facebook/opt-125m", bits=4, group_size=32, + quant_type="Q4_K_M", use_packed=True, - cpu_offload=False + cpu_offload=False, + chunk_size=1000, + device="cuda" if torch.cuda.is_available() else "cpu" ) - # Calibration data is optional for GGUF's quantize method but can be beneficial - quantized_model = quantizer.quantize(calibration_data=calibration_data) - # Export to GGUF format - quantizer.convert_to_gguf("model-q4.gguf") - -Choosing the Right Method ------------------------- - -- **GPTQ**: Best for highest accuracy with slightly slower quantization. The GPTQ method in QuantLLM involves computing Hessian matrix information. This information is primarily used for activation-based weight reordering when `actorder=True`. Users should note that the detailed iterative weight updates using the full Hessian inverse, as found in some canonical GPTQ literature, may not be fully implemented in the current layer quantization step. The system logs warnings if the Hessian is computed but not fully utilized in this manner. -- **AWQ**: Best balance of speed and accuracy, good for general use. -- **GGUF**: Best for deployment and inference with CTransformers. - -Resource Requirements ------------------- - -+-------------+------------+-------------+------------+ -| Method | Memory | Speed | Accuracy | -+=============+============+=============+============+ -| GPTQ | High | Slow | Highest | -+-------------+------------+-------------+------------+ -| AWQ | Medium | Fast | High | -+-------------+------------+-------------+------------+ -| GGUF | Low | Very Fast | Good | -+-------------+------------+-------------+------------+ - -For detailed examples of using GGUF quantization, check out the examples in the `examples/` directory or refer to the interactive tutorial in `testing.ipynb`. + # Quantize model + quantized_model = quantizer.quantize(calibration_data=calibration_data) + + # Convert to GGUF format + quantizer.convert_to_gguf("model.gguf") + +Memory-Efficient Processing +------------------------- + +For large models, QuantLLM provides several memory optimization features: + +1. **Chunk-based Processing** + + .. code-block:: python + + quantizer = GGUFQuantizer( + model_name="large-model", + chunk_size=500, # Process in smaller chunks + cpu_offload=True # Offload to CPU when needed + ) + +2. **Progress Tracking** + + The quantization process provides detailed progress information: + - Layer-wise quantization progress + - Memory usage statistics + - Estimated time remaining + - Layer shape information + +3. **Benchmarking** + + .. code-block:: python + + from quantllm.utils.benchmark import QuantizationBenchmark + + benchmark = QuantizationBenchmark( + model=model, + calibration_data=calibration_data, + input_shape=(1, 32), + num_inference_steps=100 + ) + results = benchmark.run_all_benchmarks() + benchmark.print_report() + +Best Practices +------------- + +1. **Memory Management** + - Use `cpu_offload=True` for models larger than 70% of GPU memory + - Adjust `chunk_size` based on available memory + - Monitor memory usage with benchmarking tools + +2. **Quantization Type Selection** + - Use Q4_K_M for general use cases + - Use Q2_K for extreme compression needs + - Use Q8_0 for quality-critical applications + +3. **Performance Optimization** + - Run benchmarks to find optimal settings + - Use appropriate batch sizes + - Enable progress tracking for monitoring + +For detailed examples, check out the `examples/` directory or refer to the getting started guide. diff --git a/docs/conf.py b/docs/conf.py index 98cdfd9..8725a2b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,9 +4,9 @@ project = 'QuantLLM' copyright = '2025, QuantLLM Team' -author = 'QuantLLM Team' -version = '1.0.0' -release = '1.0.0' +author = 'Dark Coder' +version = '1.2.0' +release = '1.2.0' # RTD configurations on_rtd = os.environ.get('READTHEDOCS') == 'True' diff --git a/docs/getting_started.rst b/docs/getting_started.rst index ce74e9c..8705345 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,153 +1,228 @@ Getting Started =============== +Introduction +----------- + +QuantLLM is a powerful library for quantizing and deploying large language models with a focus on memory efficiency and performance. The library now supports GGUF format, advanced progress tracking, and comprehensive benchmarking tools. + +Installation +----------- + +Install the base package: + +.. code-block:: bash + + pip install quantllm + +For GGUF support, install with extras: + +.. code-block:: bash + + pip install quantllm[gguf] + Quick Start ---------- -QuantLLM is designed to make working with large language models more accessible and efficient. Here's a complete example showcasing its key features: +Here's a complete example showcasing GGUF quantization and benchmarking: .. code-block:: python - from quantllm import ( - Model, ModelConfig, - LoadDataset, DatasetConfig, - FineTuningTrainer, TrainingConfig, - TrainingLogger + from quantllm import QuantLLM + from quantllm.quant import GGUFQuantizer + from transformers import AutoTokenizer + + # 1. Load tokenizer and prepare calibration data + model_name = "facebook/opt-125m" + tokenizer = AutoTokenizer.from_pretrained(model_name) + calibration_text = ["This is an example text for calibration."] * 10 + calibration_data = tokenizer(calibration_text, return_tensors="pt", padding=True)["input_ids"] + + # 2. Quantize using high-level API + quantized_model, benchmark_results = QuantLLM.quantize_from_pretrained( + model_name_or_path=model_name, + bits=4, # Quantization bits (2-8) + group_size=32, # Group size for quantization + quant_type="Q4_K_M", # GGUF quantization type + calibration_data=calibration_data, + benchmark=True, # Run benchmarks + benchmark_input_shape=(1, 32), + benchmark_steps=50, + cpu_offload=False, # Set to True for large models + chunk_size=1000 # Process in chunks for memory efficiency ) - # Initialize logger for rich progress tracking - logger = TrainingLogger() # This will display the ASCII art logo! - - # 1. Load and configure model with best practices - model_config = ModelConfig( - model_name="facebook/opt-125m", - load_in_4bit=True, # Enable memory-efficient 4-bit quantization - use_lora=True, # Enable parameter-efficient fine-tuning - gradient_checkpointing=True # Reduce memory usage during training - ) - model = Model(model_config).get_model() - - # 2. Load and prepare dataset with automatic preprocessing - dataset = LoadDataset().load_hf_dataset("imdb") - dataset_config = DatasetConfig( - text_column="text", - label_column="label", - max_length=512 - ) - - # 3. Configure training with optimized defaults - training_config = TrainingConfig( - learning_rate=2e-4, - num_epochs=3, - batch_size=8, - gradient_accumulation_steps=4, # For larger effective batch sizes - warmup_ratio=0.1, # Gradual learning rate warmup - evaluation_strategy="steps", # Regular evaluation during training - eval_steps=100 + # 3. Save the quantized model + QuantLLM.save_quantized_model( + model=quantized_model, + output_path="quantized_model", + save_tokenizer=True ) - # 4. Initialize trainer with progress tracking - trainer = FineTuningTrainer( - model=model, - training_config=training_config, - logger=logger # Enable rich progress tracking + # 4. Convert to GGUF format + QuantLLM.convert_to_gguf( + model=quantized_model, + output_path="model.gguf" ) - - # 5. Start training with automatic hardware optimization - trainer.train() Core Features ------------ -* **Advanced Quantization** - * 4-bit and 8-bit quantization for up to 75% memory reduction - * Automatic format selection based on your hardware - * Zero-shot quantization with minimal accuracy loss +Advanced GGUF Quantization +~~~~~~~~~~~~~~~~~~~~~~~ -* **Efficient Fine-tuning** - * LoRA support for parameter-efficient training - * Gradient checkpointing for reduced memory usage - * Automatic mixed precision training +The library supports various GGUF quantization types: -* **Hardware Optimization** - * Automatic hardware detection (CUDA, MPS, CPU) - * Optimal settings for your specific GPU - * CPU offloading for large models +* **2-bit Quantization** + * Q2_K: Best for extreme compression + * Suitable for smaller models or when size is critical -* **Rich Progress Tracking** - * Beautiful terminal-based progress display - * Detailed training metrics and logs - * Integration with WandB and TensorBoard +* **4-bit Quantization** + * Q4_K_S: Standard 4-bit quantization + * Q4_K_M: 4-bit quantization with improved accuracy + * Best balance of size and quality -* **Production Ready** - * Simple export to ONNX and TorchScript - * Quantized model deployment - * GPU and CPU inference optimization +* **8-bit Quantization** + * Q8_0: High-precision 8-bit quantization + * Best for quality-critical applications -Key Concepts ------------ +Memory-Efficient Processing +~~~~~~~~~~~~~~~~~~~~~~~ + +* Chunk-based quantization for large models +* Automatic device management +* CPU offloading support +* Progress tracking with memory statistics + +Detailed Examples +--------------- -Model Configuration -~~~~~~~~~~~~~~~~~ +1. Direct GGUF Quantization +~~~~~~~~~~~~~~~~~~~~~~~~~ -The ModelConfig class helps configure model loading: +For more control over the quantization process: .. code-block:: python - config = ModelConfig( + from quantllm.quant import GGUFQuantizer + import torch + + # Initialize quantizer with detailed configuration + quantizer = GGUFQuantizer( model_name="facebook/opt-125m", - load_in_4bit=True, # Enable 4-bit quantization - use_lora=True, # Enable LoRA - cpu_offload=True # Enable CPU offloading + bits=4, + group_size=32, + quant_type="Q4_K_M", + use_packed=True, + desc_act=False, + desc_ten=False, + legacy_format=False, + batch_size=4, + device="cuda" if torch.cuda.is_available() else "cpu", + cpu_offload=False, + gradient_checkpointing=False, + chunk_size=1000 ) -Dataset Handling -~~~~~~~~~~~~~~ + # Quantize the model + quantized_model = quantizer.quantize(calibration_data=calibration_data) + + # Convert to GGUF format with progress tracking + quantizer.convert_to_gguf("model.gguf") -Load and preprocess datasets easily: +2. Comprehensive Benchmarking +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Evaluate quantization performance: .. code-block:: python - dataset_config = DatasetConfig( - dataset_name="imdb", - text_column="text", - label_column="label", - max_length=512 + from quantllm.utils.benchmark import QuantizationBenchmark + + # Initialize benchmark + benchmark = QuantizationBenchmark( + model=model, + calibration_data=calibration_data, + input_shape=(1, 32), + num_inference_steps=100, + device="cuda", + num_warmup_steps=10 ) -Training Configuration -~~~~~~~~~~~~~~~~~~~ + # Run benchmarks and get detailed metrics + results = benchmark.run_all_benchmarks() + + # Print detailed report + benchmark.print_report() + + # Optional: Generate visualization + benchmark.plot_comparison("benchmark_results.png") + +3. Memory-Efficient Processing +~~~~~~~~~~~~~~~~~~~~~~~~~~ -Configure training parameters: +For large models with memory constraints: .. code-block:: python - training_config = TrainingConfig( - learning_rate=2e-4, - num_epochs=3, - batch_size=8, - gradient_accumulation_steps=4 + # Configure for memory efficiency + quantizer = GGUFQuantizer( + model_name="facebook/opt-1.3b", # Larger model + bits=4, + group_size=32, + cpu_offload=True, # Enable CPU offloading + chunk_size=500, # Smaller chunks for memory efficiency + gradient_checkpointing=True ) -Progress Tracking -~~~~~~~~~~~~~~ + # Process in chunks with progress display + quantized_model = quantizer.quantize(calibration_data) + +Supported GGUF Types +------------------ + +============ ================ ==================== +Bits Types Description +============ ================ ==================== +2-bit Q2_K Extreme compression +3-bit Q3_K_S Small size +3-bit Q3_K_M Medium accuracy +3-bit Q3_K_L Better accuracy +4-bit Q4_K_S Standard quality +4-bit Q4_K_M Better quality +5-bit Q5_K_S High quality +5-bit Q5_K_M Higher quality +6-bit Q6_K Very high quality +8-bit Q8_0 Best quality +============ ================ ==================== + +Best Practices +------------ -Monitor training progress: +1. **Memory Management** + * Use `cpu_offload=True` for models larger than 70% of GPU memory + * Adjust `chunk_size` based on available memory + * Enable `gradient_checkpointing` for large models -.. code-block:: python +2. **Quantization Selection** + * Use Q4_K_M for general use cases + * Use Q2_K for extreme compression needs + * Use Q8_0 for quality-critical applications - from quantllm import TrainingLogger +3. **Performance Optimization** + * Run benchmarks to find optimal settings + * Use appropriate batch sizes + * Monitor memory usage with built-in tools - logger = TrainingLogger() - trainer = FineTuningTrainer( - model=model, - logger=logger - ) +4. **Progress Tracking** + * Use the built-in progress bars + * Monitor layer-wise quantization + * Track memory usage during processing Next Steps --------- -* Check out our :doc:`tutorials/index` for detailed examples -* Read the :doc:`api_reference/index` for complete API documentation +* Check out our :doc:`tutorials/index` for more examples +* Read the :doc:`api_reference/index` for API details * See :doc:`advanced_usage/index` for advanced features -* Visit :doc:`deployment` for deployment options \ No newline at end of file +* Visit :doc:`deployment` for deployment guides \ No newline at end of file diff --git a/docs/installation.rst b/docs/installation.rst index 8e4a777..317a339 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -4,75 +4,152 @@ Installation Guide Requirements ----------- -Before installing QuantLLM, ensure your system meets these requirements: +QuantLLM requires Python 3.10 or later. The following are the core dependencies: -* Python >= 3.8 -* PyTorch >= 2.0 -* CUDA >= 11.7 (for GPU support) -* 16GB RAM (minimum) -* 8GB VRAM (recommended for GPU training) +* PyTorch >= 2.0.0 +* Transformers >= 4.30.0 +* CUDA Toolkit (optional, but recommended for GPU support) -Basic Installation ----------------- +Installation Methods +------------------ + +1. From PyPI (Recommended) +~~~~~~~~~~~~~~~~~~~~~~~ -You can install QuantLLM using pip: +Basic installation: .. code-block:: bash pip install quantllm +With GGUF support (recommended for deployment): + +.. code-block:: bash + + pip install quantllm[gguf] + +With development tools: + +.. code-block:: bash + + pip install quantllm[dev] + +2. From Source +~~~~~~~~~~~~ + +.. code-block:: bash + + git clone https://github.com/codewithdark-git/DiffusionLM.git + cd DiffusionLM + pip install -e . + For development installation: .. code-block:: bash - git clone https://github.com/codewithdark-git/QuantLLM.git - cd QuantLLM - pip install -e ".[dev]" + pip install -e .[dev,gguf] + +Hardware Requirements +------------------ -GPU Support +Minimum Requirements: +~~~~~~~~~~~~~~~~~~ + +* CPU: 4+ cores +* RAM: 16GB+ +* Storage: 10GB+ free space +* Python: 3.10+ + +Recommended for Large Models: +~~~~~~~~~~~~~~~~~~~~~~~~~ + +* CPU: 8+ cores +* RAM: 32GB+ +* GPU: NVIDIA GPU with 8GB+ VRAM +* CUDA: 11.7 or later +* Storage: 20GB+ free space + +GGUF Support ---------- -For GPU acceleration, install with CUDA support: +GGUF (GGML Universal Format) support requires additional dependencies: + +* llama-cpp-python >= 0.2.0 +* ctransformers >= 0.2.0 (optional) + +These are automatically installed with: .. code-block:: bash - pip install quantllm[gpu] + pip install quantllm[gguf] -This will install additional dependencies like: +Verify Installation +---------------- -* bitsandbytes -* accelerate -* Flash Attention 2 (where supported) +You can verify your installation by running: -Apple Silicon (M1/M2) --------------------- +.. code-block:: python -For Apple Silicon Macs: + import quantllm + from quantllm.quant import GGUFQuantizer + + # Check GGUF support + print(f"GGUF Support: {GGUFQuantizer.CT_AVAILABLE}") + + # Check CUDA availability + import torch + print(f"CUDA Available: {torch.cuda.is_available()}") + if torch.cuda.is_available(): + print(f"CUDA Version: {torch.version.cuda}") + print(f"GPU Device: {torch.cuda.get_device_name(0)}") + +Common Issues +----------- + +1. CUDA Compatibility +~~~~~~~~~~~~~~~~~~ + +If you encounter CUDA errors: .. code-block:: bash - pip install quantllm[mps] + # Install PyTorch with specific CUDA version + pip install torch --index-url https://download.pytorch.org/whl/cu118 -CPU-Only --------- +2. Memory Issues +~~~~~~~~~~~~~ -For CPU-only installations: +For large models, enable memory optimization: -.. code-block:: bash +.. code-block:: python - pip install quantllm[cpu] + quantizer = GGUFQuantizer( + model_name="large-model", + cpu_offload=True, + chunk_size=500, + gradient_checkpointing=True + ) -Optional Dependencies -------------------- +3. GGUF Conversion Issues +~~~~~~~~~~~~~~~~~~~~~~ -Weights & Biases integration: +If GGUF conversion fails: -.. code-block:: bash +1. Ensure llama-cpp-python is installed: + + .. code-block:: bash - pip install quantllm[wandb] + pip install llama-cpp-python --upgrade -Full installation with all features: +2. Check system compatibility: + + .. code-block:: bash -.. code-block:: bash + python -c "from ctransformers import AutoModelForCausalLM; print('GGUF support available')" + +Next Steps +--------- - pip install quantllm[all] +* Read the :doc:`getting_started` guide +* Check out :doc:`tutorials/index` +* See :doc:`advanced_usage/index` for advanced features diff --git a/docs/requirements.txt b/docs/requirements.txt index 9bb75c5..b23d31e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,4 +6,5 @@ myst-parser>=0.18.1 # Only include minimal package dependencies for docs building torch>=2.0.0 transformers>=4.30.0 -tqdm>=4.65.0 \ No newline at end of file +tqdm>=4.65.0 +llama-cpp-python>=0.2.0 \ No newline at end of file diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst deleted file mode 100644 index d694c5a..0000000 --- a/docs/tutorials/index.rst +++ /dev/null @@ -1,14 +0,0 @@ -Tutorials -========= - -This section contains step-by-step tutorials to help you get the most out of QuantLLM. - -.. toctree:: - :maxdepth: 2 - - quick_start - text_classification - language_modeling - custom_dataset - distributed_training - deployment \ No newline at end of file diff --git a/docs/tutorials/quick_start.rst b/docs/tutorials/quick_start.rst deleted file mode 100644 index b76f3c9..0000000 --- a/docs/tutorials/quick_start.rst +++ /dev/null @@ -1,160 +0,0 @@ -Quick Start Tutorial -================== - -This tutorial will walk you through fine-tuning a small language model on the IMDB dataset using QuantLLM. - -Setup ------ - -First, install QuantLLM: - -.. code-block:: bash - - pip install quantllm[gpu] # For GPU support - # or - pip install quantllm[cpu] # For CPU-only - -Basic Example ------------- - -Here's a complete example that demonstrates the core features of QuantLLM: - -.. code-block:: python - - from quantllm import ( - Model, ModelConfig, - LoadDataset, DatasetConfig, - DatasetPreprocessor, DatasetSplitter, - FineTuningTrainer, TrainingConfig, - TrainingLogger - ) - - # Initialize logger - logger = TrainingLogger() - - # 1. Configure and load model - logger.log_info("Loading model...") - model_config = ModelConfig( - model_name="facebook/opt-125m", # Small model for demonstration - load_in_4bit=True, # Enable 4-bit quantization - use_lora=True # Enable LoRA for efficient fine-tuning - ) - model = Model(model_config) - - # 2. Load and prepare dataset - logger.log_info("Preparing dataset...") - dataset = LoadDataset().load_hf_dataset("imdb") - - # Split dataset - splitter = DatasetSplitter() - train_dataset, val_dataset, test_dataset = splitter.train_val_test_split( - dataset["train"], # Use train split from IMDB - train_size=0.8, - val_size=0.1, - test_size=0.1 - ) - - # Preprocess datasets - tokenizer = model.get_tokenizer() - preprocessor = DatasetPreprocessor(tokenizer) - train_processed, val_processed, test_processed = preprocessor.tokenize_dataset( - train_dataset=train_dataset, - val_dataset=val_dataset, - test_dataset=test_dataset, - max_length=512, - text_column="text" - ) - - # 3. Configure training - training_config = TrainingConfig( - learning_rate=2e-4, - num_epochs=3, - batch_size=8, - gradient_accumulation_steps=4, - warmup_steps=100, - logging_steps=50 - ) - - # 4. Initialize trainer - trainer = FineTuningTrainer( - model=model.get_model(), - training_config=training_config, - train_dataloader=train_processed, - eval_dataloader=val_processed, - logger=logger - ) - - # 5. Train model - trainer.train() - - # 6. Evaluate on test set - logger.log_info("Evaluating model...") - test_metrics = trainer.evaluate(test_processed) - logger.log_info(f"Test metrics: {test_metrics}") - -Step-by-Step Explanation ----------------------- - -1. Model Configuration -~~~~~~~~~~~~~~~~~~ - -The ModelConfig class sets up model loading options: - -- ``model_name``: Which model to load from HuggingFace -- ``load_in_4bit``: Enable 4-bit quantization for memory efficiency -- ``use_lora``: Enable LoRA for parameter-efficient fine-tuning - -2. Dataset Preparation -~~~~~~~~~~~~~~~~~~~ - -We use three main classes for dataset handling: - -- ``LoadDataset``: Loads datasets from HuggingFace or local files -- ``DatasetSplitter``: Creates train/validation/test splits -- ``DatasetPreprocessor``: Handles tokenization and preprocessing - -3. Training Configuration -~~~~~~~~~~~~~~~~~~~~~~ - -TrainingConfig controls the training process: - -- ``learning_rate``: How fast the model learns -- ``num_epochs``: How many times to process the dataset -- ``batch_size``: Samples processed at once -- ``gradient_accumulation_steps``: Accumulate gradients for larger effective batch size - -4. Training -~~~~~~~~~ - -The FineTuningTrainer handles the training loop: - -- Manages model updates -- Tracks progress -- Handles checkpointing -- Provides evaluation - -Monitoring Progress ------------------ - -The TrainingLogger provides rich progress information: - -.. code-block:: python - - logger = TrainingLogger() - logger.log_info("Starting training...") # Basic logging - logger.log_metrics({"loss": 0.5}) # Track metrics - logger.log_success("Training complete!") # Success messages - -Next Steps ---------- - -- Try with different models from HuggingFace -- Experiment with training parameters -- Use your own dataset -- Enable advanced features like gradient checkpointing - -Check out other tutorials for more advanced usage: - -- :doc:`text_classification` for detailed text classification -- :doc:`custom_dataset` for using your own data -- :doc:`distributed_training` for multi-GPU training \ No newline at end of file diff --git a/docs/tutorials/text_classification.rst b/docs/tutorials/text_classification.rst deleted file mode 100644 index 86407d9..0000000 --- a/docs/tutorials/text_classification.rst +++ /dev/null @@ -1,265 +0,0 @@ -Text Classification Tutorial -======================== - -This tutorial demonstrates how to fine-tune a model for text classification tasks using QuantLLM. - -Task Overview ------------ - -We'll fine-tune a model to classify movie reviews as positive or negative using the IMDB dataset. - -Prerequisites ------------ - -- QuantLLM installed with GPU support -- Basic understanding of transformers -- At least 8GB GPU VRAM (or 16GB RAM for CPU) - -Complete Implementation --------------------- - -.. code-block:: python - - import torch - from quantllm import ( - Model, ModelConfig, - LoadDataset, DatasetConfig, - DatasetPreprocessor, DatasetSplitter, - FineTuningTrainer, TrainingConfig, - TrainingLogger, CheckpointManager - ) - - class TextClassificationTrainer: - def __init__(self): - self.logger = TrainingLogger() - self.setup_model() - self.setup_data() - self.setup_training() - - def setup_model(self): - """Configure and load the model.""" - self.logger.log_info("Setting up model...") - - # Use smaller model with LoRA for efficiency - self.model_config = ModelConfig( - model_name="facebook/opt-350m", - load_in_4bit=True, - use_lora=True, - gradient_checkpointing=True - ) - - self.model = Model(self.model_config) - self.tokenizer = self.model.get_tokenizer() - - def setup_data(self): - """Load and prepare the IMDB dataset.""" - self.logger.log_info("Preparing dataset...") - - # Load dataset - dataset = LoadDataset().load_hf_dataset("imdb") - - # Split dataset - splitter = DatasetSplitter() - self.train_dataset, self.val_dataset, self.test_dataset = splitter.train_val_test_split( - dataset["train"], - train_size=0.8, - val_size=0.1, - test_size=0.1 - ) - - # Preprocess datasets - preprocessor = DatasetPreprocessor(self.tokenizer) - self.train_processed, self.val_processed, self.test_processed = preprocessor.tokenize_dataset( - train_dataset=self.train_dataset, - val_dataset=self.val_dataset, - test_dataset=self.test_dataset, - max_length=512, - text_column="text", - label_column="label" - ) - - def setup_training(self): - """Configure training parameters.""" - self.logger.log_info("Configuring training...") - - # Set up checkpoint management - self.checkpoint_manager = CheckpointManager( - checkpoint_dir="./checkpoints", - save_total_limit=3 - ) - - # Configure training - self.training_config = TrainingConfig( - learning_rate=2e-4, - num_epochs=3, - batch_size=8, - gradient_accumulation_steps=4, - warmup_steps=100, - eval_steps=500, - save_steps=1000, - logging_steps=50 - ) - - # Initialize trainer - self.trainer = FineTuningTrainer( - model=self.model.get_model(), - training_config=self.training_config, - train_dataloader=self.train_processed, - eval_dataloader=self.val_processed, - logger=self.logger, - checkpoint_manager=self.checkpoint_manager - ) - - def train(self): - """Run the training process.""" - self.logger.log_info("Starting training...") - self.trainer.train() - - # Evaluate on test set - self.logger.log_info("Evaluating on test set...") - test_metrics = self.trainer.evaluate(self.test_processed) - self.logger.log_info(f"Test metrics: {test_metrics}") - - def predict(self, text: str) -> float: - """Make a prediction on new text.""" - # Preprocess input - inputs = self.tokenizer( - text, - padding="max_length", - truncation=True, - max_length=512, - return_tensors="pt" - ) - - # Get prediction - with torch.no_grad(): - outputs = self.model.get_model()(**inputs) - logits = outputs.logits - prediction = torch.sigmoid(logits)[0].item() - - return prediction - - # Usage example - def main(): - # Initialize trainer - classifier = TextClassificationTrainer() - - # Train model - classifier.train() - - # Make predictions - test_text = "This movie was absolutely fantastic! The acting was superb." - prediction = classifier.predict(test_text) - print(f"Prediction (positive): {prediction:.2%}") - - if __name__ == "__main__": - main() - -Step-by-Step Explanation ----------------------- - -1. Model Setup -~~~~~~~~~~~~ - -We use a medium-sized model with optimizations: - -- 4-bit quantization for memory efficiency -- LoRA for parameter-efficient fine-tuning -- Gradient checkpointing for larger batch sizes - -2. Dataset Preparation -~~~~~~~~~~~~~~~~~~~ - -The dataset preparation pipeline: - -1. Load IMDB dataset -2. Split into train/val/test -3. Preprocess and tokenize -4. Create dataloaders - -3. Training Configuration -~~~~~~~~~~~~~~~~~~~~~~ - -Key training parameters: - -- Learning rate: 2e-4 -- Batch size: 8 -- Gradient accumulation: 4 steps -- Evaluation every 500 steps -- Checkpoints every 1000 steps - -4. Training Process -~~~~~~~~~~~~~~~~ - -The training process includes: - -- Automatic hardware optimization -- Progress tracking -- Regular evaluation -- Checkpoint saving - -Making Predictions ----------------- - -Use the trained model for predictions: - -.. code-block:: python - - classifier = TextClassificationTrainer() - classifier.train() - - # Single prediction - text = "This movie was fantastic!" - prediction = classifier.predict(text) - print(f"Positive probability: {prediction:.2%}") - - # Batch predictions - texts = ["Great movie!", "Terrible acting", "Mixed feelings"] - predictions = [classifier.predict(text) for text in texts] - -Tips for Better Results --------------------- - -1. Data Quality -~~~~~~~~~~~~~ - -- Clean your input texts -- Balance your dataset -- Use appropriate text length - -2. Model Selection -~~~~~~~~~~~~~~~ - -- Start with smaller models -- Use LoRA for efficiency -- Enable quantization - -3. Training Parameters -~~~~~~~~~~~~~~~~~~ - -- Adjust learning rate -- Increase epochs for better results -- Use gradient accumulation - -4. Hardware Utilization -~~~~~~~~~~~~~~~~~~~~ - -- Enable GPU acceleration -- Use gradient checkpointing -- Monitor memory usage - -Next Steps ---------- - -- Try different model architectures -- Experiment with LoRA parameters -- Add custom evaluation metrics -- Implement cross-validation -- Deploy your model - -See Also -------- - -- :doc:`custom_dataset` for using your own data -- :doc:`deployment` for model deployment -- :doc:`advanced_usage/index` for advanced features \ No newline at end of file diff --git a/quantllm/__init__.py b/quantllm/__init__.py index e7cd448..1cc3eab 100644 --- a/quantllm/__init__.py +++ b/quantllm/__init__.py @@ -35,12 +35,12 @@ # Configure package-wide logging configure_logging() -__version__ = "0.1.0" +__version__ = "1.2.0" # Package metadata __title__ = "QuantLLM" __description__ = "Efficient Quantized LLM Fine-Tuning Library" -__author__ = "QuantLLM Team" +__author__ = "Dark Coder" __all__ = [ diff --git a/quantllm/api/high_level.py b/quantllm/api/high_level.py index c9a4ea4..566d137 100644 --- a/quantllm/api/high_level.py +++ b/quantllm/api/high_level.py @@ -3,7 +3,6 @@ from transformers import PreTrainedModel, AutoTokenizer from ..quant.gguf import GGUFQuantizer, SUPPORTED_GGUF_BITS, SUPPORTED_GGUF_TYPES from ..utils.logger import logger -from ..utils.memory_tracker import memory_tracker from ..utils.benchmark import QuantizationBenchmark class QuantLLM: @@ -57,7 +56,6 @@ def quantize_from_pretrained( """ try: logger.log_info(f"Starting GGUF quantization with {bits} bits") - memory_tracker.log_memory("quantization_start") if bits not in SUPPORTED_GGUF_BITS: raise ValueError(f"Unsupported bits: {bits}. Supported values: {SUPPORTED_GGUF_BITS}") @@ -102,7 +100,6 @@ def quantize_from_pretrained( logger.log_info("Starting quantization process") quantized_model = quantizer.quantize(calibration_data) - memory_tracker.log_memory("quantization_complete") benchmark_results = {} if benchmark: @@ -124,7 +121,6 @@ def quantize_from_pretrained( ) benchmark_results = benchmarker.run_all_benchmarks() - memory_tracker.log_memory("benchmarking_complete") logger.log_info("Benchmark Results:") if hasattr(benchmark_results, 'to_dict'): @@ -138,7 +134,8 @@ def quantize_from_pretrained( logger.log_error(f"Quantization failed: {str(e)}") raise finally: - memory_tracker.clear_memory() + if torch.cuda.is_available(): + torch.cuda.empty_cache() @staticmethod def save_quantized_model( @@ -156,7 +153,6 @@ def save_quantized_model( """ try: logger.log_info(f"Saving quantized model to {output_path}") - memory_tracker.log_memory("save_start") # Save model model.save_pretrained(output_path) @@ -174,14 +170,14 @@ def save_quantized_model( except Exception as e: logger.log_warning(f"Failed to save tokenizer: {e}") - memory_tracker.log_memory("save_complete") logger.log_info("Model saved successfully") except Exception as e: logger.log_error(f"Failed to save model: {str(e)}") raise finally: - memory_tracker.clear_memory() + if torch.cuda.is_available(): + torch.cuda.empty_cache() @staticmethod def convert_to_gguf( @@ -199,7 +195,6 @@ def convert_to_gguf( """ try: logger.log_info(f"Converting model to GGUF format: {output_path}") - memory_tracker.log_memory("conversion_start") # Get quantization config from model if not provided if not quant_config and hasattr(model.config, 'quantization_config'): @@ -216,11 +211,11 @@ def convert_to_gguf( # Convert to GGUF quantizer.convert_to_gguf(output_path) - memory_tracker.log_memory("conversion_complete") logger.log_info("GGUF conversion completed successfully") except Exception as e: logger.log_error(f"GGUF conversion failed: {str(e)}") raise finally: - memory_tracker.clear_memory() + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/quantllm/quant/gguf.py b/quantllm/quant/gguf.py index d3a8a4e..bee83c3 100644 --- a/quantllm/quant/gguf.py +++ b/quantllm/quant/gguf.py @@ -8,7 +8,6 @@ from transformers import PreTrainedModel from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear from ..utils.logger import logger -from ..utils.memory_tracker import memory_tracker import time from tqdm.auto import tqdm @@ -152,7 +151,7 @@ def quantize( start_time = time.perf_counter() - # Create progress bar for chunks + # Create progress bars chunk_pbar = tqdm(chunks, desc="Processing chunks", position=0) layer_pbar = tqdm(total=total_layers, desc="Quantizing layers", position=1, leave=True) @@ -162,7 +161,10 @@ def quantize( for idx, (name, module) in enumerate(chunk, 1): try: current_layer = idx + chunk_idx * self.chunk_size - layer_pbar.set_description(f"Layer {current_layer}/{total_layers}: {name}") + layer_shape = list(module.weight.shape) + layer_pbar.set_description( + f"Layer {current_layer}/{total_layers}: {name} {layer_shape}" + ) # Move layer to target device if needed if module.weight.device != device: @@ -220,7 +222,7 @@ def quantize( return self.model - except Exception as e: + except Exception as e: logger.log_error(f"GGUF quantization failed: {str(e)}") raise RuntimeError(f"GGUF quantization failed: {str(e)}") from e finally: @@ -238,14 +240,12 @@ def _quantize_layer( raise TypeError(f"Expected nn.Linear layer, got {type(layer)}") device = torch.device('cpu') if self.cpu_offload else self.device_manager.primary_device - memory_tracker.log_memory("layer_quantization_start") try: # Get layer device and ensure it's on the correct device layer_device = next(layer.parameters()).device if layer_device != device: layer = layer.to(device) - memory_tracker.log_memory("layer_moved_to_device") # Get weight tensor weight = layer.weight.data @@ -265,8 +265,6 @@ def _quantize_layer( scales = scale.expand(1) zeros = torch.zeros_like(scales) - memory_tracker.log_memory("scales_computed") - # Quantize weights qweight = torch.clamp( torch.round(weight_scaled * (2**(self.bits-1) - 1)), @@ -274,8 +272,6 @@ def _quantize_layer( 2**(self.bits-1) - 1 ).to(torch.int8) - memory_tracker.log_memory("weights_quantized") - # Create quantized layer qlayer = QuantizedLinear( in_features=layer.in_features, @@ -301,7 +297,6 @@ def _quantize_layer( if layer.bias is not None: qlayer.bias = layer.bias.data.clone() - memory_tracker.log_memory("layer_quantization_complete") return qlayer except Exception as e: @@ -317,49 +312,107 @@ def _quantize_layer( def convert_to_gguf(self, output_path: str): """ - Convert quantized model to GGUF format using ctransformers. + Convert quantized model to GGUF format using llama.cpp conversion tools. """ if not CT_AVAILABLE: raise ImportError("CTransformers is required for GGUF conversion") try: - logger.log_info(f"Converting model to GGUF format: {output_path}") + logger.log_info(f"\nConverting model to GGUF format: {output_path}") logger.log_info(f"Using quantization type: {self.quant_type}") - memory_tracker.log_memory("gguf_conversion_start") # Ensure model is on CPU for conversion if not self.cpu_offload: self.model.to('cpu') - memory_tracker.log_memory("model_moved_to_cpu") # Save model in HF format first temp_dir = f"{output_path}_temp_hf" - self.model.save_pretrained(temp_dir) + logger.log_info(f"Saving temporary HF checkpoint to: {temp_dir}") + self.model.save_pretrained(temp_dir, safe_serialization=True) - # Convert using ctransformers - try: - # Use ctransformers to load and save in GGUF format - ct_model = CTAutoModel.from_pretrained( - temp_dir, - model_type="llama", # Default to llama, can be parameterized later - model_file=None, - config={ - "max_new_tokens": 2048, - "context_length": 2048, - "gpu_layers": 0 # CPU conversion - } + # Prepare conversion command + import subprocess + import sys + import os + + # Try to find convert.py from llama.cpp + convert_script = None + potential_paths = [ + "llama.cpp/convert.py", + os.path.join(os.path.dirname(sys.executable), "llama.cpp/convert.py"), + os.path.expanduser("~/.local/lib/python*/site-packages/llama_cpp_python/convert.py"), + "/usr/local/lib/python*/site-packages/llama_cpp_python/convert.py" + ] + + for path in potential_paths: + if "*" in path: + import glob + matches = glob.glob(path) + if matches: + convert_script = matches[0] + break + elif os.path.exists(path): + convert_script = path + break + + if not convert_script: + raise RuntimeError( + "Could not find llama.cpp convert.py script. Please install llama-cpp-python: " + "pip install llama-cpp-python" + ) + + # Build conversion command + cmd = [ + sys.executable, + convert_script, + temp_dir, + "--outfile", output_path, + "--outtype", "f16" if self.bits <= 16 else "f32", + ] + + # Add model type specific args + model_type = self.model.config.model_type if hasattr(self.model, 'config') else "llama" + if model_type in ["llama", "mistral"]: + cmd.extend(["--model-type", model_type]) + + # Execute conversion + logger.log_info("Running GGUF conversion...") + logger.log_info(f"Command: {' '.join(cmd)}") + + with tqdm(total=100, desc="Converting to GGUF", unit="%") as pbar: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True ) - ct_model.save_pretrained(output_path) - import shutil - shutil.rmtree(temp_dir, ignore_errors=True) + # Monitor conversion progress + while True: + output = process.stdout.readline() + if output == '' and process.poll() is not None: + break + if output: + if "Converting" in output: + try: + progress = int(output.split("%")[0].split()[-1]) + pbar.n = progress + pbar.refresh() + except: + pass - except Exception as e: - logger.log_error(f"CTTransformers conversion failed: {str(e)}") - raise + # Get return code and output + return_code = process.wait() + if return_code != 0: + error_output = process.stderr.read() + raise RuntimeError(f"GGUF conversion failed with error:\n{error_output}") + + # Cleanup temporary files + import shutil + logger.log_info("Cleaning up temporary files...") + shutil.rmtree(temp_dir, ignore_errors=True) - memory_tracker.log_memory("gguf_conversion_complete") - logger.log_info("GGUF conversion completed successfully") + logger.log_info(f"Successfully saved model in GGUF format to: {output_path}") except Exception as e: logger.log_error(f"GGUF conversion failed: {str(e)}") @@ -373,6 +426,5 @@ def _clear_memory(self): if not self.cpu_offload and torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() - memory_tracker.clear_memory() \ No newline at end of file diff --git a/quantllm/utils/memory_tracker.py b/quantllm/utils/memory_tracker.py deleted file mode 100644 index 016efce..0000000 --- a/quantllm/utils/memory_tracker.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Memory tracking utilities for QuantLLM.""" - -import gc -import psutil -import torch -from typing import Optional, Dict, List, Union -from pathlib import Path -import json -import time -from .logger import logger - -try: - import pynvml - PYNVML_AVAILABLE = True - pynvml.nvmlInit() -except ImportError: - PYNVML_AVAILABLE = False - -class MemoryTracker: - """Enhanced memory tracking for CPU and GPU.""" - - def __init__(self, log_dir: Optional[str] = None): - """Initialize memory tracker.""" - self.log_dir = Path(log_dir) if log_dir else Path("memory_logs") - self.log_dir.mkdir(exist_ok=True) - - self.memory_logs: List[Dict] = [] - self.peak_memory: Dict[str, float] = { - 'cpu': 0.0, - 'gpu': 0.0 - } - - if PYNVML_AVAILABLE: - self.gpu_handles = [] - try: - device_count = pynvml.nvmlDeviceGetCount() - for i in range(device_count): - self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) - except Exception as e: - logger.log_warning(f"Failed to initialize GPU handles: {e}") - - def get_cpu_memory(self) -> float: - """Get current CPU memory usage in GB.""" - process = psutil.Process() - return process.memory_info().rss / (1024 ** 3) - - def get_gpu_memory(self, device_index: int = 0) -> Optional[Dict[str, float]]: - """Get current GPU memory usage in GB.""" - if not PYNVML_AVAILABLE or not self.gpu_handles: - return None - - try: - handle = self.gpu_handles[device_index] - info = pynvml.nvmlDeviceGetMemoryInfo(handle) - return { - 'total': info.total / (1024 ** 3), - 'used': info.used / (1024 ** 3), - 'free': info.free / (1024 ** 3) - } - except Exception as e: - logger.log_warning(f"Failed to get GPU memory info: {e}") - return None - - def get_torch_memory(self, device: Optional[Union[str, torch.device]] = None) -> Optional[float]: - """Get PyTorch allocated memory in GB.""" - if device is None and torch.cuda.is_available(): - device = torch.cuda.current_device() - - try: - if isinstance(device, str): - device = torch.device(device) - - if device.type == 'cuda': - return torch.cuda.memory_allocated(device) / (1024 ** 3) - return None - except Exception as e: - logger.log_warning(f"Failed to get PyTorch memory info: {e}") - return None - - def log_memory(self, operation: str, extra_info: Optional[Dict] = None): - """Log current memory usage.""" - timestamp = time.time() - cpu_mem = self.get_cpu_memory() - self.peak_memory['cpu'] = max(self.peak_memory['cpu'], cpu_mem) - - memory_info = { - 'timestamp': timestamp, - 'operation': operation, - 'cpu_memory_gb': cpu_mem, - 'peak_cpu_memory_gb': self.peak_memory['cpu'] - } - - if PYNVML_AVAILABLE: - for i, handle in enumerate(self.gpu_handles): - gpu_mem = self.get_gpu_memory(i) - if gpu_mem: - memory_info[f'gpu{i}_memory_gb'] = gpu_mem - self.peak_memory['gpu'] = max( - self.peak_memory['gpu'], - gpu_mem['used'] - ) - memory_info[f'peak_gpu{i}_memory_gb'] = self.peak_memory['gpu'] - - # Get GPU utilization - try: - util = pynvml.nvmlDeviceGetUtilizationRates(handle) - memory_info[f'gpu{i}_utilization'] = { - 'gpu': util.gpu, - 'memory': util.memory - } - except Exception as e: - logger.log_warning(f"Failed to get GPU utilization: {e}") - - # Add PyTorch specific memory info - if torch.cuda.is_available(): - for i in range(torch.cuda.device_count()): - torch_mem = self.get_torch_memory(f'cuda:{i}') - if torch_mem is not None: - memory_info[f'torch_gpu{i}_allocated_gb'] = torch_mem - - if extra_info: - memory_info.update(extra_info) - - self.memory_logs.append(memory_info) - logger.log_memory( - operation, - memory_info['cpu_memory_gb'], - 'cpu' - ) - - # Save to file - self._save_logs() - - def _save_logs(self): - """Save memory logs to file.""" - timestamp = time.strftime("%Y%m%d_%H%M%S") - log_file = self.log_dir / f"memory_log_{timestamp}.json" - - with open(log_file, 'w') as f: - json.dump(self.memory_logs, f, indent=2) - - def clear_memory(self): - """Clear memory and caches.""" - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - for i in range(torch.cuda.device_count()): - torch.cuda.synchronize(i) - - def get_summary(self) -> Dict: - """Get memory usage summary.""" - if not self.memory_logs: - return {} - - summary = { - 'peak_cpu_memory_gb': self.peak_memory['cpu'], - 'peak_gpu_memory_gb': self.peak_memory['gpu'], - 'num_operations': len(self.memory_logs), - 'total_duration': self.memory_logs[-1]['timestamp'] - self.memory_logs[0]['timestamp'] - } - - # Calculate average memory usage - cpu_memories = [log['cpu_memory_gb'] for log in self.memory_logs] - summary['avg_cpu_memory_gb'] = sum(cpu_memories) / len(cpu_memories) - - if PYNVML_AVAILABLE and self.gpu_handles: - gpu_memories = [] - for i in range(len(self.gpu_handles)): - gpu_key = f'gpu{i}_memory_gb' - gpu_mems = [ - log[gpu_key]['used'] - for log in self.memory_logs - if gpu_key in log - ] - if gpu_mems: - summary[f'avg_gpu{i}_memory_gb'] = sum(gpu_mems) / len(gpu_mems) - gpu_memories.extend(gpu_mems) - - if gpu_memories: - summary['avg_gpu_memory_gb'] = sum(gpu_memories) / len(gpu_memories) - - return summary - - def __del__(self): - """Cleanup NVML on deletion.""" - if PYNVML_AVAILABLE: - try: - pynvml.nvmlShutdown() - except Exception as e: - logger.log_warning(f"Failed to shutdown NVML: {e}") - -# Global memory tracker instance -memory_tracker = MemoryTracker() diff --git a/setup.py b/setup.py index 52f7b6d..424aa37 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,10 @@ setup( name="quantllm", - version="1.1.0", + version="1.2.0", author="Dark Coder", author_email="codewithdark90@gmail.com", - description="A lightweight library for quantized LLM fine-tuning and deployment", + description="A lightweight library for quantized LLM fine-tuning and deployment with GGUF support", long_description=long_description, long_description_content_type="text/markdown", project_urls={ @@ -43,7 +43,11 @@ "protobuf>=3.20.0", "einops>=0.6.1", "evaluate>=0.4.0", - "tensorboard>=2.13.0" + "tensorboard>=2.13.0", + "llama-cpp-python>=0.2.0", + "psutil>=5.9.0", + "pandas>=1.5.0", + "pynvml>=12.0.0" ], extras_require={ "dev": [ @@ -58,6 +62,9 @@ "isort>=5.10.0", "flake8>=4.0.0", ], + "gguf": [ + "ctransformers>=0.2.0", + ], }, include_package_data=True, zip_safe=False, diff --git a/test/test_gguf_quantization.py b/test/test_gguf_quantization.py index 14346cd..78a7eb2 100644 --- a/test/test_gguf_quantization.py +++ b/test/test_gguf_quantization.py @@ -4,31 +4,18 @@ import os import tempfile from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer -from quantllm.quant.gguf import GGUFQuantizer -from quantllm.quant.quantization_engine import QuantizedLinear +from quantllm.quant import GGUFQuantizer +from quantllm.quantization_engine import QuantizedLinear from quantllm.utils.benchmark import QuantizationBenchmark -from quantllm.utils.memory_tracker import MemoryTracker +from quantllm.utils.logger import logger # Define model names for testing TEST_MODEL_NAME_SMALL = "facebook/opt-125m" # Small model for quick unit tests TEST_MODEL_NAME_MEDIUM = "facebook/opt-350m" # Slightly larger model for integration tests -def _get_dummy_calibration_data(batch_size=1, seq_len=128, vocab_size=1000, num_samples=32) -> torch.Tensor: - """ - Generates a random tensor for calibration data. - - Args: - batch_size (int): Batch size (typically 1 for calibration samples if processed one by one). - This function generates `num_samples` total, so batch_size here is for shape convention. - seq_len (int): Sequence length of the calibration data. - vocab_size (int): Vocabulary size to sample token IDs from. - num_samples (int): Number of calibration samples to generate. - - Returns: - torch.Tensor: A tensor of shape (num_samples, seq_len) with random integer token IDs. - """ - # Simple random integer data - return torch.randint(0, vocab_size, (num_samples, seq_len)) +def _get_dummy_calibration_data(vocab_size: int = 32000, seq_len: int = 32, batch_size: int = 4): + """Helper function to generate dummy calibration data.""" + return torch.randint(0, vocab_size, (batch_size, seq_len)) def _load_model_and_tokenizer(model_name, trust_remote_code=True): """Helper to load a Hugging Face model and tokenizer."""