Support VLM in chat completion (+some specs updates) (#2556)

* Support VLM in chat completion (+some specs updates) * document VLM
huggingface · Sep 25, 2024 · 12ec449 · 12ec449
1 parent c0fd4e0
commit 12ec449
Show file tree

Hide file tree

Showing 18 changed files with 411 additions and 125 deletions.
diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
@@ -63,9 +63,9 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
-[[autodoc]] huggingface_hub.ChatCompletionInputTool
+[[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
 
-[[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass
+[[autodoc]] huggingface_hub.ChatCompletionInputToolType
 
 [[autodoc]] huggingface_hub.ChatCompletionInputURL
 
@@ -103,6 +103,10 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ChatCompletionStreamOutputTopLogprob
 
+[[autodoc]] huggingface_hub.ChatCompletionStreamOutputUsage
+
+[[autodoc]] huggingface_hub.ToolElement
+
 
 
 ## depth_estimation
@@ -219,12 +223,12 @@ This part of the lib is still under development and will be improved in future r
 
 ## summarization
 
-[[autodoc]] huggingface_hub.SummarizationGenerationParameters
-
 [[autodoc]] huggingface_hub.SummarizationInput
 
 [[autodoc]] huggingface_hub.SummarizationOutput
 
+[[autodoc]] huggingface_hub.SummarizationParameters
+
 
 
 ## table_question_answering
@@ -307,6 +311,18 @@ This part of the lib is still under development and will be improved in future r
 
 
 
+## text_to_speech
+
+[[autodoc]] huggingface_hub.TextToSpeechGenerationParameters
+
+[[autodoc]] huggingface_hub.TextToSpeechInput
+
+[[autodoc]] huggingface_hub.TextToSpeechOutput
+
+[[autodoc]] huggingface_hub.TextToSpeechParameters
+
+
+
 ## token_classification
 
 [[autodoc]] huggingface_hub.TokenClassificationInput
@@ -319,12 +335,12 @@ This part of the lib is still under development and will be improved in future r
 
 ## translation
 
-[[autodoc]] huggingface_hub.TranslationGenerationParameters
-
 [[autodoc]] huggingface_hub.TranslationInput
 
 [[autodoc]] huggingface_hub.TranslationOutput
 
+[[autodoc]] huggingface_hub.TranslationParameters
+
 
 
 ## video_classification

diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
@@ -62,9 +62,9 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
-[[autodoc]] huggingface_hub.ChatCompletionInputTool
+[[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
 
-[[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass
+[[autodoc]] huggingface_hub.ChatCompletionInputToolType
 
 [[autodoc]] huggingface_hub.ChatCompletionInputURL
 
@@ -102,6 +102,10 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ChatCompletionStreamOutputTopLogprob
 
+[[autodoc]] huggingface_hub.ChatCompletionStreamOutputUsage
+
+[[autodoc]] huggingface_hub.ToolElement
+
 
 
 ## depth_estimation[[huggingface_hub.DepthEstimationInput]]
@@ -216,14 +220,14 @@ rendered properly in your Markdown viewer.
 
 
 
-## summarization[[huggingface_hub.SummarizationGenerationParameters]]
-
-[[autodoc]] huggingface_hub.SummarizationGenerationParameters
+## summarization[[huggingface_hub.SummarizationInput]]
 
 [[autodoc]] huggingface_hub.SummarizationInput
 
 [[autodoc]] huggingface_hub.SummarizationOutput
 
+[[autodoc]] huggingface_hub.SummarizationParameters
+
 
 
 ## table_question_answering[[huggingface_hub.TableQuestionAnsweringInput]]
@@ -306,6 +310,18 @@ rendered properly in your Markdown viewer.
 
 
 
+## text_to_speech[[huggingface_hub.TextToSpeechGenerationParameters]]
+
+[[autodoc]] huggingface_hub.TextToSpeechGenerationParameters
+
+[[autodoc]] huggingface_hub.TextToSpeechInput
+
+[[autodoc]] huggingface_hub.TextToSpeechOutput
+
+[[autodoc]] huggingface_hub.TextToSpeechParameters
+
+
+
 ## token_classification[[huggingface_hub.TokenClassificationInput]]
 
 [[autodoc]] huggingface_hub.TokenClassificationInput
@@ -316,14 +332,14 @@ rendered properly in your Markdown viewer.
 
 
 
-## translation[[huggingface_hub.TranslationGenerationParameters]]
-
-[[autodoc]] huggingface_hub.TranslationGenerationParameters
+## translation[[huggingface_hub.TranslationInput]]
 
 [[autodoc]] huggingface_hub.TranslationInput
 
 [[autodoc]] huggingface_hub.TranslationOutput
 
+[[autodoc]] huggingface_hub.TranslationParameters
+
 
 
 ## video_classification[[huggingface_hub.VideoClassificationInput]]

diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -294,8 +294,8 @@
         "ChatCompletionInputGrammarType",
         "ChatCompletionInputMessage",
         "ChatCompletionInputMessageChunk",
-        "ChatCompletionInputTool",
-        "ChatCompletionInputToolTypeClass",
+        "ChatCompletionInputStreamOptions",
+        "ChatCompletionInputToolType",
         "ChatCompletionInputURL",
         "ChatCompletionOutput",
         "ChatCompletionOutputComplete",
@@ -314,6 +314,7 @@
         "ChatCompletionStreamOutputLogprob",
         "ChatCompletionStreamOutputLogprobs",
         "ChatCompletionStreamOutputTopLogprob",
+        "ChatCompletionStreamOutputUsage",
         "DepthEstimationInput",
         "DepthEstimationOutput",
         "DocumentQuestionAnsweringInput",
@@ -348,9 +349,9 @@
         "QuestionAnsweringParameters",
         "SentenceSimilarityInput",
         "SentenceSimilarityInputData",
-        "SummarizationGenerationParameters",
         "SummarizationInput",
         "SummarizationOutput",
+        "SummarizationParameters",
         "TableQuestionAnsweringInput",
         "TableQuestionAnsweringInputData",
         "TableQuestionAnsweringOutputElement",
@@ -379,12 +380,17 @@
         "TextToImageOutput",
         "TextToImageParameters",
         "TextToImageTargetSize",
+        "TextToSpeechGenerationParameters",
+        "TextToSpeechInput",
+        "TextToSpeechOutput",
+        "TextToSpeechParameters",
         "TokenClassificationInput",
         "TokenClassificationOutputElement",
         "TokenClassificationParameters",
-        "TranslationGenerationParameters",
+        "ToolElement",
         "TranslationInput",
         "TranslationOutput",
+        "TranslationParameters",
         "VideoClassificationInput",
         "VideoClassificationOutputElement",
         "VideoClassificationParameters",
@@ -802,8 +808,8 @@ def __dir__():
         ChatCompletionInputGrammarType,  # noqa: F401
         ChatCompletionInputMessage,  # noqa: F401
         ChatCompletionInputMessageChunk,  # noqa: F401
-        ChatCompletionInputTool,  # noqa: F401
-        ChatCompletionInputToolTypeClass,  # noqa: F401
+        ChatCompletionInputStreamOptions,  # noqa: F401
+        ChatCompletionInputToolType,  # noqa: F401
         ChatCompletionInputURL,  # noqa: F401
         ChatCompletionOutput,  # noqa: F401
         ChatCompletionOutputComplete,  # noqa: F401
@@ -822,6 +828,7 @@ def __dir__():
         ChatCompletionStreamOutputLogprob,  # noqa: F401
         ChatCompletionStreamOutputLogprobs,  # noqa: F401
         ChatCompletionStreamOutputTopLogprob,  # noqa: F401
+        ChatCompletionStreamOutputUsage,  # noqa: F401
         DepthEstimationInput,  # noqa: F401
         DepthEstimationOutput,  # noqa: F401
         DocumentQuestionAnsweringInput,  # noqa: F401
@@ -856,9 +863,9 @@ def __dir__():
         QuestionAnsweringParameters,  # noqa: F401
         SentenceSimilarityInput,  # noqa: F401
         SentenceSimilarityInputData,  # noqa: F401
-        SummarizationGenerationParameters,  # noqa: F401
         SummarizationInput,  # noqa: F401
         SummarizationOutput,  # noqa: F401
+        SummarizationParameters,  # noqa: F401
         TableQuestionAnsweringInput,  # noqa: F401
         TableQuestionAnsweringInputData,  # noqa: F401
         TableQuestionAnsweringOutputElement,  # noqa: F401
@@ -887,12 +894,17 @@ def __dir__():
         TextToImageOutput,  # noqa: F401
         TextToImageParameters,  # noqa: F401
         TextToImageTargetSize,  # noqa: F401
+        TextToSpeechGenerationParameters,  # noqa: F401
+        TextToSpeechInput,  # noqa: F401
+        TextToSpeechOutput,  # noqa: F401
+        TextToSpeechParameters,  # noqa: F401
         TokenClassificationInput,  # noqa: F401
         TokenClassificationOutputElement,  # noqa: F401
         TokenClassificationParameters,  # noqa: F401
-        TranslationGenerationParameters,  # noqa: F401
+        ToolElement,  # noqa: F401
         TranslationInput,  # noqa: F401
         TranslationOutput,  # noqa: F401
+        TranslationParameters,  # noqa: F401
         VideoClassificationInput,  # noqa: F401
         VideoClassificationOutputElement,  # noqa: F401
         VideoClassificationParameters,  # noqa: F401