chenfei-wu · yucaowang · Mar 30, 2023 · Apr 11, 2023
diff --git a/README.md b/README.md
@@ -63,9 +63,23 @@ pip install -r requirements.txt
 # prepare your private OpenAI key (for Linux)
 export OPENAI_API_KEY={Your_Private_Openai_Key}
 
+# if you're using Azure OpenAI service, please add the following settings (for Linux)
+export OPENAI_API_TYPE=azure
+export OPENAI_API_VERSION=2022-12-01
+export OPENAI_API_BASE=https://{your-resource-name}.openai.azure.com
+export OPENAI_API_KEY={Your_Private_Openai_Key}
+export OPENAI_API_AZURE_DEPLOYMENT={Your_Azure_Deployment_Name}
+
 # prepare your private OpenAI key (for Windows)
 set OPENAI_API_KEY={Your_Private_Openai_Key}
 
+# if you're using Azure OpenAI service, please add the following settings (for Windows)
+set OPENAI_API_TYPE=azure
+set OPENAI_API_VERSION=2022-12-01
+set OPENAI_API_BASE=https://{your-resource-name}.openai.azure.com
+set OPENAI_API_KEY={Your_Private_Openai_Key}
+set OPENAI_API_AZURE_DEPLOYMENT={Your_Azure_Deployment_Name}
+
 # Start Visual ChatGPT !
 # You can specify the GPU/CPU assignment by "--load", the parameter indicates which 
 # Visual Foundation Model to use and where it will be loaded to
@@ -95,28 +109,28 @@ python visual_chatgpt.py --load "ImageCaptioning_cuda:0,ImageEditing_cuda:0,
 Here we list the GPU memory usage of each visual foundation model, you can specify which one you like:
 
 | Foundation Model        | GPU Memory (MB) |
-|------------------------|-----------------|
-| ImageEditing           | 3981            |
-| InstructPix2Pix        | 2827            |
-| Text2Image             | 3385            |
-| ImageCaptioning        | 1209            |
-| Image2Canny            | 0               |
-| CannyText2Image        | 3531            |
-| Image2Line             | 0               |
-| LineText2Image         | 3529            |
-| Image2Hed              | 0               |
-| HedText2Image          | 3529            |
-| Image2Scribble         | 0               |
-| ScribbleText2Image     | 3531            |
-| Image2Pose             | 0               |
-| PoseText2Image         | 3529            |
-| Image2Seg              | 919             |
-| SegText2Image          | 3529            |
-| Image2Depth            | 0               |
-| DepthText2Image        | 3531            |
-| Image2Normal           | 0               |
-| NormalText2Image       | 3529            |
-| VisualQuestionAnswering| 1495            |
+| ----------------------- | --------------- |
+| ImageEditing            | 3981            |
+| InstructPix2Pix         | 2827            |
+| Text2Image              | 3385            |
+| ImageCaptioning         | 1209            |
+| Image2Canny             | 0               |
+| CannyText2Image         | 3531            |
+| Image2Line              | 0               |
+| LineText2Image          | 3529            |
+| Image2Hed               | 0               |
+| HedText2Image           | 3529            |
+| Image2Scribble          | 0               |
+| ScribbleText2Image      | 3531            |
+| Image2Pose              | 0               |
+| PoseText2Image          | 3529            |
+| Image2Seg               | 919             |
+| SegText2Image           | 3529            |
+| Image2Depth             | 0               |
+| DepthText2Image         | 3531            |
+| Image2Normal            | 0               |
+| NormalText2Image        | 3529            |
+| VisualQuestionAnswering | 1495            |
 
 ## Acknowledgement
 We appreciate the open source of the following projects:

diff --git a/visual_chatgpt.py b/visual_chatgpt.py
@@ -23,7 +23,7 @@
 from langchain.agents.initialize import initialize_agent
 from langchain.agents.tools import Tool
 from langchain.chains.conversation.memory import ConversationBufferMemory
-from langchain.llms.openai import OpenAI
+from langchain.llms.openai import AzureOpenAI,OpenAI
 
 VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
 
@@ -224,6 +224,13 @@ def get_new_image_name(org_img_name, func_name="update"):
     return os.path.join(head, new_file_name)
 
 
+def get_llm_instance():
+    openai_type = os.getenv('OPENAI_API_TYPE',"")
+    if openai_type == "azure":
+        deployment = os.getenv('OPENAI_API_AZURE_DEPLOYMENT', "")
+        return AzureOpenAI(temperature=0, deployment_name=deployment)
+    return OpenAI(temperature=0)
+
 
 class MaskFormer:
     def __init__(self, device):
@@ -921,7 +928,7 @@ def inference(self, inputs):
 class InfinityOutPainting:
     template_model = True # Add this line to show this is a template model.
     def __init__(self, ImageCaptioning, ImageEditing, VisualQuestionAnswering):
-        self.llm = OpenAI(temperature=0)
+        self.llm = get_llm_instance()
         self.ImageCaption = ImageCaptioning
         self.ImageEditing = ImageEditing
         self.ImageVQA = VisualQuestionAnswering
@@ -1042,7 +1049,7 @@ def __init__(self, load_dict):
                 if e.startswith('inference'):
                     func = getattr(instance, e)
                     self.tools.append(Tool(name=func.name, description=func.description, func=func))
-        self.llm = OpenAI(temperature=0)
+        self.llm = get_llm_instance()
         self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
     def init_agent(self, lang):
         self.memory.clear() #clear previous history