From b3289772c53f5412dbfeef4e139f358bdfe62b5d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 5 Mar 2023 11:11:51 +0200
Subject: [PATCH 1/5] coreml : use Core ML encoder inference

---
 .gitignore                      |   2 +
 CMakeLists.txt                  |  68 +++++++++--
 Makefile                        |  48 +++++---
 coreml/whisper-encoder-impl.h   | 142 +++++++++++++++++++++++
 coreml/whisper-encoder-impl.m   | 197 ++++++++++++++++++++++++++++++++
 coreml/whisper-encoder.h        |  22 ++++
 coreml/whisper-encoder.mm       |  61 ++++++++++
 models/download-coreml-model.sh |  82 +++++++++++++
 whisper.cpp                     |  41 +++++++
 9 files changed, 638 insertions(+), 25 deletions(-)
 create mode 100644 coreml/whisper-encoder-impl.h
 create mode 100644 coreml/whisper-encoder-impl.m
 create mode 100644 coreml/whisper-encoder.h
 create mode 100644 coreml/whisper-encoder.mm
 create mode 100755 models/download-coreml-model.sh

diff --git a/.gitignore b/.gitignore
index 3d51e0be7d4..482d15eed28 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 *.o
 *.a
+*.mlmodel
+*.mlmodelc
 .cache/
 .test/
 .vs/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b6d4b709c1..37364239a56 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,8 @@ if (APPLE)
     option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
     option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
     option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
+
+    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
     option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@@ -90,16 +92,33 @@ endif()
 
 find_package(Threads REQUIRED)
 
-# on APPLE - include Accelerate framework
-if (APPLE AND NOT WHISPER_NO_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
+# on APPLE
+if (APPLE)
+    # include Accelerate framework
+    if (NOT WHISPER_NO_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)
+
+        if (ACCELERATE_FRAMEWORK)
+            message(STATUS "Accelerate framework found")
 
-        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-    else()
-        message(WARNING "Accelerate framework not found")
+            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+        else()
+            message(WARNING "Accelerate framework not found")
+        endif()
+    endif()
+
+    if (WHISPER_COREML)
+        find_library(FOUNDATION_FRAMEWORK Foundation)
+        find_library(COREML_FRAMEWORK CoreML)
+
+        if (COREML_FRAMEWORK)
+            message(STATUS "CoreML framework found")
+
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
+        else()
+            message(WARNING "CoreML framework not found")
+        endif()
     endif()
 endif()
 
@@ -187,6 +206,33 @@ if (WHISPER_PERF)
     set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()
 
+#
+# whisper.coreml - Core ML support
+#
+
+if (WHISPER_COREML)
+    set(TARGET whisper.coreml)
+
+    add_library(${TARGET}
+        coreml/whisper-encoder.h
+        coreml/whisper-encoder.mm
+        coreml/whisper-encoder-impl.h
+        coreml/whisper-encoder-impl.m
+        )
+
+    include(DefaultTargetOptions)
+
+    target_include_directories(${TARGET} PUBLIC
+        .
+        )
+
+    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
+
+    set_target_properties(${TARGET} PROPERTIES
+        COMPILE_FLAGS "-fobjc-arc"
+        )
+endif()
+
 #
 # whisper - this is the main library of the project
 #
@@ -206,6 +252,10 @@ target_include_directories(${TARGET} PUBLIC
     .
     )
 
+if (WHISPER_COREML)
+    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
+endif()
+
 if (MSVC)
     target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
 
diff --git a/Makefile b/Makefile
index 9454b3a9b9c..c452b88ec61 100644
--- a/Makefile
+++ b/Makefile
@@ -140,6 +140,10 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
+ifdef WHISPER_COREML
+	CXXFLAGS += -DWHISPER_USE_COREML
+	LDFLAGS  += -framework Foundation -framework CoreML
+endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
@@ -195,11 +199,23 @@ ggml.o: ggml.c ggml.h
 whisper.o: whisper.cpp whisper.h ggml.h
 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
 
-libwhisper.a: ggml.o whisper.o
-	$(AR) rcs libwhisper.a ggml.o whisper.o
+ifndef WHISPER_COREML
+WHISPER_OBJ = whisper.o
+else
+whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
+	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
+
+whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
+	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
+
+WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
+endif
+
+libwhisper.a: ggml.o $(WHISPER_OBJ)
+	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
 
-libwhisper.so: ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
+libwhisper.so: ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 
 clean:
 	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
@@ -213,24 +229,24 @@ CC_SDL=`sdl2-config --cflags --libs`
 SRC_COMMON = examples/common.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
 
-main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
 	./main -h
 
-bench: examples/bench/bench.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
+bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
 
-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
 
-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
 
-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
 
-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS)
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
 
 #
 # Audio samples
diff --git a/coreml/whisper-encoder-impl.h b/coreml/whisper-encoder-impl.h
new file mode 100644
index 00000000000..9395acb250f
--- /dev/null
+++ b/coreml/whisper-encoder-impl.h
@@ -0,0 +1,142 @@
+//
+// CoremlEncoder.h
+//
+// This file was automatically generated and should not be edited.
+//
+
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#include <stdint.h>
+#include <os/log.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+
+/// Model Prediction Input Type
+API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
+@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
+
+/// melSegment as 1 × 80 × 3000 3-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Model Prediction Output Type
+API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
+@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
+
+/// output as multidimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * output;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Class for model loading and prediction
+API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
+@interface CoremlEncoder : NSObject
+@property (readonly, nonatomic, nullable) MLModel * model;
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle;
+
+/**
+    Initialize CoremlEncoder instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
+
+/**
+    Initialize CoremlEncoder instance with the model in this bundle.
+*/
+- (nullable instancetype)init;
+
+/**
+    Initialize CoremlEncoder instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize CoremlEncoder instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize CoremlEncoder instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Construct CoremlEncoder instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
+
+/**
+    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of CoremlEncoderInput to predict from
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as CoremlEncoderOutput
+*/
+- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of CoremlEncoderInput to predict from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as CoremlEncoderOutput
+*/
+- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the convenience interface
+    @param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as CoremlEncoderOutput
+*/
+- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Batch prediction
+    @param inputArray array of CoremlEncoderInput instances to obtain predictions from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the predictions as NSArray<CoremlEncoderOutput *>
+*/
+- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/coreml/whisper-encoder-impl.m b/coreml/whisper-encoder-impl.m
new file mode 100644
index 00000000000..9d3a08b8d0b
--- /dev/null
+++ b/coreml/whisper-encoder-impl.m
@@ -0,0 +1,197 @@
+//
+// CoremlEncoder.m
+//
+// This file was automatically generated and should not be edited.
+//
+
+#if !__has_feature(objc_arc)
+#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
+#endif
+
+#import "whisper-encoder-impl.h"
+
+@implementation CoremlEncoderInput
+
+- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
+    self = [super init];
+    if (self) {
+        _melSegment = melSegment;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"melSegment"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"melSegment"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
+    }
+    return nil;
+}
+
+@end
+
+@implementation CoremlEncoderOutput
+
+- (instancetype)initWithOutput:(MLMultiArray *)output {
+    self = [super init];
+    if (self) {
+        _output = output;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"output"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"output"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.output];
+    }
+    return nil;
+}
+
+@end
+
+@implementation CoremlEncoder
+
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle {
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
+    return [NSURL fileURLWithPath:assetPath];
+}
+
+
+/**
+    Initialize CoremlEncoder instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model {
+    self = [super init];
+    if (!self) { return nil; }
+    _model = model;
+    if (_model == nil) { return nil; }
+    return self;
+}
+
+
+/**
+    Initialize CoremlEncoder instance with the model in this bundle.
+*/
+- (nullable instancetype)init {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
+}
+
+
+/**
+    Initialize CoremlEncoder instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
+}
+
+
+/**
+    Initialize CoremlEncoder instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Initialize CoremlEncoder instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Construct CoremlEncoder instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
+    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
+              configuration:configuration
+          completionHandler:handler];
+}
+
+
+/**
+    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
+    [MLModel loadContentsOfURL:modelURL
+                 configuration:configuration
+             completionHandler:^(MLModel *model, NSError *error) {
+        if (model != nil) {
+            CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
+            handler(typedModel, nil);
+        } else {
+            handler(nil, error);
+        }
+    }];
+}
+
+- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
+}
+
+- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
+    if (!outFeatures) { return nil; }
+    return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
+}
+
+- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
+    return [self predictionFromFeatures:input_ error:error];
+}
+
+- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
+    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
+    if (!outBatch) { return nil; }
+    NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    for (NSInteger i = 0; i < outBatch.count; i++) {
+        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
+        CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
+        [results addObject:result];
+    }
+    return results;
+}
+
+@end
diff --git a/coreml/whisper-encoder.h b/coreml/whisper-encoder.h
new file mode 100644
index 00000000000..84bbe416505
--- /dev/null
+++ b/coreml/whisper-encoder.h
@@ -0,0 +1,22 @@
+// Wrapper of the Core ML Whisper Encoder model
+//
+// Code is derived from the work of Github user @wangchou
+// ref: https://github.com/wangchou/callCoreMLFromCpp
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_coreml_context;
+
+struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
+void whisper_coreml_free(struct whisper_coreml_context * ctx);
+
+void whisper_coreml_encode(
+        const whisper_coreml_context * ctx,
+                               float * mel,
+                               float * out);
+
+#if __cplusplus
+}
+#endif
diff --git a/coreml/whisper-encoder.mm b/coreml/whisper-encoder.mm
new file mode 100644
index 00000000000..09091c2003c
--- /dev/null
+++ b/coreml/whisper-encoder.mm
@@ -0,0 +1,61 @@
+#import "coreml/whisper-encoder.h"
+#import "coreml/whisper-encoder-impl.h"
+
+#import <CoreML/CoreML.h>
+
+#include <stdlib.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_coreml_context {
+    const void * data;
+};
+
+struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
+    NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
+
+    NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
+
+    const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
+
+    if (data == NULL) {
+        return NULL;
+    }
+
+    whisper_coreml_context * ctx = new whisper_coreml_context;
+
+    ctx->data = data;
+
+    return ctx;
+}
+
+void whisper_coreml_free(struct whisper_coreml_context * ctx) {
+    CFRelease(ctx->data);
+    delete ctx;
+}
+
+void whisper_coreml_encode(
+        const whisper_coreml_context * ctx,
+                               float * mel,
+                               float * out) {
+    MLMultiArray * inMultiArray = [
+        [MLMultiArray alloc] initWithDataPointer: mel
+                                           shape: @[@1, @80, @3000]
+                                        dataType: MLMultiArrayDataTypeFloat32
+                                         strides: @[@(240000), @(3000), @1]
+                                     deallocator: nil
+                                           error: nil
+    ];
+
+    CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
+
+    MLMultiArray * outMA = outCoreML.output;
+
+    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
+}
+
+#if __cplusplus
+}
+#endif
diff --git a/models/download-coreml-model.sh b/models/download-coreml-model.sh
new file mode 100755
index 00000000000..d46789d7c06
--- /dev/null
+++ b/models/download-coreml-model.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# This script downloads Whisper model files that have already been converted to Core ML format.
+# This way you don't have to convert them yourself.
+
+src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
+pfx="resolve/main/ggml"
+
+# get the path of this script
+function get_script_path() {
+    if [ -x "$(command -v realpath)" ]; then
+        echo "$(dirname $(realpath $0))"
+    else
+        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
+        echo "$ret"
+    fi
+}
+
+models_path="$(get_script_path)"
+
+# Whisper models
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ "$#" -ne 1 ]; then
+    printf "Usage: $0 <model>\n"
+    list_models
+
+    exit 1
+fi
+
+model=$1
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+# download Core ML model
+
+printf "Downloading Core ML model $model from '$src' ...\n"
+
+cd $models_path
+
+if [ -f "ggml-$model.mlmodel" ]; then
+    printf "Model $model already exists. Skipping download.\n"
+    exit 0
+fi
+
+if [ -x "$(command -v wget)" ]; then
+    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
+elif [ -x "$(command -v curl)" ]; then
+    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
+else
+    printf "Either wget or curl is required to download models.\n"
+    exit 1
+fi
+
+
+if [ $? -ne 0 ]; then
+    printf "Failed to download Core ML model $model \n"
+    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
+    exit 1
+fi
+
+printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
+printf "Run the following command to compile it:\n\n"
+printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
+printf "You can now use it like this:\n\n"
+printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
+printf "\n"
diff --git a/whisper.cpp b/whisper.cpp
index 846d3a93dbe..5b68bd816ff 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1,5 +1,8 @@
 #define WHISPER_BUILD
 #include "whisper.h"
+#if WHISPER_USE_COREML
+#include "coreml/whisper-encoder.h"
+#endif
 
 #include "ggml.h"
 
@@ -586,6 +589,11 @@ struct whisper_state {
 
     int lang_id = 0; // english by default
 
+    std::string path_model; // populated by whisper_init_from_file()
+#ifdef WHISPER_USE_COREML
+    whisper_coreml_context * ctx_coreml;
+#endif
+
     // [EXPERIMENTAL] token-level timestamps data
     int64_t t_beg = 0;
     int64_t t_last = 0;
@@ -1674,6 +1682,9 @@ static bool whisper_encode_internal(
     wstate.use_buf(ctx0, -1);
 
     // run the computation
+#ifdef WHISPER_USE_COREML
+    whisper_coreml_encode(wctx.ctx_coreml, (float *) mel->data, (float *) cur->data);
+#else
     {
         struct ggml_cgraph gf = {};
         gf.n_threads = n_threads;
@@ -1683,6 +1694,7 @@ static bool whisper_encode_internal(
 
         //ggml_graph_print(&gf);
     }
+#endif
 
     // cur
     //{
@@ -2524,6 +2536,20 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     return state;
 }
 
+#ifdef WHISPER_USE_COREML
+// replace .bin with .mlmodelc
+static std::string whisper_get_coreml_path(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += ".mlmodelc";
+
+    return path_bin;
+}
+#endif
+
 struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
     whisper_model_loader loader = {};
 
@@ -2536,6 +2562,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
     }
 
     loader.context = &fin;
+
     loader.read = [](void * ctx, void * output, size_t read_size) {
         std::ifstream * fin = (std::ifstream*)ctx;
         fin->read((char *)output, read_size);
@@ -2556,6 +2583,16 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
 
     if (ctx) {
         ctx->path_model = path_model;
+#ifdef WHISPER_USE_COREML
+        const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
+        fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
+
+        ctx->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
+        if (!ctx->ctx_coreml) {
+            fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+            return nullptr;
+        }
+#endif
     }
 
     return ctx;
@@ -2683,6 +2720,10 @@ void whisper_free(struct whisper_context * ctx) {
 
         whisper_free_state(ctx->state);
 
+#ifdef WHISPER_USE_COREML
+        whisper_coreml_free(ctx->ctx_coreml);
+        ctx->ctx_coreml = nullptr;
+#endif
         delete ctx;
     }
 }

From 4f7963ef82dd7775c6982db7f45c934c57b530e2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 5 Mar 2023 18:31:09 +0200
Subject: [PATCH 2/5] coreml : simlpify whisper_encode + log messages

---
 whisper.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/whisper.cpp b/whisper.cpp
index 5b68bd816ff..1fa0bb0f741 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1384,6 +1384,7 @@ static bool whisper_encode_internal(
         }
     }
 
+#ifndef WHISPER_USE_COREML
     struct ggml_tensor * cur;
 
     // convolution + gelu
@@ -1682,9 +1683,6 @@ static bool whisper_encode_internal(
     wstate.use_buf(ctx0, -1);
 
     // run the computation
-#ifdef WHISPER_USE_COREML
-    whisper_coreml_encode(wctx.ctx_coreml, (float *) mel->data, (float *) cur->data);
-#else
     {
         struct ggml_cgraph gf = {};
         gf.n_threads = n_threads;
@@ -1694,6 +1692,12 @@ static bool whisper_encode_internal(
 
         //ggml_graph_print(&gf);
     }
+#else
+    wctx.use_buf(ctx0, -1);
+
+    struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+
+    whisper_coreml_encode(wctx.ctx_coreml, (float *) mel->data, (float *) cur->data);
 #endif
 
     // cur
@@ -2586,12 +2590,15 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
 #ifdef WHISPER_USE_COREML
         const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
         fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
+        fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
 
         ctx->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
         if (!ctx->ctx_coreml) {
             fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
             return nullptr;
         }
+
+        fprintf(stderr, "%s: Core ML model loaded\n", __func__);
 #endif
     }
 

From 73cd2167a155fb78f27eb613ca3f3383f9f76112 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Apr 2023 22:17:08 +0300
Subject: [PATCH 3/5] whisper : resolve rebase conflicts

---
 whisper.cpp | 69 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/whisper.cpp b/whisper.cpp
index 1fa0bb0f741..0cc6f6e380e 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1693,11 +1693,11 @@ static bool whisper_encode_internal(
         //ggml_graph_print(&gf);
     }
 #else
-    wctx.use_buf(ctx0, -1);
+    wstate.use_buf(ctx0, -1);
 
     struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
 
-    whisper_coreml_encode(wctx.ctx_coreml, (float *) mel->data, (float *) cur->data);
+    whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
 #endif
 
     // cur
@@ -2491,6 +2491,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
 // interface implementation
 //
 
+#ifdef WHISPER_USE_COREML
+// replace .bin with .mlmodelc
+static std::string whisper_get_coreml_path(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += ".mlmodelc";
+
+    return path_bin;
+}
+#endif
+
 struct whisper_state * whisper_init_state(whisper_context * ctx) {
     whisper_state * state = new whisper_state;
 
@@ -2518,6 +2532,21 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
         fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
     }
 
+#ifdef WHISPER_USE_COREML
+    const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
+
+    fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
+    fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
+
+    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
+    if (!state->ctx_coreml) {
+        fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+        return nullptr;
+    }
+
+    fprintf(stderr, "%s: Core ML model loaded\n", __func__);
+#endif
+
     state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
 
     state->logits_id.reserve(ctx->model.hparams.n_vocab);
@@ -2540,20 +2569,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     return state;
 }
 
-#ifdef WHISPER_USE_COREML
-// replace .bin with .mlmodelc
-static std::string whisper_get_coreml_path(std::string path_bin) {
-    auto pos = path_bin.rfind('.');
-    if (pos != std::string::npos) {
-        path_bin = path_bin.substr(0, pos);
-    }
-
-    path_bin += ".mlmodelc";
-
-    return path_bin;
-}
-#endif
-
 struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
     whisper_model_loader loader = {};
 
@@ -2587,19 +2602,6 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
 
     if (ctx) {
         ctx->path_model = path_model;
-#ifdef WHISPER_USE_COREML
-        const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
-        fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
-        fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
-
-        ctx->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
-        if (!ctx->ctx_coreml) {
-            fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
-            return nullptr;
-        }
-
-        fprintf(stderr, "%s: Core ML model loaded\n", __func__);
-#endif
     }
 
     return ctx;
@@ -2712,6 +2714,11 @@ void whisper_free_state(struct whisper_state * state)
             kv_cache_free(state->decoders[i].kv_self);
         }
 
+#ifdef WHISPER_USE_COREML
+        whisper_coreml_free(state->ctx_coreml);
+        state->ctx_coreml = nullptr;
+#endif
+
         delete state;
     }
 }
@@ -2727,10 +2734,6 @@ void whisper_free(struct whisper_context * ctx) {
 
         whisper_free_state(ctx->state);
 
-#ifdef WHISPER_USE_COREML
-        whisper_coreml_free(ctx->ctx_coreml);
-        ctx->ctx_coreml = nullptr;
-#endif
         delete ctx;
     }
 }

From 28b3232aebab56a62561efcc7a9f6a63a7b1bbb6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 15 Apr 2023 10:58:02 +0300
Subject: [PATCH 4/5] coreml : add scripts for CoreML model generation

---
 .gitignore                          |   7 +-
 coreml/whisper-decoder-impl.h       | 146 ++++++++++++
 coreml/whisper-decoder-impl.m       | 201 +++++++++++++++++
 coreml/whisper-encoder-impl.h       |  72 +++---
 coreml/whisper-encoder-impl.m       |  68 +++---
 coreml/whisper-encoder.mm           |  10 +-
 models/convert-whisper-to-coreml.py | 334 ++++++++++++++++++++++++++++
 models/generate-coreml-interface.sh |  29 +++
 models/generate-coreml-model.sh     |  25 +++
 whisper.cpp                         |  17 +-
 10 files changed, 830 insertions(+), 79 deletions(-)
 create mode 100644 coreml/whisper-decoder-impl.h
 create mode 100644 coreml/whisper-decoder-impl.m
 create mode 100644 models/convert-whisper-to-coreml.py
 create mode 100755 models/generate-coreml-interface.sh
 create mode 100755 models/generate-coreml-model.sh

diff --git a/.gitignore b/.gitignore
index 482d15eed28..67ec7c32408 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,7 @@
 *.o
 *.a
-*.mlmodel
-*.mlmodelc
 .cache/
+.coreml/
 .test/
 .vs/
 .vscode/
@@ -37,4 +36,6 @@ examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
 
 extra/bench-gg.txt
 
-*.mlmodel*
+models/*.mlmodel
+models/*.mlmodelc
+models/*.mlpackage
diff --git a/coreml/whisper-decoder-impl.h b/coreml/whisper-decoder-impl.h
new file mode 100644
index 00000000000..c6f2e853118
--- /dev/null
+++ b/coreml/whisper-decoder-impl.h
@@ -0,0 +1,146 @@
+//
+// whisper-decoder-impl.h
+//
+// This file was automatically generated and should not be edited.
+//
+
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#include <stdint.h>
+#include <os/log.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+
+/// Model Prediction Input Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_decoder_implInput : NSObject<MLFeatureProvider>
+
+/// token_data as 1 by 1 matrix of 32-bit integers
+@property (readwrite, nonatomic, strong) MLMultiArray * token_data;
+
+/// audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * audio_data;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Model Prediction Output Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
+
+/// var_1346 as multidimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Class for model loading and prediction
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_decoder_impl : NSObject
+@property (readonly, nonatomic, nullable) MLModel * model;
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle;
+
+/**
+    Initialize whisper_decoder_impl instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+*/
+- (nullable instancetype)init;
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of whisper_decoder_implInput to predict from
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_decoder_implOutput
+*/
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of whisper_decoder_implInput to predict from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_decoder_implOutput
+*/
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the convenience interface
+    @param token_data as 1 by 1 matrix of 32-bit integers:
+    @param audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats:
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_decoder_implOutput
+*/
+- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Batch prediction
+    @param inputArray array of whisper_decoder_implInput instances to obtain predictions from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the predictions as NSArray<whisper_decoder_implOutput *>
+*/
+- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/coreml/whisper-decoder-impl.m b/coreml/whisper-decoder-impl.m
new file mode 100644
index 00000000000..34060e45c71
--- /dev/null
+++ b/coreml/whisper-decoder-impl.m
@@ -0,0 +1,201 @@
+//
+// whisper-decoder-impl.m
+//
+// This file was automatically generated and should not be edited.
+//
+
+#if !__has_feature(objc_arc)
+#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
+#endif
+
+#import "whisper-decoder-impl.h"
+
+@implementation whisper_decoder_implInput
+
+- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data {
+    self = [super init];
+    if (self) {
+        _token_data = token_data;
+        _audio_data = audio_data;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"token_data", @"audio_data"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"token_data"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.token_data];
+    }
+    if ([featureName isEqualToString:@"audio_data"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.audio_data];
+    }
+    return nil;
+}
+
+@end
+
+@implementation whisper_decoder_implOutput
+
+- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
+    self = [super init];
+    if (self) {
+        _var_1346 = var_1346;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"var_1346"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"var_1346"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
+    }
+    return nil;
+}
+
+@end
+
+@implementation whisper_decoder_impl
+
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
++ (nullable NSURL *)URLOfModelInThisBundle {
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_decoder_impl" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-decoder-impl.mlmodelc in the bundle resource"); return nil; }
+    return [NSURL fileURLWithPath:assetPath];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model {
+    self = [super init];
+    if (!self) { return nil; }
+    _model = model;
+    if (_model == nil) { return nil; }
+    return self;
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+*/
+- (nullable instancetype)init {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
+    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
+              configuration:configuration
+          completionHandler:handler];
+}
+
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
+    [MLModel loadContentsOfURL:modelURL
+                 configuration:configuration
+             completionHandler:^(MLModel *model, NSError *error) {
+        if (model != nil) {
+            whisper_decoder_impl *typedModel = [[whisper_decoder_impl alloc] initWithMLModel:model];
+            handler(typedModel, nil);
+        } else {
+            handler(nil, error);
+        }
+    }];
+}
+
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
+}
+
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
+    if (!outFeatures) { return nil; }
+    return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
+}
+
+- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    whisper_decoder_implInput *input_ = [[whisper_decoder_implInput alloc] initWithToken_data:token_data audio_data:audio_data];
+    return [self predictionFromFeatures:input_ error:error];
+}
+
+- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
+    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
+    if (!outBatch) { return nil; }
+    NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    for (NSInteger i = 0; i < outBatch.count; i++) {
+        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
+        whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
+        [results addObject:result];
+    }
+    return results;
+}
+
+@end
diff --git a/coreml/whisper-encoder-impl.h b/coreml/whisper-encoder-impl.h
index 9395acb250f..ecb61555b94 100644
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@@ -1,5 +1,5 @@
 //
-// CoremlEncoder.h
+// whisper-encoder-impl.h
 //
 // This file was automatically generated and should not be edited.
 //
@@ -13,20 +13,20 @@ NS_ASSUME_NONNULL_BEGIN
 
 
 /// Model Prediction Input Type
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_encoder_implInput : NSObject<MLFeatureProvider>
 
-/// melSegment as 1 × 80 × 3000 3-dimensional array of floats
-@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
+/// logmel_data as 1 × 80 × 3000 3-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * logmel_data;
 - (instancetype)init NS_UNAVAILABLE;
-- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
+- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data NS_DESIGNATED_INITIALIZER;
 
 @end
 
 
 /// Model Prediction Output Type
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_encoder_implOutput : NSObject<MLFeatureProvider>
 
 /// output as multidimensional array of floats
 @property (readwrite, nonatomic, strong) MLMultiArray * output;
@@ -37,8 +37,8 @@ API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((
 
 
 /// Class for model loading and prediction
-API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
-@interface CoremlEncoder : NSObject
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_encoder_impl : NSObject
 @property (readonly, nonatomic, nullable) MLModel * model;
 
 /**
@@ -47,20 +47,20 @@ API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((
 + (nullable NSURL *)URLOfModelInThisBundle;
 
 /**
-    Initialize CoremlEncoder instance from an existing MLModel object.
+    Initialize whisper_encoder_impl instance from an existing MLModel object.
 
-    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
+    Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
     Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
 
 /**
-    Initialize CoremlEncoder instance with the model in this bundle.
+    Initialize whisper_encoder_impl instance with the model in this bundle.
 */
 - (nullable instancetype)init;
 
 /**
-    Initialize CoremlEncoder instance with the model in this bundle.
+    Initialize whisper_encoder_impl instance with the model in this bundle.
 
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@@ -68,75 +68,75 @@ API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
-    Initialize CoremlEncoder instance from the model URL.
+    Initialize whisper_encoder_impl instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
-    Initialize CoremlEncoder instance from the model URL.
+    Initialize whisper_encoder_impl instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
-    Construct CoremlEncoder instance asynchronously with configuration.
+    Construct whisper_encoder_impl instance asynchronously with configuration.
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
 */
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
 
 /**
-    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
+    Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
 
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param modelURL The model URL.
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
 */
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
 
 /**
     Make a prediction using the standard interface
-    @param input an instance of CoremlEncoderInput to predict from
+    @param input an instance of whisper_encoder_implInput to predict from
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
+    @return the prediction as whisper_encoder_implOutput
 */
-- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
     Make a prediction using the standard interface
-    @param input an instance of CoremlEncoderInput to predict from
+    @param input an instance of whisper_encoder_implInput to predict from
     @param options prediction options
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
+    @return the prediction as whisper_encoder_implOutput
 */
-- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
     Make a prediction using the convenience interface
-    @param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
+    @param logmel_data as 1 × 80 × 3000 3-dimensional array of floats:
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the prediction as CoremlEncoderOutput
+    @return the prediction as whisper_encoder_implOutput
 */
-- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 
 /**
     Batch prediction
-    @param inputArray array of CoremlEncoderInput instances to obtain predictions from
+    @param inputArray array of whisper_encoder_implInput instances to obtain predictions from
     @param options prediction options
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
-    @return the predictions as NSArray<CoremlEncoderOutput *>
+    @return the predictions as NSArray<whisper_encoder_implOutput *>
 */
-- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/coreml/whisper-encoder-impl.m b/coreml/whisper-encoder-impl.m
index 9d3a08b8d0b..ee8e506568f 100644
--- a/coreml/whisper-encoder-impl.m
+++ b/coreml/whisper-encoder-impl.m
@@ -1,5 +1,5 @@
 //
-// CoremlEncoder.m
+// whisper-encoder-impl.m
 //
 // This file was automatically generated and should not be edited.
 //
@@ -10,30 +10,30 @@
 
 #import "whisper-encoder-impl.h"
 
-@implementation CoremlEncoderInput
+@implementation whisper_encoder_implInput
 
-- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
+- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data {
     self = [super init];
     if (self) {
-        _melSegment = melSegment;
+        _logmel_data = logmel_data;
     }
     return self;
 }
 
 - (NSSet<NSString *> *)featureNames {
-    return [NSSet setWithArray:@[@"melSegment"]];
+    return [NSSet setWithArray:@[@"logmel_data"]];
 }
 
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
-    if ([featureName isEqualToString:@"melSegment"]) {
-        return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
+    if ([featureName isEqualToString:@"logmel_data"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.logmel_data];
     }
     return nil;
 }
 
 @end
 
-@implementation CoremlEncoderOutput
+@implementation whisper_encoder_implOutput
 
 - (instancetype)initWithOutput:(MLMultiArray *)output {
     self = [super init];
@@ -56,23 +56,23 @@ - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
 
 @end
 
-@implementation CoremlEncoder
+@implementation whisper_encoder_impl
 
 
 /**
     URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle {
-    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
-    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_encoder_impl" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-encoder-impl.mlmodelc in the bundle resource"); return nil; }
     return [NSURL fileURLWithPath:assetPath];
 }
 
 
 /**
-    Initialize CoremlEncoder instance from an existing MLModel object.
+    Initialize whisper_encoder_impl instance from an existing MLModel object.
 
-    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
+    Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
     Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model {
@@ -85,7 +85,7 @@ - (instancetype)initWithMLModel:(MLModel *)model {
 
 
 /**
-    Initialize CoremlEncoder instance with the model in this bundle.
+    Initialize whisper_encoder_impl instance with the model in this bundle.
 */
 - (nullable instancetype)init {
     return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
@@ -93,7 +93,7 @@ - (nullable instancetype)init {
 
 
 /**
-    Initialize CoremlEncoder instance with the model in this bundle.
+    Initialize whisper_encoder_impl instance with the model in this bundle.
 
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@@ -104,9 +104,9 @@ - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configura
 
 
 /**
-    Initialize CoremlEncoder instance from the model URL.
+    Initialize whisper_encoder_impl instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
@@ -117,9 +117,9 @@ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError
 
 
 /**
-    Initialize CoremlEncoder instance from the model URL.
+    Initialize whisper_encoder_impl instance from the model URL.
 
-    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
     @param configuration The model configuration object
     @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
@@ -131,13 +131,13 @@ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(
 
 
 /**
-    Construct CoremlEncoder instance asynchronously with configuration.
+    Construct whisper_encoder_impl instance asynchronously with configuration.
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
 */
-+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
     [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
               configuration:configuration
           completionHandler:handler];
@@ -145,20 +145,20 @@ + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHa
 
 
 /**
-    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
+    Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
 
     Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
 
     @param modelURL The model URL.
     @param configuration The model configuration
-    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
 */
-+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
     [MLModel loadContentsOfURL:modelURL
                  configuration:configuration
              completionHandler:^(MLModel *model, NSError *error) {
         if (model != nil) {
-            CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
+            whisper_encoder_impl *typedModel = [[whisper_encoder_impl alloc] initWithMLModel:model];
             handler(typedModel, nil);
         } else {
             handler(nil, error);
@@ -166,29 +166,29 @@ + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration
     }];
 }
 
-- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
     return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
 }
 
-- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
     id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
     if (!outFeatures) { return nil; }
-    return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
+    return [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
 }
 
-- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
-    CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
+- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    whisper_encoder_implInput *input_ = [[whisper_encoder_implInput alloc] initWithLogmel_data:logmel_data];
     return [self predictionFromFeatures:input_ error:error];
 }
 
-- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
     id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
     id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
     if (!outBatch) { return nil; }
-    NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    NSMutableArray<whisper_encoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
     for (NSInteger i = 0; i < outBatch.count; i++) {
         id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
-        CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
+        whisper_encoder_implOutput * result = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
         [results addObject:result];
     }
     return results;
diff --git a/coreml/whisper-encoder.mm b/coreml/whisper-encoder.mm
index 09091c2003c..dd08f0f318f 100644
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@@ -18,7 +18,7 @@
 
     NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
 
-    const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
+    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
 
     if (data == NULL) {
         return NULL;
@@ -49,10 +49,16 @@ void whisper_coreml_encode(
                                            error: nil
     ];
 
-    CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
+    whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
 
     MLMultiArray * outMA = outCoreML.output;
 
+    //NSArray<NSNumber *> * shape = outMA.shape;
+    //NSArray<NSNumber *> * strides = outMA.strides;
+
+    //printf("shape:   %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]);
+    //printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]);
+
     memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
 }
 
diff --git a/models/convert-whisper-to-coreml.py b/models/convert-whisper-to-coreml.py
new file mode 100644
index 00000000000..489854ede7f
--- /dev/null
+++ b/models/convert-whisper-to-coreml.py
@@ -0,0 +1,334 @@
+import argparse
+import torch
+import torch.nn.functional as F
+import coremltools as ct
+
+from torch import Tensor
+from torch import nn
+from typing import Dict
+from typing import Optional
+from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
+from coremltools.models.neural_network.quantization_utils import quantize_weights
+from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
+from whisper import load_model
+
+# Use for changing dim of input in encoder and decoder embeddings
+def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
+                         missing_keys, unexpected_keys, error_msgs):
+    """
+    Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights
+    """
+    for k in state_dict:
+        is_attention = all(substr in k for substr in ['attn', '.weight'])
+        is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
+
+        if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
+            state_dict[k] = state_dict[k][:, :, None, None]
+
+
+def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
+                                           strict, missing_keys,
+                                           unexpected_keys, error_msgs):
+    state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
+    return state_dict
+
+class LayerNormANE(LayerNormANEBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._register_load_state_dict_pre_hook(
+            correct_for_bias_scale_order_inversion)
+
+class MultiHeadAttentionANE(MultiHeadAttention):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__(n_state, n_head)
+
+        setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
+        setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
+        setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
+        setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
+
+    def forward(self,
+                x: Tensor,
+                xa: Optional[Tensor] = None,
+                mask: Optional[Tensor] = None,
+                kv_cache: Optional[dict] = None):
+
+        q = self.query(x)
+
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+
+        wv, qk = self.qkv_attention_ane(q, k, v, mask)
+
+        return self.out(wv), qk
+
+    def qkv_attention_ane(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
+
+        _, dim, _, seqlen = q.size()
+
+        dim_per_head = dim // self.n_head
+
+        scale = float(dim_per_head)**-0.5
+
+        q = q * scale
+
+        mh_q = q.split(dim_per_head, dim=1)
+        mh_k = k.transpose(1,3).split(dim_per_head, dim=3)
+        mh_v = v.split(dim_per_head, dim=1)
+
+        mh_qk = [
+            torch.einsum('bchq,bkhc->bkhq', [qi, ki])
+            for qi, ki in zip(mh_q, mh_k)
+        ]  # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
+
+        if mask is not None:
+            for head_idx in range(self.n_head):
+                mh_qk[head_idx] = mh_qk[head_idx] + mask[:, :seqlen, :, :seqlen]
+
+        attn_weights = [aw.softmax(dim=1) for aw in mh_qk]  # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
+        attn = [torch.einsum('bkhq,bchk->bchq', wi, vi) for wi, vi in zip(attn_weights, mh_v)]  # (batch_size, dim_per_head, 1, max_seq_length) * n_heads
+        attn = torch.cat(attn, dim=1)  # (batch_size, dim, 1, max_seq_length)
+
+        return attn, torch.cat(mh_qk, dim=1).float().detach()
+
+
+class ResidualAttentionBlockANE(ResidualAttentionBlock):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__(n_state, n_head, cross_attention)
+
+        setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
+        setattr(self, 'attn_ln', LayerNormANE(n_state))
+
+        setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
+        setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
+
+        n_mlp = n_state * 4
+        setattr(self, 'mlp', nn.Sequential(
+            nn.Conv2d(n_state, n_mlp, kernel_size=1),
+            nn.GELU(),
+            nn.Conv2d(n_mlp, n_state, kernel_size=1)
+        ))
+        setattr(self, 'mlp_ln', LayerNormANE(n_state))
+
+
+class AudioEncoderANE(AudioEncoder):
+    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
+
+        setattr(self, 'blocks', nn.ModuleList(
+            [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
+        ))
+        setattr(self, 'ln_post', LayerNormANE(n_state))
+
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+
+        assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape"
+
+        # Add positional embedding and add dummy dim for ANE
+        x = (x + self.positional_embedding.transpose(0,1)).to(x.dtype).unsqueeze(2)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.ln_post(x)
+
+        # """
+        # TODO:
+        # I think we need to transpose the result here to make it fit whisper.cpp memory order.
+        # However, even doing this, the results are still wrong. Kind of less wrong compared to
+        # not transposing, but still wrong.
+
+        # Also, I don't know why the original OpenAI implementation does not need to transpose
+
+        # transpose to (batch_size, n_ctx, n_state)
+        # x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
+
+        # """
+        # x = x.transpose(1,3)
+
+        return x
+
+class TextDecoderANE(TextDecoder):
+
+    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
+
+        setattr(self, 'blocks', nn.ModuleList(
+            [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
+        ))
+        setattr(self, 'ln', LayerNormANE(n_state))
+
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[3] if kv_cache else 0
+        x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
+        x = x.to(xa.dtype)
+
+        # Reformat for ANE
+        mask = self.mask[None, None, :, :].permute(0,3,1,2)
+        x = x.transpose(1,2).unsqueeze(2)
+
+        for block in self.blocks:
+            x = block(x, xa, mask=mask, kv_cache=kv_cache)
+
+        x = self.ln(x)
+
+        # Reformat back from ANE
+        x = x.permute(0,2,3,1).squeeze(0)
+
+        # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
+        if self.token_embedding.weight.shape[0] == 51865:
+            # split in 11 chunks - 4715 each
+            splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
+            logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
+        else:
+            # split in 12 chunks - 4322 each
+            assert(self.token_embedding.weight.shape[0] == 51864)
+            splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//12, dim=0)
+            logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
+
+        return logits
+
+class WhisperANE(Whisper):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__(dims)
+
+        setattr(self, 'encoder', AudioEncoderANE(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        ))
+        setattr(self, 'decoder', TextDecoderANE(
+            self.dims.n_vocab,
+            self.dims.n_text_ctx,
+            self.dims.n_text_state,
+            self.dims.n_text_head,
+            self.dims.n_text_layer,
+        ))
+
+        self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
+
+    def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
+        return self.decoder(tokens, self.encoder(mel))
+
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[3] > self.decoder.positional_embedding.shape[0]:
+                cache[module] = output  # save as-is, for the first token or cross attention
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=3).detach()
+            return cache[module]
+
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttentionANE):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+
+def convert_encoder(hparams, model, quantize=False):
+    model.eval()
+
+    input_shape = (1, 80, 3000)
+    input_data = torch.randn(input_shape)
+    traced_model = torch.jit.trace(model, input_data)
+
+    model = ct.convert(
+        traced_model,
+        convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
+        inputs=[ct.TensorType(name="logmel_data", shape=input_shape)],
+        outputs=[ct.TensorType(name="output")],
+        compute_units=ct.ComputeUnit.ALL
+    )
+
+    if quantize:
+        model = quantize_weights(model, nbits=16)
+
+    return model
+
+def convert_decoder(hparams, model, quantize=False):
+    model.eval()
+
+    tokens_shape = (1, 1)
+    audio_shape = (1, hparams.n_audio_state, 1, 1500)
+
+    audio_data = torch.randn(audio_shape)
+    token_data = torch.randint(50257, tokens_shape).long()
+    traced_model = torch.jit.trace(model, (token_data, audio_data))
+
+    model = ct.convert(
+        traced_model,
+        convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
+        inputs=[
+            ct.TensorType(name="token_data", shape=tokens_shape, dtype=int),
+            ct.TensorType(name="audio_data", shape=audio_shape)
+        ]
+    )
+
+    if quantize:
+        model = quantize_weights(model, nbits=16)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large)", required=True)
+    parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
+    parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
+    parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
+    args = parser.parse_args()
+
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]:
+        raise ValueError("Invalid model name")
+
+    whisper = load_model(args.model).cpu()
+    hparams = whisper.dims
+    print(hparams)
+
+    if args.optimize_ane:
+        whisperANE = WhisperANE(hparams).eval()
+        whisperANE.load_state_dict(whisper.state_dict())
+
+        encoder = whisperANE.encoder
+        decoder = whisperANE.decoder
+    else:
+        encoder = whisper.encoder
+        decoder = whisper.decoder
+
+    # Convert encoder
+    encoder = convert_encoder(hparams, encoder, quantize=args.quantize)
+    encoder.save(f"models/coreml-encoder-{args.model}.mlpackage")
+
+    if args.encoder_only is False:
+        # Convert decoder
+        decoder = convert_decoder(hparams, decoder, quantize=args.quantize)
+        decoder.save(f"models/coreml-decoder-{args.model}.mlpackage")
+
+    print("done converting")
diff --git a/models/generate-coreml-interface.sh b/models/generate-coreml-interface.sh
new file mode 100755
index 00000000000..553d5f654f4
--- /dev/null
+++ b/models/generate-coreml-interface.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# This generates:
+#   - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m
+#   - coreml/whisper-decoder-impl.h and coreml/whisper-decoder-impl.m
+#
+
+wd=$(dirname "$0")
+cd "$wd/../"
+
+python3 models/convert-whisper-to-coreml.py --model tiny.en
+
+mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
+xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
+mv coreml/whisper_encoder_impl.h coreml/whisper-encoder-impl.h
+mv coreml/whisper_encoder_impl.m coreml/whisper-encoder-impl.m
+sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.m
+sed -i '' 's/whisper_encoder_impl\.m/whisper-encoder-impl.m/g' coreml/whisper-encoder-impl.m
+sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.h
+
+mv -v models/coreml-decoder-tiny.en.mlpackage models/whisper-decoder-impl.mlpackage
+xcrun coremlc generate models/whisper-decoder-impl.mlpackage coreml/
+mv coreml/whisper_decoder_impl.h coreml/whisper-decoder-impl.h
+mv coreml/whisper_decoder_impl.m coreml/whisper-decoder-impl.m
+sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.m
+sed -i '' 's/whisper_decoder_impl\.m/whisper-decoder-impl.m/g' coreml/whisper-decoder-impl.m
+sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.h
+
+rm -rfv models/whisper-encoder-impl.mlpackage models/whisper-decoder-impl.mlpackage
diff --git a/models/generate-coreml-model.sh b/models/generate-coreml-model.sh
new file mode 100755
index 00000000000..29d6b1d8777
--- /dev/null
+++ b/models/generate-coreml-model.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Usage: ./generate-coreml-model.sh <model-name>
+if [ $# -eq 0 ]
+  then
+    echo "No model name supplied"
+    echo "Usage: ./generate-coreml-model.sh <model-name>"
+    exit 1
+fi
+
+mname="$1"
+
+wd=$(dirname "$0")
+cd "$wd/../"
+
+python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+
+xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
+rm -rf models/ggml-${mname}-encoder.mlmodelc
+mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc
+
+# TODO: decoder (sometime in the future maybe)
+#xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/
+#rm -rf models/ggml-${mname}-decoder.mlmodelc
+#mv -v models/coreml_decoder_${mname}.mlmodelc models/ggml-${mname}-decoder.mlmodelc
diff --git a/whisper.cpp b/whisper.cpp
index 0cc6f6e380e..f4c9c3d02f4 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2492,14 +2492,14 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
 //
 
 #ifdef WHISPER_USE_COREML
-// replace .bin with .mlmodelc
-static std::string whisper_get_coreml_path(std::string path_bin) {
+// replace .bin with -encoder.mlmodelc
+static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
     auto pos = path_bin.rfind('.');
     if (pos != std::string::npos) {
         path_bin = path_bin.substr(0, pos);
     }
 
-    path_bin += ".mlmodelc";
+    path_bin += "-encoder.mlmodelc";
 
     return path_bin;
 }
@@ -2533,7 +2533,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     }
 
 #ifdef WHISPER_USE_COREML
-    const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
+    const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
 
     fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
     fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
@@ -3140,6 +3140,14 @@ void whisper_reset_timings(struct whisper_context * ctx) {
     }
 }
 
+static int whisper_has_coreml(void) {
+#ifdef WHISPER_USE_COREML
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 const char * whisper_print_system_info(void) {
     static std::string s;
 
@@ -3156,6 +3164,7 @@ const char * whisper_print_system_info(void) {
     s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
     s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
     s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+    s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
 
     return s.c_str();
 }

From 5fda9b1a9ceefc93bf30b79e924f1a1187e271db Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 15 Apr 2023 13:19:04 +0300
Subject: [PATCH 5/5] bench-all : recognize COREML flag

---
 extra/bench-all.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/extra/bench-all.sh b/extra/bench-all.sh
index fbfc8772f7c..92973786df3 100755
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@@ -64,6 +64,10 @@ for model in "${models[@]}"; do
         config="$config BLAS"
     fi
 
+    if [[ $system_info == *"COREML = 1"* ]]; then
+        config="$config COREML"
+    fi
+
     commit=$(git rev-parse --short HEAD)
 
     printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"