small stylistic fixes and adjustments, fix bug in Makefile, and chang…

…e the timing code to skip the first (slow) iteration
karpathy · Jul 27, 2023 · 25b50ee · 25b50ee
1 parent 0e1b0d4
commit 25b50ee
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 18 deletions.
diff --git a/Makefile b/Makefile
@@ -42,7 +42,7 @@ rungnu:
 	$(CC) -Ofast -std=gnu11 -o run run.c -lm
 
 .PHONY: runompgnu
-rungnu:
+runompgnu:
 	$(CC) -Ofast -fopenmp -std=gnu11 run.c  -lm  -o run
 
 .PHONY: clean

diff --git a/run.c b/run.c
@@ -193,6 +193,7 @@ void softmax(float* x, int size) {
 
 void matmul(float* xout, float* x, float* w, int n, int d) {
     // W (d,n) @ x (n,) -> xout (d,)
+    // by far the most amount of time is spent inside this little function
     int i;
     #pragma omp parallel for private(i)
     for (i = 0; i < d; i++) {
@@ -205,7 +206,7 @@ void matmul(float* xout, float* x, float* w, int n, int d) {
 }
 
 void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* w) {
-    
+
     // a few convenience variables
     float *x = s->x;
     int dim = p->dim;
@@ -222,7 +223,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
 
     // forward all the layers
     for(int l = 0; l < p->n_layers; l++) {
-    
+
         // attention rmsnorm
         rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);
 
@@ -316,7 +317,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
         for (int i = 0; i < hidden_dim; i++) {
             s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i])));
         }
-        
+
         // elementwise multiply with w3(x)
         for (int i = 0; i < hidden_dim; i++) {
             s->hb[i] = s->hb[i] * s->hb2[i];
@@ -398,15 +399,12 @@ int main(int argc, char *argv[]) {
     // read in the model.bin file
     Config config;
     TransformerWeights weights;
-    int fd = 0;
-    float* data = NULL;
-    long file_size;
+    int fd = 0;         // file descriptor for memory mapping
+    float* data = NULL; // memory mapped data pointer
+    long file_size;     // size of the checkpoint file in bytes
     {
         FILE *file = fopen(checkpoint, "rb");
-        if (!file) {
-            printf("Unable to open the checkpoint file %s!\n", checkpoint);
-            return 1;
-        }
+        if (!file) { printf("Couldn't open file %s\n", checkpoint); return 1; }
         // read in the config header
         if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
         // negative vocab size is hacky way of signaling unshared weights. bit yikes.
@@ -431,11 +429,7 @@ int main(int argc, char *argv[]) {
     char** vocab = (char**)malloc(config.vocab_size * sizeof(char*));
     {
         FILE *file = fopen("tokenizer.bin", "rb");
-        if (!file) {
-            printf("Unable to open the tokenizer file tokenizer.bin! Run "
-            "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
-            return 1;
-        }
+        if (!file) { printf("Couldn't load tokenizer.bin\n"); return 1; }
         int len;
         for (int i = 0; i < config.vocab_size; i++) {
             if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
@@ -451,7 +445,7 @@ int main(int argc, char *argv[]) {
     malloc_run_state(&state, &config);
 
     // the current position we are in
-    long start = time_in_ms();
+    long start = 0; // used to time our code, only initialized after first iteration
     int next;
     int token = 1; // 1 = BOS token in Llama-2 sentencepiece
     int pos = 0;
@@ -479,11 +473,13 @@ int main(int argc, char *argv[]) {
         // advance forward
         token = next;
         pos++;
+        // init our timer here because the first iteration is slow due to memmap
+        if (start == 0) { start = time_in_ms(); }
     }
 
     // report achieved tok/s
     long end = time_in_ms();
-    printf("\nachieved tok/s: %f\n", steps / (double)(end-start)*1000);
+    printf("\nachieved tok/s: %f\n", (steps-1) / (double)(end-start)*1000);
 
     // memory and file handles cleanup
     free_run_state(&state);