databricks · lihaoyi-databricks · Jan 5, 2024 · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023
diff --git a/bench/src/main/scala/sjsonnet/MainBenchmark.scala b/bench/src/main/scala/sjsonnet/MainBenchmark.scala
@@ -8,9 +8,12 @@ import org.openjdk.jmh.infra._
 
 object MainBenchmark {
   val mainArgs = Array[String](
-    "../../universe2/rulemanager/deploy/rulemanager.jsonnet",
-    "-J", "../../universe2",
-    "-J", "../../universe2/mt-shards/dev/az-westus-c2",
+    "../../universe/rulemanager/deploy/rulemanager.jsonnet",
+    // "../../universe/kubernetes/admission-controller/gatekeeper/deploy/gatekeeper.jsonnet",
+    "-J", "../../universe",
+    "-J", "../../universe/mt-shards/dev/az-westus-c2",
+    "-J", "../../universe/bazel-bin",
+    "--ext-code", "isKubecfg=false"
   )
 
   def findFiles(): (IndexedSeq[(Path, String)], EvalScope) = {
@@ -28,7 +31,7 @@ object MainBenchmark {
       parseCache = parseCache
     )
     val renderer = new Renderer(new StringWriter, indent = 3)
-    interp.interpret0(interp.resolver.read(path).get, path, renderer).getOrElse(???)
+    interp.interpret0(interp.resolver.read(path).get.readString(), path, renderer).getOrElse(???)
     (parseCache.keySet.toIndexedSeq, interp.evaluator)
   }
 

diff --git a/bench/src/main/scala/sjsonnet/RunProfiler.scala b/bench/src/main/scala/sjsonnet/RunProfiler.scala
@@ -24,7 +24,7 @@ object RunProfiler extends App {
 
   def run(): Long = {
     val renderer = new Renderer(new StringWriter, indent = 3)
-    val start = interp.resolver.read(path).get
+    val start = interp.resolver.read(path).get.readString()
     val t0 = System.nanoTime()
     interp.interpret0(start, path, renderer).getOrElse(???)
     System.nanoTime() - t0

diff --git a/sjsonnet/src-js/sjsonnet/Platform.scala b/sjsonnet/src-js/sjsonnet/Platform.scala
@@ -6,10 +6,10 @@ object Platform {
   def gzipString(s: String): String = {
     throw new Exception("GZip not implemented in Scala.js")
   }
-  def xzBytes(s: Array[Byte]): String = {
+  def xzBytes(s: Array[Byte], compressionLevel: Option[Int]): String = {
     throw new Exception("XZ not implemented in Scala.js")
   }
-  def xzString(s: String): String = {
+  def xzString(s: String, compressionLevel: Option[Int]): String = {
     throw new Exception("XZ not implemented in Scala.js")
   }
   def yamlToJson(s: String): String = {

diff --git a/sjsonnet/src-js/sjsonnet/SjsonnetMain.scala b/sjsonnet/src-js/sjsonnet/SjsonnetMain.scala
@@ -24,8 +24,8 @@ object SjsonnetMain {
             case null => None
             case s => Some(JsVirtualPath(s))
           }
-        def read(path: Path): Option[String] =
-          Option(importLoader(path.asInstanceOf[JsVirtualPath].path))
+        def read(path: Path): Option[ResolvedFile] =
+          Option(StaticResolvedFile(importLoader(path.asInstanceOf[JsVirtualPath].path)))
       },
       parseCache = new DefaultParseCache,
       new Settings(preserveOrder = preserveOrder),
@@ -57,4 +57,4 @@ case class JsVirtualPath(path: String) extends Path{
   def renderOffsetStr(offset: Int, loadedFileContents: mutable.HashMap[Path, Array[Int]]): String = {
     path + ":" + offset
   }
-}
+}
diff --git a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
@@ -0,0 +1,86 @@
+package sjsonnet
+
+import java.io.{BufferedInputStream, File, FileInputStream}
+import java.nio.charset.StandardCharsets
+import java.nio.file.Files
+import java.util.zip.CRC32
+import fastparse.ParserInput
+
+/**
+ * A class that encapsulates a resolved import. This is used to cache the result of
+ * resolving an import. If the import is deemed too large (IE it's a large file), then we will avoid keeping it in
+ * memory and instead will re-read it from disk.
+ */
+class CachedResolvedFile(val resolvedImportPath: OsPath, memoryLimitBytes: Long) extends ResolvedFile {
+
+  private val jFile: File = resolvedImportPath.p.toIO
+
+  assert(jFile.exists(), s"Resolved import path ${resolvedImportPath} does not exist")
+  // Assert that the file is less than limit
+  assert(jFile.length() <= memoryLimitBytes, s"Resolved import path ${resolvedImportPath} is too large: ${jFile.length()} bytes > ${memoryLimitBytes} bytes")
+
+  private[this] val resolvedImportContent: StaticResolvedFile = {
+    if (jFile.length() > 1024 * 1024) {
+      // If the file is too large, then we will just read it from disk
+      null
+    } else {
+      StaticResolvedFile(readString(jFile))
+    }
+  }
+
+  private[this] def readString(jFile: File): String = {
+    new String(Files.readAllBytes(jFile.toPath), StandardCharsets.UTF_8);
+  }
+
+  /**
+   * A method that will return a reader for the resolved import. If the import is too large, then this will return
+   * a reader that will read the file from disk. Otherwise, it will return a reader that reads from memory.
+   */
+  def getParserInput(): ParserInput = {
+    if (resolvedImportContent == null) {
+      FileParserInput(jFile)
+    } else {
+      resolvedImportContent.getParserInput()
+    }
+  }
+
+  override def readString(): String = {
+    if (resolvedImportContent == null) {
+      // If the file is too large, then we will just read it from disk
+      readString(jFile)
+    } else {
+      // Otherwise, we will read it from memory
+      resolvedImportContent.readString()
+    }
+  }
+
+  private def crcHashFile(file: File): Long = {
+    val buffer = new Array[Byte](8192)
+    val crc = new CRC32()
+
+    val fis = new FileInputStream(file)
+    val bis = new BufferedInputStream(fis)
+
+    try {
+      var bytesRead = bis.read(buffer)
+      while (bytesRead != -1) {
+        crc.update(buffer, 0, bytesRead)
+        bytesRead = bis.read(buffer)
+      }
+    } finally {
+      bis.close()
+      fis.close()
+    }
+
+    crc.getValue()
+  }
+
+  override lazy val contentHash: String = {
+    if (resolvedImportContent == null) {
+      // If the file is too large, then we will just read it from disk
+      crcHashFile(jFile).toString
+    } else {
+      resolvedImportContent.contentHash
+    }
+  }
+}
diff --git a/sjsonnet/src-jvm-native/sjsonnet/SjsonnetMain.scala b/sjsonnet/src-jvm-native/sjsonnet/SjsonnetMain.scala
@@ -22,8 +22,9 @@ object SjsonnetMain {
         .find(os.exists)
         .flatMap(p => try Some(OsPath(p)) catch{case NonFatal(_) => None})
 
-    def read(path: Path): Option[String] =
-      try Some(os.read(path.asInstanceOf[OsPath].p)) catch { case NonFatal(_) => None }
+    def read(path: Path): Option[ResolvedFile] = {
+      readPath(path)
+    }
   }
 
   def main(args: Array[String]): Unit = {
@@ -205,8 +206,9 @@ object SjsonnetMain {
         case Some(i) => new Importer {
           def resolve(docBase: Path, importName: String): Option[Path] =
             i(docBase, importName).map(OsPath)
-          def read(path: Path): Option[String] =
-            try Some(os.read(path.asInstanceOf[OsPath].p)) catch { case NonFatal(_) => None }
+          def read(path: Path): Option[ResolvedFile] = {
+            readPath(path)
+          }
         }
         case None => resolveImport(config.jpaths.map(os.Path(_, wd)).map(OsPath(_)), allowedInputs)
       },
@@ -291,4 +293,18 @@ object SjsonnetMain {
 
     }
   }
+
+  /**
+   * Read a path into a [[ResolvedFile]] if it exists and is a file. A resolved file acts as a layer
+   * of caching on top of the underlying file system. Small files are read into memory, while large
+   * files are read from disk.
+   */
+  private[this] def readPath(path: Path): Option[ResolvedFile] = {
+    val osPath = path.asInstanceOf[OsPath].p
+    if (os.exists(osPath) && os.isFile(osPath)) {
+      Some(new CachedResolvedFile(path.asInstanceOf[OsPath], memoryLimitBytes = 2048L * 1024L * 1024L))
+    } else {
+      None
+    }
+  }
 }
diff --git a/sjsonnet/src-jvm/sjsonnet/Platform.scala b/sjsonnet/src-jvm/sjsonnet/Platform.scala
@@ -23,18 +23,26 @@ object Platform {
   def gzipString(s: String): String = {
     gzipBytes(s.getBytes())
   }
-  def xzBytes(b: Array[Byte]): String = {
+
+  /**
+   *  Valid compression levels are 0 (no compression) to 9 (maximum compression).
+   */
+  def xzBytes(b: Array[Byte], compressionLevel: Option[Int]): String = {
     val outputStream: ByteArrayOutputStream = new ByteArrayOutputStream(b.length)
-    val xz: XZOutputStream = new XZOutputStream(outputStream, new LZMA2Options())
+    // Set compression to specified level
+    val level = compressionLevel.getOrElse(LZMA2Options.PRESET_DEFAULT)
+    val xz: XZOutputStream = new XZOutputStream(outputStream, new LZMA2Options(level))
     xz.write(b)
     xz.close()
     val xzedBase64: String = Base64.getEncoder.encodeToString(outputStream.toByteArray)
     outputStream.close()
     xzedBase64
   }
-  def xzString(s: String): String = {
-    xzBytes(s.getBytes())
+
+  def xzString(s: String, compressionLevel: Option[Int]): String = {
+    xzBytes(s.getBytes(), compressionLevel)
   }
+
   def yamlToJson(yamlString: String): String = {
     val yaml: java.util.LinkedHashMap[String, Object] = new Yaml(new Constructor(classOf[java.util.LinkedHashMap[String, Object]])).load(yamlString)
     new JSONObject(yaml).toString()

diff --git a/sjsonnet/src-native/sjsonnet/Platform.scala b/sjsonnet/src-native/sjsonnet/Platform.scala
@@ -6,10 +6,10 @@ object Platform {
   def gzipString(s: String): String = {
     throw new Exception("GZip not implemented in Scala Native")
   }
-  def xzBytes(s: Array[Byte]): String = {
+  def xzBytes(s: Array[Byte], compressionLevel: Option[Int]): String = {
     throw new Exception("XZ not implemented in Scala Native")
   }
-  def xzString(s: String): String = {
+  def xzString(s: String, compressionLevel: Option[Int]): String = {
     throw new Exception("XZ not implemented in Scala Native")
   }
   def yamlToJson(s: String): String = {

diff --git a/sjsonnet/src/sjsonnet/Error.scala b/sjsonnet/src/sjsonnet/Error.scala
@@ -103,7 +103,7 @@ trait EvalErrorScope {
   def prettyIndex(pos: Position): Option[(Int, Int)] = {
     importer.read(pos.currentFile).map { s =>
       val Array(line, col) =
-        new IndexedParserInput(s).prettyIndex(pos.offset).split(':')
+        s.getParserInput().prettyIndex(pos.offset).split(':')
       (line.toInt, col.toInt)
     }
   }

diff --git a/sjsonnet/src/sjsonnet/Evaluator.scala b/sjsonnet/src/sjsonnet/Evaluator.scala
@@ -300,7 +300,7 @@ class Evaluator(resolver: CachedResolver,
   }
 
   def visitImportStr(e: ImportStr)(implicit scope: ValScope): Val.Str =
-    Val.Str(e.pos, importer.resolveAndReadOrFail(e.value, e.pos)._2)
+    Val.Str(e.pos, importer.resolveAndReadOrFail(e.value, e.pos)._2.readString())
 
   def visitImport(e: Import)(implicit scope: ValScope): Val = {
     val (p, str) = importer.resolveAndReadOrFail(e.value, e.pos)