Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid Storing Large Strings in Memory #194

Merged
merged 48 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from 45 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
ff2b110
no buffer strings
ahirreddy Dec 20, 2023
0a1253a
clsoer
ahirreddy Dec 20, 2023
f653e53
fix perf
ahirreddy Dec 20, 2023
79cbbe7
Revert "fix perf"
ahirreddy Dec 21, 2023
c6e337c
Revert "clsoer"
ahirreddy Dec 21, 2023
b823e37
fix up naming
ahirreddy Dec 21, 2023
2757328
this is slow
ahirreddy Dec 21, 2023
8caa0a5
buffered reader
ahirreddy Dec 21, 2023
a9bab89
fix jmh
ahirreddy Dec 21, 2023
f230288
wip
ahirreddy Dec 23, 2023
597e248
parser input
ahirreddy Dec 23, 2023
61a9150
8k buffer
ahirreddy Dec 23, 2023
300688d
xz
ahirreddy Dec 24, 2023
573c1f7
memory limit
ahirreddy Dec 24, 2023
95cafc2
memory limit 20
ahirreddy Dec 24, 2023
50a2259
wip
ahirreddy Dec 24, 2023
62e1474
limit 40
ahirreddy Dec 24, 2023
9e3bbe4
2gb limit
ahirreddy Dec 25, 2023
ef1a463
long
ahirreddy Dec 25, 2023
4f1aa87
fix limit
ahirreddy Dec 25, 2023
972fa25
size
ahirreddy Dec 25, 2023
a4dc4d0
split eval works
ahirreddy Dec 26, 2023
50f127d
eval;
ahirreddy Dec 26, 2023
1b0394b
Revert "eval;"
ahirreddy Dec 29, 2023
a99bf47
Revert "split eval works"
ahirreddy Dec 29, 2023
2530189
Revert "size"
ahirreddy Dec 29, 2023
73a2283
fast util key map
ahirreddy Dec 29, 2023
23113c7
object 2 object
ahirreddy Dec 29, 2023
f9c31f6
Revert "object 2 object"
ahirreddy Dec 29, 2023
b2a159f
Revert "fast util key map"
ahirreddy Dec 29, 2023
27ade6f
Revert "Revert "fast util key map""
ahirreddy Dec 29, 2023
d8ffc64
Revert "Revert "Revert "fast util key map"""
ahirreddy Dec 29, 2023
8090664
level
ahirreddy Dec 29, 2023
aff20d0
xz test
ahirreddy Dec 29, 2023
1efe031
xz with level
ahirreddy Dec 29, 2023
05c02b6
revert
ahirreddy Dec 29, 2023
b65ed5f
cleanup
ahirreddy Dec 29, 2023
9bbd08f
crc
ahirreddy Dec 29, 2023
9ef506f
cleanup
ahirreddy Dec 29, 2023
f71cee6
wip
ahirreddy Dec 29, 2023
25ee568
add test cases
ahirreddy Dec 29, 2023
c3ecaff
moved
ahirreddy Dec 30, 2023
bbad767
closer
ahirreddy Dec 30, 2023
aba79be
split out test
ahirreddy Dec 30, 2023
2a170c0
fix compile
ahirreddy Dec 30, 2023
b8bdff0
avoid extra seek
ahirreddy Jan 2, 2024
c6956c2
avoid extra copy
ahirreddy Jan 2, 2024
67953ed
added doc
ahirreddy Jan 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions bench/src/main/scala/sjsonnet/MainBenchmark.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ import org.openjdk.jmh.infra._

object MainBenchmark {
val mainArgs = Array[String](
"../../universe2/rulemanager/deploy/rulemanager.jsonnet",
"-J", "../../universe2",
"-J", "../../universe2/mt-shards/dev/az-westus-c2",
"../../universe/rulemanager/deploy/rulemanager.jsonnet",
// "../../universe/kubernetes/admission-controller/gatekeeper/deploy/gatekeeper.jsonnet",
"-J", "../../universe",
"-J", "../../universe/mt-shards/dev/az-westus-c2",
"-J", "../../universe/bazel-bin",
Comment on lines +14 to +15
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added these extra imports as they're now required to compile the given target. I also added the gatekeeper as another benchmark case.

"--ext-code", "isKubecfg=false"
)

def findFiles(): (IndexedSeq[(Path, String)], EvalScope) = {
Expand All @@ -28,7 +31,7 @@ object MainBenchmark {
parseCache = parseCache
)
val renderer = new Renderer(new StringWriter, indent = 3)
interp.interpret0(interp.resolver.read(path).get, path, renderer).getOrElse(???)
interp.interpret0(interp.resolver.read(path).get.readString(), path, renderer).getOrElse(???)
(parseCache.keySet.toIndexedSeq, interp.evaluator)
}

Expand Down
2 changes: 1 addition & 1 deletion bench/src/main/scala/sjsonnet/RunProfiler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ object RunProfiler extends App {

def run(): Long = {
val renderer = new Renderer(new StringWriter, indent = 3)
val start = interp.resolver.read(path).get
val start = interp.resolver.read(path).get.readString()
val t0 = System.nanoTime()
interp.interpret0(start, path, renderer).getOrElse(???)
System.nanoTime() - t0
Expand Down
4 changes: 2 additions & 2 deletions sjsonnet/src-js/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ object Platform {
def gzipString(s: String): String = {
throw new Exception("GZip not implemented in Scala.js")
}
def xzBytes(s: Array[Byte]): String = {
def xzBytes(s: Array[Byte], compressionLevel: Option[Int]): String = {
throw new Exception("XZ not implemented in Scala.js")
}
def xzString(s: String): String = {
def xzString(s: String, compressionLevel: Option[Int]): String = {
throw new Exception("XZ not implemented in Scala.js")
}
def yamlToJson(s: String): String = {
Expand Down
6 changes: 3 additions & 3 deletions sjsonnet/src-js/sjsonnet/SjsonnetMain.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ object SjsonnetMain {
case null => None
case s => Some(JsVirtualPath(s))
}
def read(path: Path): Option[String] =
Option(importLoader(path.asInstanceOf[JsVirtualPath].path))
def read(path: Path): Option[ResolvedFile] =
Option(StaticResolvedFile(importLoader(path.asInstanceOf[JsVirtualPath].path)))
},
parseCache = new DefaultParseCache,
new Settings(preserveOrder = preserveOrder),
Expand Down Expand Up @@ -57,4 +57,4 @@ case class JsVirtualPath(path: String) extends Path{
def renderOffsetStr(offset: Int, loadedFileContents: mutable.HashMap[Path, Array[Int]]): String = {
path + ":" + offset
}
}
}
86 changes: 86 additions & 0 deletions sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package sjsonnet

import java.io.{BufferedInputStream, File, FileInputStream}
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.util.zip.CRC32
import fastparse.ParserInput

/**
* A class that encapsulates a resolved import. This is used to cache the result of
* resolving an import. If the import is deemed too large (IE it's a large file), then we will avoid keeping it in
* memory and instead will re-read it from disk.
*/
class CachedResolvedFile(val resolvedImportPath: OsPath, memoryLimitBytes: Long) extends ResolvedFile {

private val jFile: File = resolvedImportPath.p.toIO

assert(jFile.exists(), s"Resolved import path ${resolvedImportPath} does not exist")
// Assert that the file is less than limit
assert(jFile.length() <= memoryLimitBytes, s"Resolved import path ${resolvedImportPath} is too large: ${jFile.length()} bytes > ${memoryLimitBytes} bytes")

private[this] val resolvedImportContent: StaticResolvedFile = {
if (jFile.length() > 1024 * 1024) {
// If the file is too large, then we will just read it from disk
null
} else {
StaticResolvedFile(readString(jFile))
}
}

private[this] def readString(jFile: File): String = {
new String(Files.readAllBytes(jFile.toPath), StandardCharsets.UTF_8);
}

/**
* A method that will return a reader for the resolved import. If the import is too large, then this will return
* a reader that will read the file from disk. Otherwise, it will return a reader that reads from memory.
*/
def getParserInput(): ParserInput = {
if (resolvedImportContent == null) {
FileParserInput(jFile)
} else {
resolvedImportContent.getParserInput()
}
}

override def readString(): String = {
if (resolvedImportContent == null) {
// If the file is too large, then we will just read it from disk
readString(jFile)
} else {
// Otherwise, we will read it from memory
resolvedImportContent.readString()
}
}

private def crcHashFile(file: File): Long = {
val buffer = new Array[Byte](8192)
val crc = new CRC32()

val fis = new FileInputStream(file)
val bis = new BufferedInputStream(fis)

try {
var bytesRead = bis.read(buffer)
while (bytesRead != -1) {
crc.update(buffer, 0, bytesRead)
bytesRead = bis.read(buffer)
}
} finally {
bis.close()
fis.close()
}

crc.getValue()
}

override lazy val contentHash: String = {
if (resolvedImportContent == null) {
// If the file is too large, then we will just read it from disk
crcHashFile(jFile).toString
} else {
resolvedImportContent.contentHash
}
}
}
24 changes: 20 additions & 4 deletions sjsonnet/src-jvm-native/sjsonnet/SjsonnetMain.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ object SjsonnetMain {
.find(os.exists)
.flatMap(p => try Some(OsPath(p)) catch{case NonFatal(_) => None})

def read(path: Path): Option[String] =
try Some(os.read(path.asInstanceOf[OsPath].p)) catch { case NonFatal(_) => None }
def read(path: Path): Option[ResolvedFile] = {
readPath(path)
}
}

def main(args: Array[String]): Unit = {
Expand Down Expand Up @@ -205,8 +206,9 @@ object SjsonnetMain {
case Some(i) => new Importer {
def resolve(docBase: Path, importName: String): Option[Path] =
i(docBase, importName).map(OsPath)
def read(path: Path): Option[String] =
try Some(os.read(path.asInstanceOf[OsPath].p)) catch { case NonFatal(_) => None }
def read(path: Path): Option[ResolvedFile] = {
readPath(path)
}
}
case None => resolveImport(config.jpaths.map(os.Path(_, wd)).map(OsPath(_)), allowedInputs)
},
Expand Down Expand Up @@ -291,4 +293,18 @@ object SjsonnetMain {

}
}

/**
* Read a path into a [[ResolvedFile]] if it exists and is a file. A resolved file acts as a layer
* of caching on top of the underlying file system. Small files are read into memory, while large
* files are read from disk.
*/
private[this] def readPath(path: Path): Option[ResolvedFile] = {
val osPath = path.asInstanceOf[OsPath].p
if (os.exists(osPath) && os.isFile(osPath)) {
Some(new CachedResolvedFile(path.asInstanceOf[OsPath], memoryLimitBytes = 2048L * 1024L * 1024L))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the significance of this number? Should it just be Int.MaxValue.toLong or Int.MaxValue.toLong + 1?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to Int.MaxValue.toLong and added scaladoc:

 * @param memoryLimitBytes The maximum size of a file that we will resolve. This is not the size of
 * the buffer, but a mechanism to fail when being asked to resolve (and downstream parse) a file
 * that is beyond this limit.
```.

Basically, we have some pathological imports (1GB+) which I eventually want to ban (all the ones I found could be trivially modified upstream to not produce such huge files). In a followup we can make this param configurable.


cc @carl-db 

} else {
None
}
}
}
16 changes: 12 additions & 4 deletions sjsonnet/src-jvm/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,26 @@ object Platform {
def gzipString(s: String): String = {
gzipBytes(s.getBytes())
}
def xzBytes(b: Array[Byte]): String = {

/**
* Valid compression levels are 0 (no compression) to 9 (maximum compression).
*/
def xzBytes(b: Array[Byte], compressionLevel: Option[Int]): String = {
val outputStream: ByteArrayOutputStream = new ByteArrayOutputStream(b.length)
val xz: XZOutputStream = new XZOutputStream(outputStream, new LZMA2Options())
// Set compression to specified level
val level = compressionLevel.getOrElse(LZMA2Options.PRESET_DEFAULT)
val xz: XZOutputStream = new XZOutputStream(outputStream, new LZMA2Options(level))
xz.write(b)
xz.close()
val xzedBase64: String = Base64.getEncoder.encodeToString(outputStream.toByteArray)
outputStream.close()
xzedBase64
}
def xzString(s: String): String = {
xzBytes(s.getBytes())

def xzString(s: String, compressionLevel: Option[Int]): String = {
xzBytes(s.getBytes(), compressionLevel)
}

def yamlToJson(yamlString: String): String = {
val yaml: java.util.LinkedHashMap[String, Object] = new Yaml(new Constructor(classOf[java.util.LinkedHashMap[String, Object]])).load(yamlString)
new JSONObject(yaml).toString()
Expand Down
4 changes: 2 additions & 2 deletions sjsonnet/src-native/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ object Platform {
def gzipString(s: String): String = {
throw new Exception("GZip not implemented in Scala Native")
}
def xzBytes(s: Array[Byte]): String = {
def xzBytes(s: Array[Byte], compressionLevel: Option[Int]): String = {
throw new Exception("XZ not implemented in Scala Native")
}
def xzString(s: String): String = {
def xzString(s: String, compressionLevel: Option[Int]): String = {
throw new Exception("XZ not implemented in Scala Native")
}
def yamlToJson(s: String): String = {
Expand Down
2 changes: 1 addition & 1 deletion sjsonnet/src/sjsonnet/Error.scala
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ trait EvalErrorScope {
def prettyIndex(pos: Position): Option[(Int, Int)] = {
importer.read(pos.currentFile).map { s =>
val Array(line, col) =
new IndexedParserInput(s).prettyIndex(pos.offset).split(':')
s.getParserInput().prettyIndex(pos.offset).split(':')
(line.toInt, col.toInt)
}
}
Expand Down
2 changes: 1 addition & 1 deletion sjsonnet/src/sjsonnet/Evaluator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ class Evaluator(resolver: CachedResolver,
}

def visitImportStr(e: ImportStr)(implicit scope: ValScope): Val.Str =
Val.Str(e.pos, importer.resolveAndReadOrFail(e.value, e.pos)._2)
Val.Str(e.pos, importer.resolveAndReadOrFail(e.value, e.pos)._2.readString())

def visitImport(e: Import)(implicit scope: ValScope): Val = {
val (p, str) = importer.resolveAndReadOrFail(e.value, e.pos)
Expand Down
Loading
Loading