dmlc · trivialfis · Jul 4, 2020 · Jun 27, 2020 · Jun 27, 2020 · Jun 27, 2020
diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
@@ -9,12 +9,12 @@
 #include <vector>
 #include <memory>
 #include <string>
-#include <cinttypes>
 #include <utility>
 #include <map>
 #include <limits>
 #include <sstream>
 #include <locale>
+#include <cinttypes>
 
 namespace xgboost {
 /*
@@ -86,6 +86,8 @@ class JsonReader {
     msg += "\", got: \"";
     if (got == -1) {
       msg += "EOF\"";
+    } else if (got == 0) {
+      msg += "\\0\"";
     } else {
       msg += std::to_string(got) + " \"";
     }

diff --git a/src/common/io.cc b/src/common/io.cc
@@ -7,9 +7,10 @@
 #include <unistd.h>
 #endif  // defined(__unix__)
 #include <algorithm>
-#include <cstdio>
+#include <fstream>
 #include <string>
 #include <utility>
+#include <cstdio>
 
 #include "xgboost/logging.h"
 #include "io.h"
@@ -120,27 +121,22 @@ std::string LoadSequentialFile(std::string fname) {
 #if defined(__linux__)
   posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
 #endif  // defined(__linux__)
-  ssize_t bytes_read = read(fd, &buffer[0], f_size_bytes);
-  if (bytes_read < 0) {
-    close(fd);
-    ReadErr();
+  ssize_t bytes_read = 0;
+  while (bytes_read < f_size_bytes) {
+    ssize_t result = read(fd, &buffer[bytes_read], f_size_bytes - bytes_read);
+    if (result < 0) {
+      close(fd);
+      ReadErr();
+    }
+    bytes_read += result;
   }
   close(fd);
+  buffer.back() = '\0';
 #else  // defined(__unix__)
-  FILE *f = fopen(fname.c_str(), "r");
-  if (f == NULL) {
-    std::string msg;
-    OpenErr();
-  }
-  fseek(f, 0, SEEK_END);
-  auto fsize = ftell(f);
-  fseek(f, 0, SEEK_SET);
-
-  buffer.resize(fsize + 1);
-  fread(&buffer[0], 1, fsize, f);
-  fclose(f);
+  std::ifstream ifs(fname);
+  buffer = std::string((std::istreambuf_iterator<char>(ifs)),
+                       (std::istreambuf_iterator<char>()));
 #endif  // defined(__unix__)
-  buffer.back() = '\0';
   return buffer;
 }
 

diff --git a/src/common/json.cc b/src/common/json.cc
@@ -427,6 +427,8 @@ void JsonReader::Error(std::string msg) const {
   for (auto c : raw_portion) {
     if (c == '\n') {
       portion += "\\n";
+    } else if (c == '\0') {
+      portion += "\\0";
     } else {
       portion += c;
     }

diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
@@ -207,6 +207,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -v "${WORKSPACE}":/workspace \
     -w /workspace \
     ${USER_IDS} \
+    --shm-size=3g \
     "${CI_DOCKER_EXTRA_PARAMS[@]}" \
     "${DOCKER_IMG_NAME}" \
     "${COMMAND[@]}"

diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
@@ -2,6 +2,11 @@
  * Copyright (c) by XGBoost Contributors 2019
  */
 #include <gtest/gtest.h>
+#include <dmlc/filesystem.h>
+#include <atomic>
+#include <type_traits>
+#include <fstream>
+#include <cstdint>
 #include "../../../src/common/io.h"
 
 namespace xgboost {
@@ -39,5 +44,50 @@ TEST(IO, FixedSizeStream) {
     ASSERT_EQ(huge_buffer, out_buffer);
   }
 }
+
+
+#if SIZE_MAX == 0xFFFFFFFFFFFFFFFF  // Only run this test on 64-bit system
+TEST(IO, LoadSequentialFile) {
+  const size_t nbyte = static_cast<size_t>(2896075944LL);  // About 2.69 GB
+  static_assert(sizeof(size_t) == 8, "Assumption failed: size_t was assumed to be 8-bytes long");
+  static_assert(std::is_same<size_t, std::string::size_type>::value,
+                "Assumption failed: size_type of std::string was assumed to be 8-bytes long");
+
+  dmlc::TemporaryDirectory tempdir;
+  std::string path = "/dev/shm/xgboost_test_io_big_file.txt";
+  {
+    std::ofstream f(path);
+    if (!f) {  // /dev/shm not present
+      LOG(INFO) << "No /dev/shm; using dmlc::TemporaryDirectory instead";
+      path = tempdir.path + "/xgboost_test_io_big_file.txt";
+      f = std::ofstream(path);
+    }
+    CHECK(f);
+    std::string str(nbyte, 'a');
+    CHECK_EQ(str.size(), nbyte);
+    f << str;
+  }
+  {
+    std::string str = LoadSequentialFile(path);
+    CHECK_GE(str.size(), nbyte);
+    dmlc::OMPException omp_exc;
+    std::atomic<bool> success{true};
+    #pragma omp parallel for schedule(static)
+    for (int64_t i = 0; i < static_cast<int64_t>(nbyte); ++i) {
+      omp_exc.Run([&] {
+        if (str[i] != 'a' && success.load(std::memory_order_acquire)) {
+          success.store(false, std::memory_order_release);
+          LOG(FATAL) << "Big file got corrupted. Expected: str[" << i << "] = 'a', "
+            << "Actual: str[" << i << "] = '"
+            << (str[i] ? std::string(1, str[i]) : std::string("\\0")) << "'";
+        }
+      });
+    }
+    omp_exc.Rethrow();
+    CHECK(success.load(std::memory_order_acquire));
+  }
+}
+#endif  // SIZE_MAX == 0xFFFFFFFFFFFFFFFF
+
 }  // namespace common
 }  // namespace xgboost