From 681079957e1630e8af5b05355aaadef8759bf4e5 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 06:27:19 +0000 Subject: [PATCH 01/13] Ensure that the whole file is read by LoadSequentialFile() --- src/common/io.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index a8df1640f7a4..3104a81c8f39 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -120,10 +120,14 @@ std::string LoadSequentialFile(std::string fname) { #if defined(__linux__) posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); #endif // defined(__linux__) - ssize_t bytes_read = read(fd, &buffer[0], f_size_bytes); - if (bytes_read < 0) { - close(fd); - ReadErr(); + ssize_t bytes_read = 0; + while (bytes_read < f_size_bytes) { + ssize_t result = read(fd, &buffer[bytes_read], f_size_bytes); + if (result < 0) { + close(fd); + ReadErr(); + } + bytes_read += result; } close(fd); #else // defined(__unix__) From 5a5f99ce0a2700e845176278ad3f6cf29a6e83f1 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 06:26:53 +0000 Subject: [PATCH 02/13] Improve error handling in JSON when NUL letter is encountered --- include/xgboost/json_io.h | 7 +++++-- src/common/json.cc | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h index 67a829ee777b..96653256f944 100644 --- a/include/xgboost/json_io.h +++ b/include/xgboost/json_io.h @@ -9,12 +9,13 @@ #include #include #include -#include #include #include #include #include #include +#include +#include namespace xgboost { /* @@ -84,8 +85,10 @@ class JsonReader { std::string msg = "Expecting: \""; msg += c; msg += "\", got: \""; - if (got == -1) { + if (got == EOF) { msg += "EOF\""; + } else if (got == 0) { + msg += "\\0\""; } else { msg += std::to_string(got) + " \""; } diff --git a/src/common/json.cc b/src/common/json.cc index 6ba82aa91105..18d8694d14ad 100644 --- a/src/common/json.cc +++ b/src/common/json.cc @@ -427,6 +427,8 @@ void JsonReader::Error(std::string msg) const { for (auto c : raw_portion) { if (c == '\n') { portion += "\\n"; + } else if (c == '\0') { + portion += "\\0"; } else { portion += c; } From 469705bd4f214c626e57854a15eb04761822741a Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 06:25:19 +0000 Subject: [PATCH 03/13] Add a unit test for LoadSequentialFile() --- tests/cpp/common/test_io.cc | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 957c1748be5a..80608450a039 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -2,6 +2,11 @@ * Copyright (c) by XGBoost Contributors 2019 */ #include +#include +#include +#include +#include +#include #include "../../../src/common/io.h" namespace xgboost { @@ -39,5 +44,51 @@ TEST(IO, FixedSizeStream) { ASSERT_EQ(huge_buffer, out_buffer); } } + + +#if SIZE_MAX == 0xFFFFFFFFFFFFFFFF // Only run this test on 64-bit system +TEST(IO, LoadSequentialFile) { + // bytes_read = 2147479552, f_size_bytes = 2896075944 + const size_t nbyte = static_cast(2896075944LL); + static_assert(sizeof(size_t) == 8, "Assumption failed: size_t was assumed to be 8-bytes long"); + static_assert(std::is_same::value, + "Assumption failed: size_type of std::string was assumed to be 8-bytes long"); + + dmlc::TemporaryDirectory tempdir; + std::string path = "/dev/shm/xgboost_test_io_big_file.txt"; + { + FILE* f = std::fopen(path.c_str(), "w"); + if (!f) { // /dev/shm not present + LOG(INFO) << "No /dev/shm; using dmlc::TemporaryDirectory instead"; + path = tempdir.path + "/xgboost_test_io_big_file.txt"; + f = std::fopen(path.c_str(), "w"); + } + CHECK(f); + std::string str(nbyte, 'a'); + CHECK_EQ(str.size(), nbyte); + CHECK_EQ(std::fwrite(str.data(), sizeof(char), nbyte, f), nbyte); + std::fclose(f); + } + { + std::string str = LoadSequentialFile(path); + dmlc::OMPException omp_exc; + std::atomic success{true}; + #pragma omp parallel for schedule(static) + for (int64_t i = 0; i < static_cast(nbyte); ++i) { + omp_exc.Run([&] { + if (str[i] != 'a' && success.load(std::memory_order_acquire)) { + success.store(false, std::memory_order_release); + LOG(FATAL) << "Big file got corrupted. Expected: str[" << i << "] = 'a', " + << "Actual: str[" << i << "] = '" + << (str[i] ? std::string(1, str[i]) : std::string("\\0")) << "'"; + } + }); + } + omp_exc.Rethrow(); + CHECK(success.load(std::memory_order_acquire)); + } +} +#endif // SIZE_MAX == 0xFFFFFFFFFFFFFFFF + } // namespace common } // namespace xgboost From c8fabb237b7b5494ed97296b8474b8e2cc4e5f5d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 08:01:25 +0000 Subject: [PATCH 04/13] Do not use fseek() / ftell() to measure file size --- src/common/io.cc | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index 3104a81c8f39..bb7395054dc0 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -7,9 +7,10 @@ #include #endif // defined(__unix__) #include -#include +#include #include #include +#include #include "xgboost/logging.h" #include "io.h" @@ -130,21 +131,12 @@ std::string LoadSequentialFile(std::string fname) { bytes_read += result; } close(fd); + buffer.back() = '\0'; #else // defined(__unix__) - FILE *f = fopen(fname.c_str(), "r"); - if (f == NULL) { - std::string msg; - OpenErr(); - } - fseek(f, 0, SEEK_END); - auto fsize = ftell(f); - fseek(f, 0, SEEK_SET); - - buffer.resize(fsize + 1); - fread(&buffer[0], 1, fsize, f); - fclose(f); + std::ifstream ifs(fname); + buffer = std::string( (std::istreambuf_iterator(ifs) ), + (std::istreambuf_iterator()) ); #endif // defined(__unix__) - buffer.back() = '\0'; return buffer; } From 5083ca2bc8ef5a0d33a23ea9e8526ec800c9c8d0 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 08:01:46 +0000 Subject: [PATCH 05/13] Use ofstream in the unit test for exception safety --- tests/cpp/common/test_io.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 80608450a039..4a44ce4148ac 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -5,8 +5,8 @@ #include #include #include +#include #include -#include #include "../../../src/common/io.h" namespace xgboost { @@ -57,20 +57,20 @@ TEST(IO, LoadSequentialFile) { dmlc::TemporaryDirectory tempdir; std::string path = "/dev/shm/xgboost_test_io_big_file.txt"; { - FILE* f = std::fopen(path.c_str(), "w"); + std::ofstream f(path); if (!f) { // /dev/shm not present LOG(INFO) << "No /dev/shm; using dmlc::TemporaryDirectory instead"; path = tempdir.path + "/xgboost_test_io_big_file.txt"; - f = std::fopen(path.c_str(), "w"); + f = std::ofstream(path); } CHECK(f); std::string str(nbyte, 'a'); CHECK_EQ(str.size(), nbyte); - CHECK_EQ(std::fwrite(str.data(), sizeof(char), nbyte, f), nbyte); - std::fclose(f); + f << str; } { std::string str = LoadSequentialFile(path); + CHECK_EQ(str.size(), nbyte); dmlc::OMPException omp_exc; std::atomic success{true}; #pragma omp parallel for schedule(static) From efc6f47af360428ee5c27954fe433b1aabb9853d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 01:11:22 -0700 Subject: [PATCH 06/13] Fix lint --- src/common/io.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index bb7395054dc0..3f993443f606 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -134,8 +134,8 @@ std::string LoadSequentialFile(std::string fname) { buffer.back() = '\0'; #else // defined(__unix__) std::ifstream ifs(fname); - buffer = std::string( (std::istreambuf_iterator(ifs) ), - (std::istreambuf_iterator()) ); + buffer = std::string((std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator())); #endif // defined(__unix__) return buffer; } From d57e0ab107a7a18dc98c092726190e9a10ec1c3e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 01:21:03 -0700 Subject: [PATCH 07/13] Trim extra NUL letters from the end of buffer --- src/common/io.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/common/io.cc b/src/common/io.cc index 3f993443f606..9160dcafd9ba 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -132,6 +132,14 @@ std::string LoadSequentialFile(std::string fname) { } close(fd); buffer.back() = '\0'; + // Trim NUL letters at the end + int64_t i; + for (i = static_cast(buffer.size()); i >= 0; --i) { + if (buffer[i - 1] != '\0') { + break; + } + } + buffer = buffer.substr(0, i); #else // defined(__unix__) std::ifstream ifs(fname); buffer = std::string((std::istreambuf_iterator(ifs)), From 4782c7db1384f2ca5722aea892e5069e8f5664d7 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 01:25:38 -0700 Subject: [PATCH 08/13] Remove a comment --- tests/cpp/common/test_io.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 4a44ce4148ac..7b1a59222953 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -48,8 +48,7 @@ TEST(IO, FixedSizeStream) { #if SIZE_MAX == 0xFFFFFFFFFFFFFFFF // Only run this test on 64-bit system TEST(IO, LoadSequentialFile) { - // bytes_read = 2147479552, f_size_bytes = 2896075944 - const size_t nbyte = static_cast(2896075944LL); + const size_t nbyte = static_cast(2896075944LL); // About 2.69 GB static_assert(sizeof(size_t) == 8, "Assumption failed: size_t was assumed to be 8-bytes long"); static_assert(std::is_same::value, "Assumption failed: size_type of std::string was assumed to be 8-bytes long"); From 2685ce614de1c6e179a664ac31f514eb9ee99ae6 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 27 Jun 2020 01:50:44 -0700 Subject: [PATCH 09/13] Allow use of /dev/shm inside Docker container --- tests/ci_build/ci_build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh index 5f0ed112039f..fbb559c6d449 100755 --- a/tests/ci_build/ci_build.sh +++ b/tests/ci_build/ci_build.sh @@ -207,6 +207,7 @@ ${DOCKER_BINARY} run --rm --pid=host \ -v "${WORKSPACE}":/workspace \ -w /workspace \ ${USER_IDS} \ + --shm-size=3g \ "${CI_DOCKER_EXTRA_PARAMS[@]}" \ "${DOCKER_IMG_NAME}" \ "${COMMAND[@]}" From 29708e91f77e3aa13db0dc448b68a23d024fc3ed Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 28 Jun 2020 10:42:26 -0700 Subject: [PATCH 10/13] Address reviewer's feedback --- include/xgboost/json_io.h | 3 +-- src/common/io.cc | 8 -------- tests/cpp/common/test_io.cc | 1 - 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h index 96653256f944..12118e23af43 100644 --- a/include/xgboost/json_io.h +++ b/include/xgboost/json_io.h @@ -15,7 +15,6 @@ #include #include #include -#include namespace xgboost { /* @@ -85,7 +84,7 @@ class JsonReader { std::string msg = "Expecting: \""; msg += c; msg += "\", got: \""; - if (got == EOF) { + if (got == -1) { msg += "EOF\""; } else if (got == 0) { msg += "\\0\""; diff --git a/src/common/io.cc b/src/common/io.cc index 9160dcafd9ba..3f993443f606 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -132,14 +132,6 @@ std::string LoadSequentialFile(std::string fname) { } close(fd); buffer.back() = '\0'; - // Trim NUL letters at the end - int64_t i; - for (i = static_cast(buffer.size()); i >= 0; --i) { - if (buffer[i - 1] != '\0') { - break; - } - } - buffer = buffer.substr(0, i); #else // defined(__unix__) std::ifstream ifs(fname); buffer = std::string((std::istreambuf_iterator(ifs)), diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 7b1a59222953..7a248035cda4 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -69,7 +69,6 @@ TEST(IO, LoadSequentialFile) { } { std::string str = LoadSequentialFile(path); - CHECK_EQ(str.size(), nbyte); dmlc::OMPException omp_exc; std::atomic success{true}; #pragma omp parallel for schedule(static) From 7bb99c61d8473c0889832eab7268bf97f8d5a8fa Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 28 Jun 2020 10:42:39 -0700 Subject: [PATCH 11/13] Request correct number of bytes --- src/common/io.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/io.cc b/src/common/io.cc index 3f993443f606..35572397e480 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -123,7 +123,7 @@ std::string LoadSequentialFile(std::string fname) { #endif // defined(__linux__) ssize_t bytes_read = 0; while (bytes_read < f_size_bytes) { - ssize_t result = read(fd, &buffer[bytes_read], f_size_bytes); + ssize_t result = read(fd, &buffer[bytes_read], f_size_bytes - bytes_read); if (result < 0) { close(fd); ReadErr(); From 4faae5b6f73440e4eff1d450f6f00950413327d1 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 28 Jun 2020 10:54:25 -0700 Subject: [PATCH 12/13] Add back an assertion --- tests/cpp/common/test_io.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 7a248035cda4..600850d02435 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -69,6 +69,7 @@ TEST(IO, LoadSequentialFile) { } { std::string str = LoadSequentialFile(path); + CHECK_GE(str.size(), nbyte); dmlc::OMPException omp_exc; std::atomic success{true}; #pragma omp parallel for schedule(static) From c4b7ffaa72b50314689dce87781e813029b5a1cf Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 29 Jun 2020 03:03:58 -0700 Subject: [PATCH 13/13] Use std::ifstream.read() for all cases --- src/common/io.cc | 37 ++++++++------------------- src/common/io.h | 1 - tests/ci_build/ci_build.sh | 1 - tests/cpp/common/test_io.cc | 50 ------------------------------------- 4 files changed, 10 insertions(+), 79 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index 35572397e480..cbf11ae0d347 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -109,34 +109,17 @@ std::string LoadSequentialFile(std::string fname) { }; std::string buffer; -#if defined(__unix__) - struct stat fs; - if (stat(fname.c_str(), &fs) != 0) { - OpenErr(); - } - - size_t f_size_bytes = fs.st_size; - buffer.resize(f_size_bytes + 1); - int32_t fd = open(fname.c_str(), O_RDONLY); -#if defined(__linux__) - posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); -#endif // defined(__linux__) - ssize_t bytes_read = 0; - while (bytes_read < f_size_bytes) { - ssize_t result = read(fd, &buffer[bytes_read], f_size_bytes - bytes_read); - if (result < 0) { - close(fd); - ReadErr(); - } - bytes_read += result; - } - close(fd); + // Open in binary mode so that correct file size can be computed with seekg(). + // This accommodates Windows platform: + // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg + std::ifstream ifs(fname, std::ios_base::binary | std::ios_base::in); + ifs.seekg(0, std::ios_base::end); + const size_t file_size = static_cast(ifs.tellg()); + ifs.seekg(0, std::ios_base::beg); + buffer.resize(file_size + 1); + ifs.read(&buffer[0], file_size); buffer.back() = '\0'; -#else // defined(__unix__) - std::ifstream ifs(fname); - buffer = std::string((std::istreambuf_iterator(ifs)), - (std::istreambuf_iterator())); -#endif // defined(__unix__) + return buffer; } diff --git a/src/common/io.h b/src/common/io.h index 528296dc76cc..4ae3fcb02147 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -75,7 +75,6 @@ class FixedSizeStream : public PeekableInStream { std::string buffer_; }; -// Optimized for consecutive file loading in unix like systime. std::string LoadSequentialFile(std::string fname); inline std::string FileExtension(std::string const& fname) { diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh index fbb559c6d449..5f0ed112039f 100755 --- a/tests/ci_build/ci_build.sh +++ b/tests/ci_build/ci_build.sh @@ -207,7 +207,6 @@ ${DOCKER_BINARY} run --rm --pid=host \ -v "${WORKSPACE}":/workspace \ -w /workspace \ ${USER_IDS} \ - --shm-size=3g \ "${CI_DOCKER_EXTRA_PARAMS[@]}" \ "${DOCKER_IMG_NAME}" \ "${COMMAND[@]}" diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 600850d02435..957c1748be5a 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -2,11 +2,6 @@ * Copyright (c) by XGBoost Contributors 2019 */ #include -#include -#include -#include -#include -#include #include "../../../src/common/io.h" namespace xgboost { @@ -44,50 +39,5 @@ TEST(IO, FixedSizeStream) { ASSERT_EQ(huge_buffer, out_buffer); } } - - -#if SIZE_MAX == 0xFFFFFFFFFFFFFFFF // Only run this test on 64-bit system -TEST(IO, LoadSequentialFile) { - const size_t nbyte = static_cast(2896075944LL); // About 2.69 GB - static_assert(sizeof(size_t) == 8, "Assumption failed: size_t was assumed to be 8-bytes long"); - static_assert(std::is_same::value, - "Assumption failed: size_type of std::string was assumed to be 8-bytes long"); - - dmlc::TemporaryDirectory tempdir; - std::string path = "/dev/shm/xgboost_test_io_big_file.txt"; - { - std::ofstream f(path); - if (!f) { // /dev/shm not present - LOG(INFO) << "No /dev/shm; using dmlc::TemporaryDirectory instead"; - path = tempdir.path + "/xgboost_test_io_big_file.txt"; - f = std::ofstream(path); - } - CHECK(f); - std::string str(nbyte, 'a'); - CHECK_EQ(str.size(), nbyte); - f << str; - } - { - std::string str = LoadSequentialFile(path); - CHECK_GE(str.size(), nbyte); - dmlc::OMPException omp_exc; - std::atomic success{true}; - #pragma omp parallel for schedule(static) - for (int64_t i = 0; i < static_cast(nbyte); ++i) { - omp_exc.Run([&] { - if (str[i] != 'a' && success.load(std::memory_order_acquire)) { - success.store(false, std::memory_order_release); - LOG(FATAL) << "Big file got corrupted. Expected: str[" << i << "] = 'a', " - << "Actual: str[" << i << "] = '" - << (str[i] ? std::string(1, str[i]) : std::string("\\0")) << "'"; - } - }); - } - omp_exc.Rethrow(); - CHECK(success.load(std::memory_order_acquire)); - } -} -#endif // SIZE_MAX == 0xFFFFFFFFFFFFFFFF - } // namespace common } // namespace xgboost