Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unordered_map for string references #584

Merged
merged 14 commits into from
Mar 22, 2022
Merged
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ Config/Needs/website: tidyverse/tidytemplate
Config/testthat/edition: 3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.1.2
SystemRequirements: GNU make, C++11, zlib: zlib1g-dev (deb), zlib-devel
(rpm)
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

* Updated to ReadStat 1.1.8 RC (#650).

* `write_dta()` now uses strL when strings are too long to be stored in an str#
variable (#437). strL is used when strings are longer than 2045 characters by
default, which matches Stata's behaviour, but this can be reduced with the
`strl_threshold` argument.

* All `write_` functions can now write custom variable widths by setting the
`width` attribute (#650).

Expand Down
4 changes: 2 additions & 2 deletions R/cpp11.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ write_sav_ <- function(data, path, compress) {
invisible(.Call(`_haven_write_sav_`, data, path, compress))
}

write_dta_ <- function(data, path, version, label) {
invisible(.Call(`_haven_write_dta_`, data, path, version, label))
write_dta_ <- function(data, path, version, label, strl_threshold) {
invisible(.Call(`_haven_write_dta_`, data, path, version, label, strl_threshold))
}

write_sas_ <- function(data, path) {
Expand Down
26 changes: 24 additions & 2 deletions R/haven-stata.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
#' Read and write Stata DTA files
#'
#' @description
#' Currently haven can read and write logical, integer, numeric, character
#' and factors. See [labelled()] for how labelled variables in
#' Stata are handled in R.
#'
#' Character vectors will be stored as `strL` if any components are
#' `strl_threshold` bytes or longer (and `version` >= 13); otherwise they will
#' be stored as the appropriate `str#`.
#'
#' @section Character encoding:
#' Prior to Stata 14, files did not declare a text encoding, and the
#' default encoding differed across platforms. If `encoding = NULL`,
Expand Down Expand Up @@ -64,13 +69,20 @@ read_stata <- read_dta
#' @param version File version to use. Supports versions 8-15.
#' @param label Dataset label to use, or `NULL`. Defaults to the value stored in
#' the "label" attribute of `data`. Must be <= 80 characters.
write_dta <- function(data, path, version = 14, label = attr(data, "label")) {
#' @param strl_threshold Any character vectors with a maximum length greater
#' than `strl_threshold` bytes will be stored as a long string (strL) instead
#' of a standard string (str#) variable if `version` >= 13. This defaults to
#' 2045, the maximum length of str# variables. See the Stata [long
#' string](https://www.stata.com/features/overview/long-strings/)
#' documentation for more details.
write_dta <- function(data, path, version = 14, label = attr(data, "label"), strl_threshold = 2045) {
data <- validate_dta(data, version = version)
validate_dta_label(label)
write_dta_(data,
normalizePath(path, mustWork = FALSE),
version = stata_file_format(version),
label = label
label = label,
strl_threshold = validate_strl_threshold(strl_threshold)
)
invisible(data)
}
Expand All @@ -96,6 +108,16 @@ stata_file_format <- function(version) {
}
}

validate_strl_threshold <- function(strl_threshold) {
stopifnot(is.numeric(strl_threshold), length(strl_threshold) == 1)

if (strl_threshold < 0 || strl_threshold > 2045) {
2045
} else {
strl_threshold
}
}

validate_dta <- function(data, version) {
stopifnot(is.data.frame(data))

Expand Down
3 changes: 1 addition & 2 deletions man/haven-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 21 additions & 4 deletions man/read_dta.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions man/read_spss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions man/read_xpt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 31 additions & 4 deletions src/DfWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "haven_types.h"
#include "tagged_na.h"

#include <unordered_map>
#include "cpp11/doubles.hpp"
#include "cpp11/strings.hpp"
#include "cpp11/integers.hpp"
Expand Down Expand Up @@ -63,13 +64,16 @@ inline int userWidth(cpp11::sexp x) {
class Writer {
FileExt ext_;
FileVendor vendor_;
int version_;
int strl_threshold_;
std::unordered_map<const char*, readstat_string_ref_t*> string_ref_;
gorcha marked this conversation as resolved.
Show resolved Hide resolved

cpp11::list x_;
readstat_writer_t* writer_;
FILE* pOut_;

public:
Writer(FileExt ext, cpp11::list x, cpp11::strings pathEnc): ext_(ext), vendor_(extVendor(ext)), x_(x) {
Writer(FileExt ext, cpp11::list x, cpp11::strings pathEnc): ext_(ext), vendor_(extVendor(ext)), version_(0), x_(x) {
std::string path(Rf_translateChar(pathEnc[0]));

pOut_ = fopen(path.c_str(), "wb");
Expand All @@ -92,6 +96,7 @@ class Writer {
}

void setVersion(int version) {
version_ = version;
readstat_writer_set_file_format_version(writer_, version);
}

Expand All @@ -106,6 +111,10 @@ class Writer {
readstat_writer_set_file_label(writer_, string_utf8(label, 0));
}

void setStrLThreshold(int strl_threshold) {
strl_threshold_ = strl_threshold;
}

void write() {
int p = x_.size();
if (p == 0)
Expand Down Expand Up @@ -369,8 +378,23 @@ class Writer {
}


readstat_variable_t* var =
readstat_add_variable(writer_, name, READSTAT_TYPE_STRING, user_width);
// Use strL for "long" strings in stata. strL has an 80 byte overhead so
// we use it when it's likely to be more efficient. The main downside of
// strL is that it can't be used as a join key but this seems unlikely for
// very long strings.
readstat_variable_t* var;
if (ext_ == HAVEN_DTA && version_ >= 117 && user_width > strl_threshold_) {
var = readstat_add_variable(writer_, name, READSTAT_TYPE_STRING_REF, user_width);
for (int i = 0; i < x.size(); ++i) {
const char* val = string_utf8(x, i);
if (!string_ref_.count(val)) {
string_ref_[val] = readstat_add_string_ref(writer_, val);
}
}
} else {
var = readstat_add_variable(writer_, name, READSTAT_TYPE_STRING, user_width);
}

readstat_variable_set_format(var, format);
readstat_variable_set_label(var, var_label(x));
readstat_variable_set_label_set(var, labelSet);
Expand Down Expand Up @@ -426,6 +450,8 @@ class Writer {
readstat_error_t insertValue(readstat_variable_t* var, const char* val, bool is_missing) {
if (is_missing) {
return readstat_insert_missing_value(writer_, var);
} else if (var->type == READSTAT_TYPE_STRING_REF) {
return readstat_insert_string_ref(writer_, var, string_ref_[val]);
} else {
return readstat_insert_string_value(writer_, var, val);
}
Expand Down Expand Up @@ -461,10 +487,11 @@ void write_sav_(cpp11::list data, cpp11::strings path, std::string compress) {
}

[[cpp11::register]]
void write_dta_(cpp11::list data, cpp11::strings path, int version, cpp11::sexp label) {
void write_dta_(cpp11::list data, cpp11::strings path, int version, cpp11::sexp label, int strl_threshold) {
Writer writer(HAVEN_DTA, data, path);
writer.setVersion(version);
writer.setFileLabel(label);
writer.setStrLThreshold(strl_threshold);
writer.write();
}

Expand Down
25 changes: 6 additions & 19 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "haven_types.h"
#include "cpp11/declarations.hpp"
#include <R_ext/Visibility.h>

// DfReader.cpp
cpp11::list df_parse_sas_file(cpp11::list spec_b7dat, cpp11::list spec_b7cat, std::string encoding, std::string catalog_encoding, std::vector<std::string> cols_skip, long n_max, long rows_skip, std::string name_repair);
Expand Down Expand Up @@ -83,10 +84,10 @@ extern "C" SEXP _haven_write_sav_(SEXP data, SEXP path, SEXP compress) {
END_CPP11
}
// DfWriter.cpp
void write_dta_(cpp11::list data, cpp11::strings path, int version, cpp11::sexp label);
extern "C" SEXP _haven_write_dta_(SEXP data, SEXP path, SEXP version, SEXP label) {
void write_dta_(cpp11::list data, cpp11::strings path, int version, cpp11::sexp label, int strl_threshold);
extern "C" SEXP _haven_write_dta_(SEXP data, SEXP path, SEXP version, SEXP label, SEXP strl_threshold) {
BEGIN_CPP11
write_dta_(cpp11::as_cpp<cpp11::decay_t<cpp11::list>>(data), cpp11::as_cpp<cpp11::decay_t<cpp11::strings>>(path), cpp11::as_cpp<cpp11::decay_t<int>>(version), cpp11::as_cpp<cpp11::decay_t<cpp11::sexp>>(label));
write_dta_(cpp11::as_cpp<cpp11::decay_t<cpp11::list>>(data), cpp11::as_cpp<cpp11::decay_t<cpp11::strings>>(path), cpp11::as_cpp<cpp11::decay_t<int>>(version), cpp11::as_cpp<cpp11::decay_t<cpp11::sexp>>(label), cpp11::as_cpp<cpp11::decay_t<int>>(strl_threshold));
return R_NilValue;
END_CPP11
}
Expand All @@ -109,20 +110,6 @@ extern "C" SEXP _haven_write_xpt_(SEXP data, SEXP path, SEXP version, SEXP name,

extern "C" {
/* .Call calls */
extern SEXP _haven_df_parse_dta_file(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_dta_raw(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_por_file(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_por_raw(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_sas_file(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_sas_raw(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_sav_file(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_sav_raw(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_xpt_file(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_df_parse_xpt_raw(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_write_dta_(SEXP, SEXP, SEXP, SEXP);
extern SEXP _haven_write_sas_(SEXP, SEXP);
extern SEXP _haven_write_sav_(SEXP, SEXP, SEXP);
extern SEXP _haven_write_xpt_(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP is_tagged_na_(SEXP, SEXP);
extern SEXP na_tag_(SEXP);
extern SEXP tagged_na_(SEXP);
Expand All @@ -138,7 +125,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_haven_df_parse_sav_raw", (DL_FUNC) &_haven_df_parse_sav_raw, 7},
{"_haven_df_parse_xpt_file", (DL_FUNC) &_haven_df_parse_xpt_file, 5},
{"_haven_df_parse_xpt_raw", (DL_FUNC) &_haven_df_parse_xpt_raw, 5},
{"_haven_write_dta_", (DL_FUNC) &_haven_write_dta_, 4},
{"_haven_write_dta_", (DL_FUNC) &_haven_write_dta_, 5},
{"_haven_write_sas_", (DL_FUNC) &_haven_write_sas_, 2},
{"_haven_write_sav_", (DL_FUNC) &_haven_write_sav_, 3},
{"_haven_write_xpt_", (DL_FUNC) &_haven_write_xpt_, 5},
Expand All @@ -149,7 +136,7 @@ static const R_CallMethodDef CallEntries[] = {
};
}

extern "C" void R_init_haven(DllInfo* dll){
extern "C" attribute_visible void R_init_haven(DllInfo* dll){
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
R_forceSymbols(dll, TRUE);
Expand Down
4 changes: 2 additions & 2 deletions src/readstat/readstat_writer.c
Original file line number Diff line number Diff line change
Expand Up @@ -604,8 +604,8 @@ readstat_error_t readstat_insert_string_ref(readstat_writer_t *writer, const rea
return READSTAT_ERROR_STRING_REFS_NOT_SUPPORTED;

if (ref && ref->first_o == -1 && ref->first_v == -1) {
ref->first_o = writer->current_row;
ref->first_v = variable->index;
ref->first_o = writer->current_row + 1;
ref->first_v = variable->index + 1;
}

return writer->callbacks.write_string_ref(&writer->row[variable->offset], variable, ref);
Expand Down
13 changes: 13 additions & 0 deletions tests/testthat/test-haven-stata.R
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,16 @@ test_that("can't write non-integer labels (#401)", {
write_dta(df, tempfile())
})
})

test_that("can roundtrip long strings (strL)", {
long_string <- function(n, m) {
do.call("paste0", replicate(m, sample(LETTERS, n, TRUE), simplify = FALSE))
}

x <- long_string(10, 400)
expect_equal(roundtrip_var(x, "dta"), x)
x <- long_string(10, 1000)
expect_equal(roundtrip_var(x, "dta"), x)
x <- long_string(10, 3000)
expect_equal(roundtrip_var(x, "dta"), x)
})