Skip to content

Commit

Permalink
GH-41993 [Go] IPC writer shift voffsets when offsets array does not s…
Browse files Browse the repository at this point in the history
…tart from zero (#43176)

### Rationale for this change

It should be valid to specify offset buffers that do not start from zero. This particularly important for when multiple arrays share a single value buffer.

### What changes are included in this PR?

- Add condition to shift offsets buffer when it does not start from zero
- Test to reproduce failure and then validate fix

### Are these changes tested?

Yes

### Are there any user-facing changes?

Variable-length binary arrays that share a value buffer will not result in errors.

* GitHub Issue: #41993

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
  • Loading branch information
joellubi committed Jul 15, 2024
1 parent 0051257 commit 1fce293
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 3 deletions.
68 changes: 68 additions & 0 deletions go/arrow/ipc/ipc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (

"github.com/apache/arrow/go/v17/arrow"
"github.com/apache/arrow/go/v17/arrow/array"
"github.com/apache/arrow/go/v17/arrow/bitutil"
"github.com/apache/arrow/go/v17/arrow/ipc"
"github.com/apache/arrow/go/v17/arrow/memory"
)
Expand Down Expand Up @@ -620,3 +621,70 @@ func TestIpcEmptyMap(t *testing.T) {
assert.Zero(t, r.Record().NumRows())
assert.True(t, arrow.TypeEqual(dt, r.Record().Column(0).DataType()))
}

// GH-41993
func TestArrowBinaryIPCWriterTruncatedVOffsets(t *testing.T) {
var buf bytes.Buffer
buf.WriteString("apple")
buf.WriteString("pear")
buf.WriteString("banana")
values := buf.Bytes()

offsets := []int32{5, 9, 15} // <-- only "pear" and "banana"
voffsets := arrow.Int32Traits.CastToBytes(offsets)

validity := []byte{0}
bitutil.SetBit(validity, 0)
bitutil.SetBit(validity, 1)

data := array.NewData(
arrow.BinaryTypes.String,
2, // <-- only "pear" and "banana"
[]*memory.Buffer{
memory.NewBufferBytes(validity),
memory.NewBufferBytes(voffsets),
memory.NewBufferBytes(values),
},
nil,
0,
0,
)

str := array.NewStringData(data)
require.Equal(t, 2, str.Len())
require.Equal(t, "pear", str.Value(0))
require.Equal(t, "banana", str.Value(1))

schema := arrow.NewSchema([]arrow.Field{
{
Name: "string",
Type: arrow.BinaryTypes.String,
Nullable: true,
},
}, nil)
record := array.NewRecord(schema, []arrow.Array{str}, 2)

var output bytes.Buffer
writer := ipc.NewWriter(&output, ipc.WithSchema(schema))

require.NoError(t, writer.Write(record))
require.NoError(t, writer.Close())

reader, err := ipc.NewReader(bytes.NewReader(output.Bytes()), ipc.WithSchema(schema))
require.NoError(t, err)
defer reader.Release()

require.True(t, reader.Next())
require.NoError(t, reader.Err())

rec := reader.Record()
require.EqualValues(t, 1, rec.NumCols())
require.EqualValues(t, 2, rec.NumRows())

col, ok := rec.Column(0).(*array.String)
require.True(t, ok)
require.Equal(t, "pear", col.Value(0))
require.Equal(t, "banana", col.Value(1))

require.False(t, reader.Next())
}
22 changes: 19 additions & 3 deletions go/arrow/ipc/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -853,19 +853,35 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr arrow.Array) *memory.Buffer
return nil
}

dataTypeWidth := arr.DataType().Layout().Buffers[1].ByteWidth

// if we have a non-zero offset, then the value offsets do not start at
// zero. we must a) create a new offsets array with shifted offsets and
// b) slice the values array accordingly
//
hasNonZeroOffset := data.Offset() != 0

// or if there are more value offsets than values (the array has been sliced)
// we need to trim off the trailing offsets
needsTruncateAndShift := data.Offset() != 0 || offsetBytesNeeded < voffsets.Len()
hasMoreOffsetsThanValues := offsetBytesNeeded < voffsets.Len()

// or if the offsets do not start from the zero index, we need to shift them
// and slice the values array
var firstOffset int64
if dataTypeWidth == 8 {
firstOffset = arrow.Int64Traits.CastFromBytes(voffsets.Bytes())[0]
} else {
firstOffset = int64(arrow.Int32Traits.CastFromBytes(voffsets.Bytes())[0])
}
offsetsDoNotStartFromZero := firstOffset != 0

// determine whether the offsets array should be shifted
needsTruncateAndShift := hasNonZeroOffset || hasMoreOffsetsThanValues || offsetsDoNotStartFromZero

if needsTruncateAndShift {
shiftedOffsets := memory.NewResizableBuffer(w.mem)
shiftedOffsets.Resize(offsetBytesNeeded)

switch arr.DataType().Layout().Buffers[1].ByteWidth {
switch dataTypeWidth {
case 8:
dest := arrow.Int64Traits.CastFromBytes(shiftedOffsets.Bytes())
offsets := arrow.Int64Traits.CastFromBytes(voffsets.Bytes())[data.Offset() : data.Offset()+data.Len()+1]
Expand Down

0 comments on commit 1fce293

Please sign in to comment.