git.net

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[jira] [Created] (ARROW-2500) [Java] IPC Writers/readers are not always setting validity bits correctly


Emilio Lahr-Vivaz created ARROW-2500:
----------------------------------------

             Summary: [Java] IPC Writers/readers are not always setting validity bits correctly
                 Key: ARROW-2500
                 URL: https://issues.apache.org/jira/browse/ARROW-2500
             Project: Apache Arrow
          Issue Type: Bug
          Components: Java - Vectors
    Affects Versions: 0.9.0, 0.8.0
            Reporter: Emilio Lahr-Vivaz


When writing multiple batches to a Stream/File Writer, the first validity bit can get garbled between writing and reading. I couldn't pinpoint the exact issue, but I was able to re-create it with a fairly simple unit test.

in TestArrowStream.java:

{code:java}
  @Test
  public void testReadWriteMultipleBatches() throws IOException {

    ByteArrayOutputStream os = new ByteArrayOutputStream();

    try (IntVector vector = new IntVector("foo", allocator);) {
      Schema schema = new Schema(Collections.singletonList(vector.getField()), null);
      try (VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount());
           ArrowStreamWriter writer = new ArrowStreamWriter(root, new MapDictionaryProvider(), Channels.newChannel(os));) {
        writer.start();

        vector.setNull(0);
        vector.setSafe(1, 1);
        vector.setSafe(2, 2);
        vector.setNull(3);
        vector.setSafe(4, 1);
        vector.setValueCount(5);
        root.setRowCount(5);
        writer.writeBatch();

        vector.setNull(0);
        vector.setSafe(1, 1);
        vector.setSafe(2, 2);
        vector.setValueCount(3);
        root.setRowCount(3);
        writer.writeBatch();
      }
    }

    ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray());

    try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) {
      IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0);

      reader.loadNextBatch();

      assertEquals(read.getValueCount(), 5);
      assertNull(read.getObject(0));
      assertEquals(read.getObject(1), Integer.valueOf(1));
      assertEquals(read.getObject(2), Integer.valueOf(2));
      assertNull(read.getObject(3));
      assertEquals(read.getObject(4), Integer.valueOf(1));

      reader.loadNextBatch();

      assertEquals(read.getValueCount(), 3);
      assertNull(read.getObject(0));
      assertEquals(read.getObject(1), Integer.valueOf(1));
      assertEquals(read.getObject(2), Integer.valueOf(2));
    }
  }
{code}

in TestArrowFile.java:

{code}
 @Test
  public void testReadWriteMultipleBatches() throws IOException {
    File file = new File("target/mytest_nulls_multibatch.arrow");

    try (IntVector vector = new IntVector("foo", allocator);) {
      Schema schema = new Schema(Collections.singletonList(vector.getField()), null);
      try (FileOutputStream fileOutputStream = new FileOutputStream(file);
           VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount());
           ArrowFileWriter writer = new ArrowFileWriter(root, new MapDictionaryProvider(), fileOutputStream.getChannel());) {
        writer.start();

        vector.setNull(0);
        vector.setSafe(1, 1);
        vector.setSafe(2, 2);
        vector.setNull(3);
        vector.setSafe(4, 1);
        vector.setValueCount(5);
        root.setRowCount(5);
        writer.writeBatch();

        vector.setNull(0);
        vector.setSafe(1, 1);
        vector.setSafe(2, 2);
        vector.setValueCount(3);
        root.setRowCount(3);
        writer.writeBatch();
      }
    }

    try (FileInputStream fileInputStream = new FileInputStream(file);
         ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), allocator);) {
      IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0);

      reader.loadNextBatch();

      assertEquals(read.getValueCount(), 5);
      assertNull(read.getObject(0));
      assertEquals(read.getObject(1), Integer.valueOf(1));
      assertEquals(read.getObject(2), Integer.valueOf(2));
      assertNull(read.getObject(3));
      assertEquals(read.getObject(4), Integer.valueOf(1));

      reader.loadNextBatch();

      assertEquals(read.getValueCount(), 3);
      assertNull(read.getObject(0));
      assertEquals(read.getObject(1), Integer.valueOf(1));
      assertEquals(read.getObject(2), Integer.valueOf(2));
    }
  }
{code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)