Java iceberg table tests

platypii · platypii · commit bccaff4a2f24 · 2025-03-30T01:27:32.000-07:00
diff --git a/src/avro.data.js b/src/avro.data.js
@@ -31,7 +31,7 @@ export function avroData({ reader, metadata, syncMarker }) {
     reader.offset += 16
     for (let i = 0; i < 16; i++) {
       if (blockSync[i] !== syncMarker[i]) {
-        throw new Error('Sync marker does not match')
+        throw new Error('sync marker does not match')
       }
     }
     const codec = metadata['avro.codec']
@@ -40,7 +40,7 @@ export function avroData({ reader, metadata, syncMarker }) {
     if (codec === 'deflate') {
       data = gunzip(data)
     } else if (codec !== 'null') {
-      throw new Error(`Unsupported codec: ${codec}`)
+      throw new Error(`unsupported codec: ${codec}`)
     }
 
     // Decode according to binary or json encoding
@@ -61,7 +61,7 @@ export function avroData({ reader, metadata, syncMarker }) {
 }
 
 /**
- * @import {AvroType} from './types.js'
+ * @import {AvroType} from '../src/types.js'
  * @param {DataReader} reader
  * @param {AvroType} type
  * @returns {any}
@@ -123,6 +123,6 @@ function readType(reader, type) {
     return text
   } else {
     // enum, fixed, null, map
-    throw new Error(`Unsupported type: ${type}`)
+    throw new Error(`unsupported type: ${type}`)
   }
 }
diff --git a/src/iceberg.fetch.js b/src/iceberg.fetch.js
@@ -28,7 +28,7 @@ export function translateS3Url(url) {
  * Position deletes are grouped by target data file.
  * Equality deletes are grouped by sequence number.
  *
- * @import {ManifestEntry} from './types.js'
+ * @import {ManifestEntry} from '../src/types.js'
  * @param {ManifestEntry[]} deleteEntries
  * @returns {Promise<{positionDeletesMap: Map<string, Set<bigint>>, equalityDeletesMap: Map<bigint, Record<string, any>[]>}>}
  */
diff --git a/src/iceberg.js b/src/iceberg.js
@@ -16,7 +16,7 @@ export { avroData } from './avro.data.js'
  * TODO:
  *   - Sequence number checks when filtering deletes
  *
- * @import {IcebergMetadata} from './types.js'
+ * @import {IcebergMetadata} from '../src/types.js'
  * @param {object} options
  * @param {string} options.tableUrl - Base S3 URL of the table.
  * @param {number} [options.rowStart] - The starting global row index to fetch (inclusive).
diff --git a/src/iceberg.manifest.js b/src/iceberg.manifest.js
@@ -3,7 +3,7 @@ import { fetchAvroRecords } from './iceberg.fetch.js'
 /**
  * Returns manifest entries for the current snapshot.
  *
- * @import {IcebergMetadata, Manifest, ManifestEntry} from './types.js'
+ * @import {IcebergMetadata, Manifest, ManifestEntry} from '../src/types.js'
  * @typedef {{ url: string, entries: ManifestEntry[] }[]} ManifestList
  * @param {IcebergMetadata} metadata
  * @returns {Promise<ManifestList>}
diff --git a/src/iceberg.metadata.js b/src/iceberg.metadata.js
@@ -17,7 +17,7 @@ export function icebergLatestVersion(tableUrl) {
  * Fetches the Iceberg table metadata.
  * If metadataFileName is not privided, uses icebergLatestVersion to get the version hint.
  *
- * @import {IcebergMetadata} from './types.js'
+ * @import {IcebergMetadata} from '../src/types.js'
  * @param {string} tableUrl - Base URL of the table (e.g. "s3://my-bucket/path/to/table")
  * @param {string} [metadataFileName] - Name of the metadata JSON file
  * @returns {Promise<IcebergMetadata>} The table metadata as a JSON object
diff --git a/test/iceberg.java.test.js b/test/iceberg.java.test.js
@@ -0,0 +1,87 @@
+import { describe, expect, it } from 'vitest'
+import { icebergRead } from '../src/iceberg.js'
+
+describe.concurrent('icebergRead from java iceberg table', () => {
+  const tableUrl = 'https://s3.amazonaws.com/hyperparam-iceberg/java/bunnies'
+
+  it('reads data from iceberg table', async () => {
+    const data = await icebergRead({ tableUrl, metadataFileName: 'v2.metadata.json' })
+
+    // Verify we got correct number of rows
+    expect(data).toBeInstanceOf(Array)
+    expect(data.length).toBe(21)
+
+    // Verify first row has expected structure
+    expect(data[0]).toEqual({
+      'Breed Name': 'Holland Lop',
+      'Average Weight': 1.8,
+      'Fur Length': 3,
+      Lifespan: 7n,
+      'Origin Country': 'The Netherlands',
+      'Ear Type': 'Lop',
+      Temperament: 'Friendly',
+      'Popularity Rank': 1n,
+    })
+
+    // Check we have all expected properties
+    const expectedProperties = [
+      'Breed Name',
+      'Average Weight',
+      'Fur Length',
+      'Lifespan',
+      'Origin Country',
+      'Ear Type',
+      'Temperament',
+      'Popularity Rank',
+    ]
+    data.forEach(row => {
+      expectedProperties.forEach(prop => {
+        expect(row).toHaveProperty(prop)
+      })
+    })
+  })
+
+  it('reads data v3 with added column', async () => {
+    const data = await icebergRead({ tableUrl, metadataFileName: 'v3.metadata.json' })
+
+    expect(data.length).toBe(21)
+    expect(data[2]).toEqual({
+      'Breed Name': 'Flemish Giant',
+      'Average Weight': 4.5,
+      'Fur Length': 4,
+      Lifespan: 5n,
+      'Origin Country': 'Belgium',
+      'Ear Type': 'Lop',
+      Temperament: 'Calm',
+      'Popularity Rank': 3n,
+      __happy__: undefined,
+    })
+  })
+
+  it('reads data v4 with deleted rows', async () => {
+    const data = await icebergRead({ tableUrl, metadataFileName: 'v4.metadata.json' })
+
+    expect(data.length).toBe(15)
+    expect(data[2]).toEqual({
+      'Breed Name': 'American Fuzzy Lop',
+      'Average Weight': 1.4,
+      'Fur Length': 5,
+      Lifespan: 8n,
+      'Origin Country': 'USA',
+      'Ear Type': 'Lop',
+      Temperament: 'Sociable',
+      'Popularity Rank': 8n,
+    })
+    const newZealandRow = data.find(row => row['Breed Name'] === 'New Zealand')
+    expect(newZealandRow).toEqual({
+      'Breed Name': 'New Zealand',
+      'Average Weight': 4,
+      'Fur Length': 2.7,
+      Lifespan: 8n,
+      'Origin Country': 'New Zealand',
+      'Ear Type': 'Erect',
+      Temperament: 'Affectionate',
+      'Popularity Rank': 21n,
+    })
+  })
+})

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ export function avroData({ reader, metadata, syncMarker }) {`
`31`	`31`	`reader.offset += 16`
`32`	`32`	`for (let i = 0; i < 16; i++) {`
`33`	`33`	`if (blockSync[i] !== syncMarker[i]) {`
`34`		`- throw new Error('Sync marker does not match')`
	`34`	`+ throw new Error('sync marker does not match')`
`35`	`35`	`}`
`36`	`36`	`}`
`37`	`37`	`const codec = metadata['avro.codec']`
`@@ -40,7 +40,7 @@ export function avroData({ reader, metadata, syncMarker }) {`
`40`	`40`	`if (codec === 'deflate') {`
`41`	`41`	`data = gunzip(data)`
`42`	`42`	`} else if (codec !== 'null') {`
`43`		- throw new Error(`Unsupported codec: ${codec}`)
	`43`	+ throw new Error(`unsupported codec: ${codec}`)
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`// Decode according to binary or json encoding`
`@@ -61,7 +61,7 @@ export function avroData({ reader, metadata, syncMarker }) {`
`61`	`61`	`}`
`62`	`62`
`63`	`63`	`/**`
`64`		`- * @import {AvroType} from './types.js'`
	`64`	`+ * @import {AvroType} from '../src/types.js'`
`65`	`65`	`* @param {DataReader} reader`
`66`	`66`	`* @param {AvroType} type`
`67`	`67`	`* @returns {any}`
`@@ -123,6 +123,6 @@ function readType(reader, type) {`
`123`	`123`	`return text`
`124`	`124`	`} else {`
`125`	`125`	`// enum, fixed, null, map`
`126`		- throw new Error(`Unsupported type: ${type}`)
	`126`	+ throw new Error(`unsupported type: ${type}`)
`127`	`127`	`}`
`128`	`128`	`}`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ export function translateS3Url(url) {`
`28`	`28`	`* Position deletes are grouped by target data file.`
`29`	`29`	`* Equality deletes are grouped by sequence number.`
`30`	`30`	`*`
`31`		`- * @import {ManifestEntry} from './types.js'`
	`31`	`+ * @import {ManifestEntry} from '../src/types.js'`
`32`	`32`	`* @param {ManifestEntry[]} deleteEntries`
`33`	`33`	`* @returns {Promise<{positionDeletesMap: Map<string, Set<bigint>>, equalityDeletesMap: Map<bigint, Record<string, any>[]>}>}`
`34`	`34`	`*/`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ export { avroData } from './avro.data.js'`
`16`	`16`	`* TODO:`
`17`	`17`	`* - Sequence number checks when filtering deletes`
`18`	`18`	`*`
`19`		`- * @import {IcebergMetadata} from './types.js'`
	`19`	`+ * @import {IcebergMetadata} from '../src/types.js'`
`20`	`20`	`* @param {object} options`
`21`	`21`	`* @param {string} options.tableUrl - Base S3 URL of the table.`
`22`	`22`	`* @param {number} [options.rowStart] - The starting global row index to fetch (inclusive).`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ import { fetchAvroRecords } from './iceberg.fetch.js'`
`3`	`3`	`/**`
`4`	`4`	`* Returns manifest entries for the current snapshot.`
`5`	`5`	`*`
`6`		`- * @import {IcebergMetadata, Manifest, ManifestEntry} from './types.js'`
	`6`	`+ * @import {IcebergMetadata, Manifest, ManifestEntry} from '../src/types.js'`
`7`	`7`	`* @typedef {{ url: string, entries: ManifestEntry[] }[]} ManifestList`
`8`	`8`	`* @param {IcebergMetadata} metadata`
`9`	`9`	`* @returns {Promise<ManifestList>}`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ export function icebergLatestVersion(tableUrl) {`
`17`	`17`	`* Fetches the Iceberg table metadata.`
`18`	`18`	`* If metadataFileName is not privided, uses icebergLatestVersion to get the version hint.`
`19`	`19`	`*`
`20`		`- * @import {IcebergMetadata} from './types.js'`
	`20`	`+ * @import {IcebergMetadata} from '../src/types.js'`
`21`	`21`	`* @param {string} tableUrl - Base URL of the table (e.g. "s3://my-bucket/path/to/table")`
`22`	`22`	`* @param {string} [metadataFileName] - Name of the metadata JSON file`
`23`	`23`	`* @returns {Promise<IcebergMetadata>} The table metadata as a JSON object`