Skip to content

Commit ba54f17

Browse files
committed
Split out iceberg metadata functions
1 parent 46f52ae commit ba54f17

File tree

6 files changed

+67
-47
lines changed

6 files changed

+67
-47
lines changed

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@
3939
},
4040
"devDependencies": {
4141
"@types/node": "22.13.10",
42-
"@vitest/coverage-v8": "3.0.8",
42+
"@vitest/coverage-v8": "3.0.9",
4343
"eslint": "9.22.0",
44-
"eslint-plugin-jsdoc": "50.6.6",
44+
"eslint-plugin-jsdoc": "50.6.8",
4545
"typescript": "5.8.2",
46-
"vitest": "3.0.8"
46+
"vitest": "3.0.9"
4747
}
4848
}

src/iceberg.fetch.js

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,33 +21,6 @@ export function translateS3Url(url) {
2121
return url
2222
}
2323

24-
/**
25-
* Fetches the Iceberg table snapshot version from S3 using the version hint file.
26-
*
27-
* @param {string} tableBaseUrl - Base S3 URL of the table (e.g. "s3://my-bucket/path/to/table")
28-
* @returns {Promise<number>} The snapshot version
29-
*/
30-
export function fetchSnapshotVersion(tableBaseUrl) {
31-
const url = `${tableBaseUrl}/metadata/version-hint.text`
32-
const safeUrl = translateS3Url(url)
33-
// TODO: If version-hint is not found, try listing or binary search.
34-
return fetch(safeUrl).then(res => res.text()).then(text => parseInt(text))
35-
}
36-
37-
/**
38-
* Fetches the Iceberg table metadata JSON from S3
39-
*
40-
* @import {IcebergMetadata} from './types.js'
41-
* @param {string} tableBaseUrl - Base S3 URL of the table (e.g. "s3://my-bucket/path/to/table")
42-
* @param {string} metadataFileName - Name of the metadata JSON file
43-
* @returns {Promise<IcebergMetadata>} The table metadata as a JSON object
44-
*/
45-
export function fetchIcebergMetadata(tableBaseUrl, metadataFileName) {
46-
const url = `${tableBaseUrl}/metadata/${metadataFileName}`
47-
const safeUrl = translateS3Url(url)
48-
return fetch(safeUrl).then(res => res.json())
49-
}
50-
5124
/**
5225
* Fetches data files information from multiple manifest file URLs.
5326
*

src/iceberg.js

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import { asyncBufferFromUrl, parquetReadObjects } from 'hyparquet'
22
import { decompress as ZSTD } from 'fzstd'
3-
import {
4-
fetchAvroRecords, fetchDataFilesFromManifests, fetchIcebergMetadata, fetchSnapshotVersion, translateS3Url,
5-
} from './iceberg.fetch.js'
3+
import { fetchAvroRecords, fetchDataFilesFromManifests, translateS3Url } from './iceberg.fetch.js'
4+
import { fetchIcebergMetadata } from './iceberg.metadata.js'
5+
6+
export { fetchIcebergMetadata }
67

78
/**
89
* Returns manifest URLs for the current snapshot separated into data and delete manifests.
@@ -101,27 +102,22 @@ function equalityMatch(row, deletePredicate) {
101102
* @returns {Promise<Array<Record<string, any>>>} Array of data records.
102103
*/
103104
export async function icebergRead({ tableUrl, rowStart, rowEnd, metadataFileName }) {
104-
// Find the latest snapshot version.
105-
if (!metadataFileName) {
106-
const version = await fetchSnapshotVersion(tableUrl)
107-
metadataFileName = `v${version}.metadata.json`
108-
}
109-
// Fetch table metadata and validate key fields.
105+
// Fetch table metadata
110106
const metadata = await fetchIcebergMetadata(tableUrl, metadataFileName)
111107

112-
// Get manifest URLs for data and delete files.
108+
// Get manifest URLs for data and delete files
113109
const { dataManifestUrls, deleteManifestUrls } = await getManifestUrls(metadata)
114110
if (dataManifestUrls.length === 0) {
115111
throw new Error('No data manifest files found for current snapshot')
116112
}
117113

118-
// Read data file info from data manifests.
114+
// Read data file info from data manifests
119115
const dataFiles = await fetchDataFilesFromManifests(dataManifestUrls)
120116
if (dataFiles.length === 0) {
121117
throw new Error('No data files found in manifests (table may be empty)')
122118
}
123119

124-
// Read delete file info from delete manifests (if any).
120+
// Read delete file info from delete manifests (if any)
125121
const deleteFiles = deleteManifestUrls.length > 0
126122
? await fetchDataFilesFromManifests(deleteManifestUrls)
127123
: []

src/iceberg.metadata.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { translateS3Url } from './iceberg.fetch.js'
2+
3+
/**
4+
* Fetches the Iceberg table snapshot version from S3 using the version hint file.
5+
*
6+
* @param {string} tableUrl - Base S3 URL of the table (e.g. "s3://my-bucket/path/to/table")
7+
* @returns {Promise<number>} The snapshot version
8+
*/
9+
export function fetchSnapshotVersion(tableUrl) {
10+
const url = `${tableUrl}/metadata/version-hint.text`
11+
const safeUrl = translateS3Url(url)
12+
// TODO: If version-hint is not found, try listing or binary search.
13+
return fetch(safeUrl).then(res => res.text()).then(text => parseInt(text))
14+
}
15+
16+
/**
17+
* Fetches the Iceberg table metadata JSON from S3.
18+
* If metadataFileName is not privided, uses fetchSnapshotVersion to get the version hint.
19+
*
20+
* @import {IcebergMetadata} from './types.js'
21+
* @param {string} tableUrl - Base S3 URL of the table (e.g. "s3://my-bucket/path/to/table")
22+
* @param {string} [metadataFileName] - Name of the metadata JSON file
23+
* @returns {Promise<IcebergMetadata>} The table metadata as a JSON object
24+
*/
25+
export async function fetchIcebergMetadata(tableUrl, metadataFileName) {
26+
if (!metadataFileName) {
27+
const version = await fetchSnapshotVersion(tableUrl)
28+
metadataFileName = `v${version}.metadata.json`
29+
}
30+
const url = `${tableUrl}/metadata/${metadataFileName}`
31+
const safeUrl = translateS3Url(url)
32+
return fetch(safeUrl).then(res => res.json())
33+
}

src/types.d.ts

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ export interface IcebergMetadata {
77
'last-updated-ms': number
88
'last-column-id': number
99
'current-schema-id': number
10-
schemas: any[]
10+
schemas: Schema[]
1111
'default-spec-id': number
1212
'partition-specs': any[]
1313
'last-partition-id': number
1414
'default-sort-order-id': number
15-
'sort-orders': any[]
15+
'sort-orders': SortOrder[]
1616
properties: object
1717
'current-snapshot-id': number
1818
refs: object
@@ -22,7 +22,25 @@ export interface IcebergMetadata {
2222
'metadata-log': any[]
2323
}
2424

25-
interface Snapshot {
25+
export interface Schema {
26+
type: string
27+
"schema-id": number
28+
fields: Field[]
29+
}
30+
31+
interface Field {
32+
id: number
33+
name: string
34+
required: boolean
35+
type: string
36+
}
37+
38+
interface SortOrder {
39+
"order-id": number
40+
"fields": any[]
41+
}
42+
43+
export interface Snapshot {
2644
'sequence-number': number
2745
'snapshot-id': number
2846
'timestamp-ms': number
@@ -45,7 +63,7 @@ interface Snapshot {
4563
'schema-id': number
4664
}
4765

48-
interface Manifest {
66+
export interface Manifest {
4967
manifest_path: string
5068
manifest_length: bigint
5169
partition_spec_id: number

test/iceberg.fetch.test.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { describe, expect, it } from 'vitest'
2-
import { fetchSnapshotVersion } from '../src/iceberg.fetch.js'
2+
import { fetchSnapshotVersion } from '../src/iceberg.metadata.js'
33

44
describe('Iceberg fetch utils', () => {
55
const tableUrl = 'https://s3.amazonaws.com/hyperparam-iceberg/warehouse/bunnies'

0 commit comments

Comments
 (0)