Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
a6e9c9e
matching the edges
Jason-Benson Jan 5, 2026
6dafa4c
more dependencies
Jason-Benson Jan 5, 2026
ee3769b
more @typescript-eslint dependencies
Jason-Benson Jan 5, 2026
8950899
switch undefinded to null per DK
Jason-Benson Jan 5, 2026
ffe00e3
version matching
Jason-Benson Jan 6, 2026
81efecb
rolled biullmq back to 4
Jason-Benson Jan 6, 2026
bf00abf
Merge remote-tracking branch 'upstream/dev' into depend6
Jason-Benson Jan 6, 2026
aecec96
Fix tablesort nonsense.
demiankatz Jan 6, 2026
26122a8
version matching
Jason-Benson Jan 6, 2026
f7cf790
Merge remote-tracking branch 'jason/depend6' into depend7
Jason-Benson Jan 6, 2026
1915ce5
reupdated express
Jason-Benson Jan 6, 2026
f76c0a5
Downgrade tablesort.
demiankatz Jan 6, 2026
52f84ca
matching the edges
Jason-Benson Jan 5, 2026
f0678e5
more dependencies
Jason-Benson Jan 5, 2026
0aea737
more @typescript-eslint dependencies
Jason-Benson Jan 5, 2026
b010f0e
switch undefinded to null per DK
Jason-Benson Jan 5, 2026
6693d10
version matching
Jason-Benson Jan 6, 2026
0116923
rolled biullmq back to 4
Jason-Benson Jan 6, 2026
60ff6f9
Fix tablesort nonsense.
demiankatz Jan 6, 2026
bd5515c
version matching
Jason-Benson Jan 6, 2026
38ab536
reupdated express
Jason-Benson Jan 6, 2026
698fb76
Downgrade tablesort.
demiankatz Jan 6, 2026
82b040a
Modified file ingest to work around node 2gb limit, but not done
Jason-Benson Jan 13, 2026
76c78a3
Commit from GitHub Actions (Lint Pull Requests)
github-actions[bot] Jan 13, 2026
5261c38
WiP
Jason-Benson Jan 13, 2026
eee9028
Deletes any preexisting metadata before attempting to write new data
Jason-Benson Jan 20, 2026
65f5602
Merge branch 'depend8' into 2gb_limit2
Jason-Benson Jan 20, 2026
d766da9
dependency matching
Jason-Benson Jan 20, 2026
8e0d752
cleaned up lint knitpicks
Jason-Benson Jan 20, 2026
4b432ae
removed unneeded logging
Jason-Benson Jan 20, 2026
c80aea4
Commit from GitHub Actions (Lint Pull Requests)
github-actions[bot] Jan 20, 2026
1ff7bd5
Removed extra console logs
Jason-Benson Jan 21, 2026
d314a62
Added sha-512 to computed digests
Jason-Benson Jan 21, 2026
96f6a75
trimmed some unneeded bits
Jason-Benson Jan 21, 2026
336b33c
Commit from GitHub Actions (Lint Pull Requests)
github-actions[bot] Jan 21, 2026
5510ca3
Modified file ingest to work around node 2gb limit, but not done
Jason-Benson Jan 13, 2026
1e24f81
reorganized and version matched dependencies
Jason-Benson Jan 23, 2026
28a7ffd
Merge commit '1e24f817af4dd720980d15b5038cd0b1d7ce378d' into 2gb_limit2
Jason-Benson Jan 23, 2026
5944a5f
fixed lint knitpick
Jason-Benson Jan 23, 2026
ca88b20
fixed package-lock
Jason-Benson Jan 23, 2026
981285f
fixed unit tests
Jason-Benson Jan 23, 2026
599fa28
Commit from GitHub Actions (Lint Pull Requests)
github-actions[bot] Jan 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions api/src/jobs/Metadata.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,11 @@ describe("Metadata", () => {
let dataStream: string;
let contentFile;
beforeEach(() => {
contentFile = {
name: "test1",
};
contentFile = "test1";
dataStream = "test2";
fedoraObject = {
addMasterMetadataDatastream: jest.fn(),
getDatastreamAsBuffer: jest.fn(),
downloadDatastreamToTempFile: jest.fn(),
};
job = {
data: {
Expand All @@ -42,15 +40,13 @@ describe("Metadata", () => {
});

it("adds a master data stream", async () => {
fedoraObject.getDatastreamAsBuffer.mockResolvedValue(dataStream);
tmp.fileSync.mockReturnValue(contentFile);
fedoraObject.downloadDatastreamToTempFile.mockResolvedValue(contentFile);

const consoleSpy = jest.spyOn(console, "log").mockImplementation(jest.fn());
await metadata.run(job);
expect(consoleSpy).toHaveBeenCalledTimes(1);
expect(consoleSpy).toHaveBeenCalledWith("Adding metadata...", { pid: 123 });
expect(fs.writeFileSync).toHaveBeenCalledWith(contentFile.name, dataStream);
expect(fedoraObject.addMasterMetadataDatastream).toHaveBeenCalledWith(contentFile.name);
expect(fedoraObject.addMasterMetadataDatastream).toHaveBeenCalledWith(contentFile);
});
});
});
17 changes: 8 additions & 9 deletions api/src/jobs/Metadata.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { Job as QueueJob } from "bullmq";
import fs = require("fs");
import tmp = require("tmp");
import Config from "../models/Config";
import { FedoraObject } from "../models/FedoraObject";
import FedoraObjectFactory from "../services/FedoraObjectFactory";
Expand All @@ -23,15 +22,15 @@ class MetadataProcessor {

async addMasterMetadataDatastream(): Promise<void> {
const fedoraObject: FedoraObject = FedoraObject.build(this.pid, null, this.config);
const dataStream: Buffer = await fedoraObject.getDatastreamAsBuffer("MASTER");
const contentFile = tmp.fileSync();
fs.writeFileSync(contentFile.name, dataStream);
await fedoraObject.addMasterMetadataDatastream(contentFile.name);
fs.truncateSync(contentFile.name, 0);
fs.rmSync(contentFile.name);
// Stream the MASTER datastream directly to a temporary file to avoid
// buffering very large files into memory, then run FITS on that file.
const contentPath = await fedoraObject.downloadDatastreamToTempFile("MASTER");
await fedoraObject.addMasterMetadataDatastream(contentPath);
fs.truncateSync(contentPath, 0);
fs.rmSync(contentPath);
// FITS XML will have been generated in /tmp as a side-effect; clean it up:
fs.truncateSync(contentFile.name + ".fits.xml", 0);
fs.rmSync(contentFile.name + ".fits.xml");
fs.truncateSync(contentPath + ".fits.xml", 0);
fs.rmSync(contentPath + ".fits.xml");
}

async run(): Promise<void> {
Expand Down
43 changes: 42 additions & 1 deletion api/src/models/FedoraObject.ts
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import Config from "./Config";
import { DatastreamParameters, Fedora } from "../services/Fedora";
import FedoraDataCollector from "../services/FedoraDataCollector";
import { execSync } from "child_process";
import crypto = require("crypto");
import { Agent } from "../services/interfaces";

export interface ObjectParameters {
Expand Down Expand Up @@ -76,8 +77,34 @@ export class FedoraObject {
await this.fedora.deleteDatastreamTombstone(this.pid, stream);
}

async computeDigestHeaderForFile(filename: string): Promise<string> {
// Compute digest by streaming the file once (avoids loading the whole file into memory)
const md5Hash = crypto.createHash("md5");
const sha512Hash = crypto.createHash("sha512");
await new Promise<void>((resolve, reject) => {
const rs = fs.createReadStream(filename);
rs.on("data", (chunk: Buffer) => {
md5Hash.update(chunk);
sha512Hash.update(chunk);
});
rs.on("end", () => resolve());
rs.on("error", (err) => reject(err));
});
const md5 = md5Hash.digest("hex");
const sha512 = sha512Hash.digest("hex");
const digestHeader = `md5=${md5}, sha-512=${sha512}`;
return digestHeader;
Comment on lines +95 to +96
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could simplify to a direct return since the variable isn't really used for anything:

Suggested change
const digestHeader = `md5=${md5}, sha-512=${sha512}`;
return digestHeader;
return `md5=${md5}, sha-512=${sha512}`;

}

async addDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise<void> {
await this.addDatastreamFromStringOrBuffer(fs.readFileSync(filename), stream, mimeType, [201]);
// Create a fresh read stream for the upload
const readStream = fs.createReadStream(filename);
const digestHeader = await this.computeDigestHeaderForFile(filename);
const params: DatastreamParameters = {
mimeType: mimeType,
logMessage: "Initial Ingest addDatastream - " + stream,
};
await this.fedora.addDatastream(this.pid, stream, params, readStream, [201], digestHeader);
}

async updateDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise<void> {
Expand Down Expand Up @@ -106,6 +133,16 @@ export class FedoraObject {
logMessage: "Initial Ingest addDatastream - MASTER-MD",
};
const fitsXml = this.fitsMasterMetadata(filename);

// Check if MASTER-MD exists and delete it if it does
try {
const checkResponse = await this.fedora.getDatastream(this.pid, "MASTER-MD");
if (checkResponse.statusCode === 200) {
await this.deleteDatastream("MASTER-MD");
}
} catch (e) {
// No existing MASTER-MD to delete
}
await this.addDatastream("MASTER-MD", params, fitsXml, [201, 204]);
}

Expand Down Expand Up @@ -221,6 +258,10 @@ export class FedoraObject {
return this.fedora.getDatastreamAsBuffer(this.pid, datastream);
}

async downloadDatastreamToTempFile(datastream: string, treatMissingAsEmpty = false): Promise<string> {
return this.fedora.downloadDatastreamToTempFile(this.pid, datastream, treatMissingAsEmpty);
}

async getDatastreamMetadata(datastream: string): Promise<string> {
return await this.fedora.getRdf(`${this.pid}/${datastream}/fcr:metadata`);
}
Expand Down
2 changes: 1 addition & 1 deletion api/src/services/Fedora.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ describe("Fedora", () => {
const putSpy = jest.spyOn(fedora, "putDatastream").mockImplementation(jest.fn());
const patchSpy = jest.spyOn(fedora, "patchRdf").mockReturnValue({ statusCode: 204 });
await fedora.addDatastream(pid, "MASTER", { mimeType: "foo/bar" }, "content");
expect(putSpy).toHaveBeenCalledWith(pid, "MASTER", "foo/bar", [201], "content", "");
expect(putSpy).toHaveBeenCalledWith(pid, "MASTER", "foo/bar", [201], "content", "", "");
const expectedTurtle =
'<> <http://fedora.info/definitions/1/0/access/objState> "A";\n <http://purl.org/dc/terms/title> "test4_MASTER".\n';
expect(patchSpy).toHaveBeenCalledWith(`/${pid}/MASTER/fcr:metadata`, expectedTurtle);
Expand Down
103 changes: 95 additions & 8 deletions api/src/services/Fedora.ts
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import xmlescape = require("xml-escape");
import { HttpError } from "../models/HttpError";
import winston = require("winston");
import SolrCache from "./SolrCache";
import fs = require("fs");
import tmp = require("tmp");

export interface DatastreamParameters {
dsLabel?: string;
Expand Down Expand Up @@ -58,7 +60,7 @@ export class Fedora {
protected _request(
method = "get",
_path = "/",
data: string | Buffer = null,
data: string | Buffer | NodeJS.ReadableStream = null,
_options: Record<string, unknown> = {},
): Promise<NeedleResponse> {
const path = _path[0] == "/" ? _path.slice(1) : _path;
Expand All @@ -70,7 +72,11 @@ export class Fedora {
password: this.config.fedoraPassword,
};
const options = Object.assign({}, auth, _options);
return http(method, url, data, options);

return http(method, url, data, options).catch((err) => {
console.error(`Request failed for ${method.toUpperCase()} ${url}:`, err);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a number of console.error calls added to this file, and it looks like we have not console-logged errors in this layer in the past. I think it might make sense to remove them, since they seem to be associated with logic that is going to throw an exception anyway (which should lead to output elsewhere in the stack). If you think there's a strong reason to keep them around, though, I suspect they're harmless, since they should only show up in situations where things are going wrong.

throw err;
});
}

/**
Expand Down Expand Up @@ -201,6 +207,65 @@ export class Fedora {
);
}

/**
* Download a datastream directly to a temporary file to avoid buffering
* large files into memory.
*
* @param pid Record id
* @param datastream Which stream to request
* @param treatMissingAsEmpty If true, return empty temp file on 404
*/
async downloadDatastreamToTempFile(pid: string, datastream: string, treatMissingAsEmpty = false): Promise<string> {
const url = this.config.restBaseUrl + "/" + pid + "/" + datastream;
const auth = {
username: this.config.fedoraUsername,
password: this.config.fedoraPassword,
};

return new Promise((resolve, reject) => {
const tmpobj = tmp.fileSync();
const writeStream = fs.createWriteStream(tmpobj.name);

const req = http.get(url, auth);

req.on("response", (res) => {
if (res.statusCode === 200) {
req.pipe(writeStream);
writeStream.on("finish", () => {
resolve(tmpobj.name);
});
writeStream.on("error", (err) => {
try {
fs.unlinkSync(tmpobj.name);
} catch (e) {
console.error(e);
}
reject(err);
});
} else if (res.statusCode === 404 && treatMissingAsEmpty) {
// create empty file and return its path
writeStream.end(() => resolve(tmpobj.name));
} else {
try {
fs.unlinkSync(tmpobj.name);
} catch (e) {
console.error(e);
}
reject(new Error("Unexpected response for " + pid + "/" + datastream + ": " + res.statusCode));
}
});

req.on("error", (err) => {
try {
fs.unlinkSync(tmpobj.name);
} catch (e) {
console.error(e);
}
reject(err);
});
});
}

/**
* Get DC datastream from Fedora
*
Expand Down Expand Up @@ -264,18 +329,31 @@ export class Fedora {
stream: string,
mimeType: string,
expectedStatus = [201],
data: string | Buffer,
data: string | Buffer | NodeJS.ReadableStream,
linkHeader = "",
precomputedDigest = "",
): Promise<void> {
this.cache.purgeFromCacheIfEnabled(pid);
const md5 = crypto.createHash("md5").update(data).digest("hex");
const sha = crypto.createHash("sha512").update(data).digest("hex");
const headers: Record<string, string> = {
"Overwrite-Tombstone": "true",
"Content-Disposition": 'attachment; filename="' + stream + '"',
"Content-Type": mimeType,
Digest: "md5=" + md5 + ", sha-512=" + sha,
};

// If caller supplied a precomputed digest (for streaming upload), use it.
if (precomputedDigest && precomputedDigest.length > 0) {
headers.Digest = precomputedDigest;
} else {
// For string/Buffer payloads, compute digests here.
if (typeof data === "string" || Buffer.isBuffer(data)) {
const md5 = crypto.createHash("md5").update(data).digest("hex");
const sha = crypto.createHash("sha512").update(data).digest("hex");
headers.Digest = "md5=" + md5 + ", sha-512=" + sha;
} else {
// No precomputed digest and data is a stream — cannot compute here.
throw new Error("Streaming data requires a precomputed digest header to be provided");
}
}
const options = { headers: headers };
if (linkHeader.length > 0) {
options.headers.Link = linkHeader;
Expand Down Expand Up @@ -306,13 +384,22 @@ export class Fedora {
pid: string,
stream: string,
params: DatastreamParameters,
data: string | Buffer,
data: string | Buffer | NodeJS.ReadableStream,
expectedStatus = [201],
precomputedDigest = "",
): Promise<void> {
this.cache.purgeFromCacheIfEnabled(pid);

// First create the stream:
await this.putDatastream(pid, stream, params.mimeType, expectedStatus, data, params.linkHeader ?? "");
await this.putDatastream(
pid,
stream,
params.mimeType,
expectedStatus,
data,
params.linkHeader ?? "",
precomputedDigest,
);

// Now set appropriate metadata:
const writer = new N3.Writer({ format: "text/turtle" });
Expand Down