From fcc6bc5f86ec39a834494a27794e90e6f7d3d10c Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Thu, 9 Apr 2026 12:58:42 +0200 Subject: [PATCH 01/20] Workflow execution: Instantiate the node executors at the beginning of the execution. --- .../execution/ExecutorRegistry.scala | 18 +++++++ .../workflow/LocalWorkflowExecutor.scala | 49 +++++++------------ .../activity/workflow/WorkflowExecutor.scala | 20 +++++++- 3 files changed, 55 insertions(+), 32 deletions(-) diff --git a/silk-core/src/main/scala/org/silkframework/execution/ExecutorRegistry.scala b/silk-core/src/main/scala/org/silkframework/execution/ExecutorRegistry.scala index 1ba6d518d5..d968f49494 100644 --- a/silk-core/src/main/scala/org/silkframework/execution/ExecutorRegistry.scala +++ b/silk-core/src/main/scala/org/silkframework/execution/ExecutorRegistry.scala @@ -137,6 +137,18 @@ object ExecutorRegistry extends ExecutorRegistry { context: ActivityContext[ExecutionReport] = new ActivityMonitor(getClass.getSimpleName) )(implicit pluginContext: PluginContext): Option[ExecType#DataType] = { val exec = executor(task.data, execution) + executeWith(exec, task, inputs, output, execution, context) + } + + /** Execute with a pre-instantiated executor, skipping executor lookup. */ + def executeWith[TaskType <: TaskSpec, ExecType <: ExecutionType]( + exec: Executor[TaskType, ExecType], + task: Task[TaskType], + inputs: Seq[ExecType#DataType], + output: ExecutorOutput, + execution: ExecType, + context: ActivityContext[ExecutionReport] + )(implicit pluginContext: PluginContext): Option[ExecType#DataType] = { context.status.update(Status.Running("Running", None), logStatus = false) val startTime = System.currentTimeMillis() val result = exec.execute(task, inputs, output, execution, context) @@ -144,6 +156,12 @@ object ExecutorRegistry extends ExecutorRegistry { result } + /** Instantiates the executor for a given task and execution type without executing it. */ + def instantiateExecutor[TaskType <: TaskSpec, ExecType <: ExecutionType]( + task: TaskType, + execution: ExecType + ): Executor[TaskType, ExecType] = executor(task, execution) + /** Fetch the execution specific access to a dataset for the configured execution.*/ def access[DatasetType <: Dataset](task: Task[DatasetSpec[DatasetType]]): DatasetAccess = { diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala index 00becbbea3..57b71eca07 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala @@ -5,7 +5,7 @@ import org.silkframework.config._ import org.silkframework.dataset.DatasetSpec.GenericDatasetSpec import org.silkframework.dataset._ import org.silkframework.execution.local.{ErrorOutputWriter, LocalEntities, LocalExecution} -import org.silkframework.execution.{DatasetExecutor, EntityHolder, ExecutorOutput} +import org.silkframework.execution.{DatasetExecutor, EntityHolder, ExecutorOutput, ExecutorRegistry} import org.silkframework.plugins.dataset.InternalDataset import org.silkframework.rule.TransformSpec import org.silkframework.runtime.activity.{ActivityContext, UserContext} @@ -89,6 +89,7 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], checkReadOnlyDatasets() checkVariableDatasets() + initializeExecutors() if(clearDatasets) { clearOutputDatasets() } @@ -116,6 +117,23 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], } } + private def initializeExecutors()(implicit workflowRunContext: WorkflowRunContext): Unit = { + implicit val userContext: UserContext = workflowRunContext.userContext + for (node <- workflowNodes) { + val taskOpt: Option[Task[_ <: TaskSpec]] = node match { + case datasetNode: WorkflowDataset => + project.taskOption[GenericDatasetSpec](datasetNode.task).map { dt => + resolveDataset(dt, replaceDataSources ++ replaceSinks) + } + case operatorNode: WorkflowOperator => + project.anyTaskOption(operatorNode.task) + } + for (t <- taskOpt) { + workflowRunContext.nodeExecutors.put(node.nodeId, ExecutorRegistry.instantiateExecutor(t.data, executionContext)) + } + } + } + private def clearOutputDatasets()(implicit workflowRunContext: WorkflowRunContext): Unit = { implicit val userContext: UserContext = workflowRunContext.userContext // Clear all internal datasets and input datasets that are configured so @@ -415,35 +433,6 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], } } - /** NOT USED ANYMORE, only here for documentation reasons, should be deleted after everything in here is supported. */ - def executeOperator(operator: WorkflowNode) - (implicit workflowRunContext: WorkflowRunContext): Unit = { - // Get the error sinks for this operator - val errorOutputs = operator match { - case wo: WorkflowOperator => wo.errorOutputs.map(project.anyTask(_)(workflowRunContext.userContext)) - case _ => Seq() - } - var errorSinks: Seq[DatasetWriteAccess] = errorOutputSinks(errorOutputs) - - - if (errorOutputs.exists(!_.data.isInstanceOf[Dataset])) { - // TODO: Needs proper graph - // TODO: How to handle error output in new model? - errorSinks +:= InternalDataset(null) - } - - // val activity = taskExecutor(dataSources, taskData, sinks, errorSinks) - // val report = activityContext.child(activity, 0.0).startBlockingAndGetValue() - // activityContext.value() = activityContext.value().withReport(operator.id, report) - } - - private def errorOutputSinks(errorOutputs: Seq[ProjectTask[_ <: TaskSpec]]): Seq[DatasetWriteAccess] = { - errorOutputs.collect { - case pt: ProjectTask[_] if pt.data.isInstanceOf[Dataset] => - pt.data.asInstanceOf[Dataset] - } - } - /** * Returns the dataset that should be used in the workflow. Specifically [[VariableDataset]] * and [[InternalDataset]] need to be replaced by the corresponding real dataset. diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala index 5a9ba1c563..7b1aa9967e 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala @@ -57,7 +57,12 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu updateProgress(operation, task) val result = try { - ExecutorRegistry.execute(task, inputs, output, executionContext, taskContext) + workflowRunContext.nodeExecutors.get(nodeId) match { + case Some(exec) => + ExecutorRegistry.executeWith(exec.asInstanceOf[Executor[TaskType, ExecType]], task, inputs, output, executionContext, taskContext) + case None => + ExecutorRegistry.execute(task, inputs, output, executionContext, taskContext) + } } catch { case NonFatal(ex) => workflowRunContext.activityContext.value.updateWith(_.addFailedNode(nodeId, ex)) @@ -197,11 +202,22 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu } } +/** + * A context for a single workflow execution. + * + * @param activityContext The activity context for the workflow execution. + * @param workflow The workflow that is being be executed. + * @param userContext The user that is executing the workflow. + * @param alreadyExecuted The workflow nodes that have already been executed. + * @param reconfiguredTasks The already tasks that have been reconfigured. + * @param nodeExecutors The node executors for each workflow node by node id. + */ case class WorkflowRunContext(activityContext: ActivityContext[WorkflowExecutionReport], workflow: Workflow, userContext: UserContext, alreadyExecuted: mutable.Set[WorkflowNode] = mutable.Set(), - reconfiguredTasks: mutable.Map[WorkflowNode, Task[_ <: TaskSpec]] = mutable.Map()) { + reconfiguredTasks: mutable.Map[WorkflowNode, Task[_ <: TaskSpec]] = mutable.Map(), + nodeExecutors: mutable.Map[Identifier, Executor[_, _]] = mutable.Map()) { /** * Listeners for updates to task reports. * We need to hold them to prevent their garbage collection. From 6232ecdeb5dd4c6b2320b569aed37877cc42b564 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Thu, 9 Apr 2026 13:16:10 +0200 Subject: [PATCH 02/20] Add close method to Executor trait that allow operators to clean up after workflow execution. --- .../main/scala/org/silkframework/execution/Executor.scala | 7 ++++++- .../activity/workflow/LocalWorkflowExecutor.scala | 8 ++++++++ .../workspace/activity/workflow/WorkflowExecutor.scala | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/silk-core/src/main/scala/org/silkframework/execution/Executor.scala b/silk-core/src/main/scala/org/silkframework/execution/Executor.scala index 90c732f88c..7c3abab856 100644 --- a/silk-core/src/main/scala/org/silkframework/execution/Executor.scala +++ b/silk-core/src/main/scala/org/silkframework/execution/Executor.scala @@ -13,12 +13,17 @@ import org.silkframework.runtime.plugin.{AnyPlugin, PluginContext} * @tparam ExecType The execution type, e.g., SparkExecution */ @PluginType() -trait Executor[TaskType <: TaskSpec, ExecType <: ExecutionType] extends AnyPlugin { +trait Executor[TaskType <: TaskSpec, ExecType <: ExecutionType] extends AnyPlugin with java.io.Closeable { def execute(task: Task[TaskType], inputs: Seq[ExecType#DataType], output: ExecutorOutput, execution: ExecType, context: ActivityContext[ExecutionReport] = new ActivityMonitor(getClass.getSimpleName)) (implicit pluginContext: PluginContext): Option[ExecType#DataType] + /** + * Called after the (worklow) execution has finished. + */ + override def close(): Unit = {} + } /** diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala index 57b71eca07..8e2973ced4 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala @@ -114,6 +114,14 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], } finally { context.value.updateWith(_.asDone()) this.executionContext.executeShutdownHooks() + workflowRunContext.nodeExecutors.foreach { case (nodeId, exec) => + try { + exec.close() + } catch { + case NonFatal(ex) => + log.log(Level.WARNING, s"Exception while closing executor for node '$nodeId'.", ex) + } + } } } diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala index 7b1aa9967e..24863f388f 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala @@ -61,7 +61,7 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu case Some(exec) => ExecutorRegistry.executeWith(exec.asInstanceOf[Executor[TaskType, ExecType]], task, inputs, output, executionContext, taskContext) case None => - ExecutorRegistry.execute(task, inputs, output, executionContext, taskContext) + throw WorkflowExecutionException(s"No executor found for node '$nodeId'. This is a bug: executors should have been initialized before execution.") } } catch { case NonFatal(ex) => From 86cbcce91585ac7204bf5d872b8acf6c19c69537 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Thu, 9 Apr 2026 14:25:40 +0200 Subject: [PATCH 03/20] Add InWorkflowDataset --- .../plugins/dataset/rdf/RdfPlugins.scala | 6 +- .../rdf/datasets/InMemoryDataset.scala | 4 ++ .../rdf/datasets/InWorkflowDataset.scala | 45 ++++++++++++++ .../datasets/InWorkflowDatasetExecutor.scala | 28 +++++++++ .../rdf/datasets/InWorkflowDatasetTest.scala | 62 +++++++++++++++++++ 5 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala create mode 100644 silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala create mode 100644 silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala index 9342e83304..36e2e88c87 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala @@ -1,6 +1,6 @@ package org.silkframework.plugins.dataset.rdf -import org.silkframework.plugins.dataset.rdf.datasets.{AlignmentDataset, InMemoryDataset, RdfFileDataset, SparqlDataset} +import org.silkframework.plugins.dataset.rdf.datasets.{AlignmentDataset, InMemoryDataset, InWorkflowDataset, InWorkflowDatasetExecutor, RdfFileDataset, SparqlDataset} import org.silkframework.plugins.dataset.rdf.executors.{LocalSparqlCopyExecutor, LocalSparqlSelectExecutor, LocalSparqlUpdateExecutor} import org.silkframework.plugins.dataset.rdf.tasks.{SparqlCopyCustomTask, SparqlSelectCustomTask, SparqlUpdateCustomTask} import org.silkframework.plugins.dataset.rdf.vocab.{InMemoryVocabularyManager, RdfFilesVocabularyManager, RdfProjectFilesVocabularyManager, RdfVocabularyManager} @@ -14,6 +14,7 @@ class RdfPlugins extends PluginModule { classOf[SparqlDataset], classOf[AlignmentDataset], classOf[InMemoryDataset], + classOf[InWorkflowDataset], classOf[RdfVocabularyManager], classOf[RdfFilesVocabularyManager], classOf[RdfProjectFilesVocabularyManager], @@ -26,7 +27,8 @@ class RdfPlugins extends PluginModule { val executors = Seq( classOf[LocalSparqlSelectExecutor], classOf[LocalSparqlUpdateExecutor], - classOf[LocalSparqlCopyExecutor] + classOf[LocalSparqlCopyExecutor], + classOf[InWorkflowDatasetExecutor] ) } diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala index bb4a987335..44f7f7c7c5 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala @@ -15,6 +15,10 @@ import org.silkframework.runtime.plugin.annotations.{Param, Plugin, PluginRefere description = "A Dataset that holds all data in-memory.", documentationFile = "InMemoryDataset.md", relatedPlugins = Array( + new PluginReference( + id = InWorkflowDataset.pluginId, + description = "Both datasets hold data in-memory, but the in-workflow dataset is scoped to a single workflow execution and cleared afterwards, while the in-memory dataset persists for the lifetime of the application." + ), new PluginReference( id = SparqlDataset.pluginId, description = "Data in the in-memory dataset does not persist beyond the running process. The SPARQL endpoint dataset connects to an external store that persists independently, which means switching between them changes not just where the data lives but whether it survives execution at all." diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala new file mode 100644 index 0000000000..674e278d77 --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -0,0 +1,45 @@ +package org.silkframework.plugins.dataset.rdf.datasets + +import org.apache.jena.rdf.model.ModelFactory +import org.silkframework.dataset._ +import org.silkframework.dataset.rdf.{RdfDataset, SparqlEndpoint, SparqlParams} +import org.silkframework.plugins.dataset.rdf.access.{SparqlSink, SparqlSource} +import org.silkframework.plugins.dataset.rdf.endpoint.JenaModelEndpoint +import org.silkframework.runtime.activity.UserContext +import org.silkframework.runtime.plugin.annotations.{Plugin, PluginReference} + +@Plugin( + id = InWorkflowDataset.pluginId, + label = "In-workflow dataset", + categories = Array(DatasetCategories.embedded), + description = "A Dataset that holds all data in-memory for the duration of a single workflow execution. " + + "The data is stored separatly for each workflow execution. " + + "The data is cleared once the workflow execution has finished.", + relatedPlugins = Array( + new PluginReference( + id = InMemoryDataset.pluginId, + description = "Both datasets hold data in-memory, but the in-memory dataset persists for the lifetime of the running process, " + + "while the in-workflow dataset is scoped to a single workflow execution and cleared afterwards." + ) + ) +) +case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { + + // Empty placeholder model. Actual data is held in the executor (InWorkflowDatasetExecutor). + // Framework code that bypasses access() and reads sparqlEndpoint directly will see empty results. + private val emptyModel = ModelFactory.createDefaultModel() + + override val sparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(emptyModel) + + override def source(implicit userContext: UserContext): DataSource = new SparqlSource(SparqlParams(), sparqlEndpoint) + + override def entitySink(implicit userContext: UserContext): EntitySink = new SparqlSink(SparqlParams(), sparqlEndpoint) + + override def linkSink(implicit userContext: UserContext): LinkSink = new SparqlSink(SparqlParams(), sparqlEndpoint) + + override def tripleSink(implicit userContext: UserContext): TripleSink = new SparqlSink(SparqlParams(), sparqlEndpoint) +} + +object InWorkflowDataset { + final val pluginId = "inWorkflow" +} diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala new file mode 100644 index 0000000000..aba631c9a2 --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -0,0 +1,28 @@ +package org.silkframework.plugins.dataset.rdf.datasets + +import org.apache.jena.rdf.model.{Model, ModelFactory} +import org.silkframework.config.Task +import org.silkframework.dataset.{DatasetAccess, DatasetSpec} +import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} + +/** + * Executor for [[InWorkflowDataset]]. + * + * Holds the actual Jena model for the duration of a workflow execution. + * Overrides access() to expose the executor-owned model to the framework, + * ensuring each workflow execution has its own isolated data. + * Clears the model in close() once the workflow execution has finished. + */ +class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] { + + private val model: Model = ModelFactory.createDefaultModel() + + // JenaModelDataset wraps the model and provides source/entitySink/linkSink backed by it. + private val modelDataset: JenaModelDataset = JenaModelDataset(model) + + override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = modelDataset + + override def close(): Unit = { + model.removeAll() + } +} diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala new file mode 100644 index 0000000000..fa49e6a5f3 --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala @@ -0,0 +1,62 @@ +package org.silkframework.plugins.dataset.rdf.datasets + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers +import org.silkframework.config.{PlainTask, Prefixes} +import org.silkframework.dataset.DatasetSpec +import org.silkframework.dataset.rdf.RdfDataset +import org.silkframework.execution.local.LocalExecution +import org.silkframework.runtime.activity.UserContext + +class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { + + private implicit val userContext: UserContext = UserContext.Empty + private implicit val prefixes: Prefixes = Prefixes.empty + + private val dataset = InWorkflowDataset() + private val task = PlainTask("test", DatasetSpec(dataset)) + private val execution = LocalExecution() + + private val tripleCountQuery = "SELECT * WHERE {?s ?p ?o}" + + behavior of "InWorkflowDataset" + + it should "store data in the executor, not in the dataset itself" in { + val executor = new InWorkflowDatasetExecutor() + val executorEndpoint = executor.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint + + executorEndpoint.update("INSERT DATA { }") + + // The executor's model contains the written data + executorEndpoint.select(tripleCountQuery).bindings.size mustBe 1 + // The dataset's own placeholder endpoint is not affected + dataset.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 0 + } + + it should "clear all data when close() is called" in { + val executor = new InWorkflowDatasetExecutor() + val executorEndpoint = executor.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint + + executorEndpoint.update("INSERT DATA { }") + executorEndpoint.select(tripleCountQuery).bindings.size mustBe 1 + + executor.close() + + executorEndpoint.select(tripleCountQuery).bindings.size mustBe 0 + } + + it should "isolate data between concurrent executions" in { + val executor1 = new InWorkflowDatasetExecutor() + val executor2 = new InWorkflowDatasetExecutor() + val endpoint1 = executor1.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint + val endpoint2 = executor2.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint + + endpoint1.update("INSERT DATA { }") + endpoint2.update("INSERT DATA { }") + endpoint2.update("INSERT DATA { }") + + // Each executor only sees data from its own model + endpoint1.select(tripleCountQuery).bindings.size mustBe 1 + endpoint2.select(tripleCountQuery).bindings.size mustBe 2 + } +} From ccc01ee6ac21340a4cd15be0938c024c719e4cb1 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Thu, 9 Apr 2026 15:20:13 +0200 Subject: [PATCH 04/20] Add integration test for InWorkflowDataset --- .../InWorkflowDatasetIntegrationTest.scala | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala new file mode 100644 index 0000000000..54e63ae6ab --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala @@ -0,0 +1,145 @@ +package org.silkframework.plugins.dataset.rdf.datasets + +import org.apache.jena.rdf.model.{Model, ModelFactory} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers +import org.silkframework.config.{ConfigTest, MetaData, Prefixes} +import org.silkframework.dataset.DatasetSpec +import org.silkframework.entity.paths.UntypedPath +import org.silkframework.plugins.dataset.rdf.tasks.SparqlCopyCustomTask +import org.silkframework.rule._ +import org.silkframework.runtime.activity.{ActivityMonitor, UserContext} +import org.silkframework.util.{ConfigTestTrait, Uri} +import org.silkframework.workspace.activity.workflow.{LocalWorkflowExecutorGeneratingProvenance, Workflow, WorkflowDataset, WorkflowExecutionReportWithProvenance, WorkflowOperator} +import org.silkframework.workspace.{InMemoryWorkspaceTestTrait, ProjectConfig, WorkspaceFactory} + +/** + * Integration test for [[InWorkflowDataset]] within a real workflow execution. + * + * Tests that: + * - Two InWorkflowDataset instances are fully isolated from each other. + * - Multiple uses of the same InWorkflowDataset instance within one workflow + * execution all see the same data. + * + * Workflow structure: + * source1 → copyToInWorkflow1 → inWorkflow1 → readFromInWorkflow1A → output1A + * └──────→ readFromInWorkflow1B → output1B + * source2 → copyToInWorkflow2 → inWorkflow2 → readFromInWorkflow2A → output2A + * └──────→ readFromInWorkflow2B → output2B + * + * Writing to each InWorkflowDataset goes through SparqlCopyCustomTask (QuadEntitySchema → + * withEntitySink → access() → executor model). Reading goes through TransformSpec + * (FixedSchemaPort(MultiEntitySchema) → handleMultiEntitySchema → access().source → + * executor model), which is the only read path that correctly reaches the executor model. + */ +class InWorkflowDatasetIntegrationTest extends AnyFlatSpec with Matchers with ConfigTestTrait { + + implicit val userContext: UserContext = UserContext.Empty + implicit val prefixes: Prefixes = Prefixes.empty + + override def propertyMap: Map[String, Option[String]] = Map( + "workspace.provider.plugin" -> Some("inMemoryWorkspaceProvider") + ) + + "InWorkflowDataset" should "isolate data between two instances and share data across multiple uses of the same instance within a workflow" in { + val workspace = WorkspaceFactory().workspace + val project = workspace.createProject(ProjectConfig(metaData = MetaData(Some("inWorkflowIntegrationTest")))) + + // Source datasets pre-populated with distinct triples. + val source1Model: Model = ModelFactory.createDefaultModel() + source1Model.createResource("http://s1") + .addProperty(source1Model.createProperty("http://p"), source1Model.createResource("http://o1")) + + val source2Model: Model = ModelFactory.createDefaultModel() + source2Model.createResource("http://s2") + .addProperty(source2Model.createProperty("http://p"), source2Model.createResource("http://o2")) + + // Output datasets — empty initially, filled by the workflow. + val output1AModel: Model = ModelFactory.createDefaultModel() + val output1BModel: Model = ModelFactory.createDefaultModel() + val output2AModel: Model = ModelFactory.createDefaultModel() + val output2BModel: Model = ModelFactory.createDefaultModel() + + // Register dataset tasks. + project.addTask("source1", DatasetSpec(JenaModelDataset(source1Model))) + project.addTask("source2", DatasetSpec(JenaModelDataset(source2Model))) + project.addTask("inWorkflow1", DatasetSpec(InWorkflowDataset())) + project.addTask("inWorkflow2", DatasetSpec(InWorkflowDataset())) + project.addTask("output1A", DatasetSpec(JenaModelDataset(output1AModel))) + project.addTask("output1B", DatasetSpec(JenaModelDataset(output1BModel))) + project.addTask("output2A", DatasetSpec(JenaModelDataset(output2AModel))) + project.addTask("output2B", DatasetSpec(JenaModelDataset(output2BModel))) + + // SparqlCopyCustomTask: reads via SparqlEndpointEntitySchema, outputs QuadEntitySchema. + // Quads are written to InWorkflowDataset via withEntitySink → access() → executor model. + val copyQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }" + project.addTask("copyToInWorkflow1", SparqlCopyCustomTask(copyQuery, tempFile = false)) + project.addTask("copyToInWorkflow2", SparqlCopyCustomTask(copyQuery, tempFile = false)) + + // TransformSpec: reads via FixedSchemaPort(MultiEntitySchema) → handleMultiEntitySchema + // → access().source → executor model. Identity mapping preserves the property. + val identityTransform = TransformSpec( + selection = DatasetSelection("dummy", Uri("")), + mappingRule = RootMappingRule(MappingRules( + propertyRules = Seq( + DirectMapping( + id = "pmap", + sourcePath = UntypedPath(Uri("http://p")), + mappingTarget = MappingTarget(Uri("http://p")) + ) + ) + )) + ) + project.addTask("readFromInWorkflow1A", identityTransform) + project.addTask("readFromInWorkflow1B", identityTransform) + project.addTask("readFromInWorkflow2A", identityTransform) + project.addTask("readFromInWorkflow2B", identityTransform) + + // Build the workflow. + // inWorkflow1 is written to once (by copyToInWorkflow1) and then read twice + // (by readFromInWorkflow1A and readFromInWorkflow1B), exercising the + // alreadyExecuted / multiple-reads behaviour. + val workflow = Workflow( + operators = Seq( + WorkflowOperator(Seq(Some("source1")), "copyToInWorkflow1", Seq("inWorkflow1"), Seq.empty, (0, 0), "copyToInWorkflow1", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("source2")), "copyToInWorkflow2", Seq("inWorkflow2"), Seq.empty, (0, 300), "copyToInWorkflow2", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inWorkflow1")), "readFromInWorkflow1A", Seq("output1A"), Seq.empty, (200, 0), "readFromInWorkflow1A", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inWorkflow1")), "readFromInWorkflow1B", Seq("output1B"), Seq.empty, (200,100), "readFromInWorkflow1B", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inWorkflow2")), "readFromInWorkflow2A", Seq("output2A"), Seq.empty, (200,300), "readFromInWorkflow2A", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inWorkflow2")), "readFromInWorkflow2B", Seq("output2B"), Seq.empty, (200,400), "readFromInWorkflow2B", None, Seq.empty, Seq.empty) + ), + datasets = Seq( + WorkflowDataset(Seq.empty, "source1", Seq("copyToInWorkflow1"), (0, 0), "source1", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq.empty, "source2", Seq("copyToInWorkflow2"), (0, 300), "source2", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("copyToInWorkflow1")), "inWorkflow1", Seq("readFromInWorkflow1A", "readFromInWorkflow1B"), (100, 0), "inWorkflow1", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("copyToInWorkflow2")), "inWorkflow2", Seq("readFromInWorkflow2A", "readFromInWorkflow2B"), (100,300), "inWorkflow2", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInWorkflow1A")), "output1A", Seq.empty, (300, 0), "output1A", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInWorkflow1B")), "output1B", Seq.empty, (300,100), "output1B", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInWorkflow2A")), "output2A", Seq.empty, (300,300), "output2A", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInWorkflow2B")), "output2B", Seq.empty, (300,400), "output2B", None, Seq.empty, Seq.empty) + ) + ) + project.addTask("workflow", workflow) + val workflowTask = project.task[Workflow]("workflow") + + // Execute the workflow. + val executor = LocalWorkflowExecutorGeneratingProvenance(workflowTask) + val monitor = new ActivityMonitor("monitor", initialValue = Some(WorkflowExecutionReportWithProvenance.empty)) + executor.run(monitor) + + // Each InWorkflowDataset instance received data and fed its downstream operators. + output1AModel.size() must be > 0L + output1BModel.size() must be > 0L + output2AModel.size() must be > 0L + output2BModel.size() must be > 0L + + // Multiple uses of the same instance: both reads of inWorkflow1 see identical data. + output1AModel.isIsomorphicWith(output1BModel) mustBe true + + // Multiple uses of the same instance: both reads of inWorkflow2 see identical data. + output2AModel.isIsomorphicWith(output2BModel) mustBe true + + // Isolation: inWorkflow1 (source1 data) and inWorkflow2 (source2 data) are separate. + output1AModel.isIsomorphicWith(output2AModel) mustBe false + } +} From 8128921693211709a7cb10a57ac37ad5c1eca8ea Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Thu, 9 Apr 2026 15:46:38 +0200 Subject: [PATCH 05/20] SparkWorkflowExecutor should also create executors for each excution. --- .../workflow/LocalWorkflowExecutor.scala | 52 +--------------- .../activity/workflow/WorkflowExecutor.scala | 61 ++++++++++++++++++- 2 files changed, 61 insertions(+), 52 deletions(-) diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala index 8e2973ced4..8d5d3429b8 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala @@ -5,7 +5,7 @@ import org.silkframework.config._ import org.silkframework.dataset.DatasetSpec.GenericDatasetSpec import org.silkframework.dataset._ import org.silkframework.execution.local.{ErrorOutputWriter, LocalEntities, LocalExecution} -import org.silkframework.execution.{DatasetExecutor, EntityHolder, ExecutorOutput, ExecutorRegistry} +import org.silkframework.execution.{DatasetExecutor, EntityHolder, ExecutorOutput} import org.silkframework.plugins.dataset.InternalDataset import org.silkframework.rule.TransformSpec import org.silkframework.runtime.activity.{ActivityContext, UserContext} @@ -81,15 +81,10 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], } private def runWorkflow(implicit context: ActivityContext[WorkflowExecutionReport], userContext: UserContext): Unit = { - implicit val workflowRunContext: WorkflowRunContext = WorkflowRunContext( - activityContext = context, - workflow = currentWorkflow, - userContext = userContext - ) + implicit val workflowRunContext: WorkflowRunContext = createRunContext checkReadOnlyDatasets() checkVariableDatasets() - initializeExecutors() if(clearDatasets) { clearOutputDatasets() } @@ -125,23 +120,6 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], } } - private def initializeExecutors()(implicit workflowRunContext: WorkflowRunContext): Unit = { - implicit val userContext: UserContext = workflowRunContext.userContext - for (node <- workflowNodes) { - val taskOpt: Option[Task[_ <: TaskSpec]] = node match { - case datasetNode: WorkflowDataset => - project.taskOption[GenericDatasetSpec](datasetNode.task).map { dt => - resolveDataset(dt, replaceDataSources ++ replaceSinks) - } - case operatorNode: WorkflowOperator => - project.anyTaskOption(operatorNode.task) - } - for (t <- taskOpt) { - workflowRunContext.nodeExecutors.put(node.nodeId, ExecutorRegistry.instantiateExecutor(t.data, executionContext)) - } - } - } - private def clearOutputDatasets()(implicit workflowRunContext: WorkflowRunContext): Unit = { implicit val userContext: UserContext = workflowRunContext.userContext // Clear all internal datasets and input datasets that are configured so @@ -441,32 +419,6 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], } } - /** - * Returns the dataset that should be used in the workflow. Specifically [[VariableDataset]] - * and [[InternalDataset]] need to be replaced by the corresponding real dataset. - * - * @param datasetTask - * @param replaceDatasets A map with replacement datasets for [[VariableDataset]] objects. - * @return - */ - private def resolveDataset(datasetTask: Task[GenericDatasetSpec], - replaceDatasets: Map[String, Dataset]): Task[GenericDatasetSpec] = { - replaceDatasets.get(datasetTask.id.toString) match { - case Some(d) => - PlainTask(datasetTask.id, datasetTask.data.copy(plugin = d), metaData = datasetTask.metaData) - case None => - datasetTask.data.plugin match { - case _: VariableDataset => - throw new IllegalArgumentException("No replacement found for variable dataset " + datasetTask.id.toString) - case _: InternalDataset => - val internalDataset = executionContext.createInternalDataset(Some(datasetTask.id.toString)) - PlainTask(datasetTask.id, datasetTask.data.copy(plugin = internalDataset), metaData = datasetTask.metaData) - case _: Dataset => - datasetTask - } - } - } - override protected val executionContext: LocalExecution = LocalExecution( useLocalInternalDatasets, replaceDataSources, diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala index 24863f388f..d8afc4e807 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala @@ -1,9 +1,11 @@ package org.silkframework.workspace.activity.workflow -import org.silkframework.config.{Prefixes, Task, TaskSpec} -import org.silkframework.dataset.Dataset +import org.silkframework.config.{PlainTask, Prefixes, Task, TaskSpec} +import org.silkframework.dataset.{Dataset, VariableDataset} import org.silkframework.dataset.DatasetSpec.GenericDatasetSpec import org.silkframework.execution._ +import org.silkframework.execution.local.LocalExecution +import org.silkframework.plugins.dataset.InternalDataset import org.silkframework.runtime.activity.Status.Canceling import org.silkframework.runtime.activity._ import org.silkframework.runtime.plugin.PluginContext @@ -93,6 +95,30 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu } } + protected def createRunContext(implicit userContext: UserContext, context: ActivityContext[WorkflowExecutionReport]): WorkflowRunContext = { + val workflowRunContext = WorkflowRunContext( + activityContext = context, + workflow = currentWorkflow, + userContext = userContext + ) + + for (node <- workflowNodes) { + val taskOpt: Option[Task[_ <: TaskSpec]] = node match { + case datasetNode: WorkflowDataset => + project.taskOption[GenericDatasetSpec](datasetNode.task).map { dt => + resolveDataset(dt, replaceDataSources ++ replaceSinks) + } + case operatorNode: WorkflowOperator => + project.anyTaskOption(operatorNode.task) + } + for (t <- taskOpt) { + workflowRunContext.nodeExecutors.put(node.nodeId, ExecutorRegistry.instantiateExecutor(t.data, executionContext)) + } + } + + workflowRunContext + } + /** * Update the progress and write a log message. * @@ -194,6 +220,37 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu } } + /** + * Returns the dataset that should be used in the workflow. Specifically [[VariableDataset]] + * and [[InternalDataset]] need to be replaced by the corresponding real dataset. + * + * @param datasetTask + * @param replaceDatasets A map with replacement datasets for [[VariableDataset]] objects. + * @return + */ + protected def resolveDataset(datasetTask: Task[GenericDatasetSpec], + replaceDatasets: Map[String, Dataset]): Task[GenericDatasetSpec] = { + replaceDatasets.get(datasetTask.id.toString) match { + case Some(d) => + PlainTask(datasetTask.id, datasetTask.data.copy(plugin = d), metaData = datasetTask.metaData) + case None => + datasetTask.data.plugin match { + case _: VariableDataset => + throw new IllegalArgumentException("No replacement found for variable dataset " + datasetTask.id.toString) + case _: InternalDataset => + executionContext match { + case localExecution: LocalExecution => + val internalDataset = localExecution.createInternalDataset(Some(datasetTask.id.toString)) + PlainTask(datasetTask.id, datasetTask.data.copy(plugin = internalDataset), metaData = datasetTask.metaData) + case _ => + datasetTask + } + case _: Dataset => + datasetTask + } + } + } + /** Necessary update for the user context, so external datasets can be accessed in safe-mode inside a workflow execution. */ def updateUserContext(userContext: UserContext): UserContext = { val executionContext = userContext.executionContext From 0e505fc8a57ce32fa32f3a7131178723d6354cf6 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Fri, 10 Apr 2026 12:03:30 +0200 Subject: [PATCH 06/20] InMemoryDataset should hold the data from the most recent execution. --- .../dataset/rdf/datasets/InWorkflowDataset.md | 9 +++++++ .../rdf/datasets/InWorkflowDataset.scala | 27 ++++++++++++------- .../datasets/InWorkflowDatasetExecutor.scala | 15 +++++++---- .../rdf/datasets/InWorkflowDatasetTest.scala | 24 ++++++++++++++--- 4 files changed, 57 insertions(+), 18 deletions(-) create mode 100644 silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md diff --git a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md new file mode 100644 index 0000000000..2639289a66 --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md @@ -0,0 +1,9 @@ +The **in-workflow dataset** is an embedded RDF store that holds all data **in memory**, scoped to a single workflow execution. It is intended as a **transient working graph** for passing data between operators within one run. + +Typical use cases: +- Passing intermediate RDF results between operators within a single workflow execution. +- Storing triples produced by one operator for consumption by a downstream operator in the same run. +- Keeping workflow-local data isolated from other concurrent workflow executions. + +If the dataset is read from outside a workflow, the data from the most recently started executor will be returned. +For large graphs, use an external RDF store. \ No newline at end of file diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala index 674e278d77..6c297badf1 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -1,6 +1,6 @@ package org.silkframework.plugins.dataset.rdf.datasets -import org.apache.jena.rdf.model.ModelFactory +import org.apache.jena.rdf.model.{Model, ModelFactory} import org.silkframework.dataset._ import org.silkframework.dataset.rdf.{RdfDataset, SparqlEndpoint, SparqlParams} import org.silkframework.plugins.dataset.rdf.access.{SparqlSink, SparqlSource} @@ -12,24 +12,33 @@ import org.silkframework.runtime.plugin.annotations.{Plugin, PluginReference} id = InWorkflowDataset.pluginId, label = "In-workflow dataset", categories = Array(DatasetCategories.embedded), - description = "A Dataset that holds all data in-memory for the duration of a single workflow execution. " + - "The data is stored separatly for each workflow execution. " + - "The data is cleared once the workflow execution has finished.", + description = "A Dataset that holds all data in-memory, scoped to a single workflow execution. " + + "The data is stored separately for each workflow execution.", + documentationFile = "InWorkflowDataset.md", relatedPlugins = Array( new PluginReference( id = InMemoryDataset.pluginId, description = "Both datasets hold data in-memory, but the in-memory dataset persists for the lifetime of the running process, " + - "while the in-workflow dataset is scoped to a single workflow execution and cleared afterwards." + "while the in-workflow dataset is scoped to a single workflow execution." ) ) ) case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { - // Empty placeholder model. Actual data is held in the executor (InWorkflowDatasetExecutor). - // Framework code that bypasses access() and reads sparqlEndpoint directly will see empty results. - private val emptyModel = ModelFactory.createDefaultModel() + // Starts as an empty model so reads before any execution see empty (not null) results. + // Replaced by a new JenaModelEndpoint when an executor registers its model via updateData. + @volatile + private var mostRecentSparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(ModelFactory.createDefaultModel()) - override val sparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(emptyModel) + /** + * Called by [[InWorkflowDatasetExecutor]] when a new execution starts. + * Updates sparqlEndpoint so that direct reads see the latest executor's model. + */ + private[datasets] def updateData(model: Model): Unit = { + mostRecentSparqlEndpoint = new JenaModelEndpoint(model) + } + + override def sparqlEndpoint: SparqlEndpoint = mostRecentSparqlEndpoint override def source(implicit userContext: UserContext): DataSource = new SparqlSource(SparqlParams(), sparqlEndpoint) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index aba631c9a2..ca03623a35 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -11,7 +11,11 @@ import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} * Holds the actual Jena model for the duration of a workflow execution. * Overrides access() to expose the executor-owned model to the framework, * ensuring each workflow execution has its own isolated data. - * Clears the model in close() once the workflow execution has finished. + * When access() is called, the dataset's sparqlEndpoint is updated to point + * to this executor's model so that framework code reading sparqlEndpoint + * directly sees the data from the most recently started execution. + * The model is retained after the execution ends so that the data remains + * accessible via sparqlEndpoint until a new execution overwrites it. */ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] { @@ -20,9 +24,10 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] // JenaModelDataset wraps the model and provides source/entitySink/linkSink backed by it. private val modelDataset: JenaModelDataset = JenaModelDataset(model) - override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = modelDataset - - override def close(): Unit = { - model.removeAll() + override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = { + task.data.plugin.updateData(model) + modelDataset } + + override def close(): Unit = {} } diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala index fa49e6a5f3..0997b54095 100644 --- a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala @@ -29,11 +29,11 @@ class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { // The executor's model contains the written data executorEndpoint.select(tripleCountQuery).bindings.size mustBe 1 - // The dataset's own placeholder endpoint is not affected - dataset.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 0 + // After access() the dataset's sparqlEndpoint reflects the executor's model + dataset.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 1 } - it should "clear all data when close() is called" in { + it should "retain data after close() is called" in { val executor = new InWorkflowDatasetExecutor() val executorEndpoint = executor.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint @@ -42,7 +42,23 @@ class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { executor.close() - executorEndpoint.select(tripleCountQuery).bindings.size mustBe 0 + executorEndpoint.select(tripleCountQuery).bindings.size mustBe 1 + } + + it should "update the dataset sparqlEndpoint to the latest executor's model" in { + val dataset2 = InWorkflowDataset() + val task2 = PlainTask("test2", DatasetSpec(dataset2)) + val executor1 = new InWorkflowDatasetExecutor() + val executor2 = new InWorkflowDatasetExecutor() + + val endpoint1 = executor1.access(task2, execution).asInstanceOf[RdfDataset].sparqlEndpoint + endpoint1.update("INSERT DATA { }") + // dataset2 now points to executor1's model — one triple visible + dataset2.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 1 + + // executor2.access() replaces the endpoint — dataset2 now sees executor2's (empty) model + executor2.access(task2, execution) + dataset2.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 0 } it should "isolate data between concurrent executions" in { From c1477fd6a79248be8562e1ce328a78f80e0daa58 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Fri, 10 Apr 2026 13:08:12 +0200 Subject: [PATCH 07/20] InWorkflowDataset: Nested workflow executions should use the same data as the parent execution. --- .../execution/local/LocalExecution.scala | 6 +- .../rdf/datasets/InWorkflowDataset.scala | 33 ++++++++ .../datasets/InWorkflowDatasetExecutor.scala | 35 ++++++++- .../InWorkflowDatasetIntegrationTest.scala | 76 +++++++++++++++++++ .../rdf/datasets/InWorkflowDatasetTest.scala | 60 +++++++++++++++ .../LocalWorkflowAsTaskExecutor.scala | 3 +- .../workflow/LocalWorkflowExecutor.scala | 6 +- 7 files changed, 211 insertions(+), 8 deletions(-) diff --git a/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala b/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala index 49a648359b..b9a1db6a23 100644 --- a/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala +++ b/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala @@ -21,10 +21,14 @@ import scala.jdk.CollectionConverters.ConcurrentMapHasAsScala case class LocalExecution(useLocalInternalDatasets: Boolean, replaceDataSources: Map[String, Dataset] = Map.empty, replaceSinks: Map[String, Dataset] = Map.empty, - workflowId: Option[Identifier] = None) extends ExecutionType { + workflowId: Option[Identifier] = None, + parentExecution: Option[LocalExecution] = None) extends ExecutionType { type DataType = LocalEntities + /** Unique identifier for this execution instance. */ + val executionId: Identifier = Identifier.random + private val log: Logger = Logger.getLogger(this.getClass.getName) private val internalDatasets: mutable.Map[Option[String], InternalDatasetTrait] = mutable.Map.empty diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala index 6c297badf1..49519ecf71 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -3,10 +3,14 @@ package org.silkframework.plugins.dataset.rdf.datasets import org.apache.jena.rdf.model.{Model, ModelFactory} import org.silkframework.dataset._ import org.silkframework.dataset.rdf.{RdfDataset, SparqlEndpoint, SparqlParams} +import org.silkframework.execution.local.LocalExecution import org.silkframework.plugins.dataset.rdf.access.{SparqlSink, SparqlSource} import org.silkframework.plugins.dataset.rdf.endpoint.JenaModelEndpoint import org.silkframework.runtime.activity.UserContext import org.silkframework.runtime.plugin.annotations.{Plugin, PluginReference} +import org.silkframework.util.Identifier + +import java.util.Collections @Plugin( id = InWorkflowDataset.pluginId, @@ -30,6 +34,35 @@ case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { @volatile private var mostRecentSparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(ModelFactory.createDefaultModel()) + /** + * Models for all current workflow executions, keyed by execution ID. + * Uses a WeakHashMap so entries are cleaned up by GC if removeModel() is not called. + * The key (Identifier) is only strongly referenced by the LocalExecution instance, + * so when that execution is garbage collected, the entry is automatically removed. + */ + private val executionModels: java.util.Map[Identifier, Model] = + Collections.synchronizedMap(new java.util.WeakHashMap[Identifier, Model]()) + + /** Registers the model for a given execution. */ + private[datasets] def registerModel(executionId: Identifier, model: Model): Unit = { + executionModels.put(executionId, model) + } + + /** + * Finds the model for the closest ancestor execution that has one registered. + * Walks up the parentExecution chain. + */ + private[datasets] def findModel(execution: LocalExecution): Option[Model] = { + Option(executionModels.get(execution.executionId)).orElse( + execution.parentExecution.flatMap(findModel) + ) + } + + /** Removes the model for a given execution. Called by the executor on close(). */ + private[datasets] def removeModel(executionId: Identifier): Unit = { + executionModels.remove(executionId) + } + /** * Called by [[InWorkflowDatasetExecutor]] when a new execution starts. * Updates sparqlEndpoint so that direct reads see the latest executor's model. diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index ca03623a35..9b825444c6 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -4,6 +4,7 @@ import org.apache.jena.rdf.model.{Model, ModelFactory} import org.silkframework.config.Task import org.silkframework.dataset.{DatasetAccess, DatasetSpec} import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} +import org.silkframework.util.Identifier /** * Executor for [[InWorkflowDataset]]. @@ -16,18 +17,44 @@ import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} * directly sees the data from the most recently started execution. * The model is retained after the execution ends so that the data remains * accessible via sparqlEndpoint until a new execution overwrites it. + * + * If the execution has a parent (nested workflow), the parent's model data + * is copied into this executor's model on first access, so that the nested + * workflow sees the data written by the parent. */ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] { - private val model: Model = ModelFactory.createDefaultModel() + private var model: Model = _ + private var modelDataset: JenaModelDataset = _ + + @volatile private var initialized: Boolean = false - // JenaModelDataset wraps the model and provides source/entitySink/linkSink backed by it. - private val modelDataset: JenaModelDataset = JenaModelDataset(model) + // Stored on first access for cleanup in close(). + private var executionId: Option[Identifier] = None + private var plugin: Option[InWorkflowDataset] = None override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = { + if (!initialized) { + initialized = true + val datasetPlugin = task.data.plugin + // Reuse the parent execution's model if available, otherwise create a new one. + model = execution.parentExecution.flatMap(datasetPlugin.findModel).getOrElse(ModelFactory.createDefaultModel()) + modelDataset = JenaModelDataset(model) + // Register this executor's model so nested workflows can find it. + datasetPlugin.registerModel(execution.executionId, model) + executionId = Some(execution.executionId) + plugin = Some(datasetPlugin) + } task.data.plugin.updateData(model) modelDataset } - override def close(): Unit = {} + override def close(): Unit = { + for { + eid <- executionId + p <- plugin + } { + p.removeModel(eid) + } + } } diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala index 54e63ae6ab..9ae68cd761 100644 --- a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala @@ -142,4 +142,80 @@ class InWorkflowDatasetIntegrationTest extends AnyFlatSpec with Matchers with Co // Isolation: inWorkflow1 (source1 data) and inWorkflow2 (source2 data) are separate. output1AModel.isIsomorphicWith(output2AModel) mustBe false } + + it should "propagate InWorkflowDataset data from a parent workflow to a nested workflow" in { + val workspace = WorkspaceFactory().workspace + val project = workspace.createProject(ProjectConfig(metaData = MetaData(Some("nestedWorkflowTest")))) + + // Source dataset with test data. + val sourceModel: Model = ModelFactory.createDefaultModel() + sourceModel.createResource("http://nested/s1") + .addProperty(sourceModel.createProperty("http://p"), sourceModel.createResource("http://nested/o1")) + sourceModel.createResource("http://nested/s2") + .addProperty(sourceModel.createProperty("http://p"), sourceModel.createResource("http://nested/o2")) + + // Output dataset — empty initially, filled by the nested workflow. + val outputModel: Model = ModelFactory.createDefaultModel() + + // Register tasks. + project.addTask("source", DatasetSpec(JenaModelDataset(sourceModel))) + project.addTask("inWorkflowDs", DatasetSpec(InWorkflowDataset())) + project.addTask("output", DatasetSpec(JenaModelDataset(outputModel))) + + // SparqlCopyCustomTask: copies triples into the InWorkflowDataset. + val copyQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }" + project.addTask("copyToInWorkflow", SparqlCopyCustomTask(copyQuery, tempFile = false)) + + // TransformSpec: reads from InWorkflowDataset via MultiEntitySchema path. + val identityTransform = TransformSpec( + selection = DatasetSelection("dummy", Uri("")), + mappingRule = RootMappingRule(MappingRules( + propertyRules = Seq( + DirectMapping( + id = "pmap", + sourcePath = UntypedPath(Uri("http://p")), + mappingTarget = MappingTarget(Uri("http://p")) + ) + ) + )) + ) + project.addTask("readFromInWorkflow", identityTransform) + + // Nested workflow: reads from inWorkflowDs and writes to output. + val nestedWorkflow = Workflow( + operators = Seq( + WorkflowOperator(Seq(Some("inWorkflowDs")), "readFromInWorkflow", Seq("output"), Seq.empty, (100, 0), "readFromInWorkflow", None, Seq.empty, Seq.empty) + ), + datasets = Seq( + WorkflowDataset(Seq.empty, "inWorkflowDs", Seq("readFromInWorkflow"), (0, 0), "inWorkflowDs", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInWorkflow")), "output", Seq.empty, (200, 0), "output", None, Seq.empty, Seq.empty) + ) + ) + project.addTask("nestedWorkflow", nestedWorkflow) + + // Parent workflow: source → copyToInWorkflow → inWorkflowDs → nestedWorkflow + val parentWorkflow = Workflow( + operators = Seq( + WorkflowOperator(Seq(Some("source")), "copyToInWorkflow", Seq("inWorkflowDs"), Seq.empty, (100, 0), "copyToInWorkflow", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inWorkflowDs")), "nestedWorkflow", Seq.empty, Seq.empty, (300, 0), "nestedWorkflow", None, Seq.empty, Seq.empty) + ), + datasets = Seq( + WorkflowDataset(Seq.empty, "source", Seq("copyToInWorkflow"), (0, 0), "source", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("copyToInWorkflow")), "inWorkflowDs", Seq("nestedWorkflow"), (200, 0), "inWorkflowDs", None, Seq.empty, Seq.empty) + ) + ) + project.addTask("parentWorkflow", parentWorkflow) + val workflowTask = project.task[Workflow]("parentWorkflow") + + // Execute the parent workflow. + val executor = LocalWorkflowExecutorGeneratingProvenance(workflowTask) + val monitor = new ActivityMonitor("nestedMonitor", initialValue = Some(WorkflowExecutionReportWithProvenance.empty)) + executor.run(monitor) + + // The nested workflow must have read the data written by the parent. + outputModel.size() must be > 0L + + // Verify the output contains exactly the source triples (2 resources with property). + outputModel.listSubjects().toList.size() mustBe 2 + } } diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala index 0997b54095..0168ba5499 100644 --- a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala @@ -7,6 +7,7 @@ import org.silkframework.dataset.DatasetSpec import org.silkframework.dataset.rdf.RdfDataset import org.silkframework.execution.local.LocalExecution import org.silkframework.runtime.activity.UserContext +import org.silkframework.util.Identifier class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { @@ -75,4 +76,63 @@ class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { endpoint1.select(tripleCountQuery).bindings.size mustBe 1 endpoint2.select(tripleCountQuery).bindings.size mustBe 2 } + + it should "use parent execution data in the nested executor" in { + val nestedDataset = InWorkflowDataset() + val nestedTask = PlainTask("nestedTest", DatasetSpec(nestedDataset)) + + // Parent execution writes data + val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) + val parentExecutor = new InWorkflowDatasetExecutor() + val parentEndpoint = parentExecutor.access(nestedTask, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + parentEndpoint.update("INSERT DATA { }") + parentEndpoint.update("INSERT DATA { }") + parentEndpoint.select(tripleCountQuery).bindings.size mustBe 2 + + // Child execution with parent reference + val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) + val childExecutor = new InWorkflowDatasetExecutor() + val childEndpoint = childExecutor.access(nestedTask, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + + // Child sees the parent's data + childEndpoint.select(tripleCountQuery).bindings.size mustBe 2 + } + + it should "share the model between parent and child executions" in { + val nestedDataset = InWorkflowDataset() + val nestedTask = PlainTask("nestedTest", DatasetSpec(nestedDataset)) + + // Parent execution writes one triple + val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) + val parentExecutor = new InWorkflowDatasetExecutor() + val parentEndpoint = parentExecutor.access(nestedTask, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + parentEndpoint.update("INSERT DATA { }") + + // Child execution references the same model and writes more + val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) + val childExecutor = new InWorkflowDatasetExecutor() + val childEndpoint = childExecutor.access(nestedTask, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + childEndpoint.update("INSERT DATA { }") + + // Both see the same data since they share the model + childEndpoint.select(tripleCountQuery).bindings.size mustBe 2 + parentEndpoint.select(tripleCountQuery).bindings.size mustBe 2 + } + + it should "clean up model on close()" in { + val nestedDataset = InWorkflowDataset() + val nestedTask = PlainTask("cleanupTest", DatasetSpec(nestedDataset)) + + val exec = LocalExecution(false, workflowId = Some(Identifier("wf"))) + val executor = new InWorkflowDatasetExecutor() + executor.access(nestedTask, exec).asInstanceOf[RdfDataset].sparqlEndpoint + .update("INSERT DATA { }") + + // Model is registered + nestedDataset.findModel(exec) must not be empty + + // After close, model is removed from the dataset's map + executor.close() + nestedDataset.findModel(exec) mustBe empty + } } diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowAsTaskExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowAsTaskExecutor.scala index ab62f1bf51..5104c1967c 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowAsTaskExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowAsTaskExecutor.scala @@ -30,7 +30,8 @@ class LocalWorkflowAsTaskExecutor extends Executor[Workflow, LocalExecution] { projectTask, clearDatasets = false, replaceDataSources = execution.replaceDataSources, - replaceSinks = execution.replaceSinks + replaceSinks = execution.replaceSinks, + parentExecution = Some(execution) ).run(workflowContext) None diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala index 8d5d3429b8..d206fc245a 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala @@ -38,7 +38,8 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], replaceDataSources: Map[String, Dataset] = Map.empty, replaceSinks: Map[String, Dataset] = Map.empty, useLocalInternalDatasets: Boolean = false, - clearDatasets: Boolean = true) + clearDatasets: Boolean = true, + parentExecution: Option[LocalExecution] = None) extends WorkflowExecutor[LocalExecution] { private val log = Logger.getLogger(getClass.getName) @@ -423,7 +424,8 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], useLocalInternalDatasets, replaceDataSources, replaceSinks, - Some(workflowTask.id) + Some(workflowTask.id), + parentExecution ) override protected def workflowNodeEntities[T](workflowDependencyNode: WorkflowDependencyNode, From 1cfccd88cc7cd01003306c28484c65affa1edb07 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Fri, 10 Apr 2026 13:09:44 +0200 Subject: [PATCH 08/20] InWorkflowDataset: Update doc --- .../plugins/dataset/rdf/datasets/InWorkflowDataset.md | 2 ++ .../plugins/dataset/rdf/datasets/InWorkflowDataset.scala | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md index 2639289a66..6cdca0de97 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md +++ b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md @@ -5,5 +5,7 @@ Typical use cases: - Storing triples produced by one operator for consumption by a downstream operator in the same run. - Keeping workflow-local data isolated from other concurrent workflow executions. +**Nested workflows:** When a workflow contains a nested workflow, the nested workflow shares the same in-workflow dataset model as the parent. Data written by the parent workflow is available in the nested workflow, and data written by the nested workflow is visible to the parent after the nested workflow completes. + If the dataset is read from outside a workflow, the data from the most recently started executor will be returned. For large graphs, use an external RDF store. \ No newline at end of file diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala index 49519ecf71..d89d1ddbff 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -17,7 +17,8 @@ import java.util.Collections label = "In-workflow dataset", categories = Array(DatasetCategories.embedded), description = "A Dataset that holds all data in-memory, scoped to a single workflow execution. " + - "The data is stored separately for each workflow execution.", + "The data is stored separately for each workflow execution. " + + "Nested workflows share the same model as the parent, so data written by the parent is available in the nested workflow and vice versa.", documentationFile = "InWorkflowDataset.md", relatedPlugins = Array( new PluginReference( From a4a73e928509c87c33051f0bf5a2bb80d7b7ddb0 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Fri, 10 Apr 2026 13:27:06 +0200 Subject: [PATCH 09/20] InWorkflowDataset: Improve automatic cleanup in case the execution identifier is held elsewhere. --- .../rdf/datasets/InWorkflowDataset.scala | 27 +++++++++++-------- .../datasets/InWorkflowDatasetExecutor.scala | 25 +++++++---------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala index d89d1ddbff..dd53718170 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -36,17 +36,17 @@ case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { private var mostRecentSparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(ModelFactory.createDefaultModel()) /** - * Models for all current workflow executions, keyed by execution ID. - * Uses a WeakHashMap so entries are cleaned up by GC if removeModel() is not called. - * The key (Identifier) is only strongly referenced by the LocalExecution instance, - * so when that execution is garbage collected, the entry is automatically removed. + * Models for all current workflow executions, keyed by [[ExecutionModelKey]]. + * Uses a WeakHashMap so entries are automatically cleaned up by GC when the key is no longer referenced. + * When the executor is GC'd, the entry is cleaned up. + * Entries are also explicitly removed by [[InWorkflowDatasetExecutor.close()]] when the execution finishes. */ - private val executionModels: java.util.Map[Identifier, Model] = - Collections.synchronizedMap(new java.util.WeakHashMap[Identifier, Model]()) + private val executionModels: java.util.Map[ExecutionModelKey, Model] = + Collections.synchronizedMap(new java.util.WeakHashMap[ExecutionModelKey, Model]()) /** Registers the model for a given execution. */ - private[datasets] def registerModel(executionId: Identifier, model: Model): Unit = { - executionModels.put(executionId, model) + private[datasets] def registerModel(key: ExecutionModelKey, model: Model): Unit = { + executionModels.put(key, model) } /** @@ -54,14 +54,14 @@ case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { * Walks up the parentExecution chain. */ private[datasets] def findModel(execution: LocalExecution): Option[Model] = { - Option(executionModels.get(execution.executionId)).orElse( + Option(executionModels.get(ExecutionModelKey(execution.executionId))).orElse( execution.parentExecution.flatMap(findModel) ) } /** Removes the model for a given execution. Called by the executor on close(). */ - private[datasets] def removeModel(executionId: Identifier): Unit = { - executionModels.remove(executionId) + private[datasets] def removeModel(key: ExecutionModelKey): Unit = { + executionModels.remove(key) } /** @@ -86,3 +86,8 @@ case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { object InWorkflowDataset { final val pluginId = "inWorkflow" } + +/** + * Key for the [[InWorkflowDataset.executionModels]] WeakHashMap. + */ +private[datasets] case class ExecutionModelKey(executionId: Identifier) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index 9b825444c6..81459fbe6b 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -4,23 +4,14 @@ import org.apache.jena.rdf.model.{Model, ModelFactory} import org.silkframework.config.Task import org.silkframework.dataset.{DatasetAccess, DatasetSpec} import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} -import org.silkframework.util.Identifier /** * Executor for [[InWorkflowDataset]]. * * Holds the actual Jena model for the duration of a workflow execution. - * Overrides access() to expose the executor-owned model to the framework, - * ensuring each workflow execution has its own isolated data. - * When access() is called, the dataset's sparqlEndpoint is updated to point - * to this executor's model so that framework code reading sparqlEndpoint - * directly sees the data from the most recently started execution. - * The model is retained after the execution ends so that the data remains - * accessible via sparqlEndpoint until a new execution overwrites it. * - * If the execution has a parent (nested workflow), the parent's model data - * is copied into this executor's model on first access, so that the nested - * workflow sees the data written by the parent. + * If the execution has a parent (nested workflow), the parent's model is + * reused so that the nested workflow sees the data written by the parent. */ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] { @@ -30,7 +21,8 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] @volatile private var initialized: Boolean = false // Stored on first access for cleanup in close(). - private var executionId: Option[Identifier] = None + // The executor is the only strong reference holder for the key, enabling WeakHashMap cleanup. + private var modelKey: Option[ExecutionModelKey] = None private var plugin: Option[InWorkflowDataset] = None override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = { @@ -41,8 +33,9 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] model = execution.parentExecution.flatMap(datasetPlugin.findModel).getOrElse(ModelFactory.createDefaultModel()) modelDataset = JenaModelDataset(model) // Register this executor's model so nested workflows can find it. - datasetPlugin.registerModel(execution.executionId, model) - executionId = Some(execution.executionId) + val key = ExecutionModelKey(execution.executionId) + datasetPlugin.registerModel(key, model) + modelKey = Some(key) plugin = Some(datasetPlugin) } task.data.plugin.updateData(model) @@ -51,10 +44,10 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] override def close(): Unit = { for { - eid <- executionId + key <- modelKey p <- plugin } { - p.removeModel(eid) + p.removeModel(key) } } } From 6e3d049e1b49d389ff0f8dae091ad4fec6da4d5a Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Fri, 10 Apr 2026 14:45:43 +0200 Subject: [PATCH 10/20] InWorkflowDataset bugfixes. --- .../rdf/datasets/InWorkflowDataset.scala | 8 +-- .../datasets/InWorkflowDatasetExecutor.scala | 6 +-- .../rdf/datasets/InWorkflowDatasetTest.scala | 52 ++++++++++++++++++- .../workflow/LocalWorkflowExecutor.scala | 4 +- .../activity/workflow/WorkflowExecutor.scala | 10 ++-- 5 files changed, 64 insertions(+), 16 deletions(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala index dd53718170..e3b5fe96c8 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -53,9 +53,9 @@ case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { * Finds the model for the closest ancestor execution that has one registered. * Walks up the parentExecution chain. */ - private[datasets] def findModel(execution: LocalExecution): Option[Model] = { - Option(executionModels.get(ExecutionModelKey(execution.executionId))).orElse( - execution.parentExecution.flatMap(findModel) + private[datasets] def findModel(execution: LocalExecution, taskId: Identifier): Option[Model] = { + Option(executionModels.get(ExecutionModelKey(execution.executionId, taskId))).orElse( + execution.parentExecution.flatMap(findModel(_, taskId)) ) } @@ -90,4 +90,4 @@ object InWorkflowDataset { /** * Key for the [[InWorkflowDataset.executionModels]] WeakHashMap. */ -private[datasets] case class ExecutionModelKey(executionId: Identifier) +private[datasets] case class ExecutionModelKey(executionId: Identifier, taskId: Identifier) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index 81459fbe6b..55586518d4 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -29,11 +29,11 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] if (!initialized) { initialized = true val datasetPlugin = task.data.plugin - // Reuse the parent execution's model if available, otherwise create a new one. - model = execution.parentExecution.flatMap(datasetPlugin.findModel).getOrElse(ModelFactory.createDefaultModel()) + // Reuse the execution's model if available, otherwise create a new one. + model = execution.parentExecution.flatMap(datasetPlugin.findModel(_, task.id)).getOrElse(ModelFactory.createDefaultModel()) modelDataset = JenaModelDataset(model) // Register this executor's model so nested workflows can find it. - val key = ExecutionModelKey(execution.executionId) + val key = ExecutionModelKey(execution.executionId, task.id) datasetPlugin.registerModel(key, model) modelKey = Some(key) plugin = Some(datasetPlugin) diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala index 0168ba5499..230c3b9679 100644 --- a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala @@ -119,6 +119,54 @@ class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { parentEndpoint.select(tripleCountQuery).bindings.size mustBe 2 } + it should "only reuse the parent model with the same task id in a nested execution" in { + val sharedDataset = InWorkflowDataset() + val taskA = PlainTask("datasetA", DatasetSpec(sharedDataset)) + val taskB = PlainTask("datasetB", DatasetSpec(sharedDataset)) + + val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) + + // Parent registers models for both taskA and taskB + val parentExecutorA = new InWorkflowDatasetExecutor() + val endpointA = parentExecutorA.access(taskA, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + endpointA.update("INSERT DATA { }") + + val parentExecutorB = new InWorkflowDatasetExecutor() + val endpointB = parentExecutorB.access(taskB, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + endpointB.update("INSERT DATA { }") + endpointB.update("INSERT DATA { }") + + // Child execution for taskA — must see only taskA's data (1 triple), not taskB's (2 triples) + val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) + val childExecutorA = new InWorkflowDatasetExecutor() + val childEndpointA = childExecutorA.access(taskA, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + childEndpointA.select(tripleCountQuery).bindings.size mustBe 1 + + // Child execution for taskB — must see only taskB's data (2 triples) + val childExecutorB = new InWorkflowDatasetExecutor() + val childEndpointB = childExecutorB.access(taskB, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + childEndpointB.select(tripleCountQuery).bindings.size mustBe 2 + } + + it should "create a new model in a nested execution if the parent has no matching task id" in { + val sharedDataset = InWorkflowDataset() + val parentTask = PlainTask("parentOnly", DatasetSpec(sharedDataset)) + val childTask = PlainTask("childOnly", DatasetSpec(sharedDataset)) + + val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) + + // Parent registers a model for parentOnly + val parentExecutor = new InWorkflowDatasetExecutor() + val parentEndpoint = parentExecutor.access(parentTask, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + parentEndpoint.update("INSERT DATA { }") + + // Child execution for a different task id — must NOT see parent data + val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) + val childExecutor = new InWorkflowDatasetExecutor() + val childEndpoint = childExecutor.access(childTask, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + childEndpoint.select(tripleCountQuery).bindings.size mustBe 0 + } + it should "clean up model on close()" in { val nestedDataset = InWorkflowDataset() val nestedTask = PlainTask("cleanupTest", DatasetSpec(nestedDataset)) @@ -129,10 +177,10 @@ class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { .update("INSERT DATA { }") // Model is registered - nestedDataset.findModel(exec) must not be empty + nestedDataset.findModel(exec, nestedTask.id) must not be empty // After close, model is removed from the dataset's map executor.close() - nestedDataset.findModel(exec) mustBe empty + nestedDataset.findModel(exec, nestedTask.id) mustBe empty } } diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala index d206fc245a..ff00400ffb 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/LocalWorkflowExecutor.scala @@ -110,12 +110,12 @@ case class LocalWorkflowExecutor(workflowTask: ProjectTask[Workflow], } finally { context.value.updateWith(_.asDone()) this.executionContext.executeShutdownHooks() - workflowRunContext.nodeExecutors.foreach { case (nodeId, exec) => + workflowRunContext.taskExecutors.foreach { case (taskId, exec) => try { exec.close() } catch { case NonFatal(ex) => - log.log(Level.WARNING, s"Exception while closing executor for node '$nodeId'.", ex) + log.log(Level.WARNING, s"Exception while closing executor for task '$taskId'.", ex) } } } diff --git a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala index d8afc4e807..c02ef81d83 100644 --- a/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala +++ b/silk-workspace/src/main/scala/org/silkframework/workspace/activity/workflow/WorkflowExecutor.scala @@ -59,11 +59,11 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu updateProgress(operation, task) val result = try { - workflowRunContext.nodeExecutors.get(nodeId) match { + workflowRunContext.taskExecutors.get(task.id) match { case Some(exec) => ExecutorRegistry.executeWith(exec.asInstanceOf[Executor[TaskType, ExecType]], task, inputs, output, executionContext, taskContext) case None => - throw WorkflowExecutionException(s"No executor found for node '$nodeId'. This is a bug: executors should have been initialized before execution.") + throw WorkflowExecutionException(s"No executor found for task '${task.id}'. This is a bug: executors should have been initialized before execution.") } } catch { case NonFatal(ex) => @@ -112,7 +112,7 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu project.anyTaskOption(operatorNode.task) } for (t <- taskOpt) { - workflowRunContext.nodeExecutors.put(node.nodeId, ExecutorRegistry.instantiateExecutor(t.data, executionContext)) + workflowRunContext.taskExecutors.getOrElseUpdate(t.id, ExecutorRegistry.instantiateExecutor(t.data, executionContext)) } } @@ -267,14 +267,14 @@ trait WorkflowExecutor[ExecType <: ExecutionType] extends Activity[WorkflowExecu * @param userContext The user that is executing the workflow. * @param alreadyExecuted The workflow nodes that have already been executed. * @param reconfiguredTasks The already tasks that have been reconfigured. - * @param nodeExecutors The node executors for each workflow node by node id. + * @param taskExecutors The executors for each task by task id. */ case class WorkflowRunContext(activityContext: ActivityContext[WorkflowExecutionReport], workflow: Workflow, userContext: UserContext, alreadyExecuted: mutable.Set[WorkflowNode] = mutable.Set(), reconfiguredTasks: mutable.Map[WorkflowNode, Task[_ <: TaskSpec]] = mutable.Map(), - nodeExecutors: mutable.Map[Identifier, Executor[_, _]] = mutable.Map()) { + taskExecutors: mutable.Map[Identifier, Executor[_, _]] = mutable.Map()) { /** * Listeners for updates to task reports. * We need to hold them to prevent their garbage collection. From 76580ba17713cd9475bf7d885d2e6f709a6c6044 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Fri, 10 Apr 2026 14:49:05 +0200 Subject: [PATCH 11/20] Update doc --- .../plugins/dataset/rdf/datasets/InWorkflowDataset.md | 2 +- .../plugins/dataset/rdf/datasets/InWorkflowDataset.scala | 6 +++--- .../dataset/rdf/datasets/InWorkflowDatasetExecutor.scala | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md index 6cdca0de97..a61e592817 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md +++ b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md @@ -5,7 +5,7 @@ Typical use cases: - Storing triples produced by one operator for consumption by a downstream operator in the same run. - Keeping workflow-local data isolated from other concurrent workflow executions. -**Nested workflows:** When a workflow contains a nested workflow, the nested workflow shares the same in-workflow dataset model as the parent. Data written by the parent workflow is available in the nested workflow, and data written by the nested workflow is visible to the parent after the nested workflow completes. +**Nested workflows:** When a workflow contains a nested workflow, the nested workflow shares the same in-workflow dataset model as the parent for each dataset task with the same identifier. Data written by the parent workflow is available in the nested workflow, and data written by the nested workflow is visible to the parent after the nested workflow completes. Dataset tasks with different identifiers remain isolated. If the dataset is read from outside a workflow, the data from the most recently started executor will be returned. For large graphs, use an external RDF store. \ No newline at end of file diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala index e3b5fe96c8..2d6ac692bd 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -18,7 +18,7 @@ import java.util.Collections categories = Array(DatasetCategories.embedded), description = "A Dataset that holds all data in-memory, scoped to a single workflow execution. " + "The data is stored separately for each workflow execution. " + - "Nested workflows share the same model as the parent, so data written by the parent is available in the nested workflow and vice versa.", + "A dataset in a nested workflow shares the same model as the parent, so data written by the parent is available in the nested workflow and vice versa.", documentationFile = "InWorkflowDataset.md", relatedPlugins = Array( new PluginReference( @@ -50,8 +50,8 @@ case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { } /** - * Finds the model for the closest ancestor execution that has one registered. - * Walks up the parentExecution chain. + * Finds the model for the closest ancestor execution that has one registered for the given task. + * Walks up the parentExecution chain, matching by both execution ID and task ID. */ private[datasets] def findModel(execution: LocalExecution, taskId: Identifier): Option[Model] = { Option(executionModels.get(ExecutionModelKey(execution.executionId, taskId))).orElse( diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index 55586518d4..e564a67e39 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -10,8 +10,8 @@ import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} * * Holds the actual Jena model for the duration of a workflow execution. * - * If the execution has a parent (nested workflow), the parent's model is - * reused so that the nested workflow sees the data written by the parent. + * If the execution has a parent (nested workflow), the parent's model for the + * same task is reused so that the nested workflow sees the data written by the parent. */ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] { From 23df68a893cf75c09bb42f26b9f2ee5dd9c6a6d3 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Wed, 22 Apr 2026 14:19:03 +0200 Subject: [PATCH 12/20] InWorkflowDataset fix: Don't remove graph on clear --- .../dataset/rdf/datasets/InWorkflowDataset.scala | 6 +++--- .../rdf/datasets/InWorkflowDatasetExecutor.scala | 13 ++++++++----- .../dataset/rdf/datasets/JenaModelDataset.scala | 10 +++++----- .../dataset/rdf/endpoint/JenaModelEndpoint.scala | 1 + .../InWorkflowDatasetIntegrationTest.scala | 16 ++++++++-------- 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala index 2d6ac692bd..d07f674a10 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala @@ -76,11 +76,11 @@ case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { override def source(implicit userContext: UserContext): DataSource = new SparqlSource(SparqlParams(), sparqlEndpoint) - override def entitySink(implicit userContext: UserContext): EntitySink = new SparqlSink(SparqlParams(), sparqlEndpoint) + override def entitySink(implicit userContext: UserContext): EntitySink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = false) - override def linkSink(implicit userContext: UserContext): LinkSink = new SparqlSink(SparqlParams(), sparqlEndpoint) + override def linkSink(implicit userContext: UserContext): LinkSink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = false) - override def tripleSink(implicit userContext: UserContext): TripleSink = new SparqlSink(SparqlParams(), sparqlEndpoint) + override def tripleSink(implicit userContext: UserContext): TripleSink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = false) } object InWorkflowDataset { diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index e564a67e39..40957de6d7 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -15,28 +15,31 @@ import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} */ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] { - private var model: Model = _ - private var modelDataset: JenaModelDataset = _ + @volatile private var model: Model = _ + @volatile private var modelDataset: JenaModelDataset = _ @volatile private var initialized: Boolean = false // Stored on first access for cleanup in close(). // The executor is the only strong reference holder for the key, enabling WeakHashMap cleanup. - private var modelKey: Option[ExecutionModelKey] = None - private var plugin: Option[InWorkflowDataset] = None + @volatile private var modelKey: Option[ExecutionModelKey] = None + @volatile private var plugin: Option[InWorkflowDataset] = None override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = { if (!initialized) { + println("INITIALIZED") initialized = true val datasetPlugin = task.data.plugin // Reuse the execution's model if available, otherwise create a new one. model = execution.parentExecution.flatMap(datasetPlugin.findModel(_, task.id)).getOrElse(ModelFactory.createDefaultModel()) - modelDataset = JenaModelDataset(model) + modelDataset = JenaModelDataset.fromModel(model, dropGraphOnClear = false) // Register this executor's model so nested workflows can find it. val key = ExecutionModelKey(execution.executionId, task.id) datasetPlugin.registerModel(key, model) modelKey = Some(key) plugin = Some(datasetPlugin) + } else { + println("FOUND: " + model) } task.data.plugin.updateData(model) modelDataset diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/JenaModelDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/JenaModelDataset.scala index 096ef77f90..c7d38564ac 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/JenaModelDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/JenaModelDataset.scala @@ -7,7 +7,7 @@ import org.silkframework.plugins.dataset.rdf.endpoint.JenaModelEndpoint import org.silkframework.plugins.dataset.rdf.access.{SparqlSink, SparqlSource} import org.silkframework.runtime.activity.UserContext -case class JenaModelDataset() extends RdfDataset { +case class JenaModelDataset(dropGraphOnClear: Boolean = true) extends RdfDataset { private val sparqlParams = SparqlParams() @@ -27,21 +27,21 @@ case class JenaModelDataset() extends RdfDataset { * Returns a link sink for writing entity links to the data set. */ override def linkSink(implicit userContext: UserContext): LinkSink = { - new SparqlSink(sparqlParams, sparqlEndpoint, dropGraphOnClear = true) + new SparqlSink(sparqlParams, sparqlEndpoint, dropGraphOnClear = dropGraphOnClear) } /** * Returns an entity sink for writing entities to the data set. */ override def entitySink(implicit userContext: UserContext): EntitySink = { - new SparqlSink(sparqlParams, sparqlEndpoint, dropGraphOnClear = true) + new SparqlSink(sparqlParams, sparqlEndpoint, dropGraphOnClear = dropGraphOnClear) } } object JenaModelDataset { - def apply(model: Model): JenaModelDataset = { - val ds = JenaModelDataset() + def fromModel(model: Model, dropGraphOnClear: Boolean = true): JenaModelDataset = { + val ds = new JenaModelDataset(dropGraphOnClear) ds.sparqlEndpoint = new JenaModelEndpoint(model) ds } diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala index 178deb70e2..27e34ebd66 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala @@ -44,6 +44,7 @@ class JenaModelEndpoint(model: Model) extends JenaEndpoint { override def update(query: String) (implicit userContext: UserContext): Unit = { this.synchronized { + println("Update query: " + query) super.update(query) } } diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala index 9ae68cd761..c07265c27c 100644 --- a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala @@ -61,14 +61,14 @@ class InWorkflowDatasetIntegrationTest extends AnyFlatSpec with Matchers with Co val output2BModel: Model = ModelFactory.createDefaultModel() // Register dataset tasks. - project.addTask("source1", DatasetSpec(JenaModelDataset(source1Model))) - project.addTask("source2", DatasetSpec(JenaModelDataset(source2Model))) + project.addTask("source1", DatasetSpec(JenaModelDataset.fromModel(source1Model))) + project.addTask("source2", DatasetSpec(JenaModelDataset.fromModel(source2Model))) project.addTask("inWorkflow1", DatasetSpec(InWorkflowDataset())) project.addTask("inWorkflow2", DatasetSpec(InWorkflowDataset())) - project.addTask("output1A", DatasetSpec(JenaModelDataset(output1AModel))) - project.addTask("output1B", DatasetSpec(JenaModelDataset(output1BModel))) - project.addTask("output2A", DatasetSpec(JenaModelDataset(output2AModel))) - project.addTask("output2B", DatasetSpec(JenaModelDataset(output2BModel))) + project.addTask("output1A", DatasetSpec(JenaModelDataset.fromModel(output1AModel))) + project.addTask("output1B", DatasetSpec(JenaModelDataset.fromModel(output1BModel))) + project.addTask("output2A", DatasetSpec(JenaModelDataset.fromModel(output2AModel))) + project.addTask("output2B", DatasetSpec(JenaModelDataset.fromModel(output2BModel))) // SparqlCopyCustomTask: reads via SparqlEndpointEntitySchema, outputs QuadEntitySchema. // Quads are written to InWorkflowDataset via withEntitySink → access() → executor model. @@ -158,9 +158,9 @@ class InWorkflowDatasetIntegrationTest extends AnyFlatSpec with Matchers with Co val outputModel: Model = ModelFactory.createDefaultModel() // Register tasks. - project.addTask("source", DatasetSpec(JenaModelDataset(sourceModel))) + project.addTask("source", DatasetSpec(JenaModelDataset.fromModel(sourceModel))) project.addTask("inWorkflowDs", DatasetSpec(InWorkflowDataset())) - project.addTask("output", DatasetSpec(JenaModelDataset(outputModel))) + project.addTask("output", DatasetSpec(JenaModelDataset.fromModel(outputModel))) // SparqlCopyCustomTask: copies triples into the InWorkflowDataset. val copyQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }" From c23cc9a7febd83014db18048b97725b39a206205 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Wed, 22 Apr 2026 14:19:54 +0200 Subject: [PATCH 13/20] Remove printlns --- .../dataset/rdf/datasets/InWorkflowDatasetExecutor.scala | 3 --- .../plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala | 1 - 2 files changed, 4 deletions(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index 40957de6d7..040d251b75 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -27,7 +27,6 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = { if (!initialized) { - println("INITIALIZED") initialized = true val datasetPlugin = task.data.plugin // Reuse the execution's model if available, otherwise create a new one. @@ -38,8 +37,6 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] datasetPlugin.registerModel(key, model) modelKey = Some(key) plugin = Some(datasetPlugin) - } else { - println("FOUND: " + model) } task.data.plugin.updateData(model) modelDataset diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala index 27e34ebd66..178deb70e2 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala @@ -44,7 +44,6 @@ class JenaModelEndpoint(model: Model) extends JenaEndpoint { override def update(query: String) (implicit userContext: UserContext): Unit = { this.synchronized { - println("Update query: " + query) super.update(query) } } From abffed208102f1dbab7e9c931a70b544cabfc228 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Thu, 23 Apr 2026 10:56:49 +0200 Subject: [PATCH 14/20] Deprecate internal datasets. --- .../org/silkframework/execution/local/LocalExecution.scala | 3 ++- .../org/silkframework/plugins/dataset/InternalDataset.scala | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala b/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala index b9a1db6a23..f7dbd28578 100644 --- a/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala +++ b/silk-core/src/main/scala/org/silkframework/execution/local/LocalExecution.scala @@ -111,7 +111,8 @@ object LocalExecution { id = "LocalInternalDataset", label = "Internal dataset (single graph)", description = - """Dataset for storing entities between workflow steps. This variant does use the same graph for all internal datasets in a workflow. The underlying dataset type can be configured using the `dataset.internal.*` configuration parameters.""" + """Dataset for storing entities between workflow steps. This variant does use the same graph for all internal datasets in a workflow. The underlying dataset type can be configured using the `dataset.internal.*` configuration parameters.""", + deprecation = "This dataset is deprecated and will be removed in a future version. Instead use either the \"In-workflow dataset\" or the \"In-memory dataset\"." ) case class LocalInternalDataset() extends InternalDatasetTrait { override protected def internalDatasetPluginImpl: Dataset = InternalDataset.createInternalDataset() diff --git a/silk-core/src/main/scala/org/silkframework/plugins/dataset/InternalDataset.scala b/silk-core/src/main/scala/org/silkframework/plugins/dataset/InternalDataset.scala index 90669d51cc..728a2d6d0a 100644 --- a/silk-core/src/main/scala/org/silkframework/plugins/dataset/InternalDataset.scala +++ b/silk-core/src/main/scala/org/silkframework/plugins/dataset/InternalDataset.scala @@ -16,7 +16,8 @@ import scala.util.Try id = "internal", label = "Internal dataset", categories = Array(DatasetCategories.embedded), - description = """Dataset for storing entities between workflow steps. The underlying dataset type can be configured using the `dataset.internal.*` configuration parameters.""" + description = """Dataset for storing entities between workflow steps. The underlying dataset type can be configured using the `dataset.internal.*` configuration parameters.""", + deprecation = "This dataset is deprecated and will be removed in a future version. Instead use either the \"In-workflow dataset\" or the \"In-memory dataset\"." ) case class InternalDataset( @Param(label = "graph URI", value = "The RDF graph that is used for storing internal data") From bdeac5e564be15df9b984e4d6e3a99e0c16eb344 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Tue, 28 Apr 2026 17:04:47 +0200 Subject: [PATCH 15/20] InWorkflowDatasetExecutor bugfix: Need to use DatasetSpecAccess so that RDF types are also written. --- .../dataset/rdf/datasets/InWorkflowDatasetExecutor.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala index 040d251b75..701c552103 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala @@ -2,7 +2,7 @@ package org.silkframework.plugins.dataset.rdf.datasets import org.apache.jena.rdf.model.{Model, ModelFactory} import org.silkframework.config.Task -import org.silkframework.dataset.{DatasetAccess, DatasetSpec} +import org.silkframework.dataset.{DatasetAccess, DatasetSpec, DatasetSpecAccess} import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} /** @@ -39,7 +39,7 @@ class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] plugin = Some(datasetPlugin) } task.data.plugin.updateData(model) - modelDataset + DatasetSpecAccess(task.data, modelDataset) } override def close(): Unit = { From d82eca9b102f2a8f301e331e5677fea9f30fea2a Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Tue, 28 Apr 2026 17:47:15 +0200 Subject: [PATCH 16/20] Merge InWorkflowDataset into InMemoryDataset and simplify code --- .../dataset/rdf/datasets/InMemoryDataset.md | 69 ++++-- .../dataset/rdf/datasets/InWorkflowDataset.md | 11 - .../plugins/dataset/rdf/RdfPlugins.scala | 5 +- .../rdf/datasets/InMemoryDataset.scala | 96 +++++--- .../datasets/InMemoryDatasetExecutor.scala | 54 +++++ .../rdf/datasets/InWorkflowDataset.scala | 93 -------- .../datasets/InWorkflowDatasetExecutor.scala | 53 ----- ...DatasetWorkflowScopedIntegrationTest.scala | 176 ++++++++++++++ ...> InMemoryDatasetWorkflowScopedTest.scala} | 111 ++++----- .../InWorkflowDatasetIntegrationTest.scala | 221 ------------------ 10 files changed, 400 insertions(+), 489 deletions(-) delete mode 100644 silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md create mode 100644 silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetExecutor.scala delete mode 100644 silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala delete mode 100644 silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala create mode 100644 silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetWorkflowScopedIntegrationTest.scala rename silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/{InWorkflowDatasetTest.scala => InMemoryDatasetWorkflowScopedTest.scala} (55%) delete mode 100644 silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala diff --git a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md index 71cd845a7e..88a945e041 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md +++ b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md @@ -9,23 +9,27 @@ Typical use cases: ## 2. Behaviour and lifecycle -- The dataset maintains a single in-memory RDF model. -- All read and write operations go through a SPARQL endpoint over this model. -- Data exists only **in memory**: - - It is not persisted to disk by this dataset. - - After an application restart, the dataset contents are empty again. +The dataset maintains a single in-memory RDF model and exposes it via a SPARQL endpoint. Two lifecycle modes are available, controlled by the `workflowScoped` parameter: -Within a workflow: -- The dataset can be used as both **input** and **output**: - - Upstream operators can write triples/entities/links into it. - - Downstream operators can read from it via SPARQL-based mechanisms. +**Application-scoped mode** (default, `workflowScoped = false`): +- A single shared model is created when the dataset is instantiated. +- Data persists for the lifetime of the running application process. +- All workflow executions share the same in-memory graph. +- After an application restart, the dataset contents are empty again. + +**Workflow-scoped mode** (`workflowScoped = true`): +- A separate model is created for each workflow execution. +- Concurrent workflow executions are fully isolated from each other. +- A dataset task in a nested workflow shares the same model as the parent workflow for the same task identifier. Data written by the parent is available in the nested workflow and vice versa. +- If the dataset is read from outside a workflow context, the data from the most recently started executor is returned. +- When the workflow execution ends, the per-execution data is removed automatically. ## 3. Reading data - When used as a **source**, the dataset exposes its data as a SPARQL endpoint. - Queries and retrievals behave like against a normal SPARQL dataset: - Entity retrieval, path/type discovery, sampling, etc. are executed via SPARQL. -- There is no file backing this dataset; everything comes from what has been written into the in-memory model during the lifetime of the process. +- There is no file backing this dataset; everything comes from what has been written into the in-memory model during the lifetime of the process (application-scoped) or the workflow execution (workflow-scoped). ## 4. Writing data @@ -44,22 +48,40 @@ All three sinks ultimately write into the same in-memory graph; there is no sepa ## 5. Configuration -### Clear graph before workflow execution +### Workflow scoped + +- **Parameter:** `workflowScoped` (boolean) +- **Default:** `false` + +When `true` (workflow-scoped mode): +- Data is stored in a separate in-memory graph for each workflow execution. +- Concurrent workflow executions are fully isolated from each other. +- A dataset task in a nested workflow shares the same graph as the parent for the same task identifier. Data written by the parent is available in the nested workflow and vice versa. +- If the dataset is read from outside a workflow context, the data from the most recently started executor is returned. +- When the workflow execution ends, the per-execution data is removed automatically. + +When `false` (default, application-scoped mode): +- Data persists in a single shared graph for the lifetime of the running process. +- All workflow executions share the same graph. -- **Parameter:** `Clear graph before workflow execution` (boolean) -- **Default:** `true` +### Clear graph before workflow execution -Behaviour: +- **Parameter:** `clearGraphBeforeExecution` (boolean, **deprecated**) +- **Default:** `false` + +This parameter is deprecated. Use the **Clear dataset** operator in the workflow instead. + +Behaviour (application-scoped mode only): - If **true**: - - Before the dataset is used in a workflow execution, the graph is cleared (for writes via this dataset). + - Before the dataset is used in a workflow execution, the graph is cleared. - The workflow sees a **fresh, empty in-memory graph** at the start of the run. - If **false**: - Existing data in the in-memory graph is **preserved** when the workflow starts. - New data is added on top of whatever is already stored in the model. -This parameter controls whether the dataset behaves as a **fresh scratch graph per workflow run** or as a **longer-lived in-memory graph** within the lifetime of the running application. +This parameter has no effect when `workflowScoped = true` (the executor manages the lifecycle). ## 6. Limitations and recommendations @@ -79,14 +101,23 @@ This parameter controls whether the dataset behaves as a **fresh scratch graph p ## 7. Example usage scenarios -- Use as a **temporary integration graph**: +- Use as a **temporary integration graph** (application-scoped): - Multiple sources write into the in-memory dataset. - A downstream SPARQL-based operator queries the combined graph. -- Use as a **scratch area for experimentation**: +- Use as a **scratch area for experimentation** (application-scoped): - Quickly test mapping or linking logic by writing output into the in-memory dataset. - Inspect the result via SPARQL without configuring an external endpoint. -- Use as a **small lookup store**: +- Use as a **small lookup store** (application-scoped): - Preload a small set of reference triples (e.g. codes or mappings). - Let workflows query these during execution. + +- Use as a **workflow-local intermediate store** (workflow-scoped): + - Multiple operators in a single workflow run write intermediate RDF results. + - Downstream operators in the same run read from the dataset without affecting parallel runs. + +- Use in **nested workflows** (workflow-scoped): + - A parent workflow writes data into a workflow-scoped dataset. + - A nested sub-workflow reads and enriches the same data. + - After the nested workflow completes, the parent can read the enriched result. diff --git a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md deleted file mode 100644 index a61e592817..0000000000 --- a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.md +++ /dev/null @@ -1,11 +0,0 @@ -The **in-workflow dataset** is an embedded RDF store that holds all data **in memory**, scoped to a single workflow execution. It is intended as a **transient working graph** for passing data between operators within one run. - -Typical use cases: -- Passing intermediate RDF results between operators within a single workflow execution. -- Storing triples produced by one operator for consumption by a downstream operator in the same run. -- Keeping workflow-local data isolated from other concurrent workflow executions. - -**Nested workflows:** When a workflow contains a nested workflow, the nested workflow shares the same in-workflow dataset model as the parent for each dataset task with the same identifier. Data written by the parent workflow is available in the nested workflow, and data written by the nested workflow is visible to the parent after the nested workflow completes. Dataset tasks with different identifiers remain isolated. - -If the dataset is read from outside a workflow, the data from the most recently started executor will be returned. -For large graphs, use an external RDF store. \ No newline at end of file diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala index 36e2e88c87..835a2491e0 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/RdfPlugins.scala @@ -1,6 +1,6 @@ package org.silkframework.plugins.dataset.rdf -import org.silkframework.plugins.dataset.rdf.datasets.{AlignmentDataset, InMemoryDataset, InWorkflowDataset, InWorkflowDatasetExecutor, RdfFileDataset, SparqlDataset} +import org.silkframework.plugins.dataset.rdf.datasets.{AlignmentDataset, InMemoryDataset, InMemoryDatasetExecutor, RdfFileDataset, SparqlDataset} import org.silkframework.plugins.dataset.rdf.executors.{LocalSparqlCopyExecutor, LocalSparqlSelectExecutor, LocalSparqlUpdateExecutor} import org.silkframework.plugins.dataset.rdf.tasks.{SparqlCopyCustomTask, SparqlSelectCustomTask, SparqlUpdateCustomTask} import org.silkframework.plugins.dataset.rdf.vocab.{InMemoryVocabularyManager, RdfFilesVocabularyManager, RdfProjectFilesVocabularyManager, RdfVocabularyManager} @@ -14,7 +14,6 @@ class RdfPlugins extends PluginModule { classOf[SparqlDataset], classOf[AlignmentDataset], classOf[InMemoryDataset], - classOf[InWorkflowDataset], classOf[RdfVocabularyManager], classOf[RdfFilesVocabularyManager], classOf[RdfProjectFilesVocabularyManager], @@ -28,7 +27,7 @@ class RdfPlugins extends PluginModule { classOf[LocalSparqlSelectExecutor], classOf[LocalSparqlUpdateExecutor], classOf[LocalSparqlCopyExecutor], - classOf[InWorkflowDatasetExecutor] + classOf[InMemoryDatasetExecutor] ) } diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala index 44f7f7c7c5..d0a40efde6 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala @@ -1,24 +1,26 @@ package org.silkframework.plugins.dataset.rdf.datasets -import org.apache.jena.rdf.model.ModelFactory +import org.apache.jena.rdf.model.{Model, ModelFactory} import org.silkframework.dataset._ import org.silkframework.dataset.rdf.{RdfDataset, SparqlEndpoint, SparqlParams} -import org.silkframework.plugins.dataset.rdf.endpoint.JenaModelEndpoint +import org.silkframework.execution.local.LocalExecution import org.silkframework.plugins.dataset.rdf.access.{SparqlSink, SparqlSource} +import org.silkframework.plugins.dataset.rdf.endpoint.JenaModelEndpoint import org.silkframework.runtime.activity.UserContext import org.silkframework.runtime.plugin.annotations.{Param, Plugin, PluginReference} +import org.silkframework.util.Identifier + +import java.util.Collections @Plugin( id = InMemoryDataset.pluginId, label = "In-memory dataset", categories = Array(DatasetCategories.embedded), - description = "A Dataset that holds all data in-memory.", + description = "A dataset that holds all data in-memory. " + + "In the default (application-scoped) mode, data persists for the lifetime of the running process. " + + "In workflow-scoped mode, data is isolated per workflow execution and shared with nested workflows that reference the same dataset task.", documentationFile = "InMemoryDataset.md", relatedPlugins = Array( - new PluginReference( - id = InWorkflowDataset.pluginId, - description = "Both datasets hold data in-memory, but the in-workflow dataset is scoped to a single workflow execution and cleared afterwards, while the in-memory dataset persists for the lifetime of the application." - ), new PluginReference( id = SparqlDataset.pluginId, description = "Data in the in-memory dataset does not persist beyond the running process. The SPARQL endpoint dataset connects to an external store that persists independently, which means switching between them changes not just where the data lives but whether it survives execution at all." @@ -29,33 +31,75 @@ import org.silkframework.runtime.plugin.annotations.{Param, Plugin, PluginRefere ) ) ) -case class InMemoryDataset(@Param(label = "Clear graph before workflow execution (deprecated)", - value = "This is deprecated, use the 'Clear dataset' operator instead to clear a dataset in a workflow. If set to true this will clear this dataset before it is used in a workflow execution.", - advanced = true) - clearGraphBeforeExecution: Boolean = false) extends RdfDataset with TripleSinkDataset { - - private val model = ModelFactory.createDefaultModel() - - override val sparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(model) +case class InMemoryDataset( + @Param(label = "Workflow-scoped", + value = "If true, data is isolated per workflow execution and cleared after the execution ends, " + + "sharing data with nested workflows that reference the same dataset task. " + + "If false (default), data persists for the lifetime of the application process.") + workflowScoped: Boolean = false, + @Param(label = "Clear graph before workflow execution (deprecated)", + value = "This is deprecated, use the 'Clear dataset' operator instead to clear a dataset in a workflow. If set to true this will clear this dataset before it is used in a workflow execution.", + advanced = true) + clearGraphBeforeExecution: Boolean = false +) extends RdfDataset with TripleSinkDataset { /** - * Returns a data source for reading entities from the data set. - */ - override def source(implicit userContext: UserContext): DataSource = new SparqlSource(SparqlParams(), sparqlEndpoint) + * The active Jena model backing this dataset. + * + * Application-scoped mode: initialised once and never reassigned; holds data for the + * lifetime of the process. + * + * Workflow-scoped mode: replaced by [[updateData]] each time [[InMemoryDatasetExecutor]] + * activates a new execution. + */ + @volatile private[datasets] var model: Model = ModelFactory.createDefaultModel() /** - * Returns a entity sink for writing entities to the data set. - */ - override def entitySink(implicit userContext: UserContext): EntitySink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = clearGraphBeforeExecution) + * Models for all current workflow executions, keyed by [[ExecutionModelKey]]. + * Uses a WeakHashMap so entries are automatically cleaned up by GC when the key is no longer referenced. + * Entries are also explicitly removed by [[InMemoryDatasetExecutor.close()]] when the execution finishes. + */ + private val executionModels: java.util.Map[ExecutionModelKey, Model] = + Collections.synchronizedMap(new java.util.WeakHashMap[ExecutionModelKey, Model]()) - /** - * Returns a link sink for writing entity links to the data set. - */ - override def linkSink(implicit userContext: UserContext): LinkSink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = clearGraphBeforeExecution) + private[datasets] def registerModel(key: ExecutionModelKey, model: Model): Unit = + executionModels.put(key, model) + + private[datasets] def findModel(execution: LocalExecution, taskId: Identifier): Option[Model] = + Option(executionModels.get(ExecutionModelKey(execution.executionId, taskId))).orElse( + execution.parentExecution.flatMap(findModel(_, taskId)) + ) + + private[datasets] def removeModel(key: ExecutionModelKey): Unit = + executionModels.remove(key) + + /** Switches [[model]] to the given execution's model so out-of-workflow reads see current data. */ + private[datasets] def updateData(newModel: Model): Unit = + model = newModel + + // In workflow-scoped mode the executor owns the model lifecycle, so sinks must not drop the graph. + private def dropGraph: Boolean = !workflowScoped && clearGraphBeforeExecution + + override def sparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(model) - override def tripleSink(implicit userContext: UserContext): TripleSink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = clearGraphBeforeExecution) + override def source(implicit userContext: UserContext): DataSource = + new SparqlSource(SparqlParams(), sparqlEndpoint) + + override def entitySink(implicit userContext: UserContext): EntitySink = + new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = dropGraph) + + override def linkSink(implicit userContext: UserContext): LinkSink = + new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = dropGraph) + + override def tripleSink(implicit userContext: UserContext): TripleSink = + new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = dropGraph) } object InMemoryDataset { final val pluginId = "inMemory" } + +/** + * Key for the [[InMemoryDataset.executionModels]] WeakHashMap (workflow-scoped mode). + */ +private[datasets] case class ExecutionModelKey(executionId: Identifier, taskId: Identifier) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetExecutor.scala new file mode 100644 index 0000000000..82ad59d2fc --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetExecutor.scala @@ -0,0 +1,54 @@ +package org.silkframework.plugins.dataset.rdf.datasets + +import org.apache.jena.rdf.model.{Model, ModelFactory} +import org.silkframework.config.Task +import org.silkframework.dataset.{DatasetAccess, DatasetSpec, DatasetSpecAccess} +import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} + +/** + * Executor for [[InMemoryDataset]]. + * + * In application-scoped mode (`workflowScoped == false`), wraps the dataset's static model. + * + * In workflow-scoped mode (`workflowScoped == true`), holds a separate Jena model for the + * duration of a workflow execution. If the execution has a parent (nested workflow), the parent's + * model for the same task is reused so that the nested workflow sees the data written by the parent. + */ +class InMemoryDatasetExecutor extends LocalDatasetExecutor[InMemoryDataset] { + + // Used only in workflow-scoped mode + @volatile private var model: Model = _ + @volatile private var modelDataset: JenaModelDataset = _ + @volatile private var initialized: Boolean = false + @volatile private var modelKey: Option[ExecutionModelKey] = None + @volatile private var plugin: Option[InMemoryDataset] = None + + override def access(task: Task[DatasetSpec[InMemoryDataset]], execution: LocalExecution): DatasetAccess = { + val datasetPlugin = task.data.plugin + if (datasetPlugin.workflowScoped) { + if (!initialized) { + initialized = true + model = execution.parentExecution + .flatMap(datasetPlugin.findModel(_, task.id)) + .getOrElse(ModelFactory.createDefaultModel()) + modelDataset = JenaModelDataset.fromModel(model, dropGraphOnClear = false) + val key = ExecutionModelKey(execution.executionId, task.id) + datasetPlugin.registerModel(key, model) + modelKey = Some(key) + plugin = Some(datasetPlugin) + } + datasetPlugin.updateData(model) + DatasetSpecAccess(task.data, modelDataset) + } else { + val ds = JenaModelDataset.fromModel(datasetPlugin.model, dropGraphOnClear = datasetPlugin.clearGraphBeforeExecution) + DatasetSpecAccess(task.data, ds) + } + } + + override def close(): Unit = { + for { + key <- modelKey + p <- plugin + } p.removeModel(key) + } +} diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala deleted file mode 100644 index d07f674a10..0000000000 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDataset.scala +++ /dev/null @@ -1,93 +0,0 @@ -package org.silkframework.plugins.dataset.rdf.datasets - -import org.apache.jena.rdf.model.{Model, ModelFactory} -import org.silkframework.dataset._ -import org.silkframework.dataset.rdf.{RdfDataset, SparqlEndpoint, SparqlParams} -import org.silkframework.execution.local.LocalExecution -import org.silkframework.plugins.dataset.rdf.access.{SparqlSink, SparqlSource} -import org.silkframework.plugins.dataset.rdf.endpoint.JenaModelEndpoint -import org.silkframework.runtime.activity.UserContext -import org.silkframework.runtime.plugin.annotations.{Plugin, PluginReference} -import org.silkframework.util.Identifier - -import java.util.Collections - -@Plugin( - id = InWorkflowDataset.pluginId, - label = "In-workflow dataset", - categories = Array(DatasetCategories.embedded), - description = "A Dataset that holds all data in-memory, scoped to a single workflow execution. " + - "The data is stored separately for each workflow execution. " + - "A dataset in a nested workflow shares the same model as the parent, so data written by the parent is available in the nested workflow and vice versa.", - documentationFile = "InWorkflowDataset.md", - relatedPlugins = Array( - new PluginReference( - id = InMemoryDataset.pluginId, - description = "Both datasets hold data in-memory, but the in-memory dataset persists for the lifetime of the running process, " + - "while the in-workflow dataset is scoped to a single workflow execution." - ) - ) -) -case class InWorkflowDataset() extends RdfDataset with TripleSinkDataset { - - // Starts as an empty model so reads before any execution see empty (not null) results. - // Replaced by a new JenaModelEndpoint when an executor registers its model via updateData. - @volatile - private var mostRecentSparqlEndpoint: SparqlEndpoint = new JenaModelEndpoint(ModelFactory.createDefaultModel()) - - /** - * Models for all current workflow executions, keyed by [[ExecutionModelKey]]. - * Uses a WeakHashMap so entries are automatically cleaned up by GC when the key is no longer referenced. - * When the executor is GC'd, the entry is cleaned up. - * Entries are also explicitly removed by [[InWorkflowDatasetExecutor.close()]] when the execution finishes. - */ - private val executionModels: java.util.Map[ExecutionModelKey, Model] = - Collections.synchronizedMap(new java.util.WeakHashMap[ExecutionModelKey, Model]()) - - /** Registers the model for a given execution. */ - private[datasets] def registerModel(key: ExecutionModelKey, model: Model): Unit = { - executionModels.put(key, model) - } - - /** - * Finds the model for the closest ancestor execution that has one registered for the given task. - * Walks up the parentExecution chain, matching by both execution ID and task ID. - */ - private[datasets] def findModel(execution: LocalExecution, taskId: Identifier): Option[Model] = { - Option(executionModels.get(ExecutionModelKey(execution.executionId, taskId))).orElse( - execution.parentExecution.flatMap(findModel(_, taskId)) - ) - } - - /** Removes the model for a given execution. Called by the executor on close(). */ - private[datasets] def removeModel(key: ExecutionModelKey): Unit = { - executionModels.remove(key) - } - - /** - * Called by [[InWorkflowDatasetExecutor]] when a new execution starts. - * Updates sparqlEndpoint so that direct reads see the latest executor's model. - */ - private[datasets] def updateData(model: Model): Unit = { - mostRecentSparqlEndpoint = new JenaModelEndpoint(model) - } - - override def sparqlEndpoint: SparqlEndpoint = mostRecentSparqlEndpoint - - override def source(implicit userContext: UserContext): DataSource = new SparqlSource(SparqlParams(), sparqlEndpoint) - - override def entitySink(implicit userContext: UserContext): EntitySink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = false) - - override def linkSink(implicit userContext: UserContext): LinkSink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = false) - - override def tripleSink(implicit userContext: UserContext): TripleSink = new SparqlSink(SparqlParams(), sparqlEndpoint, dropGraphOnClear = false) -} - -object InWorkflowDataset { - final val pluginId = "inWorkflow" -} - -/** - * Key for the [[InWorkflowDataset.executionModels]] WeakHashMap. - */ -private[datasets] case class ExecutionModelKey(executionId: Identifier, taskId: Identifier) diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala deleted file mode 100644 index 701c552103..0000000000 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetExecutor.scala +++ /dev/null @@ -1,53 +0,0 @@ -package org.silkframework.plugins.dataset.rdf.datasets - -import org.apache.jena.rdf.model.{Model, ModelFactory} -import org.silkframework.config.Task -import org.silkframework.dataset.{DatasetAccess, DatasetSpec, DatasetSpecAccess} -import org.silkframework.execution.local.{LocalDatasetExecutor, LocalExecution} - -/** - * Executor for [[InWorkflowDataset]]. - * - * Holds the actual Jena model for the duration of a workflow execution. - * - * If the execution has a parent (nested workflow), the parent's model for the - * same task is reused so that the nested workflow sees the data written by the parent. - */ -class InWorkflowDatasetExecutor extends LocalDatasetExecutor[InWorkflowDataset] { - - @volatile private var model: Model = _ - @volatile private var modelDataset: JenaModelDataset = _ - - @volatile private var initialized: Boolean = false - - // Stored on first access for cleanup in close(). - // The executor is the only strong reference holder for the key, enabling WeakHashMap cleanup. - @volatile private var modelKey: Option[ExecutionModelKey] = None - @volatile private var plugin: Option[InWorkflowDataset] = None - - override def access(task: Task[DatasetSpec[InWorkflowDataset]], execution: LocalExecution): DatasetAccess = { - if (!initialized) { - initialized = true - val datasetPlugin = task.data.plugin - // Reuse the execution's model if available, otherwise create a new one. - model = execution.parentExecution.flatMap(datasetPlugin.findModel(_, task.id)).getOrElse(ModelFactory.createDefaultModel()) - modelDataset = JenaModelDataset.fromModel(model, dropGraphOnClear = false) - // Register this executor's model so nested workflows can find it. - val key = ExecutionModelKey(execution.executionId, task.id) - datasetPlugin.registerModel(key, model) - modelKey = Some(key) - plugin = Some(datasetPlugin) - } - task.data.plugin.updateData(model) - DatasetSpecAccess(task.data, modelDataset) - } - - override def close(): Unit = { - for { - key <- modelKey - p <- plugin - } { - p.removeModel(key) - } - } -} diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetWorkflowScopedIntegrationTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetWorkflowScopedIntegrationTest.scala new file mode 100644 index 0000000000..6d7a8d16ee --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetWorkflowScopedIntegrationTest.scala @@ -0,0 +1,176 @@ +package org.silkframework.plugins.dataset.rdf.datasets + +import org.apache.jena.rdf.model.{Model, ModelFactory} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers +import org.silkframework.config.{MetaData, Prefixes} +import org.silkframework.dataset.DatasetSpec +import org.silkframework.entity.paths.UntypedPath +import org.silkframework.plugins.dataset.rdf.tasks.SparqlCopyCustomTask +import org.silkframework.rule._ +import org.silkframework.runtime.activity.{ActivityMonitor, UserContext} +import org.silkframework.util.{ConfigTestTrait, Uri} +import org.silkframework.workspace.activity.workflow.{LocalWorkflowExecutorGeneratingProvenance, Workflow, WorkflowDataset, WorkflowExecutionReportWithProvenance, WorkflowOperator} +import org.silkframework.workspace.{InMemoryWorkspaceTestTrait, ProjectConfig, WorkspaceFactory} + +/** + * Integration test for [[InMemoryDataset]] with `workflowScoped = true` within a real workflow execution. + */ +class InMemoryDatasetWorkflowScopedIntegrationTest extends AnyFlatSpec with Matchers with ConfigTestTrait { + + implicit val userContext: UserContext = UserContext.Empty + implicit val prefixes: Prefixes = Prefixes.empty + + override def propertyMap: Map[String, Option[String]] = Map( + "workspace.provider.plugin" -> Some("inMemoryWorkspaceProvider") + ) + + "InMemoryDataset (workflowScoped = true)" should "isolate data between two instances and share data across multiple uses of the same instance within a workflow" in { + val workspace = WorkspaceFactory().workspace + val project = workspace.createProject(ProjectConfig(metaData = MetaData(Some("inMemoryWorkflowScopedIntegrationTest")))) + + val source1Model: Model = ModelFactory.createDefaultModel() + source1Model.createResource("http://s1") + .addProperty(source1Model.createProperty("http://p"), source1Model.createResource("http://o1")) + + val source2Model: Model = ModelFactory.createDefaultModel() + source2Model.createResource("http://s2") + .addProperty(source2Model.createProperty("http://p"), source2Model.createResource("http://o2")) + + val output1AModel: Model = ModelFactory.createDefaultModel() + val output1BModel: Model = ModelFactory.createDefaultModel() + val output2AModel: Model = ModelFactory.createDefaultModel() + val output2BModel: Model = ModelFactory.createDefaultModel() + + project.addTask("source1", DatasetSpec(JenaModelDataset.fromModel(source1Model))) + project.addTask("source2", DatasetSpec(JenaModelDataset.fromModel(source2Model))) + project.addTask("inMemory1", DatasetSpec(InMemoryDataset(workflowScoped = true))) + project.addTask("inMemory2", DatasetSpec(InMemoryDataset(workflowScoped = true))) + project.addTask("output1A", DatasetSpec(JenaModelDataset.fromModel(output1AModel))) + project.addTask("output1B", DatasetSpec(JenaModelDataset.fromModel(output1BModel))) + project.addTask("output2A", DatasetSpec(JenaModelDataset.fromModel(output2AModel))) + project.addTask("output2B", DatasetSpec(JenaModelDataset.fromModel(output2BModel))) + + val copyQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }" + project.addTask("copyToInMemory1", SparqlCopyCustomTask(copyQuery, tempFile = false)) + project.addTask("copyToInMemory2", SparqlCopyCustomTask(copyQuery, tempFile = false)) + + val identityTransform = TransformSpec( + selection = DatasetSelection("dummy", Uri("")), + mappingRule = RootMappingRule(MappingRules( + propertyRules = Seq( + DirectMapping( + id = "pmap", + sourcePath = UntypedPath(Uri("http://p")), + mappingTarget = MappingTarget(Uri("http://p")) + ) + ) + )) + ) + project.addTask("readFromInMemory1A", identityTransform) + project.addTask("readFromInMemory1B", identityTransform) + project.addTask("readFromInMemory2A", identityTransform) + project.addTask("readFromInMemory2B", identityTransform) + + val workflow = Workflow( + operators = Seq( + WorkflowOperator(Seq(Some("source1")), "copyToInMemory1", Seq("inMemory1"), Seq.empty, (0, 0), "copyToInMemory1", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("source2")), "copyToInMemory2", Seq("inMemory2"), Seq.empty, (0, 300), "copyToInMemory2", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inMemory1")), "readFromInMemory1A", Seq("output1A"), Seq.empty, (200, 0), "readFromInMemory1A", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inMemory1")), "readFromInMemory1B", Seq("output1B"), Seq.empty, (200,100), "readFromInMemory1B", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inMemory2")), "readFromInMemory2A", Seq("output2A"), Seq.empty, (200,300), "readFromInMemory2A", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inMemory2")), "readFromInMemory2B", Seq("output2B"), Seq.empty, (200,400), "readFromInMemory2B", None, Seq.empty, Seq.empty) + ), + datasets = Seq( + WorkflowDataset(Seq.empty, "source1", Seq("copyToInMemory1"), (0, 0), "source1", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq.empty, "source2", Seq("copyToInMemory2"), (0, 300), "source2", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("copyToInMemory1")), "inMemory1", Seq("readFromInMemory1A", "readFromInMemory1B"), (100, 0), "inMemory1", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("copyToInMemory2")), "inMemory2", Seq("readFromInMemory2A", "readFromInMemory2B"), (100,300), "inMemory2", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInMemory1A")), "output1A", Seq.empty, (300, 0), "output1A", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInMemory1B")), "output1B", Seq.empty, (300,100), "output1B", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInMemory2A")), "output2A", Seq.empty, (300,300), "output2A", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInMemory2B")), "output2B", Seq.empty, (300,400), "output2B", None, Seq.empty, Seq.empty) + ) + ) + project.addTask("workflow", workflow) + val workflowTask = project.task[Workflow]("workflow") + + val executor = LocalWorkflowExecutorGeneratingProvenance(workflowTask) + val monitor = new ActivityMonitor("monitor", initialValue = Some(WorkflowExecutionReportWithProvenance.empty)) + executor.run(monitor) + + output1AModel.size() must be > 0L + output1BModel.size() must be > 0L + output2AModel.size() must be > 0L + output2BModel.size() must be > 0L + + output1AModel.isIsomorphicWith(output1BModel) mustBe true + output2AModel.isIsomorphicWith(output2BModel) mustBe true + output1AModel.isIsomorphicWith(output2AModel) mustBe false + } + + it should "propagate InMemoryDataset (workflowScoped) data from a parent workflow to a nested workflow" in { + val workspace = WorkspaceFactory().workspace + val project = workspace.createProject(ProjectConfig(metaData = MetaData(Some("nestedWorkflowScopedTest")))) + + val sourceModel: Model = ModelFactory.createDefaultModel() + sourceModel.createResource("http://nested/s1") + .addProperty(sourceModel.createProperty("http://p"), sourceModel.createResource("http://nested/o1")) + sourceModel.createResource("http://nested/s2") + .addProperty(sourceModel.createProperty("http://p"), sourceModel.createResource("http://nested/o2")) + + val outputModel: Model = ModelFactory.createDefaultModel() + + project.addTask("source", DatasetSpec(JenaModelDataset.fromModel(sourceModel))) + project.addTask("inMemoryDs", DatasetSpec(InMemoryDataset(workflowScoped = true))) + project.addTask("output", DatasetSpec(JenaModelDataset.fromModel(outputModel))) + + val copyQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }" + project.addTask("copyToInMemory", SparqlCopyCustomTask(copyQuery, tempFile = false)) + + val identityTransform = TransformSpec( + selection = DatasetSelection("dummy", Uri("")), + mappingRule = RootMappingRule(MappingRules( + propertyRules = Seq( + DirectMapping( + id = "pmap", + sourcePath = UntypedPath(Uri("http://p")), + mappingTarget = MappingTarget(Uri("http://p")) + ) + ) + )) + ) + project.addTask("readFromInMemory", identityTransform) + + val nestedWorkflow = Workflow( + operators = Seq( + WorkflowOperator(Seq(Some("inMemoryDs")), "readFromInMemory", Seq("output"), Seq.empty, (100, 0), "readFromInMemory", None, Seq.empty, Seq.empty) + ), + datasets = Seq( + WorkflowDataset(Seq.empty, "inMemoryDs", Seq("readFromInMemory"), (0, 0), "inMemoryDs", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("readFromInMemory")), "output", Seq.empty, (200, 0), "output", None, Seq.empty, Seq.empty) + ) + ) + project.addTask("nestedWorkflow", nestedWorkflow) + + val parentWorkflow = Workflow( + operators = Seq( + WorkflowOperator(Seq(Some("source")), "copyToInMemory", Seq("inMemoryDs"), Seq.empty, (100, 0), "copyToInMemory", None, Seq.empty, Seq.empty), + WorkflowOperator(Seq(Some("inMemoryDs")), "nestedWorkflow", Seq.empty, Seq.empty, (300, 0), "nestedWorkflow", None, Seq.empty, Seq.empty) + ), + datasets = Seq( + WorkflowDataset(Seq.empty, "source", Seq("copyToInMemory"), (0, 0), "source", None, Seq.empty, Seq.empty), + WorkflowDataset(Seq(Some("copyToInMemory")), "inMemoryDs", Seq("nestedWorkflow"), (200, 0), "inMemoryDs", None, Seq.empty, Seq.empty) + ) + ) + project.addTask("parentWorkflow", parentWorkflow) + val workflowTask = project.task[Workflow]("parentWorkflow") + + val executor = LocalWorkflowExecutorGeneratingProvenance(workflowTask) + val monitor = new ActivityMonitor("nestedMonitor", initialValue = Some(WorkflowExecutionReportWithProvenance.empty)) + executor.run(monitor) + + outputModel.size() must be > 0L + outputModel.listSubjects().toList.size() mustBe 2 + } +} diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetWorkflowScopedTest.scala similarity index 55% rename from silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala rename to silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetWorkflowScopedTest.scala index 230c3b9679..6745122745 100644 --- a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetTest.scala +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDatasetWorkflowScopedTest.scala @@ -3,40 +3,38 @@ package org.silkframework.plugins.dataset.rdf.datasets import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.must.Matchers import org.silkframework.config.{PlainTask, Prefixes} -import org.silkframework.dataset.DatasetSpec -import org.silkframework.dataset.rdf.RdfDataset +import org.silkframework.dataset.{DatasetAccess, DatasetSpec, DatasetSpecAccess} +import org.silkframework.dataset.rdf.{RdfDataset, SparqlEndpoint} import org.silkframework.execution.local.LocalExecution import org.silkframework.runtime.activity.UserContext import org.silkframework.util.Identifier -class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { +class InMemoryDatasetWorkflowScopedTest extends AnyFlatSpec with Matchers { private implicit val userContext: UserContext = UserContext.Empty private implicit val prefixes: Prefixes = Prefixes.empty - private val dataset = InWorkflowDataset() + private val dataset = InMemoryDataset(workflowScoped = true) private val task = PlainTask("test", DatasetSpec(dataset)) private val execution = LocalExecution() private val tripleCountQuery = "SELECT * WHERE {?s ?p ?o}" - behavior of "InWorkflowDataset" + behavior of "InMemoryDataset (workflowScoped = true)" it should "store data in the executor, not in the dataset itself" in { - val executor = new InWorkflowDatasetExecutor() - val executorEndpoint = executor.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint + val executor = new InMemoryDatasetExecutor() + val executorEndpoint = sparqlEndpoint(executor.access(task, execution)) executorEndpoint.update("INSERT DATA { }") - // The executor's model contains the written data executorEndpoint.select(tripleCountQuery).bindings.size mustBe 1 - // After access() the dataset's sparqlEndpoint reflects the executor's model dataset.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 1 } it should "retain data after close() is called" in { - val executor = new InWorkflowDatasetExecutor() - val executorEndpoint = executor.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint + val executor = new InMemoryDatasetExecutor() + val executorEndpoint = sparqlEndpoint(executor.access(task, execution)) executorEndpoint.update("INSERT DATA { }") executorEndpoint.select(tripleCountQuery).bindings.size mustBe 1 @@ -47,140 +45,127 @@ class InWorkflowDatasetTest extends AnyFlatSpec with Matchers { } it should "update the dataset sparqlEndpoint to the latest executor's model" in { - val dataset2 = InWorkflowDataset() + val dataset2 = InMemoryDataset(workflowScoped = true) val task2 = PlainTask("test2", DatasetSpec(dataset2)) - val executor1 = new InWorkflowDatasetExecutor() - val executor2 = new InWorkflowDatasetExecutor() + val executor1 = new InMemoryDatasetExecutor() + val executor2 = new InMemoryDatasetExecutor() - val endpoint1 = executor1.access(task2, execution).asInstanceOf[RdfDataset].sparqlEndpoint + val endpoint1 = sparqlEndpoint(executor1.access(task2, execution)) endpoint1.update("INSERT DATA { }") - // dataset2 now points to executor1's model — one triple visible dataset2.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 1 - // executor2.access() replaces the endpoint — dataset2 now sees executor2's (empty) model executor2.access(task2, execution) dataset2.sparqlEndpoint.select(tripleCountQuery).bindings.size mustBe 0 } it should "isolate data between concurrent executions" in { - val executor1 = new InWorkflowDatasetExecutor() - val executor2 = new InWorkflowDatasetExecutor() - val endpoint1 = executor1.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint - val endpoint2 = executor2.access(task, execution).asInstanceOf[RdfDataset].sparqlEndpoint + val executor1 = new InMemoryDatasetExecutor() + val executor2 = new InMemoryDatasetExecutor() + val endpoint1 = sparqlEndpoint(executor1.access(task, execution)) + val endpoint2 = sparqlEndpoint(executor2.access(task, execution)) endpoint1.update("INSERT DATA { }") endpoint2.update("INSERT DATA { }") endpoint2.update("INSERT DATA { }") - // Each executor only sees data from its own model endpoint1.select(tripleCountQuery).bindings.size mustBe 1 endpoint2.select(tripleCountQuery).bindings.size mustBe 2 } it should "use parent execution data in the nested executor" in { - val nestedDataset = InWorkflowDataset() + val nestedDataset = InMemoryDataset(workflowScoped = true) val nestedTask = PlainTask("nestedTest", DatasetSpec(nestedDataset)) - // Parent execution writes data val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) - val parentExecutor = new InWorkflowDatasetExecutor() - val parentEndpoint = parentExecutor.access(nestedTask, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val parentExecutor = new InMemoryDatasetExecutor() + val parentEndpoint = sparqlEndpoint(parentExecutor.access(nestedTask, parentExecution)) parentEndpoint.update("INSERT DATA { }") parentEndpoint.update("INSERT DATA { }") parentEndpoint.select(tripleCountQuery).bindings.size mustBe 2 - // Child execution with parent reference val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) - val childExecutor = new InWorkflowDatasetExecutor() - val childEndpoint = childExecutor.access(nestedTask, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val childExecutor = new InMemoryDatasetExecutor() + val childEndpoint = sparqlEndpoint(childExecutor.access(nestedTask, childExecution)) - // Child sees the parent's data childEndpoint.select(tripleCountQuery).bindings.size mustBe 2 } it should "share the model between parent and child executions" in { - val nestedDataset = InWorkflowDataset() + val nestedDataset = InMemoryDataset(workflowScoped = true) val nestedTask = PlainTask("nestedTest", DatasetSpec(nestedDataset)) - // Parent execution writes one triple val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) - val parentExecutor = new InWorkflowDatasetExecutor() - val parentEndpoint = parentExecutor.access(nestedTask, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val parentExecutor = new InMemoryDatasetExecutor() + val parentEndpoint = sparqlEndpoint(parentExecutor.access(nestedTask, parentExecution)) parentEndpoint.update("INSERT DATA { }") - // Child execution references the same model and writes more val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) - val childExecutor = new InWorkflowDatasetExecutor() - val childEndpoint = childExecutor.access(nestedTask, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val childExecutor = new InMemoryDatasetExecutor() + val childEndpoint = sparqlEndpoint(childExecutor.access(nestedTask, childExecution)) childEndpoint.update("INSERT DATA { }") - // Both see the same data since they share the model childEndpoint.select(tripleCountQuery).bindings.size mustBe 2 parentEndpoint.select(tripleCountQuery).bindings.size mustBe 2 } it should "only reuse the parent model with the same task id in a nested execution" in { - val sharedDataset = InWorkflowDataset() + val sharedDataset = InMemoryDataset(workflowScoped = true) val taskA = PlainTask("datasetA", DatasetSpec(sharedDataset)) val taskB = PlainTask("datasetB", DatasetSpec(sharedDataset)) val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) - // Parent registers models for both taskA and taskB - val parentExecutorA = new InWorkflowDatasetExecutor() - val endpointA = parentExecutorA.access(taskA, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val parentExecutorA = new InMemoryDatasetExecutor() + val endpointA = sparqlEndpoint(parentExecutorA.access(taskA, parentExecution)) endpointA.update("INSERT DATA { }") - val parentExecutorB = new InWorkflowDatasetExecutor() - val endpointB = parentExecutorB.access(taskB, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val parentExecutorB = new InMemoryDatasetExecutor() + val endpointB = sparqlEndpoint(parentExecutorB.access(taskB, parentExecution)) endpointB.update("INSERT DATA { }") endpointB.update("INSERT DATA { }") - // Child execution for taskA — must see only taskA's data (1 triple), not taskB's (2 triples) val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) - val childExecutorA = new InWorkflowDatasetExecutor() - val childEndpointA = childExecutorA.access(taskA, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val childExecutorA = new InMemoryDatasetExecutor() + val childEndpointA = sparqlEndpoint(childExecutorA.access(taskA, childExecution)) childEndpointA.select(tripleCountQuery).bindings.size mustBe 1 - // Child execution for taskB — must see only taskB's data (2 triples) - val childExecutorB = new InWorkflowDatasetExecutor() - val childEndpointB = childExecutorB.access(taskB, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val childExecutorB = new InMemoryDatasetExecutor() + val childEndpointB = sparqlEndpoint(childExecutorB.access(taskB, childExecution)) childEndpointB.select(tripleCountQuery).bindings.size mustBe 2 } it should "create a new model in a nested execution if the parent has no matching task id" in { - val sharedDataset = InWorkflowDataset() + val sharedDataset = InMemoryDataset(workflowScoped = true) val parentTask = PlainTask("parentOnly", DatasetSpec(sharedDataset)) val childTask = PlainTask("childOnly", DatasetSpec(sharedDataset)) val parentExecution = LocalExecution(false, workflowId = Some(Identifier("parentWf"))) - // Parent registers a model for parentOnly - val parentExecutor = new InWorkflowDatasetExecutor() - val parentEndpoint = parentExecutor.access(parentTask, parentExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val parentExecutor = new InMemoryDatasetExecutor() + val parentEndpoint = sparqlEndpoint(parentExecutor.access(parentTask, parentExecution)) parentEndpoint.update("INSERT DATA { }") - // Child execution for a different task id — must NOT see parent data val childExecution = LocalExecution(false, workflowId = Some(Identifier("childWf")), parentExecution = Some(parentExecution)) - val childExecutor = new InWorkflowDatasetExecutor() - val childEndpoint = childExecutor.access(childTask, childExecution).asInstanceOf[RdfDataset].sparqlEndpoint + val childExecutor = new InMemoryDatasetExecutor() + val childEndpoint = sparqlEndpoint(childExecutor.access(childTask, childExecution)) childEndpoint.select(tripleCountQuery).bindings.size mustBe 0 } it should "clean up model on close()" in { - val nestedDataset = InWorkflowDataset() + val nestedDataset = InMemoryDataset(workflowScoped = true) val nestedTask = PlainTask("cleanupTest", DatasetSpec(nestedDataset)) val exec = LocalExecution(false, workflowId = Some(Identifier("wf"))) - val executor = new InWorkflowDatasetExecutor() - executor.access(nestedTask, exec).asInstanceOf[RdfDataset].sparqlEndpoint + val executor = new InMemoryDatasetExecutor() + sparqlEndpoint(executor.access(nestedTask, exec)) .update("INSERT DATA { }") - // Model is registered nestedDataset.findModel(exec, nestedTask.id) must not be empty - // After close, model is removed from the dataset's map executor.close() nestedDataset.findModel(exec, nestedTask.id) mustBe empty } -} + + private def sparqlEndpoint(access: DatasetAccess): SparqlEndpoint = + access.asInstanceOf[DatasetSpecAccess].datasetAccess.asInstanceOf[RdfDataset].sparqlEndpoint +} \ No newline at end of file diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala deleted file mode 100644 index c07265c27c..0000000000 --- a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/datasets/InWorkflowDatasetIntegrationTest.scala +++ /dev/null @@ -1,221 +0,0 @@ -package org.silkframework.plugins.dataset.rdf.datasets - -import org.apache.jena.rdf.model.{Model, ModelFactory} -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.matchers.must.Matchers -import org.silkframework.config.{ConfigTest, MetaData, Prefixes} -import org.silkframework.dataset.DatasetSpec -import org.silkframework.entity.paths.UntypedPath -import org.silkframework.plugins.dataset.rdf.tasks.SparqlCopyCustomTask -import org.silkframework.rule._ -import org.silkframework.runtime.activity.{ActivityMonitor, UserContext} -import org.silkframework.util.{ConfigTestTrait, Uri} -import org.silkframework.workspace.activity.workflow.{LocalWorkflowExecutorGeneratingProvenance, Workflow, WorkflowDataset, WorkflowExecutionReportWithProvenance, WorkflowOperator} -import org.silkframework.workspace.{InMemoryWorkspaceTestTrait, ProjectConfig, WorkspaceFactory} - -/** - * Integration test for [[InWorkflowDataset]] within a real workflow execution. - * - * Tests that: - * - Two InWorkflowDataset instances are fully isolated from each other. - * - Multiple uses of the same InWorkflowDataset instance within one workflow - * execution all see the same data. - * - * Workflow structure: - * source1 → copyToInWorkflow1 → inWorkflow1 → readFromInWorkflow1A → output1A - * └──────→ readFromInWorkflow1B → output1B - * source2 → copyToInWorkflow2 → inWorkflow2 → readFromInWorkflow2A → output2A - * └──────→ readFromInWorkflow2B → output2B - * - * Writing to each InWorkflowDataset goes through SparqlCopyCustomTask (QuadEntitySchema → - * withEntitySink → access() → executor model). Reading goes through TransformSpec - * (FixedSchemaPort(MultiEntitySchema) → handleMultiEntitySchema → access().source → - * executor model), which is the only read path that correctly reaches the executor model. - */ -class InWorkflowDatasetIntegrationTest extends AnyFlatSpec with Matchers with ConfigTestTrait { - - implicit val userContext: UserContext = UserContext.Empty - implicit val prefixes: Prefixes = Prefixes.empty - - override def propertyMap: Map[String, Option[String]] = Map( - "workspace.provider.plugin" -> Some("inMemoryWorkspaceProvider") - ) - - "InWorkflowDataset" should "isolate data between two instances and share data across multiple uses of the same instance within a workflow" in { - val workspace = WorkspaceFactory().workspace - val project = workspace.createProject(ProjectConfig(metaData = MetaData(Some("inWorkflowIntegrationTest")))) - - // Source datasets pre-populated with distinct triples. - val source1Model: Model = ModelFactory.createDefaultModel() - source1Model.createResource("http://s1") - .addProperty(source1Model.createProperty("http://p"), source1Model.createResource("http://o1")) - - val source2Model: Model = ModelFactory.createDefaultModel() - source2Model.createResource("http://s2") - .addProperty(source2Model.createProperty("http://p"), source2Model.createResource("http://o2")) - - // Output datasets — empty initially, filled by the workflow. - val output1AModel: Model = ModelFactory.createDefaultModel() - val output1BModel: Model = ModelFactory.createDefaultModel() - val output2AModel: Model = ModelFactory.createDefaultModel() - val output2BModel: Model = ModelFactory.createDefaultModel() - - // Register dataset tasks. - project.addTask("source1", DatasetSpec(JenaModelDataset.fromModel(source1Model))) - project.addTask("source2", DatasetSpec(JenaModelDataset.fromModel(source2Model))) - project.addTask("inWorkflow1", DatasetSpec(InWorkflowDataset())) - project.addTask("inWorkflow2", DatasetSpec(InWorkflowDataset())) - project.addTask("output1A", DatasetSpec(JenaModelDataset.fromModel(output1AModel))) - project.addTask("output1B", DatasetSpec(JenaModelDataset.fromModel(output1BModel))) - project.addTask("output2A", DatasetSpec(JenaModelDataset.fromModel(output2AModel))) - project.addTask("output2B", DatasetSpec(JenaModelDataset.fromModel(output2BModel))) - - // SparqlCopyCustomTask: reads via SparqlEndpointEntitySchema, outputs QuadEntitySchema. - // Quads are written to InWorkflowDataset via withEntitySink → access() → executor model. - val copyQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }" - project.addTask("copyToInWorkflow1", SparqlCopyCustomTask(copyQuery, tempFile = false)) - project.addTask("copyToInWorkflow2", SparqlCopyCustomTask(copyQuery, tempFile = false)) - - // TransformSpec: reads via FixedSchemaPort(MultiEntitySchema) → handleMultiEntitySchema - // → access().source → executor model. Identity mapping preserves the property. - val identityTransform = TransformSpec( - selection = DatasetSelection("dummy", Uri("")), - mappingRule = RootMappingRule(MappingRules( - propertyRules = Seq( - DirectMapping( - id = "pmap", - sourcePath = UntypedPath(Uri("http://p")), - mappingTarget = MappingTarget(Uri("http://p")) - ) - ) - )) - ) - project.addTask("readFromInWorkflow1A", identityTransform) - project.addTask("readFromInWorkflow1B", identityTransform) - project.addTask("readFromInWorkflow2A", identityTransform) - project.addTask("readFromInWorkflow2B", identityTransform) - - // Build the workflow. - // inWorkflow1 is written to once (by copyToInWorkflow1) and then read twice - // (by readFromInWorkflow1A and readFromInWorkflow1B), exercising the - // alreadyExecuted / multiple-reads behaviour. - val workflow = Workflow( - operators = Seq( - WorkflowOperator(Seq(Some("source1")), "copyToInWorkflow1", Seq("inWorkflow1"), Seq.empty, (0, 0), "copyToInWorkflow1", None, Seq.empty, Seq.empty), - WorkflowOperator(Seq(Some("source2")), "copyToInWorkflow2", Seq("inWorkflow2"), Seq.empty, (0, 300), "copyToInWorkflow2", None, Seq.empty, Seq.empty), - WorkflowOperator(Seq(Some("inWorkflow1")), "readFromInWorkflow1A", Seq("output1A"), Seq.empty, (200, 0), "readFromInWorkflow1A", None, Seq.empty, Seq.empty), - WorkflowOperator(Seq(Some("inWorkflow1")), "readFromInWorkflow1B", Seq("output1B"), Seq.empty, (200,100), "readFromInWorkflow1B", None, Seq.empty, Seq.empty), - WorkflowOperator(Seq(Some("inWorkflow2")), "readFromInWorkflow2A", Seq("output2A"), Seq.empty, (200,300), "readFromInWorkflow2A", None, Seq.empty, Seq.empty), - WorkflowOperator(Seq(Some("inWorkflow2")), "readFromInWorkflow2B", Seq("output2B"), Seq.empty, (200,400), "readFromInWorkflow2B", None, Seq.empty, Seq.empty) - ), - datasets = Seq( - WorkflowDataset(Seq.empty, "source1", Seq("copyToInWorkflow1"), (0, 0), "source1", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq.empty, "source2", Seq("copyToInWorkflow2"), (0, 300), "source2", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("copyToInWorkflow1")), "inWorkflow1", Seq("readFromInWorkflow1A", "readFromInWorkflow1B"), (100, 0), "inWorkflow1", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("copyToInWorkflow2")), "inWorkflow2", Seq("readFromInWorkflow2A", "readFromInWorkflow2B"), (100,300), "inWorkflow2", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("readFromInWorkflow1A")), "output1A", Seq.empty, (300, 0), "output1A", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("readFromInWorkflow1B")), "output1B", Seq.empty, (300,100), "output1B", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("readFromInWorkflow2A")), "output2A", Seq.empty, (300,300), "output2A", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("readFromInWorkflow2B")), "output2B", Seq.empty, (300,400), "output2B", None, Seq.empty, Seq.empty) - ) - ) - project.addTask("workflow", workflow) - val workflowTask = project.task[Workflow]("workflow") - - // Execute the workflow. - val executor = LocalWorkflowExecutorGeneratingProvenance(workflowTask) - val monitor = new ActivityMonitor("monitor", initialValue = Some(WorkflowExecutionReportWithProvenance.empty)) - executor.run(monitor) - - // Each InWorkflowDataset instance received data and fed its downstream operators. - output1AModel.size() must be > 0L - output1BModel.size() must be > 0L - output2AModel.size() must be > 0L - output2BModel.size() must be > 0L - - // Multiple uses of the same instance: both reads of inWorkflow1 see identical data. - output1AModel.isIsomorphicWith(output1BModel) mustBe true - - // Multiple uses of the same instance: both reads of inWorkflow2 see identical data. - output2AModel.isIsomorphicWith(output2BModel) mustBe true - - // Isolation: inWorkflow1 (source1 data) and inWorkflow2 (source2 data) are separate. - output1AModel.isIsomorphicWith(output2AModel) mustBe false - } - - it should "propagate InWorkflowDataset data from a parent workflow to a nested workflow" in { - val workspace = WorkspaceFactory().workspace - val project = workspace.createProject(ProjectConfig(metaData = MetaData(Some("nestedWorkflowTest")))) - - // Source dataset with test data. - val sourceModel: Model = ModelFactory.createDefaultModel() - sourceModel.createResource("http://nested/s1") - .addProperty(sourceModel.createProperty("http://p"), sourceModel.createResource("http://nested/o1")) - sourceModel.createResource("http://nested/s2") - .addProperty(sourceModel.createProperty("http://p"), sourceModel.createResource("http://nested/o2")) - - // Output dataset — empty initially, filled by the nested workflow. - val outputModel: Model = ModelFactory.createDefaultModel() - - // Register tasks. - project.addTask("source", DatasetSpec(JenaModelDataset.fromModel(sourceModel))) - project.addTask("inWorkflowDs", DatasetSpec(InWorkflowDataset())) - project.addTask("output", DatasetSpec(JenaModelDataset.fromModel(outputModel))) - - // SparqlCopyCustomTask: copies triples into the InWorkflowDataset. - val copyQuery = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }" - project.addTask("copyToInWorkflow", SparqlCopyCustomTask(copyQuery, tempFile = false)) - - // TransformSpec: reads from InWorkflowDataset via MultiEntitySchema path. - val identityTransform = TransformSpec( - selection = DatasetSelection("dummy", Uri("")), - mappingRule = RootMappingRule(MappingRules( - propertyRules = Seq( - DirectMapping( - id = "pmap", - sourcePath = UntypedPath(Uri("http://p")), - mappingTarget = MappingTarget(Uri("http://p")) - ) - ) - )) - ) - project.addTask("readFromInWorkflow", identityTransform) - - // Nested workflow: reads from inWorkflowDs and writes to output. - val nestedWorkflow = Workflow( - operators = Seq( - WorkflowOperator(Seq(Some("inWorkflowDs")), "readFromInWorkflow", Seq("output"), Seq.empty, (100, 0), "readFromInWorkflow", None, Seq.empty, Seq.empty) - ), - datasets = Seq( - WorkflowDataset(Seq.empty, "inWorkflowDs", Seq("readFromInWorkflow"), (0, 0), "inWorkflowDs", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("readFromInWorkflow")), "output", Seq.empty, (200, 0), "output", None, Seq.empty, Seq.empty) - ) - ) - project.addTask("nestedWorkflow", nestedWorkflow) - - // Parent workflow: source → copyToInWorkflow → inWorkflowDs → nestedWorkflow - val parentWorkflow = Workflow( - operators = Seq( - WorkflowOperator(Seq(Some("source")), "copyToInWorkflow", Seq("inWorkflowDs"), Seq.empty, (100, 0), "copyToInWorkflow", None, Seq.empty, Seq.empty), - WorkflowOperator(Seq(Some("inWorkflowDs")), "nestedWorkflow", Seq.empty, Seq.empty, (300, 0), "nestedWorkflow", None, Seq.empty, Seq.empty) - ), - datasets = Seq( - WorkflowDataset(Seq.empty, "source", Seq("copyToInWorkflow"), (0, 0), "source", None, Seq.empty, Seq.empty), - WorkflowDataset(Seq(Some("copyToInWorkflow")), "inWorkflowDs", Seq("nestedWorkflow"), (200, 0), "inWorkflowDs", None, Seq.empty, Seq.empty) - ) - ) - project.addTask("parentWorkflow", parentWorkflow) - val workflowTask = project.task[Workflow]("parentWorkflow") - - // Execute the parent workflow. - val executor = LocalWorkflowExecutorGeneratingProvenance(workflowTask) - val monitor = new ActivityMonitor("nestedMonitor", initialValue = Some(WorkflowExecutionReportWithProvenance.empty)) - executor.run(monitor) - - // The nested workflow must have read the data written by the parent. - outputModel.size() must be > 0L - - // Verify the output contains exactly the source triples (2 resources with property). - outputModel.listSubjects().toList.size() mustBe 2 - } -} From a0674165e17ee9085d43b449eb753bb5ff8f7895 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Tue, 28 Apr 2026 17:55:26 +0200 Subject: [PATCH 17/20] Update In-Memory dataset icon --- libs/gui-elements | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/gui-elements b/libs/gui-elements index 64359f57da..945fe9637d 160000 --- a/libs/gui-elements +++ b/libs/gui-elements @@ -1 +1 @@ -Subproject commit 64359f57dad3410bef34cc87d2f1e8a7cd7b8708 +Subproject commit 945fe9637dedb053be6fafea166376b7e8e75a1a From 032b12c672ad5f1aad03653376afabd4ee2c38e6 Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Tue, 28 Apr 2026 18:00:20 +0200 Subject: [PATCH 18/20] Updated doc of In-Memory dataset and renamed it to "In-memory Knowledge Graph" --- .../plugins/dataset/rdf/datasets/InMemoryDataset.md | 6 ++++++ .../plugins/dataset/rdf/datasets/InMemoryDataset.scala | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md index 88a945e041..59cc976a98 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md +++ b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md @@ -93,6 +93,12 @@ This parameter has no effect when `workflowScoped = true` (the executor manages - Contents are lost when the application/server is restarted. - Do not treat this dataset as long-term storage. +- **SPARQL engine** + - The dataset is backed by [Apache Jena](https://jena.apache.org/), exposed through a Jena in-memory SPARQL endpoint. + +- **No named-graph support** + - Only the **default graph** is available. Writing triples into a named graph is not possible. + - **Scope** - Best suited for: - small to medium intermediate results, diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala index d0a40efde6..f15ed10dac 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.scala @@ -14,7 +14,7 @@ import java.util.Collections @Plugin( id = InMemoryDataset.pluginId, - label = "In-memory dataset", + label = "In-memory Knowledge Graph", categories = Array(DatasetCategories.embedded), description = "A dataset that holds all data in-memory. " + "In the default (application-scoped) mode, data persists for the lifetime of the running process. " + From ba4d2ab0dc5522393eb0531d4bfa2d90b0961f8f Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Tue, 28 Apr 2026 18:37:16 +0200 Subject: [PATCH 19/20] JenaModelEndpoint: Limit the memory that can be written --- .../rdf/endpoint/JenaModelEndpoint.scala | 28 ++++++++++- .../rdf/endpoint/JenaModelEndpointTest.scala | 49 +++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpointTest.scala diff --git a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala index 178deb70e2..437c7e45cc 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala +++ b/silk-plugins/silk-plugins-rdf/src/main/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpoint.scala @@ -1,17 +1,33 @@ package org.silkframework.plugins.dataset.rdf.endpoint import org.apache.jena.query.{DatasetFactory, Query, QueryExecution, QueryExecutionFactory} -import org.apache.jena.rdf.model.Model -import org.apache.jena.sparql.core.DatasetGraphFactory +import org.apache.jena.rdf.listeners.StatementListener +import org.apache.jena.rdf.model.{Model, Statement} import org.apache.jena.update.{UpdateExecutionFactory, UpdateFactory, UpdateProcessor} import org.silkframework.dataset.rdf.{QuadIterator, SparqlEndpoint, SparqlParams, SparqlResults} import org.silkframework.runtime.activity.UserContext +import org.silkframework.runtime.resource.Resource /** * A SPARQL endpoint which executes all queries on a Jena Model. */ class JenaModelEndpoint(model: Model) extends JenaEndpoint { + private val byteLimit: Long = Resource.maxInMemorySize() + @volatile private var estimatedBytesWritten: Long = 0L + + model.register(new StatementListener { + override def addedStatement(s: Statement): Unit = + estimatedBytesWritten += statementBytes(s) + override def removedStatement(s: Statement): Unit = + estimatedBytesWritten = math.max(0L, estimatedBytesWritten - statementBytes(s)) + }) + + private def statementBytes(s: Statement): Long = + s.getSubject.toString.length.toLong + + s.getPredicate.toString.length.toLong + + s.getObject.toString.length.toLong + override protected def createQueryExecution(query: Query): QueryExecution = { QueryExecutionFactory.create(query, model) } @@ -45,6 +61,14 @@ class JenaModelEndpoint(model: Model) extends JenaEndpoint { (implicit userContext: UserContext): Unit = { this.synchronized { super.update(query) + if (estimatedBytesWritten > byteLimit) { + throw new RuntimeException( + s"In-memory Knowledge Graph has exceeded the size limit of $byteLimit bytes " + + s"(estimated bytes written: $estimatedBytesWritten). " + + s"Reduce the amount of data written or increase the limit by configuring " + + s"'${Resource.maxInMemorySizeParameterName}'." + ) + } } } diff --git a/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpointTest.scala b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpointTest.scala new file mode 100644 index 0000000000..5e3e4a3965 --- /dev/null +++ b/silk-plugins/silk-plugins-rdf/src/test/scala/org/silkframework/plugins/dataset/rdf/endpoint/JenaModelEndpointTest.scala @@ -0,0 +1,49 @@ +package org.silkframework.plugins.dataset.rdf.endpoint + +import org.apache.jena.rdf.model.ModelFactory +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers +import org.silkframework.runtime.activity.UserContext +import org.silkframework.runtime.resource.Resource +import org.silkframework.util.ConfigTestTrait + +class JenaModelEndpointTest extends AnyFlatSpec with Matchers { + + private implicit val userContext: UserContext = UserContext.Empty + + behavior of "JenaModelEndpoint" + + it should "not throw when data written is within the memory limit" in { + ConfigTestTrait.withConfig(Resource.maxInMemorySizeParameterName -> Some("100b")) { + val endpoint = new JenaModelEndpoint(ModelFactory.createDefaultModel()) + // Two triples at 9+8+9 bytes each = 52 estimated bytes, under the 100b limit + endpoint.update("INSERT DATA { }") + endpoint.update("INSERT DATA { }") + } + } + + it should "throw when data written exceeds the memory limit" in { + ConfigTestTrait.withConfig(Resource.maxInMemorySizeParameterName -> Some("50b")) { + val endpoint = new JenaModelEndpoint(ModelFactory.createDefaultModel()) + // First triple: 26 estimated bytes, within the 50b limit + endpoint.update("INSERT DATA { }") + // Second triple pushes the total to 52 bytes, exceeding the limit + an[RuntimeException] should be thrownBy { + endpoint.update("INSERT DATA { }") + } + } + } + + it should "throw for a short generative update that produces more data than the query string" in { + ConfigTestTrait.withConfig(Resource.maxInMemorySizeParameterName -> Some("50b")) { + val endpoint = new JenaModelEndpoint(ModelFactory.createDefaultModel()) + // Write one triple (26 estimated bytes), within the 50b limit + endpoint.update("INSERT DATA { }") + // A 47-char WHERE-clause query that generates a new triple: + // → 9+15+9 = 33 more bytes, total 59 > 50b + an[RuntimeException] should be thrownBy { + endpoint.update("INSERT { ?s ?o } WHERE { ?s ?p ?o }") + } + } + } +} From c19134248969bbffa611a11d2ea133bbaee1bacb Mon Sep 17 00:00:00 2001 From: Robert Isele Date: Tue, 28 Apr 2026 18:41:16 +0200 Subject: [PATCH 20/20] JenaModelEndpoint: document memory limit --- .../plugins/dataset/rdf/datasets/InMemoryDataset.md | 1 + 1 file changed, 1 insertion(+) diff --git a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md index 59cc976a98..878ba25dc7 100644 --- a/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md +++ b/silk-plugins/silk-plugins-rdf/src/main/resources/org/silkframework/plugins/dataset/rdf/datasets/InMemoryDataset.md @@ -88,6 +88,7 @@ This parameter has no effect when `workflowScoped = true` (the executor manages - **Memory-bound** - All data is kept in memory; large graphs will increase memory usage and may impact performance. - For large or production RDF graphs, use an external RDF store and a SPARQL dataset instead. + - A size limit is enforced: once the estimated size of data written to the dataset exceeds the value of `org.silkframework.runtime.resource.Resource.maxInMemorySize`, the workflow fails with an error. This prevents the JVM from running out of heap memory. - **No persistence** - Contents are lost when the application/server is restarted.