From f8394bb787d8f8b1594c61383d7bb57a6c278565 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Mon, 1 Dec 2025 08:48:05 +0100 Subject: [PATCH 01/11] Adding space-id-to-space-key map --- src/Analyzer/ConfluenceAnalyzer.php | 17 +++++++++++------ src/Converter/ConfluenceConverter.php | 18 +++++++++++++++++- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index 1063a0a..97cfe8a 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -95,6 +95,7 @@ public function __construct( $config, Workspace $workspace, DataBuckets $buckets parent::__construct( $config, $workspace, $buckets ); $this->customBuckets = new DataBuckets( [ 'space-id-to-prefix-map', + 'space-id-to-space-key-map', 'space-key-to-prefix-map', 'space-name-to-prefix-map', 'space-id-to-name-map', @@ -351,6 +352,9 @@ private function buildSpaceMaps( DOMDocument $dom ): void { $this->customBuckets->addData( 'space-key-to-prefix-map', $spaceKey, $customSpacePrefix, false, true ); + $this->customBuckets->addData( + 'space-id-to-space-key-map', $spaceId, $spaceKey, false, true + ); $this->customBuckets->addData( 'space-name-to-prefix-map', $spaceName, $customSpacePrefix, false, true ); @@ -619,6 +623,7 @@ private function buildUserMap( DOMDocument $dom ): void { */ private function buildPageMaps( DOMDocument $dom ): void { $spaceIdToPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); + $spaceIdToSpaceKeyMap = $this->customBuckets->getBucketData( 'space-id-to-space-key-map' ); $spaceIdHomepages = $this->customBuckets->getBucketData( 'space-id-homepages' ); $pageIdParentPageIdMap = $this->customBuckets->getBucketData( 'page-id-to-parent-page-id-map' ); $pageIdConfluendTitleMap = $this->customBuckets->getBucketData( 'page-id-to-confluence-title-map' ); @@ -643,13 +648,14 @@ private function buildPageMaps( DOMDocument $dom ): void { if ( $spaceId === null ) { return; } - if ( !isset( $spaceIdToPrefixMap[$spaceId] ) ) { + if ( !isset( $spaceIdToSpaceKeyMap[$spaceId] ) ) { return; } - $prefix = $spaceIdToPrefixMap[$spaceId]; + $spaceKey = $spaceIdToSpaceKeyMap[$spaceId]; + if ( isset( $this->advancedConfig['analyzer-include-spacekey'] ) - && !in_array( strtolower( $prefix ), $this->advancedConfig['analyzer-include-spacekey'] ) + && !in_array( strtolower( $spaceKey ), $this->advancedConfig['analyzer-include-spacekey'] ) ) { return; } @@ -1078,7 +1084,6 @@ private function makeAttachmentReference( XMLHelper $xmlHelper, DOMElement $atta } private function checkTitles(): void { - $spacePrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); $pagesTitlesMap = $this->customBuckets->getBucketData( 'pages-titles-map' ); $hasInvalidTitles = false; @@ -1115,7 +1120,7 @@ private function checkTitles(): void { $hasInvalidNamespaces = true; } - if ( mb_strlen( urlencode( $text ) ) > 255 ) { + if ( strlen( $text ) > 255 ) { $this->customBuckets->addData( 'invalid-titles', 'length', $title, @@ -1124,7 +1129,7 @@ private function checkTitles(): void { $hasInvalidTitles = true; } } else { - if ( mb_strlen( urlencode( $title ) ) > 255 ) { + if ( strlen( $title ) > 255 ) { $this->customBuckets->addData( 'invalid-titles', 'length', $title, diff --git a/src/Converter/ConfluenceConverter.php b/src/Converter/ConfluenceConverter.php index a7404a4..f53f94e 100644 --- a/src/Converter/ConfluenceConverter.php +++ b/src/Converter/ConfluenceConverter.php @@ -8,6 +8,7 @@ use DOMXPath; use HalloWelt\MediaWiki\Lib\Migration\Converter\PandocHTML; use HalloWelt\MediaWiki\Lib\Migration\DataBuckets; +use HalloWelt\MediaWiki\Lib\Migration\ExecutionTime; use HalloWelt\MediaWiki\Lib\Migration\IOutputAwareInterface; use HalloWelt\MediaWiki\Lib\Migration\Workspace; use HalloWelt\MigrateConfluence\Converter\Postprocessor\FixImagesWithExternalUrl; @@ -134,7 +135,8 @@ public function __construct( $config, Workspace $workspace ) { $this->customBuckets = new DataBuckets( [ 'title-uploads', - 'title-uploads-fail' + 'title-uploads-fail', + 'converter-body-content-id-execution-time', ] ); } @@ -149,6 +151,7 @@ public function setOutput( Output $output ) { * @inheritDoc */ protected function doConvert( SplFileInfo $file ): string { + $executionTime = new ExecutionTime(); $this->output->writeln( $file->getPathname() ); $this->dataLookup = ConversionDataLookup::newFromBuckets( $this->dataBuckets ); $this->conversionDataWriter = ConversionDataWriter::newFromBuckets( $this->dataBuckets ); @@ -218,6 +221,15 @@ protected function doConvert( SplFileInfo $file ): string { $this->postProcessLinks(); $this->postprocessWikiText(); + $executionTimeString = $executionTime->getHumanReadableTime(); + $this->customBuckets->addData( + 'converter-body-content-id-execution-time', + $bodyContentId, + $executionTimeString, + false, + true + ); + $this->customBuckets->saveToWorkspace( $this->workspace ); return $this->wikiText; @@ -782,7 +794,11 @@ private function buildMediaExcludeList( $wikiText ): array { * @return string */ private function getCurrentPageTitle(): string { + $prefix = ''; $spaceIdPrefixMap = $this->dataBuckets->getBucketData( 'space-id-to-prefix-map' ); + if ( !isset( $spaceIdPrefixMap[$this->currentSpace] ) ) { + $this->output->writeln( "SpaceId {$this->currentSpace} not found in spaceIdPrefixMap" ); + } $prefix = $spaceIdPrefixMap[$this->currentSpace]; $currentPageTitle = $this->currentPageTitle; From fa5dc7fa78ad42e3e13b5f8b52cfc484ee087296 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Mon, 1 Dec 2025 08:49:50 +0100 Subject: [PATCH 02/11] Fix space id hompage composer override --- src/Composer/ConfluenceComposer.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Composer/ConfluenceComposer.php b/src/Composer/ConfluenceComposer.php index b5b8d09..fe3e9eb 100644 --- a/src/Composer/ConfluenceComposer.php +++ b/src/Composer/ConfluenceComposer.php @@ -92,6 +92,9 @@ public function buildXML( Builder $builder ) { $bodyContentIDMainpageID = []; $pagesToBodyContents = array_flip( $bodyContentsToPagesMap ); foreach ( $spaceIDHomepagesMap as $spaceID => $homepageID ) { + if ( !isset( $pagesToBodyContents[$homepageID] ) ) { + continue; + } $bodyContentsID = $pagesToBodyContents[$homepageID]; $bodyContentIDMainpageID[$bodyContentsID] = $homepageID; } From 03d2f68a7d4ea2d1836cfcce95346849b42886d6 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Tue, 2 Dec 2025 08:25:17 +0100 Subject: [PATCH 03/11] Fix composer --- src/Composer/ConfluenceComposer.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Composer/ConfluenceComposer.php b/src/Composer/ConfluenceComposer.php index fe3e9eb..43ceb43 100644 --- a/src/Composer/ConfluenceComposer.php +++ b/src/Composer/ConfluenceComposer.php @@ -144,8 +144,10 @@ public function buildXML( Builder $builder ) { $namespace = $this->getNamespace( $pageTitle ); if ( isset( $this->advancedConfig['composer-include-namespace'] ) - && in_array( $namespace, $this->advancedConfig['composer-include-namespace'] ) + && !in_array( $namespace, $this->advancedConfig['composer-include-namespace'] ) ) { + $this->output->writeln( "Page {$pageTitle} skipped by configuration" ); + } else { $builder->addRevision( $pageTitle, $pageContent, $timestamp ); // Append attachments @@ -175,8 +177,6 @@ public function buildXML( Builder $builder ) { } } } - } else { - $this->output->writeln( "Page {$pageTitle} skipped by configuration" ); } } From 47db56a601a46bac6aff1eaf626e580509a30d04 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Tue, 2 Dec 2025 13:55:38 +0100 Subject: [PATCH 04/11] Improve invalid title handling --- src/Analyzer/ConfluenceAnalyzer.php | 27 +++++++-------------------- src/Command/Analyze.php | 12 ++++++++++++ 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index 97cfe8a..c035f75 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -23,22 +23,11 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, IOutputAwareInterface { - /** - * - * @var DOMDocument - */ - private $dom = null; - /** * @var DataBuckets */ private $customBuckets = null; - /** - * @var XMLHelper - */ - private $helper = null; - /** * @var LoggerInterface */ @@ -123,15 +112,12 @@ public function __construct( $config, Workspace $workspace, DataBuckets $buckets 'attachment-orig-filename-target-filename-map', 'attachment-id-to-target-filename-map', 'filenames-to-filetitles-map', + 'invalid-titles-page-id-to-title', + 'invalid-titles-attachment-id-to-title', 'invalid-titles', 'invalid-namespaces', - 'debug-attachment-id-to-target-filename', - 'debug-missing-attachment-id-to-filename', - 'debug-attachment-page-to-attachment-id', - 'debug-fallback-attachment-id-to-target-filename', - 'debug-additional-attachment-id-to-target-filename', ] ); $this->logger = new NullLogger(); @@ -673,12 +659,13 @@ private function buildPageMaps( DOMDocument $dom ): void { try { $targetTitle = $titleBuilder->buildTitle( $pageNode ); } catch ( InvalidTitleException $ex ) { - $this->buckets->addData( 'title-invalids', $pageId, $ex->getInvalidTitle() ); - return; + $this->customBuckets->addData( 'invalid-titles-page-id-to-title', $pageId, $ex->getInvalidTitle() ); + // We don't want to loose this page. Title can be modified after analyze process + $targetTitle = $ex->getInvalidTitle(); } if ( $targetTitle === '' ) { - $this->buckets->addData( 'title-invalids', $pageId, $targetTitle ); + $this->customBuckets->addData( 'invalid-titles-page-id-to-title', $pageId, $targetTitle ); return; } @@ -1023,7 +1010,7 @@ private function makeAttachmentTargetFilenameFromData( $targetName = $filenameBuilder->buildFromAttachmentData( $attachmentSpaceId, $attachmentOrigFilename, $shortTargetTitle ); } catch ( InvalidTitleException $ex ) { - $this->buckets->addData( 'title-invalids', $attachmentId, $ex->getInvalidTitle() ); + $this->customBuckets->addData( 'invalid-titles-attachment-id-to-title', $attachmentId, $ex->getInvalidTitle() ); $this->logger->error( $ex->getMessage() ); return '###INVALID###'; } diff --git a/src/Command/Analyze.php b/src/Command/Analyze.php index b4e4de7..a379a6f 100644 --- a/src/Command/Analyze.php +++ b/src/Command/Analyze.php @@ -65,4 +65,16 @@ private function readConfigFile( &$config ): void { } } } + + /** + * + * @inheritDoc + */ + protected function getBucketKeys() { + return [ + 'files', + 'title-attachments', + 'title-revisions' + ]; + } } From 13e44a11af0cc4ac83e59d5e52499b4878d27b85 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Thu, 4 Dec 2025 10:15:11 +0100 Subject: [PATCH 05/11] Rename bucktes for better workflow --- src/Analyzer/ConfluenceAnalyzer.php | 274 +++++++++++------- src/Command/Analyze.php | 20 +- src/Command/Compose.php | 17 ++ src/Command/Extract.php | 12 + src/Composer/ConfluenceComposer.php | 32 +- .../_defaultpages/Template/SpaceDetails | 2 +- src/Converter/ConfluenceConverter.php | 69 +++-- .../Processor/StructuredMacroGliffy.php | 2 +- .../Processor/StructuredMacroSpaceDetails.php | 2 +- src/Extractor/ConfluenceExtractor.php | 34 ++- src/Utility/ConversionDataLookup.php | 14 +- src/Utility/ConversionDataWriter.php | 2 +- 12 files changed, 288 insertions(+), 192 deletions(-) diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index c035f75..44c95ee 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -9,6 +9,7 @@ use HalloWelt\MediaWiki\Lib\Migration\InvalidTitleException; use HalloWelt\MediaWiki\Lib\Migration\IOutputAwareInterface; use HalloWelt\MediaWiki\Lib\Migration\TitleBuilder as GenericTitleBuilder; +use HalloWelt\MediaWiki\Lib\Migration\WindowsFilename; use HalloWelt\MediaWiki\Lib\Migration\Workspace; use HalloWelt\MigrateConfluence\Utility\FilenameBuilder; use HalloWelt\MigrateConfluence\Utility\TitleBuilder; @@ -83,41 +84,28 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, I public function __construct( $config, Workspace $workspace, DataBuckets $buckets ) { parent::__construct( $config, $workspace, $buckets ); $this->customBuckets = new DataBuckets( [ - 'space-id-to-prefix-map', - 'space-id-to-space-key-map', - 'space-key-to-prefix-map', - 'space-name-to-prefix-map', - 'space-id-to-name-map', - 'space-key-to-name-map', - 'space-id-homepages', - 'space-id-to-description-id-map', - 'space-description-id-to-body-id-map', - 'space-details', - 'page-id-to-confluence-title-map', - 'page-id-to-parent-page-id-map', - 'body-content-id-to-page-id-map', - 'attachment-id-to-orig-filename-map', - 'attachment-id-to-space-id-map', - 'attachment-id-to-reference-map', - 'attachment-id-to-container-content-id-map', - 'attachment-id-to-content-status-map', - 'userkey-to-username-map', - 'pages-titles-map', - 'page-id-to-confluence-key-map', - 'page-id-to-title-map', - 'page-id-to-space-id', - 'body-contents-to-pages-map', - 'title-files', - 'additional-files', - 'attachment-orig-filename-target-filename-map', - 'attachment-id-to-target-filename-map', - 'filenames-to-filetitles-map', - 'invalid-titles-page-id-to-title', - 'invalid-titles-attachment-id-to-title', - - 'invalid-titles', - 'invalid-namespaces', - + 'analyze-space-id-to-space-key-map', + 'analyze-space-name-to-prefix-map', + 'analyze-space-id-to-name-map', + 'analyze-space-key-to-name-map', + 'analyze-page-id-to-confluence-title-map', + 'analyze-page-id-to-parent-page-id-map', + 'analyze-body-content-id-to-page-id-map', + 'analyze-attachment-id-to-orig-filename-map', + 'analyze-attachment-id-to-space-id-map', + 'analyze-attachment-id-to-reference-map', + 'analyze-attachment-id-to-container-content-id-map', + 'analyze-attachment-id-to-content-status-map', + 'analyze-page-id-to-confluence-key-map', + 'analyze-title-to-attachment-title', + 'analyze-attachment-id-to-target-filename-map', + + 'debug-analyze-invalid-titles-page-id-to-title', + 'debug-analyze-invalid-titles-attachment-id-to-title', + + 'warning-analyze-invalid-namespaces', + 'warning-analyze-invalid-titles', + 'warning-analyze-invalid-filenames', ] ); $this->logger = new NullLogger(); @@ -332,23 +320,23 @@ private function buildSpaceMaps( DOMDocument $dom ): void { return; } - $this->customBuckets->addData( - 'space-id-to-prefix-map', $spaceId, $customSpacePrefix, false, true + $this->buckets->addData( + 'global-space-id-to-prefix-map', $spaceId, $customSpacePrefix, false, true ); - $this->customBuckets->addData( - 'space-key-to-prefix-map', $spaceKey, $customSpacePrefix, false, true + $this->buckets->addData( + 'global-space-key-to-prefix-map', $spaceKey, $customSpacePrefix, false, true ); $this->customBuckets->addData( - 'space-id-to-space-key-map', $spaceId, $spaceKey, false, true + 'analyze-space-id-to-space-key-map', $spaceId, $spaceKey, false, true ); $this->customBuckets->addData( - 'space-name-to-prefix-map', $spaceName, $customSpacePrefix, false, true + 'analyze-space-name-to-prefix-map', $spaceName, $customSpacePrefix, false, true ); $this->customBuckets->addData( - 'space-id-to-name-map', $spaceId, $spaceName, false, true + 'analyze-space-id-to-name-map', $spaceId, $spaceName, false, true ); $this->customBuckets->addData( - 'space-key-to-name-map', $spaceKey, $spaceName, false, true + 'analyze-space-key-to-name-map', $spaceKey, $spaceName, false, true ); $homePageId = -1; @@ -357,7 +345,7 @@ private function buildSpaceMaps( DOMDocument $dom ): void { $homePageId = $xmlHelper->getIDNodeValue( $homePagePropertyNode ); } if ( $homePageId > -1 ) { - $this->customBuckets->addData( 'space-id-homepages', $spaceId, $homePageId, false, true ); + $this->buckets->addData( 'global-space-id-homepages', $spaceId, $homePageId, false, true ); } $details = []; @@ -380,8 +368,8 @@ private function buildSpaceMaps( DOMDocument $dom ): void { $propertyNode = $xmlHelper->getPropertyNode( 'description' ); if ( $propertyNode !== null ) { $details['description'] = $xmlHelper->getIDNodeValue( $propertyNode ); - $this->customBuckets->addData( - 'space-id-to-description-id-map', + $this->buckets->addData( + 'global-space-id-to-description-id-map', $spaceId, $details['description'], false, @@ -409,7 +397,7 @@ private function buildSpaceMaps( DOMDocument $dom ): void { } if ( !empty( $details ) ) { - $this->customBuckets->addData( 'space-details', $spaceId, $details, false, true ); + $this->buckets->addData( 'global-space-details', $spaceId, $details, false, true ); $this->output->writeln( "Add details description ($spaceId)" ); } } @@ -433,7 +421,7 @@ private function buildSpaceDescriptionMap( DOMDocument $dom ): void { $bodyContents = $xmlHelper->getElementsFromCollection( 'bodyContents', $spaceDescription ); foreach ( $bodyContents as $bodyContent ) { $id = $xmlHelper->getIDNodeValue( $bodyContent ); - $this->customBuckets->addData( 'space-description-id-to-body-id-map', $descID, $id, false, true ); + $this->buckets->addData( 'global-space-description-id-to-body-id-map', $descID, $id, false, true ); $this->output->writeln( "\nAdd space description ($id)" ); } } @@ -470,13 +458,13 @@ private function buildParentPageMap( DOMDocument $dom ): void { $pageId = $xmlHelper->getIDNodeValue( $pageNode ); $parentPageId = $xmlHelper->getPropertyValue( 'parent', $pageNode ); if ( $parentPageId !== null ) { - $this->customBuckets->addData( 'page-id-to-parent-page-id-map', $pageId, $parentPageId, false, true ); + $this->customBuckets->addData( 'analyze-page-id-to-parent-page-id-map', $pageId, $parentPageId, false, true ); } $pageId = $xmlHelper->getIDNodeValue( $pageNode ); $confluenceTitle = $xmlHelper->getPropertyValue( 'title', $pageNode ); if ( $confluenceTitle !== null ) { - $this->customBuckets->addData( 'page-id-to-confluence-title-map', $pageId, $confluenceTitle, false, true ); + $this->customBuckets->addData( 'analyze-page-id-to-confluence-title-map', $pageId, $confluenceTitle, false, true ); } } @@ -497,7 +485,7 @@ private function buildBodyContentMap( DOMDocument $dom ): void { $bodyContentId = $xmlHelper->getIDNodeValue( $bodyContentObject ); $pageId = $xmlHelper->getPropertyValue( 'content', $bodyContentObject ); - $this->customBuckets->addData( 'body-content-id-to-page-id-map', + $this->customBuckets->addData( 'analyze-body-content-id-to-page-id-map', $bodyContentId, $pageId, false, true ); } @@ -530,29 +518,29 @@ private function buildAttachmentMaps( DOMDocument $dom ): void { if ( $attachmentFilename !== '' && is_int( $attachmentId ) ) { $this->customBuckets->addData( - 'attachment-id-to-orig-filename-map', $attachmentId, $attachmentFilename, false, true ); + 'analyze-attachment-id-to-orig-filename-map', $attachmentId, $attachmentFilename, false, true ); } $attachmentSpaceId = $xmlHelper->getPropertyValue( 'space', $attachmentNode ); if ( is_int( $attachmentId ) ) { $this->customBuckets->addData( - 'attachment-id-to-space-id-map', $attachmentId, $attachmentSpaceId, false, true ); + 'analyze-attachment-id-to-space-id-map', $attachmentId, $attachmentSpaceId, false, true ); } $attachmentReference = $this->makeAttachmentReference( $xmlHelper, $attachmentNode ); if ( $attachmentReference !== '' ) { $this->customBuckets->addData( - 'attachment-id-to-reference-map', $attachmentId, $attachmentReference, false, true ); + 'analyze-attachment-id-to-reference-map', $attachmentId, $attachmentReference, false, true ); } $containerContent = $xmlHelper->getPropertyNode( 'containerContent', $attachmentNode ); if ( $containerContent instanceof DOMElement ) { $containerContentId = $xmlHelper->getIDNodeValue( $containerContent ); if ( $containerContentId >= 0 ) { $this->customBuckets->addData( - 'attachment-id-to-container-content-id-map', $attachmentId, $containerContentId, false, true ); + 'analyze-attachment-id-to-container-content-id-map', $attachmentId, $containerContentId, false, true ); } } $attachmentNodeContentStatus = $xmlHelper->getPropertyValue( 'contentStatus', $attachmentNode ); $this->customBuckets->addData( - 'attachment-id-to-content-status-map', $attachmentId, $attachmentNodeContentStatus, false, true ); + 'analyze-attachment-id-to-content-status-map', $attachmentId, $attachmentNodeContentStatus, false, true ); } /** @@ -583,8 +571,8 @@ private function buildUserMap( DOMDocument $dom ): void { $mediaWikiUsername = $this->makeMWUserName( $lcUserName ); - $this->customBuckets->addData( - 'userkey-to-username-map', + $this->buckets->addData( + 'global-userkey-to-username-map', $userImplKey, $mediaWikiUsername, false @@ -608,12 +596,12 @@ private function buildUserMap( DOMDocument $dom ): void { * @return void */ private function buildPageMaps( DOMDocument $dom ): void { - $spaceIdToPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); - $spaceIdToSpaceKeyMap = $this->customBuckets->getBucketData( 'space-id-to-space-key-map' ); - $spaceIdHomepages = $this->customBuckets->getBucketData( 'space-id-homepages' ); - $pageIdParentPageIdMap = $this->customBuckets->getBucketData( 'page-id-to-parent-page-id-map' ); - $pageIdConfluendTitleMap = $this->customBuckets->getBucketData( 'page-id-to-confluence-title-map' ); - $bodyContents = $this->customBuckets->getBucketData( 'body-content-id-to-page-id-map' ); + $spaceIdToPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); + $spaceIdToSpaceKeyMap = $this->customBuckets->getBucketData( 'analyze-space-id-to-space-key-map' ); + $spaceIdHomepages = $this->buckets->getBucketData( 'global-space-id-homepages' ); + $pageIdParentPageIdMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-parent-page-id-map' ); + $pageIdConfluendTitleMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-title-map' ); + $bodyContents = $this->customBuckets->getBucketData( 'analyze-body-content-id-to-page-id-map' ); $xmlHelper = new XMLHelper( $dom ); @@ -659,20 +647,20 @@ private function buildPageMaps( DOMDocument $dom ): void { try { $targetTitle = $titleBuilder->buildTitle( $pageNode ); } catch ( InvalidTitleException $ex ) { - $this->customBuckets->addData( 'invalid-titles-page-id-to-title', $pageId, $ex->getInvalidTitle() ); + $this->customBuckets->addData( 'debug-analyze-invalid-titles-page-id-to-title', $pageId, $ex->getInvalidTitle() ); // We don't want to loose this page. Title can be modified after analyze process $targetTitle = $ex->getInvalidTitle(); } if ( $targetTitle === '' ) { - $this->customBuckets->addData( 'invalid-titles-page-id-to-title', $pageId, $targetTitle ); + $this->customBuckets->addData( 'debug-analyze-invalid-titles-page-id-to-title', $pageId, $targetTitle ); return; } $this->output->writeln( "Add page '$targetTitle' (ID:$pageId)" ); /** - * Adds data bucket "pages-titles-map", which contains mapping from page title itself to full page title. + * Adds data bucket "global-pages-titles-map", which contains mapping from page title itself to full page title. * Full page title contains parent pages and namespace (if it is not general space). * Example: * "Detailed_planning" -> "Dokumentation/Detailed_planning" @@ -686,14 +674,14 @@ private function buildPageMaps( DOMDocument $dom ): void { $pageConfluenceTitle = "$spaceId---{$pageConfluenceTitle}"; // Some normalization $pageConfluenceTitle = str_replace( ' ', '_', $pageConfluenceTitle ); - $this->customBuckets->addData( 'pages-titles-map', $pageConfluenceTitle, $targetTitle, false, true ); - $this->customBuckets->addData( 'page-id-to-confluence-key-map', $pageId, $pageConfluenceTitle, false, true ); + $this->buckets->addData( 'global-pages-titles-map', $pageConfluenceTitle, $targetTitle, false, true ); + $this->customBuckets->addData( 'analyze-page-id-to-confluence-key-map', $pageId, $pageConfluenceTitle, false, true ); // Also add pages IDs in Confluence to full page title mapping. // It is needed to have enough context on converting stage, // to know from filename which page is currently being converted. - $this->customBuckets->addData( 'page-id-to-title-map', $pageId, $targetTitle, false, true ); - $this->customBuckets->addData( 'page-id-to-space-id', $pageId, $spaceId, false, true ); + $this->buckets->addData( 'global-page-id-to-title-map', $pageId, $targetTitle, false, true ); + $this->buckets->addData( 'global-page-id-to-space-id', $pageId, $spaceId, false, true ); $revisionTimestamp = $this->buildRevisionTimestamp( $xmlHelper, $pageNode ); $bodyContentIds = $this->getBodyContentIds( $xmlHelper, $pageNode ); @@ -701,7 +689,7 @@ private function buildPageMaps( DOMDocument $dom ): void { foreach ( $bodyContentIds as $bodyContentId ) { // TODO: Add UserImpl-key or directly MediaWiki username // (could also be done in `extract` as "metadata" ) - $this->customBuckets->addData( 'body-contents-to-pages-map', $bodyContentId, $pageId, false, true ); + $this->buckets->addData( 'global-body-contents-to-pages-map', $bodyContentId, $pageId, false, true ); } } else { $bodyContentIds = []; @@ -710,8 +698,8 @@ private function buildPageMaps( DOMDocument $dom ): void { if ( $pageId === $contentPageId ) { $bodyContentIds[] = $bodyContentId; - $this->customBuckets->addData( - 'body-contents-to-pages-map', + $this->buckets->addData( + 'global-body-contents-to-pages-map', $bodyContentId, $pageId, false, @@ -737,13 +725,13 @@ private function buildPageMaps( DOMDocument $dom ): void { * @return void */ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement $element, int $spaceId ): void { - $pageIdConflueTitleMap = $this->customBuckets->getBucketData( 'page-id-to-confluence-title-map' ); - $pageIdConfluenKeyMap = $this->customBuckets->getBucketData( 'page-id-to-confluence-key-map' ); - $pagesTitlesMap = $this->customBuckets->getBucketData( 'pages-titles-map' ); - $spaceIdToPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); - $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'attachment-id-to-orig-filename-map' ); - $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'attachment-id-to-space-id-map' ); - $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'attachment-id-to-reference-map' ); + $pageIdConflueTitleMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-title-map' ); + $pageIdConfluenKeyMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-key-map' ); + $pagesTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); + $spaceIdToPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); + $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-orig-filename-map' ); + $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-space-id-map' ); + $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-reference-map' ); $pageId = $xmlHelper->getIDNodeValue( $element ); $confluenceTitle = $pageIdConflueTitleMap[$pageId]; @@ -771,6 +759,10 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement $confluenceTitle, $attachmentId, $attachmentSpaceId, $attachmentOrigFilename, $wikiTitle, $spaceIdToPrefixMap ); + if ( $attachmentTargetFilename === '' ) { + $this->customBuckets->addData( 'debug-analyze-invalid-titles-attachment-id-to-title', $attachmentId, $attachmentTargetFilename ); + continue; + } if ( !isset( $attachmentIdToReferenceMap[$attachmentId] ) ) { continue; } @@ -779,12 +771,12 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement // In case of ERM34465 no files are added to title-attachments $this->addTitleAttachment( $wikiTitle, $attachmentTargetFilename ); $this->addFile( $attachmentTargetFilename, $attachmentReference ); - $this->customBuckets->addData( 'title-files', $wikiTitle, $attachmentTargetFilename, false, true ); + $this->customBuckets->addData( 'analyze-title-to-attachment-title', $wikiTitle, $attachmentTargetFilename, false, true ); $this->addedAttachmentIds[] = $attachmentId; $confluenceFileKey = str_replace( ' ', '_', "{$spaceId}---{$confluenceTitle}---{$attachmentOrigFilename}" ); - $this->customBuckets->addData( - 'filenames-to-filetitles-map', + $this->buckets->addData( + 'global-filenames-to-filetitles-map', $confluenceFileKey, $attachmentTargetFilename, false, @@ -792,13 +784,13 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement ); $this->customBuckets->addData( - 'attachment-id-to-target-filename-map', + 'analyze-attachment-id-to-target-filename-map', $attachmentId, $attachmentTargetFilename ); - $this->customBuckets->addData( - 'attachment-orig-filename-target-filename-map', + $this->buckets->addData( + 'global-attachment-orig-filename-target-filename-map', $attachmentOrigFilename, $attachmentTargetFilename ); @@ -810,12 +802,12 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement * @return void */ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { - $spaceIdPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); - $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'attachment-id-to-orig-filename-map' ); - $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'attachment-id-to-reference-map' ); - $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'attachment-id-to-space-id-map' ); - $pageIdToTitleMap = $this->customBuckets->getBucketData( 'page-id-to-title-map' ); - $pageIdToConfluenceKey = $this->customBuckets->getBucketData( 'page-id-to-confluence-key-map' ); + $spaceIdPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); + $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-orig-filename-map' ); + $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-reference-map' ); + $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-space-id-map' ); + $pageIdToTitleMap = $this->buckets->getBucketData( 'global-page-id-to-title-map' ); + $pageIdToConfluenceKey = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-key-map' ); $xmlHelper = new XMLHelper( $dom ); @@ -866,6 +858,10 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $confluenceKey, $attachmentId, $attachmentSpaceId, $attachmentOrigFilename, $targetTitle, $spaceIdPrefixMap ); + if ( $attachmentTargetFilename === '' ) { + $this->customBuckets->addData( 'debug-analyze-invalid-titles-attachment-id-to-title', $attachmentId, $attachmentTargetFilename ); + return; + } if ( !isset( $attachmentIdToReferenceMap[$attachmentId] ) ) { $this->output->writeln( @@ -881,8 +877,8 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $this->addTitleAttachment( $targetTitle, $attachmentTargetFilename ); $this->output->writeln( "Add attachment $attachmentTargetFilename (fallback: {$confluenceKey})" ); } else { - $this->customBuckets->addData( - 'additional-files', $attachmentTargetFilename, $attachmentReference, false, true ); + $this->buckets->addData( + 'global-additional-files', $attachmentTargetFilename, $attachmentReference, false, true ); $this->output->writeln( "Add attachment $attachmentTargetFilename (additional)" ); } @@ -890,8 +886,8 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $this->addedAttachmentIds[] = $attachmentId; $confluenceFileKey = str_replace( ' ', '', "{$confluenceKey}---{$attachmentOrigFilename}" ); - $this->customBuckets->addData( - 'filenames-to-filetitles-map', + $this->buckets->addData( + 'global-filenames-to-filetitles-map', $confluenceFileKey, $attachmentTargetFilename, false, @@ -899,13 +895,13 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { ); $this->customBuckets->addData( - 'attachment-id-to-target-filename-map', + 'analyze-attachment-id-to-target-filename-map', $attachmentId, $attachmentTargetFilename ); - $this->customBuckets->addData( - 'attachment-orig-filename-target-filename-map', + $this->buckets->addData( + 'global-attachment-orig-filename-target-filename-map', $attachmentOrigFilename, $attachmentTargetFilename ); @@ -1010,9 +1006,9 @@ private function makeAttachmentTargetFilenameFromData( $targetName = $filenameBuilder->buildFromAttachmentData( $attachmentSpaceId, $attachmentOrigFilename, $shortTargetTitle ); } catch ( InvalidTitleException $ex ) { - $this->customBuckets->addData( 'invalid-titles-attachment-id-to-title', $attachmentId, $ex->getInvalidTitle() ); + $this->customBuckets->addData( 'debug-analyze-invalid-titles-attachment-id-to-title', $attachmentId, $ex->getInvalidTitle() ); $this->logger->error( $ex->getMessage() ); - return '###INVALID###'; + $targetName = $ex->getInvalidTitle(); } } @@ -1032,7 +1028,7 @@ private function makeAttachmentTargetFilenameFromData( $fileKey = "{$pageConfluenceTitle}---$attachmentOrigFilename"; // Some normalization $fileKey = str_replace( ' ', '_', $fileKey ); - $this->customBuckets->addData( 'filenames-to-filetitles-map', $fileKey, $targetName, false, true ); + $this->buckets->addData( 'global-filenames-to-filetitles-map', $fileKey, $targetName, false, true ); return $targetName; } @@ -1071,14 +1067,15 @@ private function makeAttachmentReference( XMLHelper $xmlHelper, DOMElement $atta } private function checkTitles(): void { - $pagesTitlesMap = $this->customBuckets->getBucketData( 'pages-titles-map' ); + $pagesTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); + $hasInvalidTitles = false; $hasInvalidNamespaces = false; foreach ( $pagesTitlesMap as $key => $title ) { if ( str_ends_with( 'title', '_' ) ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'invalid_ending', $title, true, true ); @@ -1087,7 +1084,7 @@ private function checkTitles(): void { if ( str_contains( $title, ':' ) ) { if ( strpos( $title, ':' ) !== strrpos( $title, ':' ) ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'multiple_collons', $title, true, true ); @@ -1100,7 +1097,7 @@ private function checkTitles(): void { preg_match( '#(\d*)([a-zA-Z0-9_]*)#', $namespace, $matches ); if ( empty( $matches ) || $matches[1] !== '' ) { $this->customBuckets->addData( - 'invalid-namespaces', + 'warning-analyze-invalid-namespaces', 'invalid_char', $namespace, true, true ); @@ -1109,7 +1106,7 @@ private function checkTitles(): void { if ( strlen( $text ) > 255 ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'length', $title, true, true ); @@ -1118,7 +1115,7 @@ private function checkTitles(): void { } else { if ( strlen( $title ) > 255 ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'length', $title, true, true ); @@ -1127,6 +1124,19 @@ private function checkTitles(): void { } } + $files = $this->buckets->getBucketData( 'global-files' ); + $hasInvalidFilenames = false; + foreach ( $files as $title => $paths ) { + if ( strlen( $title ) > 255 ) { + $this->customBuckets->addData( + 'warning-analyze-invalid-filenames', + 'length', $title, + true, true + ); + $hasInvalidFilenames = true; + } + } + if ( $hasInvalidNamespaces === true || $hasInvalidTitles === true ) { $this->output->writeln( "\n\nWarning:\n" ); @@ -1138,9 +1148,53 @@ private function checkTitles(): void { $this->output->writeln( ' - Analyze process found invalid titles' ); } + if ( $hasInvalidFilenames === true ) { + $this->output->writeln( ' - Analyze process found invalid filenames' ); + } + $this->output->writeln( - "\nPlease check invalid-namespaces.php and/or invalid-titles.php before continuing with extract step" + "\nPlease check \"warning-analyze-invalid-namespaces.php\", \"warning-analyze-invalid-titles.php\" and \"warning-analyze-invalid-filenames.php\" before continuing with extract step" ); } } + + /** + * + * @param string $titleText + * @param string $contentReference + * @return void + */ + protected function addTitleRevision( $titleText, $contentReference = 'n/a' ) { + $this->buckets->addData( 'global-title-revisions', $titleText, $contentReference ); + } + + /** + * + * @param string $titleText + * @param string $attachmentReference + * @return void + */ + protected function addTitleAttachment( $titleText, $attachmentReference = 'n/a' ) { + $this->buckets->addData( 'global-title-attachments', $titleText, $attachmentReference ); + } + + /** + * + * @param string $rawFilename + * @param string $attachmentReference + * @return void + */ + protected function addFile( $rawFilename, $attachmentReference = 'n/a' ) { + try { + $filename = $this->getFilename( $rawFilename, $attachmentReference ); + $filename = ( new WindowsFilename( $filename ) ) . ''; + } catch ( InvalidTitleException $ex ) { + $this->logger->error( $ex->getMessage() ); + return; + } + + $prefixedFilename = $this->maybePrefixFilename( $filename ); + + $this->buckets->addData( 'global-files', $prefixedFilename, $attachmentReference ); + } } diff --git a/src/Command/Analyze.php b/src/Command/Analyze.php index a379a6f..70f6a77 100644 --- a/src/Command/Analyze.php +++ b/src/Command/Analyze.php @@ -72,9 +72,23 @@ private function readConfigFile( &$config ): void { */ protected function getBucketKeys() { return [ - 'files', - 'title-attachments', - 'title-revisions' + 'global-files', + 'global-title-attachments', + 'global-title-revisions', + 'global-space-id-to-prefix-map', + 'global-space-key-to-prefix-map', + 'global-space-id-homepages', + 'global-space-id-to-description-id-map', + 'global-space-description-id-to-body-id-map', + 'global-space-details', + 'global-userkey-to-username-map', + 'global-pages-titles-map', + 'global-page-id-to-title-map', + 'global-page-id-to-space-id', + 'global-body-contents-to-pages-map', + 'global-additional-files', + 'global-attachment-orig-filename-target-filename-map', + 'global-filenames-to-filetitles-map', ]; } } diff --git a/src/Command/Compose.php b/src/Command/Compose.php index b2a3a9c..b9b9640 100644 --- a/src/Command/Compose.php +++ b/src/Command/Compose.php @@ -66,4 +66,21 @@ private function readConfigFile( &$config ): void { } } } + + /** + * + * @inheritDoc + */ + protected function getBucketKeys() { + return [ + 'global-space-id-homepages', + 'global-space-id-to-description-id-map', + 'global-space-description-id-to-body-id-map', + 'global-body-contents-to-pages-map', + 'global-title-attachments', + 'global-title-revisions', + 'global-files', + 'global-additional-files' + ]; + } } diff --git a/src/Command/Extract.php b/src/Command/Extract.php index 59369be..322c4e8 100644 --- a/src/Command/Extract.php +++ b/src/Command/Extract.php @@ -69,4 +69,16 @@ private function readConfigFile( &$config ): void { } } } + + /** + * + * @inheritDoc + */ + protected function getBucketKeys() { + return [ + // From this step + 'global-title-metadata', + 'global-revision-contents', + ]; + } } diff --git a/src/Composer/ConfluenceComposer.php b/src/Composer/ConfluenceComposer.php index 43ceb43..448e9c8 100644 --- a/src/Composer/ConfluenceComposer.php +++ b/src/Composer/ConfluenceComposer.php @@ -14,11 +14,6 @@ class ConfluenceComposer extends ComposerBase implements IOutputAwareInterface { - /** - * @var DataBuckets - */ - private $dataBuckets; - /** * @var DataBuckets */ @@ -40,23 +35,12 @@ class ConfluenceComposer extends ComposerBase implements IOutputAwareInterface { public function __construct( $config, Workspace $workspace, DataBuckets $buckets ) { parent::__construct( $config, $workspace, $buckets ); - $this->dataBuckets = new DataBuckets( [ - 'space-id-homepages', - 'space-id-to-description-id-map', - 'space-description-id-to-body-id-map', - 'body-contents-to-pages-map', - 'title-attachments', - 'title-revisions', - 'files', - 'additional-files' - ] ); - $this->customBuckets = new DataBuckets( [ 'title-uploads', 'title-uploads-fail' ] ); - $this->dataBuckets->loadFromWorkspace( $this->workspace ); + $this->customBuckets->loadFromWorkspace( $this->workspace ); if ( isset( $config['config'] ) ) { $this->advancedConfig = $config['config']; @@ -78,16 +62,16 @@ public function buildXML( Builder $builder ) { $this->appendDefaultPages( $builder ); $this->addDefaultFiles(); - $bodyContentsToPagesMap = $this->dataBuckets->getBucketData( 'body-contents-to-pages-map' ); - $spaceIDHomepagesMap = $this->dataBuckets->getBucketData( 'space-id-homepages' ); + $bodyContentsToPagesMap = $this->buckets->getBucketData( 'global-body-contents-to-pages-map' ); + $spaceIDHomepagesMap = $this->buckets->getBucketData( 'global-space-id-homepages' ); $homepageSpaceIDMap = array_flip( $spaceIDHomepagesMap ); - $spaceIDDescriptionIDMap = $this->dataBuckets->getBucketData( 'space-id-to-description-id-map' ); - $spaceDescriptionIDBodyIDMap = $this->dataBuckets->getBucketData( 'space-description-id-to-body-id-map' ); + $spaceIDDescriptionIDMap = $this->buckets->getBucketData( 'global-space-id-to-description-id-map' ); + $spaceDescriptionIDBodyIDMap = $this->buckets->getBucketData( 'global-space-description-id-to-body-id-map' ); - $pagesRevisions = $this->dataBuckets->getBucketData( 'title-revisions' ); - $filesMap = $this->dataBuckets->getBucketData( 'files' ); - $pageAttachmentsMap = $this->dataBuckets->getBucketData( 'title-attachments' ); + $pagesRevisions = $this->buckets->getBucketData( 'global-title-revisions' ); + $filesMap = $this->buckets->getBucketData( 'global-files' ); + $pageAttachmentsMap = $this->buckets->getBucketData( 'global-title-attachments' ); $bodyContentIDMainpageID = []; $pagesToBodyContents = array_flip( $bodyContentsToPagesMap ); diff --git a/src/Composer/_defaultpages/Template/SpaceDetails b/src/Composer/_defaultpages/Template/SpaceDetails index ce01f5e..83caf63 100644 --- a/src/Composer/_defaultpages/Template/SpaceDetails +++ b/src/Composer/_defaultpages/Template/SpaceDetails @@ -1,4 +1,4 @@ -Template for macro "space-details" +Template for macro "global-space-details" {| class="wikitable" style="width:{{{width}}}" |- | Name || {{NAMESPACE}} diff --git a/src/Converter/ConfluenceConverter.php b/src/Converter/ConfluenceConverter.php index f53f94e..6c61173 100644 --- a/src/Converter/ConfluenceConverter.php +++ b/src/Converter/ConfluenceConverter.php @@ -68,10 +68,10 @@ class ConfluenceConverter extends PandocHTML implements IOutputAwareInterface { protected $bodyContentFile = null; /** @var DataBuckets */ - private $dataBuckets = null; + private $executionTimeBuckets = null; /** @var DataBuckets */ - private $customBuckets = null; + private $buckets = null; /** @var ConversionDataLookup */ private $dataLookup = null; @@ -113,29 +113,26 @@ class ConfluenceConverter extends PandocHTML implements IOutputAwareInterface { public function __construct( $config, Workspace $workspace ) { parent::__construct( $config, $workspace ); - $this->dataBuckets = new DataBuckets( [ - 'page-id-to-title-map', - 'pages-titles-map', - 'title-attachments', - 'body-contents-to-pages-map', - 'page-id-to-space-id', - 'space-id-to-prefix-map', - 'space-key-to-prefix-map', - 'filenames-to-filetitles-map', - 'title-metadata', - 'attachment-orig-filename-target-filename-map', - 'files', - 'userkey-to-username-map', - 'space-description-id-to-body-id-map', - 'gliffy-map', - 'attachment-confluence-file-key-to-target-filename-map' + $this->buckets = new DataBuckets( [ + 'global-page-id-to-title-map', + 'global-pages-titles-map', + 'global-title-attachments', + 'global-body-contents-to-pages-map', + 'global-page-id-to-space-id', + 'global-space-id-to-prefix-map', + 'global-space-key-to-prefix-map', + 'global-filenames-to-filetitles-map', + 'global-title-metadata', + 'global-attachment-orig-filename-target-filename-map', + 'global-files', + 'global-userkey-to-username-map', + 'global-space-description-id-to-body-id-map', + 'global-gliffy-map' ] ); - $this->dataBuckets->loadFromWorkspace( $this->workspace ); + $this->buckets->loadFromWorkspace( $this->workspace ); - $this->customBuckets = new DataBuckets( [ - 'title-uploads', - 'title-uploads-fail', + $this->executionTimeBuckets = new DataBuckets( [ 'converter-body-content-id-execution-time', ] ); } @@ -153,8 +150,8 @@ public function setOutput( Output $output ) { protected function doConvert( SplFileInfo $file ): string { $executionTime = new ExecutionTime(); $this->output->writeln( $file->getPathname() ); - $this->dataLookup = ConversionDataLookup::newFromBuckets( $this->dataBuckets ); - $this->conversionDataWriter = ConversionDataWriter::newFromBuckets( $this->dataBuckets ); + $this->dataLookup = ConversionDataLookup::newFromBuckets( $this->buckets ); + $this->conversionDataWriter = ConversionDataWriter::newFromBuckets( $this->buckets ); $this->rawFile = $file; if ( isset( $this->config['config']['ext-ns-file-repo-compat'] ) @@ -177,7 +174,7 @@ protected function doConvert( SplFileInfo $file ): string { } $this->currentSpace = $this->getSpaceIdFromPageId( $pageId ); - $pagesIdsToTitlesMap = $this->dataBuckets->getBucketData( 'page-id-to-title-map' ); + $pagesIdsToTitlesMap = $this->buckets->getBucketData( 'global-page-id-to-title-map' ); if ( isset( $pagesIdsToTitlesMap[$pageId] ) ) { $this->currentPageTitle = $pagesIdsToTitlesMap[$pageId]; } else { @@ -222,7 +219,7 @@ protected function doConvert( SplFileInfo $file ): string { $this->postprocessWikiText(); $executionTimeString = $executionTime->getHumanReadableTime(); - $this->customBuckets->addData( + $this->executionTimeBuckets->addData( 'converter-body-content-id-execution-time', $bodyContentId, $executionTimeString, @@ -230,7 +227,7 @@ protected function doConvert( SplFileInfo $file ): string { true ); - $this->customBuckets->saveToWorkspace( $this->workspace ); + $this->executionTimeBuckets->saveToWorkspace( $this->workspace ); return $this->wikiText; } @@ -286,7 +283,7 @@ private function runProcessors( $dom ) { ), new StructuredMacroGliffy( $this->dataLookup, $this->conversionDataWriter, $this->currentSpace, - $currentPageTitle, $this->customBuckets, $this->nsFileRepoCompat + $currentPageTitle, $this->buckets, $this->nsFileRepoCompat ), new StructuredMacroContenByLabel( $this->currentPageTitle ), new StructuredMacroAttachments(), @@ -347,7 +344,7 @@ private function getBodyContentIdFromFilename() { * @return int */ private function getPageIdFromBodyContentId( $bodyContentId ) { - $map = $this->dataBuckets->getBucketData( 'body-contents-to-pages-map' ); + $map = $this->buckets->getBucketData( 'global-body-contents-to-pages-map' ); return $map[$bodyContentId] ?? -1; } @@ -357,7 +354,7 @@ private function getPageIdFromBodyContentId( $bodyContentId ) { * @return int */ private function getSpaceDescriptionIDFromBodyContentId( $bodyContentId ) { - $map = $this->dataBuckets->getBucketData( 'space-description-id-to-body-id-map' ); + $map = $this->buckets->getBucketData( 'global-space-description-id-to-body-id-map' ); $map = array_flip( $map ); return $map[$bodyContentId] ?? -1; } @@ -368,7 +365,7 @@ private function getSpaceDescriptionIDFromBodyContentId( $bodyContentId ) { * @return int */ private function getSpaceIdFromPageId( $pageId ) { - $map = $this->dataBuckets->getBucketData( 'page-id-to-space-id' ); + $map = $this->buckets->getBucketData( 'global-page-id-to-space-id' ); return $map[$pageId] ?? -1; } @@ -427,7 +424,7 @@ private function processMacro( $sender, $match, $dom, $xpath ) { 'panel', 'recently-updated', 'section', - 'space-details', + 'global-space-details', 'status', 'task', 'task-list', @@ -544,7 +541,7 @@ protected function preprocessHTMLSource( $oHTMLSourceFile ) { $sContent = str_replace( 'dataBuckets->getBucketData( 'title-metadata' ); + $categorieMap = $this->buckets->getBucketData( 'global-title-metadata' ); $categories = ''; if ( isset( $categorieMap[$pageId] ) && isset( $categorieMap[$pageId]['categories'] ) ) { foreach ( $categorieMap[$pageId]['categories'] as $key => $category ) { @@ -679,7 +676,7 @@ public function postProcessDOM( $dom, $xpath ) { * @return void */ public function postProcessLinks() { - $oldToNewTitlesMap = $this->dataBuckets->getBucketData( 'pages-titles-map' ); + $oldToNewTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); $this->wikiText = preg_replace_callback( "/\[\[Media:(.*)]]/", @@ -735,7 +732,7 @@ static function ( $aMatches ) { private function addAdditionalAttachments(): string { $wikiText = ''; - $attachmentsMap = $this->dataBuckets->getBucketData( 'title-attachments' ); + $attachmentsMap = $this->buckets->getBucketData( 'global-title-attachments' ); $currentPageTitle = $this->getCurrentPageTitle(); @@ -795,7 +792,7 @@ private function buildMediaExcludeList( $wikiText ): array { */ private function getCurrentPageTitle(): string { $prefix = ''; - $spaceIdPrefixMap = $this->dataBuckets->getBucketData( 'space-id-to-prefix-map' ); + $spaceIdPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); if ( !isset( $spaceIdPrefixMap[$this->currentSpace] ) ) { $this->output->writeln( "SpaceId {$this->currentSpace} not found in spaceIdPrefixMap" ); } diff --git a/src/Converter/Processor/StructuredMacroGliffy.php b/src/Converter/Processor/StructuredMacroGliffy.php index 5ef1559..86bf832 100644 --- a/src/Converter/Processor/StructuredMacroGliffy.php +++ b/src/Converter/Processor/StructuredMacroGliffy.php @@ -114,7 +114,7 @@ private function makeParamsString( array $params ): string { $params['name'] = $filename; } - $this->dataBuckets->addData( 'gliffy-map', $key, $filename, true, true ); + $this->dataBuckets->addData( 'global-gliffy-map', $key, $filename, true, true ); } else { return ''; } diff --git a/src/Converter/Processor/StructuredMacroSpaceDetails.php b/src/Converter/Processor/StructuredMacroSpaceDetails.php index 5b663f4..db07371 100644 --- a/src/Converter/Processor/StructuredMacroSpaceDetails.php +++ b/src/Converter/Processor/StructuredMacroSpaceDetails.php @@ -11,7 +11,7 @@ class StructuredMacroSpaceDetails extends StructuredMacroProcessorBase { * @return string */ protected function getMacroName(): string { - return 'space-details'; + return 'global-space-details'; } /** diff --git a/src/Extractor/ConfluenceExtractor.php b/src/Extractor/ConfluenceExtractor.php index f6b437c..16e6fc1 100644 --- a/src/Extractor/ConfluenceExtractor.php +++ b/src/Extractor/ConfluenceExtractor.php @@ -31,8 +31,8 @@ class ConfluenceExtractor extends ExtractorBase { public function __construct( $config, Workspace $workspace, DataBuckets $buckets ) { parent::__construct( $config, $workspace, $buckets ); $this->customBuckets = new DataBuckets( [ - 'labelling-id-to-label-id-map', - 'label-id-to-name-map', + 'extract-labelling-id-to-label-id-map', + 'extract-label-id-to-name-map', ] ); } @@ -140,12 +140,12 @@ private function buildLabellingMap( DOMDocument $dom ): void { $labelProp = $xmlHelper->getPropertyNode( 'label', $labelling ); $labelId = $xmlHelper->getIDNodeValue( $labelProp ); - $labelMap = $this->customBuckets->getBucketData( 'label-id-to-name-map' ); + $labelMap = $this->customBuckets->getBucketData( 'extract-label-id-to-name-map' ); if ( isset( $labelMap[$labelId] ) ) { $categories[] = $labelMap[$labelId]; } - $this->customBuckets->addData( 'labelling-id-to-label-id-map', $id, $labelId, false, true ); + $this->customBuckets->addData( 'extract-labelling-id-to-label-id-map', $id, $labelId, false, true ); } /** @@ -173,7 +173,7 @@ private function buildLabelMap( DOMDocument $dom ): void { $id = $xmlHelper->getIDNodeValue( $label ); $name = $xmlHelper->getPropertyValue( 'name', $label ); - $this->customBuckets->addData( 'label-id-to-name-map', $id, $name, false, true ); + $this->customBuckets->addData( 'extract-label-id-to-name-map', $id, $name, false, true ); } /** @@ -193,8 +193,8 @@ private function getBodyContentHTML( XMLHelper $xmlHelper, DOMElement $bodyConte * @return void */ private function extractPageMetaData( DOMDocument $dom ) { - $labellingMap = $this->customBuckets->getBucketData( 'labelling-id-to-label-id-map' ); - $labelMap = $this->customBuckets->getBucketData( 'label-id-to-name-map' ); + $labellingMap = $this->customBuckets->getBucketData( 'extract-labelling-id-to-label-id-map' ); + $labelMap = $this->customBuckets->getBucketData( 'extract-label-id-to-name-map' ); $xmlHelper = new XMLHelper( $dom ); @@ -229,7 +229,25 @@ private function extractPageMetaData( DOMDocument $dom ) { 'categories' => $categories ]; - $this->buckets->addData( 'title-metadata', $id, $meta, false ); + $this->buckets->addData( 'global-title-metadata', $id, $meta, false ); } } + + /** + * + * @param string $revisionReference + * @param string $contentReference + */ + protected function addRevisionContent( $revisionReference, $contentReference = 'n/a' ) { + $this->buckets->addData( 'global-revision-contents', $revisionReference, $contentReference ); + } + + /** + * + * @param string $titleText + * @param string $meta + */ + protected function addTitleMetaData( $titleText, $meta = [] ) { + $this->buckets->addData( 'global-title-metadata', $titleText, $meta, false ); + } } diff --git a/src/Utility/ConversionDataLookup.php b/src/Utility/ConversionDataLookup.php index 3bc887e..54595ec 100644 --- a/src/Utility/ConversionDataLookup.php +++ b/src/Utility/ConversionDataLookup.php @@ -59,13 +59,13 @@ class ConversionDataLookup { */ public static function newFromBuckets( DataBuckets $buckets ) { return new static( - $buckets->getBucketData( 'space-id-to-prefix-map' ), - $buckets->getBucketData( 'pages-titles-map' ), - $buckets->getBucketData( 'filenames-to-filetitles-map' ), - $buckets->getBucketData( 'attachment-orig-filename-target-filename-map' ), - $buckets->getBucketData( 'files' ), - $buckets->getBucketData( 'userkey-to-username-map' ), - $buckets->getBucketData( 'space-key-to-prefix-map' ), + $buckets->getBucketData( 'global-space-id-to-prefix-map' ), + $buckets->getBucketData( 'global-pages-titles-map' ), + $buckets->getBucketData( 'global-filenames-to-filetitles-map' ), + $buckets->getBucketData( 'global-attachment-orig-filename-target-filename-map' ), + $buckets->getBucketData( 'global-files' ), + $buckets->getBucketData( 'global-userkey-to-username-map' ), + $buckets->getBucketData( 'global-space-key-to-prefix-map' ), ); } diff --git a/src/Utility/ConversionDataWriter.php b/src/Utility/ConversionDataWriter.php index e80c1fe..2e23a8c 100644 --- a/src/Utility/ConversionDataWriter.php +++ b/src/Utility/ConversionDataWriter.php @@ -18,7 +18,7 @@ class ConversionDataWriter { */ public static function newFromBuckets( DataBuckets $buckets ) { return new static( - $buckets->getBucketData( 'files' ) + $buckets->getBucketData( 'global-files' ) ); } From 6d797471d5c9dc801a30a1163483ef1c63657d04 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Thu, 4 Dec 2025 10:26:41 +0100 Subject: [PATCH 06/11] Add TitleValidityChecker class --- src/Analyzer/ConfluenceAnalyzer.php | 18 +++--- src/Utility/TitleValidityChecker.php | 86 ++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 9 deletions(-) create mode 100644 src/Utility/TitleValidityChecker.php diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index 44c95ee..f4508fe 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -13,6 +13,7 @@ use HalloWelt\MediaWiki\Lib\Migration\Workspace; use HalloWelt\MigrateConfluence\Utility\FilenameBuilder; use HalloWelt\MigrateConfluence\Utility\TitleBuilder; +use HalloWelt\MigrateConfluence\Utility\TitleValidityChecker; use HalloWelt\MigrateConfluence\Utility\XMLHelper; use Psr\Log\LoggerAwareInterface; use Psr\Log\LoggerInterface; @@ -1068,12 +1069,13 @@ private function makeAttachmentReference( XMLHelper $xmlHelper, DOMElement $atta private function checkTitles(): void { $pagesTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); - + + $validityChecker = new TitleValidityChecker(); $hasInvalidTitles = false; $hasInvalidNamespaces = false; foreach ( $pagesTitlesMap as $key => $title ) { - if ( str_ends_with( 'title', '_' ) ) { + if ( !$validityChecker->hasValidEnding( $title ) ) { $this->customBuckets->addData( 'warning-analyze-invalid-titles', 'invalid_ending', $title, @@ -1082,7 +1084,7 @@ private function checkTitles(): void { $hasInvalidTitles = true; } if ( str_contains( $title, ':' ) ) { - if ( strpos( $title, ':' ) !== strrpos( $title, ':' ) ) { + if ( $validityChecker->hasDoubleCollon( $title ) ) { $this->customBuckets->addData( 'warning-analyze-invalid-titles', 'multiple_collons', $title, @@ -1093,9 +1095,7 @@ private function checkTitles(): void { $namespace = substr( $title, 0, strpos( $title, ':' ) ); $text = substr( $title, strpos( $title, ':' ) + 1 ); - $matches = []; - preg_match( '#(\d*)([a-zA-Z0-9_]*)#', $namespace, $matches ); - if ( empty( $matches ) || $matches[1] !== '' ) { + if ( !$validityChecker->hasValidNamespace( $namespace ) ) { $this->customBuckets->addData( 'warning-analyze-invalid-namespaces', 'invalid_char', $namespace, @@ -1104,7 +1104,7 @@ private function checkTitles(): void { $hasInvalidNamespaces = true; } - if ( strlen( $text ) > 255 ) { + if ( !$validityChecker->hasValidLength( $text ) ) { $this->customBuckets->addData( 'warning-analyze-invalid-titles', 'length', $title, @@ -1113,7 +1113,7 @@ private function checkTitles(): void { $hasInvalidTitles = true; } } else { - if ( strlen( $title ) > 255 ) { + if ( !$validityChecker->hasValidLength( $title ) ) { $this->customBuckets->addData( 'warning-analyze-invalid-titles', 'length', $title, @@ -1127,7 +1127,7 @@ private function checkTitles(): void { $files = $this->buckets->getBucketData( 'global-files' ); $hasInvalidFilenames = false; foreach ( $files as $title => $paths ) { - if ( strlen( $title ) > 255 ) { + if ( $validityChecker->hasValidLength( $title ) ) { $this->customBuckets->addData( 'warning-analyze-invalid-filenames', 'length', $title, diff --git a/src/Utility/TitleValidityChecker.php b/src/Utility/TitleValidityChecker.php new file mode 100644 index 0000000..2f8609e --- /dev/null +++ b/src/Utility/TitleValidityChecker.php @@ -0,0 +1,86 @@ +hasValidEnding( $title ) ) { + return false; + } + + if ( str_contains( $title, ':' ) ) { + if ( $this->hasDoubleCollon( $title ) ) { + return false; + } + + $namespace = substr( $title, 0, strpos( $title, ':' ) ); + $text = substr( $title, strpos( $title, ':' ) + 1 ); + + if ( !$this->hasValidNamespace( $namespace ) ) { + return false; + } + + if ( !$this->hasValidLength( $text ) ) { + return false; + } + } else { + if ( !$this->hasValidLength( $title ) ) { + return false; + } + } + + return true; + } + + /** + * @param string $title + * @return boolean + */ + public function hasValidEnding( string $title ): bool { + if ( str_ends_with( $title, '_' ) ) { + return false; + } + return true; + } + + /** + * @param string $title + * @return boolean + */ + public function hasDoubleCollon( string $title ): bool { + if ( strpos( $title, ':' ) !== strrpos( $title, ':' ) ) { + return true; + } + return false; + } + + /** + * @param string $namespace + * @return boolean + */ + public function hasValidNamespace( string $namespace ): bool { + $matches = []; + preg_match( '#(\d*)([a-zA-Z0-9_]*)#', $namespace, $matches ); + if ( empty( $matches ) || $matches[1] !== '' ) { + return false; + } + return true; + } + + /** + * @param string $title + * @return boolean + */ + public function hasValidLength( string $title ): bool { + if ( strlen( $title ) > 255 ) { + return false; + } + return true; + } + +} \ No newline at end of file From e14c49a5258da22365aec620582603b8b55231ae Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Thu, 4 Dec 2025 10:34:52 +0100 Subject: [PATCH 07/11] CC --- src/Analyzer/ConfluenceAnalyzer.php | 66 +++++++++++++++++++++++----- src/Utility/TitleValidityChecker.php | 14 +++--- 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index f4508fe..8d2073e 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -459,13 +459,19 @@ private function buildParentPageMap( DOMDocument $dom ): void { $pageId = $xmlHelper->getIDNodeValue( $pageNode ); $parentPageId = $xmlHelper->getPropertyValue( 'parent', $pageNode ); if ( $parentPageId !== null ) { - $this->customBuckets->addData( 'analyze-page-id-to-parent-page-id-map', $pageId, $parentPageId, false, true ); + $this->customBuckets->addData( + 'analyze-page-id-to-parent-page-id-map', + $pageId, $parentPageId, false, true + ); } $pageId = $xmlHelper->getIDNodeValue( $pageNode ); $confluenceTitle = $xmlHelper->getPropertyValue( 'title', $pageNode ); if ( $confluenceTitle !== null ) { - $this->customBuckets->addData( 'analyze-page-id-to-confluence-title-map', $pageId, $confluenceTitle, false, true ); + $this->customBuckets->addData( + 'analyze-page-id-to-confluence-title-map', + $pageId, $confluenceTitle, false, true + ); } } @@ -536,7 +542,9 @@ private function buildAttachmentMaps( DOMDocument $dom ): void { $containerContentId = $xmlHelper->getIDNodeValue( $containerContent ); if ( $containerContentId >= 0 ) { $this->customBuckets->addData( - 'analyze-attachment-id-to-container-content-id-map', $attachmentId, $containerContentId, false, true ); + 'analyze-attachment-id-to-container-content-id-map', + $attachmentId, $containerContentId, false, true + ); } } $attachmentNodeContentStatus = $xmlHelper->getPropertyValue( 'contentStatus', $attachmentNode ); @@ -648,7 +656,10 @@ private function buildPageMaps( DOMDocument $dom ): void { try { $targetTitle = $titleBuilder->buildTitle( $pageNode ); } catch ( InvalidTitleException $ex ) { - $this->customBuckets->addData( 'debug-analyze-invalid-titles-page-id-to-title', $pageId, $ex->getInvalidTitle() ); + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-page-id-to-title', + $pageId, $ex->getInvalidTitle() + ); // We don't want to loose this page. Title can be modified after analyze process $targetTitle = $ex->getInvalidTitle(); } @@ -676,7 +687,10 @@ private function buildPageMaps( DOMDocument $dom ): void { // Some normalization $pageConfluenceTitle = str_replace( ' ', '_', $pageConfluenceTitle ); $this->buckets->addData( 'global-pages-titles-map', $pageConfluenceTitle, $targetTitle, false, true ); - $this->customBuckets->addData( 'analyze-page-id-to-confluence-key-map', $pageId, $pageConfluenceTitle, false, true ); + $this->customBuckets->addData( + 'analyze-page-id-to-confluence-key-map', + $pageId, $pageConfluenceTitle, false, true + ); // Also add pages IDs in Confluence to full page title mapping. // It is needed to have enough context on converting stage, @@ -730,7 +744,9 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement $pageIdConfluenKeyMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-key-map' ); $pagesTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); $spaceIdToPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); - $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-orig-filename-map' ); + $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( + 'analyze-attachment-id-to-orig-filename-map' + ); $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-space-id-map' ); $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-reference-map' ); @@ -761,7 +777,10 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement $attachmentOrigFilename, $wikiTitle, $spaceIdToPrefixMap ); if ( $attachmentTargetFilename === '' ) { - $this->customBuckets->addData( 'debug-analyze-invalid-titles-attachment-id-to-title', $attachmentId, $attachmentTargetFilename ); + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-attachment-id-to-title', + $attachmentId, $attachmentTargetFilename + ); continue; } if ( !isset( $attachmentIdToReferenceMap[$attachmentId] ) ) { @@ -772,7 +791,10 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement // In case of ERM34465 no files are added to title-attachments $this->addTitleAttachment( $wikiTitle, $attachmentTargetFilename ); $this->addFile( $attachmentTargetFilename, $attachmentReference ); - $this->customBuckets->addData( 'analyze-title-to-attachment-title', $wikiTitle, $attachmentTargetFilename, false, true ); + $this->customBuckets->addData( + 'analyze-title-to-attachment-title', + $wikiTitle, $attachmentTargetFilename, false, true + ); $this->addedAttachmentIds[] = $attachmentId; $confluenceFileKey = str_replace( ' ', '_', "{$spaceId}---{$confluenceTitle}---{$attachmentOrigFilename}" ); @@ -804,7 +826,9 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement */ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $spaceIdPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); - $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-orig-filename-map' ); + $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( + 'analyze-attachment-id-to-orig-filename-map' + ); $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-reference-map' ); $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-space-id-map' ); $pageIdToTitleMap = $this->buckets->getBucketData( 'global-page-id-to-title-map' ); @@ -860,7 +884,10 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $targetTitle, $spaceIdPrefixMap ); if ( $attachmentTargetFilename === '' ) { - $this->customBuckets->addData( 'debug-analyze-invalid-titles-attachment-id-to-title', $attachmentId, $attachmentTargetFilename ); + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-attachment-id-to-title', + $attachmentId, $attachmentTargetFilename + ); return; } @@ -1007,7 +1034,10 @@ private function makeAttachmentTargetFilenameFromData( $targetName = $filenameBuilder->buildFromAttachmentData( $attachmentSpaceId, $attachmentOrigFilename, $shortTargetTitle ); } catch ( InvalidTitleException $ex ) { - $this->customBuckets->addData( 'debug-analyze-invalid-titles-attachment-id-to-title', $attachmentId, $ex->getInvalidTitle() ); + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-attachment-id-to-title', + $attachmentId, $ex->getInvalidTitle() + ); $this->logger->error( $ex->getMessage() ); $targetName = $ex->getInvalidTitle(); } @@ -1153,7 +1183,19 @@ private function checkTitles(): void { } $this->output->writeln( - "\nPlease check \"warning-analyze-invalid-namespaces.php\", \"warning-analyze-invalid-titles.php\" and \"warning-analyze-invalid-filenames.php\" before continuing with extract step" + "\nPlease check" + ); + $this->output->writeln( + "\n - \"warning-analyze-invalid-namespaces.php\"" + ); + $this->output->writeln( + "\n - \"warning-analyze-invalid-titles.php\"" + ); + $this->output->writeln( + "\n - \"warning-analyze-invalid-filenames.php\"" + ); + $this->output->writeln( + "\nbefore continuing with extract step" ); } } diff --git a/src/Utility/TitleValidityChecker.php b/src/Utility/TitleValidityChecker.php index 2f8609e..0ebe627 100644 --- a/src/Utility/TitleValidityChecker.php +++ b/src/Utility/TitleValidityChecker.php @@ -6,7 +6,7 @@ class TitleValidityChecker { /** * @param string $title - * @return boolean + * @return bool */ public function validate( string $title ): bool { if ( !$this->hasValidEnding( $title ) ) { @@ -17,7 +17,7 @@ public function validate( string $title ): bool { if ( $this->hasDoubleCollon( $title ) ) { return false; } - + $namespace = substr( $title, 0, strpos( $title, ':' ) ); $text = substr( $title, strpos( $title, ':' ) + 1 ); @@ -39,7 +39,7 @@ public function validate( string $title ): bool { /** * @param string $title - * @return boolean + * @return bool */ public function hasValidEnding( string $title ): bool { if ( str_ends_with( $title, '_' ) ) { @@ -50,7 +50,7 @@ public function hasValidEnding( string $title ): bool { /** * @param string $title - * @return boolean + * @return bool */ public function hasDoubleCollon( string $title ): bool { if ( strpos( $title, ':' ) !== strrpos( $title, ':' ) ) { @@ -61,7 +61,7 @@ public function hasDoubleCollon( string $title ): bool { /** * @param string $namespace - * @return boolean + * @return bool */ public function hasValidNamespace( string $namespace ): bool { $matches = []; @@ -74,7 +74,7 @@ public function hasValidNamespace( string $namespace ): bool { /** * @param string $title - * @return boolean + * @return bool */ public function hasValidLength( string $title ): bool { if ( strlen( $title ) > 255 ) { @@ -83,4 +83,4 @@ public function hasValidLength( string $title ): bool { return true; } -} \ No newline at end of file +} From 3f3f08c788ed0fb81810436110e0a1b28f3752eb Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Thu, 4 Dec 2025 10:42:32 +0100 Subject: [PATCH 08/11] Fix space details macro --- src/Converter/Processor/StructuredMacroSpaceDetails.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Converter/Processor/StructuredMacroSpaceDetails.php b/src/Converter/Processor/StructuredMacroSpaceDetails.php index db07371..5b663f4 100644 --- a/src/Converter/Processor/StructuredMacroSpaceDetails.php +++ b/src/Converter/Processor/StructuredMacroSpaceDetails.php @@ -11,7 +11,7 @@ class StructuredMacroSpaceDetails extends StructuredMacroProcessorBase { * @return string */ protected function getMacroName(): string { - return 'global-space-details'; + return 'space-details'; } /** From 72eca34a19558f19c615d45052464c26243137e1 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Thu, 4 Dec 2025 10:43:57 +0100 Subject: [PATCH 09/11] Fix space details macro --- src/Composer/_defaultpages/Template/SpaceDetails | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Composer/_defaultpages/Template/SpaceDetails b/src/Composer/_defaultpages/Template/SpaceDetails index 83caf63..ce01f5e 100644 --- a/src/Composer/_defaultpages/Template/SpaceDetails +++ b/src/Composer/_defaultpages/Template/SpaceDetails @@ -1,4 +1,4 @@ -Template for macro "global-space-details" +Template for macro "space-details" {| class="wikitable" style="width:{{{width}}}" |- | Name || {{NAMESPACE}} From 5079d5c61e0657fd93345b9b5a2bc3e372c7ab54 Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Fri, 12 Dec 2025 09:40:18 +0100 Subject: [PATCH 10/11] Add title skip list and compress long titles --- doc/config.sample.yaml | 8 +++ src/Analyzer/ConfluenceAnalyzer.php | 89 ++++++++++++++++++++++++--- src/Command/Extract.php | 1 + src/Composer/ConfluenceComposer.php | 89 ++++++++++++++++----------- src/Converter/ConfluenceConverter.php | 38 ++++++++++-- src/Extractor/ConfluenceExtractor.php | 9 ++- 6 files changed, 182 insertions(+), 52 deletions(-) diff --git a/doc/config.sample.yaml b/doc/config.sample.yaml index db0e6db..4cf1784 100644 --- a/doc/config.sample.yaml +++ b/doc/config.sample.yaml @@ -4,7 +4,15 @@ config: ABC: "MY_NAMESPACE:ABC/" DEF: "MY_NAMESPACE:DEf/" GHI: "GHI_NAMESPACE:" + analyzer-include-spacekey: + - ABC + - DEF + composer-include-namespace: + - ABC + composer-skip-titles: + - ABC:DEF/GHI categories: - My Category 1 - My Category 2 ext-ns-file-repo-compat: true + include-history: false diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index 8d2073e..c2a419d 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -5,16 +5,19 @@ use DOMDocument; use DOMElement; use HalloWelt\MediaWiki\Lib\Migration\AnalyzerBase; +use HalloWelt\MediaWiki\Lib\Migration\ApplyCompressedTitle; use HalloWelt\MediaWiki\Lib\Migration\DataBuckets; use HalloWelt\MediaWiki\Lib\Migration\InvalidTitleException; use HalloWelt\MediaWiki\Lib\Migration\IOutputAwareInterface; use HalloWelt\MediaWiki\Lib\Migration\TitleBuilder as GenericTitleBuilder; +use HalloWelt\MediaWiki\Lib\Migration\TitleCompressor; use HalloWelt\MediaWiki\Lib\Migration\WindowsFilename; use HalloWelt\MediaWiki\Lib\Migration\Workspace; use HalloWelt\MigrateConfluence\Utility\FilenameBuilder; use HalloWelt\MigrateConfluence\Utility\TitleBuilder; use HalloWelt\MigrateConfluence\Utility\TitleValidityChecker; use HalloWelt\MigrateConfluence\Utility\XMLHelper; +use phpDocumentor\Reflection\Types\Boolean; use Psr\Log\LoggerAwareInterface; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; @@ -55,11 +58,14 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, I */ private $addedAttachmentIds = []; - /** - * - * @var string - */ - private $pageConfluenceTitle = ''; + /** @var array */ + private $pagesTitlesMap = []; + + /** @var array */ + private $pageIdToTitleMap = []; + + /** @var array */ + private $titleRevision = []; /** * @var string @@ -76,6 +82,9 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, I */ private $advancedConfig = []; + /** @var Boolean */ + private $includeHistory = false; + /** * * @param array $config @@ -89,6 +98,8 @@ public function __construct( $config, Workspace $workspace, DataBuckets $buckets 'analyze-space-name-to-prefix-map', 'analyze-space-id-to-name-map', 'analyze-space-key-to-name-map', + 'analyze-pages-titles-map', + 'analyze-page-id-to-title-map', 'analyze-page-id-to-confluence-title-map', 'analyze-page-id-to-parent-page-id-map', 'analyze-body-content-id-to-page-id-map', @@ -100,6 +111,7 @@ public function __construct( $config, Workspace $workspace, DataBuckets $buckets 'analyze-page-id-to-confluence-key-map', 'analyze-title-to-attachment-title', 'analyze-attachment-id-to-target-filename-map', + 'analzye-title-revisions', 'debug-analyze-invalid-titles-page-id-to-title', 'debug-analyze-invalid-titles-attachment-id-to-title', @@ -140,6 +152,12 @@ private function setConfigVars(): void { } $this->advancedConfig['analyzer-include-spacekey'] = $normalizedAnalyzerIncludeSpacekey; } + + if ( isset( $this->advancedConfig['include-history'] ) ) { + if ( is_bool( $this->advancedConfig['include-history'] ) ) { + $this->includeHistory = $this->advancedConfig['include-history']; + } + } } /** @@ -252,6 +270,29 @@ protected function doAnalyze( SplFileInfo $file ): bool { } $xmlReader->close(); + // compress title lenght and create pages-titles-map and page-id-to-title-map + $titleCompressor = new TitleCompressor(); + $compressedTitlesMap = $titleCompressor->execute( $this->pagesTitlesMap ); + + $applyCompressedTitles = new ApplyCompressedTitle( $compressedTitlesMap ); + $compressedPagesTitlesMap = $applyCompressedTitles->toMapValues( $this->pagesTitlesMap ); + foreach ( $compressedPagesTitlesMap as $key => $title ) { + $this->buckets->addData( 'global-pages-titles-map', $key, $title, false, true ); + } + $compressedPageIdToTitleMap = $applyCompressedTitles->toMapValues( $this->pageIdToTitleMap ); + ksort( $compressedPageIdToTitleMap ); + foreach ( $compressedPageIdToTitleMap as $id => $title ) { + $this->buckets->addData( 'global-page-id-to-title-map', $id, $title, false, true ); + } + + $compressedTitleRevison = $applyCompressedTitles->toMapKeys( $this->titleRevision ); + ksort( $compressedTitleRevison ); + foreach ( $compressedTitleRevison as $title => $revisions ) { + foreach( $revisions as $revision ) { + $this->addTitleRevision( $title, $revision ); + } + } + // Process title attachments fallback $xmlReader->open( $file->getPathname() ); $read = $xmlReader->read(); @@ -624,7 +665,7 @@ private function buildPageMaps( DOMDocument $dom ): void { return; } $status = $xmlHelper->getPropertyValue( 'contentStatus', $pageNode ); - if ( $status !== 'current' ) { + if ( !$this->includeHistory && ( $status !== 'current' ) ) { return; } $spaceId = $xmlHelper->getPropertyValue( 'space', $pageNode ); @@ -672,8 +713,10 @@ private function buildPageMaps( DOMDocument $dom ): void { $this->output->writeln( "Add page '$targetTitle' (ID:$pageId)" ); /** - * Adds data bucket "global-pages-titles-map", which contains mapping from page title itself to full page title. + * Adds data bucket "analyze-pages-titles-map", which contains mapping from page title itself to full page title. * Full page title contains parent pages and namespace (if it is not general space). + * + * After testing for title validity and sanitizing titles they will be added to global-pages-titles-map later. * Example: * "Detailed_planning" -> "Dokumentation/Detailed_planning" */ @@ -686,17 +729,19 @@ private function buildPageMaps( DOMDocument $dom ): void { $pageConfluenceTitle = "$spaceId---{$pageConfluenceTitle}"; // Some normalization $pageConfluenceTitle = str_replace( ' ', '_', $pageConfluenceTitle ); - $this->buckets->addData( 'global-pages-titles-map', $pageConfluenceTitle, $targetTitle, false, true ); $this->customBuckets->addData( 'analyze-page-id-to-confluence-key-map', $pageId, $pageConfluenceTitle, false, true ); + $this->buckets->addData( 'analyze-pages-titles-map', $pageConfluenceTitle, $targetTitle, false, true ); + $this->pagesTitlesMap[$pageConfluenceTitle] = $targetTitle; // Also add pages IDs in Confluence to full page title mapping. // It is needed to have enough context on converting stage, // to know from filename which page is currently being converted. - $this->buckets->addData( 'global-page-id-to-title-map', $pageId, $targetTitle, false, true ); + $this->buckets->addData( 'analyze-page-id-to-title-map', $pageId, $targetTitle, false, true ); $this->buckets->addData( 'global-page-id-to-space-id', $pageId, $spaceId, false, true ); + $this->pageIdToTitleMap[$pageId] = $targetTitle; $revisionTimestamp = $this->buildRevisionTimestamp( $xmlHelper, $pageNode ); $bodyContentIds = $this->getBodyContentIds( $xmlHelper, $pageNode ); @@ -725,8 +770,13 @@ private function buildPageMaps( DOMDocument $dom ): void { } $version = $xmlHelper->getPropertyValue( 'version', $pageNode ); + $revision = implode( '/', $bodyContentIds ) . "@$version-$revisionTimestamp"; - $this->addTitleRevision( $targetTitle, implode( '/', $bodyContentIds ) . "@$version-$revisionTimestamp" ); + if ( !isset( $this->titleRevision[$targetTitle] ) ) { + $this->titleRevision[$targetTitle] = []; + } + $this->titleRevision[$targetTitle][] = $revision; + $this->addAnalyzerTitleRevision( $targetTitle, $revision ); // Find attachments @@ -751,8 +801,17 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-reference-map' ); $pageId = $xmlHelper->getIDNodeValue( $element ); + if ( !isset( $pageIdConflueTitleMap[$pageId] ) ) { + return; + } $confluenceTitle = $pageIdConflueTitleMap[$pageId]; + if ( !isset( $pageIdConfluenKeyMap[$pageId] ) ) { + return; + } $confluenceKey = $pageIdConfluenKeyMap[$pageId]; + if ( !isset( $pagesTitlesMap[$confluenceKey] ) ) { + return; + } $wikiTitle = $pagesTitlesMap[$confluenceKey]; // In case of ERM34465 this seems to be empty because @@ -1200,6 +1259,16 @@ private function checkTitles(): void { } } + /** + * + * @param string $titleText + * @param string $contentReference + * @return void + */ + private function addAnalyzerTitleRevision( $titleText, $contentReference = 'n/a' ) { + $this->buckets->addData( 'analzye-title-revisions', $titleText, $contentReference ); + } + /** * * @param string $titleText diff --git a/src/Command/Extract.php b/src/Command/Extract.php index 322c4e8..b94ffb7 100644 --- a/src/Command/Extract.php +++ b/src/Command/Extract.php @@ -79,6 +79,7 @@ protected function getBucketKeys() { // From this step 'global-title-metadata', 'global-revision-contents', + 'global-body-contents-to-pages-map', ]; } } diff --git a/src/Composer/ConfluenceComposer.php b/src/Composer/ConfluenceComposer.php index 448e9c8..d51556a 100644 --- a/src/Composer/ConfluenceComposer.php +++ b/src/Composer/ConfluenceComposer.php @@ -83,55 +83,70 @@ public function buildXML( Builder $builder ) { $bodyContentIDMainpageID[$bodyContentsID] = $homepageID; } - foreach ( $pagesRevisions as $pageTitle => $pageRevision ) { + foreach ( $pagesRevisions as $pageTitle => $pageRevisions ) { $this->output->writeln( "\nProcessing: $pageTitle\n" ); - $pageRevisionData = explode( '@', $pageRevision[0] ); + // Sometimes not all namespaces should be used for the import. To skip this namespaces + // use this option + $namespace = $this->getNamespace( $pageTitle ); + if ( + isset( $this->advancedConfig['composer-include-namespace'] ) + && !in_array( $namespace, $this->advancedConfig['composer-include-namespace'] ) + ) { + $this->output->writeln( "Namespace {$namespace} skipped by configuration" ); + continue; + } - $timestamp = explode( '-', $pageRevisionData[1] )[1]; + // Sometimes titles have contents >256kB which might break the import. To skip this titles + // use this option + if ( + isset( $this->advancedConfig['composer-skip-titles'] ) + && in_array( $pageTitle, $this->advancedConfig['composer-skip-titles'] ) + ) { + $this->output->writeln( "Page {$pageTitle} skipped by configuration" ); + continue; + } - $bodyContentIds = $pageRevisionData[0]; - $bodyContentIdsArr = explode( '/', $bodyContentIds ); + foreach( $pageRevisions as $pageRevision ) { + $pageRevisionData = explode( '@', $pageRevision ); - $pageContent = ""; - foreach ( $bodyContentIdsArr as $bodyContentId ) { - if ( $bodyContentId === '' ) { - // Skip if no reference to a body content is not set - continue; - } + $timestamp = explode( '-', $pageRevisionData[1] )[1]; - $this->output->writeln( "Getting '$bodyContentId' body content..." ); - - $pageContent .= $this->workspace->getConvertedContent( $bodyContentId ) . "\n"; - - // Add space description to homepage - if ( isset( $bodyContentIDMainpageID[$bodyContentId] ) ) { - // get homepage id if it is a homepage - $mainpageID = $bodyContentIDMainpageID[$bodyContentId]; - if ( isset( $homepageSpaceIDMap[$mainpageID] ) ) { - // get space id - $spaceID = $homepageSpaceIDMap[$mainpageID]; - if ( isset( $spaceIDDescriptionIDMap[$spaceID] ) ) { - // get description id - $descID = $spaceIDDescriptionIDMap[$spaceID]; - if ( isset( $spaceDescriptionIDBodyIDMap[$descID] ) ) { + $bodyContentIds = $pageRevisionData[0]; + $bodyContentIdsArr = explode( '/', $bodyContentIds ); + + $pageContent = ""; + foreach ( $bodyContentIdsArr as $bodyContentId ) { + if ( $bodyContentId === '' ) { + // Skip if no reference to a body content is not set + continue; + } + + $this->output->writeln( "Getting '$bodyContentId' body content..." ); + + $pageContent .= $this->workspace->getConvertedContent( $bodyContentId ) . "\n"; + + // Add space description to homepage + if ( isset( $bodyContentIDMainpageID[$bodyContentId] ) ) { + // get homepage id if it is a homepage + $mainpageID = $bodyContentIDMainpageID[$bodyContentId]; + if ( isset( $homepageSpaceIDMap[$mainpageID] ) ) { + // get space id + $spaceID = $homepageSpaceIDMap[$mainpageID]; + if ( isset( $spaceIDDescriptionIDMap[$spaceID] ) ) { // get description id - $descBodyID = $spaceDescriptionIDBodyIDMap[$descID]; - $description = $this->workspace->getConvertedContent( $descBodyID ); - $pageContent .= "[[Space description::$description]]\n"; + $descID = $spaceIDDescriptionIDMap[$spaceID]; + if ( isset( $spaceDescriptionIDBodyIDMap[$descID] ) ) { + // get description id + $descBodyID = $spaceDescriptionIDBodyIDMap[$descID]; + $description = $this->workspace->getConvertedContent( $descBodyID ); + $pageContent .= "[[Space description::$description]]\n"; + } } } } } - } - $namespace = $this->getNamespace( $pageTitle ); - if ( - isset( $this->advancedConfig['composer-include-namespace'] ) - && !in_array( $namespace, $this->advancedConfig['composer-include-namespace'] ) - ) { - $this->output->writeln( "Page {$pageTitle} skipped by configuration" ); - } else { $builder->addRevision( $pageTitle, $pageContent, $timestamp ); // Append attachments diff --git a/src/Converter/ConfluenceConverter.php b/src/Converter/ConfluenceConverter.php index 6c61173..e25c514 100644 --- a/src/Converter/ConfluenceConverter.php +++ b/src/Converter/ConfluenceConverter.php @@ -73,6 +73,9 @@ class ConfluenceConverter extends PandocHTML implements IOutputAwareInterface { /** @var DataBuckets */ private $buckets = null; + /** @var DataBuckets */ + private $customBuckets = null; + /** @var ConversionDataLookup */ private $dataLookup = null; @@ -127,13 +130,16 @@ public function __construct( $config, Workspace $workspace ) { 'global-files', 'global-userkey-to-username-map', 'global-space-description-id-to-body-id-map', - 'global-gliffy-map' + 'global-gliffy-map', ] ); $this->buckets->loadFromWorkspace( $this->workspace ); + $this->customBuckets = new DataBuckets( [ + 'warning-convert-body-content-id-content-size', + ] ); $this->executionTimeBuckets = new DataBuckets( [ - 'converter-body-content-id-execution-time', + 'convert-body-content-id-execution-time', ] ); } @@ -149,6 +155,10 @@ public function setOutput( Output $output ) { */ protected function doConvert( SplFileInfo $file ): string { $executionTime = new ExecutionTime(); + + $this->customBuckets->loadFromWorkspace( $this->workspace ); + $this->executionTimeBuckets->loadFromWorkspace( $this->workspace ); + $this->output->writeln( $file->getPathname() ); $this->dataLookup = ConversionDataLookup::newFromBuckets( $this->buckets ); $this->conversionDataWriter = ConversionDataWriter::newFromBuckets( $this->buckets ); @@ -218,16 +228,36 @@ protected function doConvert( SplFileInfo $file ): string { $this->postProcessLinks(); $this->postprocessWikiText(); + // Content size sometimes breakes import + $exceed = ''; + $wikiTextLength = strlen( $this->wikiText ); + $wikiTextLength = $wikiTextLength / 1000; + if ( $wikiTextLength > 512 ) { + $exceed = '512'; + } elseif ( $wikiTextLength > 256 ) { + $exceed = '256'; + } elseif ( $wikiTextLength > 100 ) { + $exceed = '100'; + } + if ( $exceed !== '' ) { + $this->buckets->addData( + 'warning-convert-body-content-id-content-size', + $exceed, + $bodyContentId + ); + $this->output->writeln( "bodyContentId {$this->currentSpace} contains large content" ); + } + $executionTimeString = $executionTime->getHumanReadableTime(); $this->executionTimeBuckets->addData( - 'converter-body-content-id-execution-time', + 'convert-body-content-id-execution-time', $bodyContentId, $executionTimeString, false, true ); - $this->executionTimeBuckets->saveToWorkspace( $this->workspace ); + $this->customBuckets->saveToWorkspace( $this->workspace ); return $this->wikiText; } diff --git a/src/Extractor/ConfluenceExtractor.php b/src/Extractor/ConfluenceExtractor.php index 16e6fc1..38d9971 100644 --- a/src/Extractor/ConfluenceExtractor.php +++ b/src/Extractor/ConfluenceExtractor.php @@ -30,6 +30,7 @@ class ConfluenceExtractor extends ExtractorBase { */ public function __construct( $config, Workspace $workspace, DataBuckets $buckets ) { parent::__construct( $config, $workspace, $buckets ); + $this->customBuckets = new DataBuckets( [ 'extract-labelling-id-to-label-id-map', 'extract-label-id-to-name-map', @@ -41,6 +42,7 @@ public function __construct( $config, Workspace $workspace, DataBuckets $buckets * @return bool */ protected function doExtract( SplFileInfo $file ): bool { + $this->buckets->loadFromWorkspace( $this->workspace ); $this->customBuckets->loadFromWorkspace( $this->workspace ); if ( isset( $this->config['config']['categories'] ) ) { @@ -109,11 +111,16 @@ protected function doExtract( SplFileInfo $file ): bool { * @return void */ private function extractBodyContents( DOMDocument $dom ): void { + $bodyContentsToPagesMap = $this->buckets->getBucketData( 'global-body-contents-to-pages-map' ); + $xmlHelper = new XMLHelper( $dom ); $bodyContents = $xmlHelper->getObjectNodes( 'BodyContent' ); foreach ( $bodyContents as $bodyContent ) { $id = $xmlHelper->getIDNodeValue( $bodyContent ); + if ( !isset( $bodyContentsToPagesMap[ $id ] ) ) { + continue; + } $bodyContentHTML = $this->getBodyContentHTML( $xmlHelper, $bodyContent ); $targetFileName = $this->workspace->saveRawContent( $id, $bodyContentHTML ); $this->addRevisionContent( $id, $targetFileName ); @@ -229,7 +236,7 @@ private function extractPageMetaData( DOMDocument $dom ) { 'categories' => $categories ]; - $this->buckets->addData( 'global-title-metadata', $id, $meta, false ); + $this->addTitleMetaData( $id, $meta ); } } From 78558d3faaa0a7f646ea98ad8be961d909fdae7d Mon Sep 17 00:00:00 2001 From: Daniel Vogel Date: Fri, 19 Dec 2025 07:17:23 +0100 Subject: [PATCH 11/11] Fix adding revisions; sort revisions --- src/Analyzer/ConfluenceAnalyzer.php | 4 ++-- src/Composer/ConfluenceComposer.php | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index c2a419d..9f4b1f4 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -154,7 +154,7 @@ private function setConfigVars(): void { } if ( isset( $this->advancedConfig['include-history'] ) ) { - if ( is_bool( $this->advancedConfig['include-history'] ) ) { + if ( $this->advancedConfig['include-history'] === true ) { $this->includeHistory = $this->advancedConfig['include-history']; } } @@ -288,6 +288,7 @@ protected function doAnalyze( SplFileInfo $file ): bool { $compressedTitleRevison = $applyCompressedTitles->toMapKeys( $this->titleRevision ); ksort( $compressedTitleRevison ); foreach ( $compressedTitleRevison as $title => $revisions ) { + $revisions = array_unique( $revisions ); foreach( $revisions as $revision ) { $this->addTitleRevision( $title, $revision ); } @@ -657,7 +658,6 @@ private function buildPageMaps( DOMDocument $dom ): void { $pages = $xmlHelper->getObjectNodes( 'Page' ); if ( count( $pages ) < 1 ) { - return; } $pageNode = $pages->item( 0 ); diff --git a/src/Composer/ConfluenceComposer.php b/src/Composer/ConfluenceComposer.php index d51556a..233aea5 100644 --- a/src/Composer/ConfluenceComposer.php +++ b/src/Composer/ConfluenceComposer.php @@ -107,12 +107,29 @@ public function buildXML( Builder $builder ) { continue; } + $sortedRevisions = []; foreach( $pageRevisions as $pageRevision ) { $pageRevisionData = explode( '@', $pageRevision ); + $bodyContentIds = $pageRevisionData[0]; - $timestamp = explode( '-', $pageRevisionData[1] )[1]; + $versionTimestamp = explode( '-', $pageRevisionData[1] ); + $version = $versionTimestamp[0]; + $timestamp = $versionTimestamp[1]; - $bodyContentIds = $pageRevisionData[0]; + $sortedRevisions[$timestamp] = $bodyContentIds; + } + + ksort( $sortedRevisions ); + if ( !isset( $this->advancedConfig['include-history'] ) + || $this->advancedConfig['include-history'] !== true + ) { + $bodyContentIds = end( $sortedRevisions ); + $timestamp = array_search( $bodyContentIds, $sortedRevisions ); + $sortedRevisions = []; // Reset sortedRevisions + $sortedRevisions[$timestamp] = $bodyContentIds; + } + + foreach( $sortedRevisions as $timestamp => $bodyContentIds ) { $bodyContentIdsArr = explode( '/', $bodyContentIds ); $pageContent = "";