diff --git a/doc/config.sample.yaml b/doc/config.sample.yaml index db0e6db..4cf1784 100644 --- a/doc/config.sample.yaml +++ b/doc/config.sample.yaml @@ -4,7 +4,15 @@ config: ABC: "MY_NAMESPACE:ABC/" DEF: "MY_NAMESPACE:DEf/" GHI: "GHI_NAMESPACE:" + analyzer-include-spacekey: + - ABC + - DEF + composer-include-namespace: + - ABC + composer-skip-titles: + - ABC:DEF/GHI categories: - My Category 1 - My Category 2 ext-ns-file-repo-compat: true + include-history: false diff --git a/src/Analyzer/ConfluenceAnalyzer.php b/src/Analyzer/ConfluenceAnalyzer.php index 1063a0a..9f4b1f4 100644 --- a/src/Analyzer/ConfluenceAnalyzer.php +++ b/src/Analyzer/ConfluenceAnalyzer.php @@ -5,14 +5,19 @@ use DOMDocument; use DOMElement; use HalloWelt\MediaWiki\Lib\Migration\AnalyzerBase; +use HalloWelt\MediaWiki\Lib\Migration\ApplyCompressedTitle; use HalloWelt\MediaWiki\Lib\Migration\DataBuckets; use HalloWelt\MediaWiki\Lib\Migration\InvalidTitleException; use HalloWelt\MediaWiki\Lib\Migration\IOutputAwareInterface; use HalloWelt\MediaWiki\Lib\Migration\TitleBuilder as GenericTitleBuilder; +use HalloWelt\MediaWiki\Lib\Migration\TitleCompressor; +use HalloWelt\MediaWiki\Lib\Migration\WindowsFilename; use HalloWelt\MediaWiki\Lib\Migration\Workspace; use HalloWelt\MigrateConfluence\Utility\FilenameBuilder; use HalloWelt\MigrateConfluence\Utility\TitleBuilder; +use HalloWelt\MigrateConfluence\Utility\TitleValidityChecker; use HalloWelt\MigrateConfluence\Utility\XMLHelper; +use phpDocumentor\Reflection\Types\Boolean; use Psr\Log\LoggerAwareInterface; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; @@ -23,22 +28,11 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, IOutputAwareInterface { - /** - * - * @var DOMDocument - */ - private $dom = null; - /** * @var DataBuckets */ private $customBuckets = null; - /** - * @var XMLHelper - */ - private $helper = null; - /** * @var LoggerInterface */ @@ -64,11 +58,14 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, I */ private $addedAttachmentIds = []; - /** - * - * @var string - */ - private $pageConfluenceTitle = ''; + /** @var array */ + private $pagesTitlesMap = []; + + /** @var array */ + private $pageIdToTitleMap = []; + + /** @var array */ + private $titleRevision = []; /** * @var string @@ -85,6 +82,9 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, I */ private $advancedConfig = []; + /** @var Boolean */ + private $includeHistory = false; + /** * * @param array $config @@ -94,43 +94,31 @@ class ConfluenceAnalyzer extends AnalyzerBase implements LoggerAwareInterface, I public function __construct( $config, Workspace $workspace, DataBuckets $buckets ) { parent::__construct( $config, $workspace, $buckets ); $this->customBuckets = new DataBuckets( [ - 'space-id-to-prefix-map', - 'space-key-to-prefix-map', - 'space-name-to-prefix-map', - 'space-id-to-name-map', - 'space-key-to-name-map', - 'space-id-homepages', - 'space-id-to-description-id-map', - 'space-description-id-to-body-id-map', - 'space-details', - 'page-id-to-confluence-title-map', - 'page-id-to-parent-page-id-map', - 'body-content-id-to-page-id-map', - 'attachment-id-to-orig-filename-map', - 'attachment-id-to-space-id-map', - 'attachment-id-to-reference-map', - 'attachment-id-to-container-content-id-map', - 'attachment-id-to-content-status-map', - 'userkey-to-username-map', - 'pages-titles-map', - 'page-id-to-confluence-key-map', - 'page-id-to-title-map', - 'page-id-to-space-id', - 'body-contents-to-pages-map', - 'title-files', - 'additional-files', - 'attachment-orig-filename-target-filename-map', - 'attachment-id-to-target-filename-map', - 'filenames-to-filetitles-map', - - 'invalid-titles', - 'invalid-namespaces', - - 'debug-attachment-id-to-target-filename', - 'debug-missing-attachment-id-to-filename', - 'debug-attachment-page-to-attachment-id', - 'debug-fallback-attachment-id-to-target-filename', - 'debug-additional-attachment-id-to-target-filename', + 'analyze-space-id-to-space-key-map', + 'analyze-space-name-to-prefix-map', + 'analyze-space-id-to-name-map', + 'analyze-space-key-to-name-map', + 'analyze-pages-titles-map', + 'analyze-page-id-to-title-map', + 'analyze-page-id-to-confluence-title-map', + 'analyze-page-id-to-parent-page-id-map', + 'analyze-body-content-id-to-page-id-map', + 'analyze-attachment-id-to-orig-filename-map', + 'analyze-attachment-id-to-space-id-map', + 'analyze-attachment-id-to-reference-map', + 'analyze-attachment-id-to-container-content-id-map', + 'analyze-attachment-id-to-content-status-map', + 'analyze-page-id-to-confluence-key-map', + 'analyze-title-to-attachment-title', + 'analyze-attachment-id-to-target-filename-map', + 'analzye-title-revisions', + + 'debug-analyze-invalid-titles-page-id-to-title', + 'debug-analyze-invalid-titles-attachment-id-to-title', + + 'warning-analyze-invalid-namespaces', + 'warning-analyze-invalid-titles', + 'warning-analyze-invalid-filenames', ] ); $this->logger = new NullLogger(); @@ -164,6 +152,12 @@ private function setConfigVars(): void { } $this->advancedConfig['analyzer-include-spacekey'] = $normalizedAnalyzerIncludeSpacekey; } + + if ( isset( $this->advancedConfig['include-history'] ) ) { + if ( $this->advancedConfig['include-history'] === true ) { + $this->includeHistory = $this->advancedConfig['include-history']; + } + } } /** @@ -276,6 +270,30 @@ protected function doAnalyze( SplFileInfo $file ): bool { } $xmlReader->close(); + // compress title lenght and create pages-titles-map and page-id-to-title-map + $titleCompressor = new TitleCompressor(); + $compressedTitlesMap = $titleCompressor->execute( $this->pagesTitlesMap ); + + $applyCompressedTitles = new ApplyCompressedTitle( $compressedTitlesMap ); + $compressedPagesTitlesMap = $applyCompressedTitles->toMapValues( $this->pagesTitlesMap ); + foreach ( $compressedPagesTitlesMap as $key => $title ) { + $this->buckets->addData( 'global-pages-titles-map', $key, $title, false, true ); + } + $compressedPageIdToTitleMap = $applyCompressedTitles->toMapValues( $this->pageIdToTitleMap ); + ksort( $compressedPageIdToTitleMap ); + foreach ( $compressedPageIdToTitleMap as $id => $title ) { + $this->buckets->addData( 'global-page-id-to-title-map', $id, $title, false, true ); + } + + $compressedTitleRevison = $applyCompressedTitles->toMapKeys( $this->titleRevision ); + ksort( $compressedTitleRevison ); + foreach ( $compressedTitleRevison as $title => $revisions ) { + $revisions = array_unique( $revisions ); + foreach( $revisions as $revision ) { + $this->addTitleRevision( $title, $revision ); + } + } + // Process title attachments fallback $xmlReader->open( $file->getPathname() ); $read = $xmlReader->read(); @@ -345,20 +363,23 @@ private function buildSpaceMaps( DOMDocument $dom ): void { return; } - $this->customBuckets->addData( - 'space-id-to-prefix-map', $spaceId, $customSpacePrefix, false, true + $this->buckets->addData( + 'global-space-id-to-prefix-map', $spaceId, $customSpacePrefix, false, true + ); + $this->buckets->addData( + 'global-space-key-to-prefix-map', $spaceKey, $customSpacePrefix, false, true ); $this->customBuckets->addData( - 'space-key-to-prefix-map', $spaceKey, $customSpacePrefix, false, true + 'analyze-space-id-to-space-key-map', $spaceId, $spaceKey, false, true ); $this->customBuckets->addData( - 'space-name-to-prefix-map', $spaceName, $customSpacePrefix, false, true + 'analyze-space-name-to-prefix-map', $spaceName, $customSpacePrefix, false, true ); $this->customBuckets->addData( - 'space-id-to-name-map', $spaceId, $spaceName, false, true + 'analyze-space-id-to-name-map', $spaceId, $spaceName, false, true ); $this->customBuckets->addData( - 'space-key-to-name-map', $spaceKey, $spaceName, false, true + 'analyze-space-key-to-name-map', $spaceKey, $spaceName, false, true ); $homePageId = -1; @@ -367,7 +388,7 @@ private function buildSpaceMaps( DOMDocument $dom ): void { $homePageId = $xmlHelper->getIDNodeValue( $homePagePropertyNode ); } if ( $homePageId > -1 ) { - $this->customBuckets->addData( 'space-id-homepages', $spaceId, $homePageId, false, true ); + $this->buckets->addData( 'global-space-id-homepages', $spaceId, $homePageId, false, true ); } $details = []; @@ -390,8 +411,8 @@ private function buildSpaceMaps( DOMDocument $dom ): void { $propertyNode = $xmlHelper->getPropertyNode( 'description' ); if ( $propertyNode !== null ) { $details['description'] = $xmlHelper->getIDNodeValue( $propertyNode ); - $this->customBuckets->addData( - 'space-id-to-description-id-map', + $this->buckets->addData( + 'global-space-id-to-description-id-map', $spaceId, $details['description'], false, @@ -419,7 +440,7 @@ private function buildSpaceMaps( DOMDocument $dom ): void { } if ( !empty( $details ) ) { - $this->customBuckets->addData( 'space-details', $spaceId, $details, false, true ); + $this->buckets->addData( 'global-space-details', $spaceId, $details, false, true ); $this->output->writeln( "Add details description ($spaceId)" ); } } @@ -443,7 +464,7 @@ private function buildSpaceDescriptionMap( DOMDocument $dom ): void { $bodyContents = $xmlHelper->getElementsFromCollection( 'bodyContents', $spaceDescription ); foreach ( $bodyContents as $bodyContent ) { $id = $xmlHelper->getIDNodeValue( $bodyContent ); - $this->customBuckets->addData( 'space-description-id-to-body-id-map', $descID, $id, false, true ); + $this->buckets->addData( 'global-space-description-id-to-body-id-map', $descID, $id, false, true ); $this->output->writeln( "\nAdd space description ($id)" ); } } @@ -480,13 +501,19 @@ private function buildParentPageMap( DOMDocument $dom ): void { $pageId = $xmlHelper->getIDNodeValue( $pageNode ); $parentPageId = $xmlHelper->getPropertyValue( 'parent', $pageNode ); if ( $parentPageId !== null ) { - $this->customBuckets->addData( 'page-id-to-parent-page-id-map', $pageId, $parentPageId, false, true ); + $this->customBuckets->addData( + 'analyze-page-id-to-parent-page-id-map', + $pageId, $parentPageId, false, true + ); } $pageId = $xmlHelper->getIDNodeValue( $pageNode ); $confluenceTitle = $xmlHelper->getPropertyValue( 'title', $pageNode ); if ( $confluenceTitle !== null ) { - $this->customBuckets->addData( 'page-id-to-confluence-title-map', $pageId, $confluenceTitle, false, true ); + $this->customBuckets->addData( + 'analyze-page-id-to-confluence-title-map', + $pageId, $confluenceTitle, false, true + ); } } @@ -507,7 +534,7 @@ private function buildBodyContentMap( DOMDocument $dom ): void { $bodyContentId = $xmlHelper->getIDNodeValue( $bodyContentObject ); $pageId = $xmlHelper->getPropertyValue( 'content', $bodyContentObject ); - $this->customBuckets->addData( 'body-content-id-to-page-id-map', + $this->customBuckets->addData( 'analyze-body-content-id-to-page-id-map', $bodyContentId, $pageId, false, true ); } @@ -540,29 +567,31 @@ private function buildAttachmentMaps( DOMDocument $dom ): void { if ( $attachmentFilename !== '' && is_int( $attachmentId ) ) { $this->customBuckets->addData( - 'attachment-id-to-orig-filename-map', $attachmentId, $attachmentFilename, false, true ); + 'analyze-attachment-id-to-orig-filename-map', $attachmentId, $attachmentFilename, false, true ); } $attachmentSpaceId = $xmlHelper->getPropertyValue( 'space', $attachmentNode ); if ( is_int( $attachmentId ) ) { $this->customBuckets->addData( - 'attachment-id-to-space-id-map', $attachmentId, $attachmentSpaceId, false, true ); + 'analyze-attachment-id-to-space-id-map', $attachmentId, $attachmentSpaceId, false, true ); } $attachmentReference = $this->makeAttachmentReference( $xmlHelper, $attachmentNode ); if ( $attachmentReference !== '' ) { $this->customBuckets->addData( - 'attachment-id-to-reference-map', $attachmentId, $attachmentReference, false, true ); + 'analyze-attachment-id-to-reference-map', $attachmentId, $attachmentReference, false, true ); } $containerContent = $xmlHelper->getPropertyNode( 'containerContent', $attachmentNode ); if ( $containerContent instanceof DOMElement ) { $containerContentId = $xmlHelper->getIDNodeValue( $containerContent ); if ( $containerContentId >= 0 ) { $this->customBuckets->addData( - 'attachment-id-to-container-content-id-map', $attachmentId, $containerContentId, false, true ); + 'analyze-attachment-id-to-container-content-id-map', + $attachmentId, $containerContentId, false, true + ); } } $attachmentNodeContentStatus = $xmlHelper->getPropertyValue( 'contentStatus', $attachmentNode ); $this->customBuckets->addData( - 'attachment-id-to-content-status-map', $attachmentId, $attachmentNodeContentStatus, false, true ); + 'analyze-attachment-id-to-content-status-map', $attachmentId, $attachmentNodeContentStatus, false, true ); } /** @@ -593,8 +622,8 @@ private function buildUserMap( DOMDocument $dom ): void { $mediaWikiUsername = $this->makeMWUserName( $lcUserName ); - $this->customBuckets->addData( - 'userkey-to-username-map', + $this->buckets->addData( + 'global-userkey-to-username-map', $userImplKey, $mediaWikiUsername, false @@ -618,17 +647,17 @@ private function buildUserMap( DOMDocument $dom ): void { * @return void */ private function buildPageMaps( DOMDocument $dom ): void { - $spaceIdToPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); - $spaceIdHomepages = $this->customBuckets->getBucketData( 'space-id-homepages' ); - $pageIdParentPageIdMap = $this->customBuckets->getBucketData( 'page-id-to-parent-page-id-map' ); - $pageIdConfluendTitleMap = $this->customBuckets->getBucketData( 'page-id-to-confluence-title-map' ); - $bodyContents = $this->customBuckets->getBucketData( 'body-content-id-to-page-id-map' ); + $spaceIdToPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); + $spaceIdToSpaceKeyMap = $this->customBuckets->getBucketData( 'analyze-space-id-to-space-key-map' ); + $spaceIdHomepages = $this->buckets->getBucketData( 'global-space-id-homepages' ); + $pageIdParentPageIdMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-parent-page-id-map' ); + $pageIdConfluendTitleMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-title-map' ); + $bodyContents = $this->customBuckets->getBucketData( 'analyze-body-content-id-to-page-id-map' ); $xmlHelper = new XMLHelper( $dom ); $pages = $xmlHelper->getObjectNodes( 'Page' ); if ( count( $pages ) < 1 ) { - return; } $pageNode = $pages->item( 0 ); @@ -636,20 +665,21 @@ private function buildPageMaps( DOMDocument $dom ): void { return; } $status = $xmlHelper->getPropertyValue( 'contentStatus', $pageNode ); - if ( $status !== 'current' ) { + if ( !$this->includeHistory && ( $status !== 'current' ) ) { return; } $spaceId = $xmlHelper->getPropertyValue( 'space', $pageNode ); if ( $spaceId === null ) { return; } - if ( !isset( $spaceIdToPrefixMap[$spaceId] ) ) { + if ( !isset( $spaceIdToSpaceKeyMap[$spaceId] ) ) { return; } - $prefix = $spaceIdToPrefixMap[$spaceId]; + $spaceKey = $spaceIdToSpaceKeyMap[$spaceId]; + if ( isset( $this->advancedConfig['analyzer-include-spacekey'] ) - && !in_array( strtolower( $prefix ), $this->advancedConfig['analyzer-include-spacekey'] ) + && !in_array( strtolower( $spaceKey ), $this->advancedConfig['analyzer-include-spacekey'] ) ) { return; } @@ -667,20 +697,26 @@ private function buildPageMaps( DOMDocument $dom ): void { try { $targetTitle = $titleBuilder->buildTitle( $pageNode ); } catch ( InvalidTitleException $ex ) { - $this->buckets->addData( 'title-invalids', $pageId, $ex->getInvalidTitle() ); - return; + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-page-id-to-title', + $pageId, $ex->getInvalidTitle() + ); + // We don't want to loose this page. Title can be modified after analyze process + $targetTitle = $ex->getInvalidTitle(); } if ( $targetTitle === '' ) { - $this->buckets->addData( 'title-invalids', $pageId, $targetTitle ); + $this->customBuckets->addData( 'debug-analyze-invalid-titles-page-id-to-title', $pageId, $targetTitle ); return; } $this->output->writeln( "Add page '$targetTitle' (ID:$pageId)" ); /** - * Adds data bucket "pages-titles-map", which contains mapping from page title itself to full page title. + * Adds data bucket "analyze-pages-titles-map", which contains mapping from page title itself to full page title. * Full page title contains parent pages and namespace (if it is not general space). + * + * After testing for title validity and sanitizing titles they will be added to global-pages-titles-map later. * Example: * "Detailed_planning" -> "Dokumentation/Detailed_planning" */ @@ -693,14 +729,19 @@ private function buildPageMaps( DOMDocument $dom ): void { $pageConfluenceTitle = "$spaceId---{$pageConfluenceTitle}"; // Some normalization $pageConfluenceTitle = str_replace( ' ', '_', $pageConfluenceTitle ); - $this->customBuckets->addData( 'pages-titles-map', $pageConfluenceTitle, $targetTitle, false, true ); - $this->customBuckets->addData( 'page-id-to-confluence-key-map', $pageId, $pageConfluenceTitle, false, true ); + $this->customBuckets->addData( + 'analyze-page-id-to-confluence-key-map', + $pageId, $pageConfluenceTitle, false, true + ); + $this->buckets->addData( 'analyze-pages-titles-map', $pageConfluenceTitle, $targetTitle, false, true ); + $this->pagesTitlesMap[$pageConfluenceTitle] = $targetTitle; // Also add pages IDs in Confluence to full page title mapping. // It is needed to have enough context on converting stage, // to know from filename which page is currently being converted. - $this->customBuckets->addData( 'page-id-to-title-map', $pageId, $targetTitle, false, true ); - $this->customBuckets->addData( 'page-id-to-space-id', $pageId, $spaceId, false, true ); + $this->buckets->addData( 'analyze-page-id-to-title-map', $pageId, $targetTitle, false, true ); + $this->buckets->addData( 'global-page-id-to-space-id', $pageId, $spaceId, false, true ); + $this->pageIdToTitleMap[$pageId] = $targetTitle; $revisionTimestamp = $this->buildRevisionTimestamp( $xmlHelper, $pageNode ); $bodyContentIds = $this->getBodyContentIds( $xmlHelper, $pageNode ); @@ -708,7 +749,7 @@ private function buildPageMaps( DOMDocument $dom ): void { foreach ( $bodyContentIds as $bodyContentId ) { // TODO: Add UserImpl-key or directly MediaWiki username // (could also be done in `extract` as "metadata" ) - $this->customBuckets->addData( 'body-contents-to-pages-map', $bodyContentId, $pageId, false, true ); + $this->buckets->addData( 'global-body-contents-to-pages-map', $bodyContentId, $pageId, false, true ); } } else { $bodyContentIds = []; @@ -717,8 +758,8 @@ private function buildPageMaps( DOMDocument $dom ): void { if ( $pageId === $contentPageId ) { $bodyContentIds[] = $bodyContentId; - $this->customBuckets->addData( - 'body-contents-to-pages-map', + $this->buckets->addData( + 'global-body-contents-to-pages-map', $bodyContentId, $pageId, false, @@ -729,8 +770,13 @@ private function buildPageMaps( DOMDocument $dom ): void { } $version = $xmlHelper->getPropertyValue( 'version', $pageNode ); + $revision = implode( '/', $bodyContentIds ) . "@$version-$revisionTimestamp"; - $this->addTitleRevision( $targetTitle, implode( '/', $bodyContentIds ) . "@$version-$revisionTimestamp" ); + if ( !isset( $this->titleRevision[$targetTitle] ) ) { + $this->titleRevision[$targetTitle] = []; + } + $this->titleRevision[$targetTitle][] = $revision; + $this->addAnalyzerTitleRevision( $targetTitle, $revision ); // Find attachments @@ -744,17 +790,28 @@ private function buildPageMaps( DOMDocument $dom ): void { * @return void */ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement $element, int $spaceId ): void { - $pageIdConflueTitleMap = $this->customBuckets->getBucketData( 'page-id-to-confluence-title-map' ); - $pageIdConfluenKeyMap = $this->customBuckets->getBucketData( 'page-id-to-confluence-key-map' ); - $pagesTitlesMap = $this->customBuckets->getBucketData( 'pages-titles-map' ); - $spaceIdToPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); - $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'attachment-id-to-orig-filename-map' ); - $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'attachment-id-to-space-id-map' ); - $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'attachment-id-to-reference-map' ); + $pageIdConflueTitleMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-title-map' ); + $pageIdConfluenKeyMap = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-key-map' ); + $pagesTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); + $spaceIdToPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); + $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( + 'analyze-attachment-id-to-orig-filename-map' + ); + $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-space-id-map' ); + $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-reference-map' ); $pageId = $xmlHelper->getIDNodeValue( $element ); + if ( !isset( $pageIdConflueTitleMap[$pageId] ) ) { + return; + } $confluenceTitle = $pageIdConflueTitleMap[$pageId]; + if ( !isset( $pageIdConfluenKeyMap[$pageId] ) ) { + return; + } $confluenceKey = $pageIdConfluenKeyMap[$pageId]; + if ( !isset( $pagesTitlesMap[$confluenceKey] ) ) { + return; + } $wikiTitle = $pagesTitlesMap[$confluenceKey]; // In case of ERM34465 this seems to be empty because @@ -778,6 +835,13 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement $confluenceTitle, $attachmentId, $attachmentSpaceId, $attachmentOrigFilename, $wikiTitle, $spaceIdToPrefixMap ); + if ( $attachmentTargetFilename === '' ) { + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-attachment-id-to-title', + $attachmentId, $attachmentTargetFilename + ); + continue; + } if ( !isset( $attachmentIdToReferenceMap[$attachmentId] ) ) { continue; } @@ -786,12 +850,15 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement // In case of ERM34465 no files are added to title-attachments $this->addTitleAttachment( $wikiTitle, $attachmentTargetFilename ); $this->addFile( $attachmentTargetFilename, $attachmentReference ); - $this->customBuckets->addData( 'title-files', $wikiTitle, $attachmentTargetFilename, false, true ); + $this->customBuckets->addData( + 'analyze-title-to-attachment-title', + $wikiTitle, $attachmentTargetFilename, false, true + ); $this->addedAttachmentIds[] = $attachmentId; $confluenceFileKey = str_replace( ' ', '_', "{$spaceId}---{$confluenceTitle}---{$attachmentOrigFilename}" ); - $this->customBuckets->addData( - 'filenames-to-filetitles-map', + $this->buckets->addData( + 'global-filenames-to-filetitles-map', $confluenceFileKey, $attachmentTargetFilename, false, @@ -799,13 +866,13 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement ); $this->customBuckets->addData( - 'attachment-id-to-target-filename-map', + 'analyze-attachment-id-to-target-filename-map', $attachmentId, $attachmentTargetFilename ); - $this->customBuckets->addData( - 'attachment-orig-filename-target-filename-map', + $this->buckets->addData( + 'global-attachment-orig-filename-target-filename-map', $attachmentOrigFilename, $attachmentTargetFilename ); @@ -817,12 +884,14 @@ private function getAttachmentsFromCollection( XMLHelper $xmlHelper, DOMElement * @return void */ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { - $spaceIdPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); - $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( 'attachment-id-to-orig-filename-map' ); - $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'attachment-id-to-reference-map' ); - $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'attachment-id-to-space-id-map' ); - $pageIdToTitleMap = $this->customBuckets->getBucketData( 'page-id-to-title-map' ); - $pageIdToConfluenceKey = $this->customBuckets->getBucketData( 'page-id-to-confluence-key-map' ); + $spaceIdPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); + $attachmentIdToOrigFilenameMap = $this->customBuckets->getBucketData( + 'analyze-attachment-id-to-orig-filename-map' + ); + $attachmentIdToReferenceMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-reference-map' ); + $attachmentIdToSpaceIdMap = $this->customBuckets->getBucketData( 'analyze-attachment-id-to-space-id-map' ); + $pageIdToTitleMap = $this->buckets->getBucketData( 'global-page-id-to-title-map' ); + $pageIdToConfluenceKey = $this->customBuckets->getBucketData( 'analyze-page-id-to-confluence-key-map' ); $xmlHelper = new XMLHelper( $dom ); @@ -873,6 +942,13 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $confluenceKey, $attachmentId, $attachmentSpaceId, $attachmentOrigFilename, $targetTitle, $spaceIdPrefixMap ); + if ( $attachmentTargetFilename === '' ) { + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-attachment-id-to-title', + $attachmentId, $attachmentTargetFilename + ); + return; + } if ( !isset( $attachmentIdToReferenceMap[$attachmentId] ) ) { $this->output->writeln( @@ -888,8 +964,8 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $this->addTitleAttachment( $targetTitle, $attachmentTargetFilename ); $this->output->writeln( "Add attachment $attachmentTargetFilename (fallback: {$confluenceKey})" ); } else { - $this->customBuckets->addData( - 'additional-files', $attachmentTargetFilename, $attachmentReference, false, true ); + $this->buckets->addData( + 'global-additional-files', $attachmentTargetFilename, $attachmentReference, false, true ); $this->output->writeln( "Add attachment $attachmentTargetFilename (additional)" ); } @@ -897,8 +973,8 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { $this->addedAttachmentIds[] = $attachmentId; $confluenceFileKey = str_replace( ' ', '', "{$confluenceKey}---{$attachmentOrigFilename}" ); - $this->customBuckets->addData( - 'filenames-to-filetitles-map', + $this->buckets->addData( + 'global-filenames-to-filetitles-map', $confluenceFileKey, $attachmentTargetFilename, false, @@ -906,13 +982,13 @@ private function buildTitleAttachmentsFallbackMaps( DOMDocument $dom ): void { ); $this->customBuckets->addData( - 'attachment-id-to-target-filename-map', + 'analyze-attachment-id-to-target-filename-map', $attachmentId, $attachmentTargetFilename ); - $this->customBuckets->addData( - 'attachment-orig-filename-target-filename-map', + $this->buckets->addData( + 'global-attachment-orig-filename-target-filename-map', $attachmentOrigFilename, $attachmentTargetFilename ); @@ -1017,9 +1093,12 @@ private function makeAttachmentTargetFilenameFromData( $targetName = $filenameBuilder->buildFromAttachmentData( $attachmentSpaceId, $attachmentOrigFilename, $shortTargetTitle ); } catch ( InvalidTitleException $ex ) { - $this->buckets->addData( 'title-invalids', $attachmentId, $ex->getInvalidTitle() ); + $this->customBuckets->addData( + 'debug-analyze-invalid-titles-attachment-id-to-title', + $attachmentId, $ex->getInvalidTitle() + ); $this->logger->error( $ex->getMessage() ); - return '###INVALID###'; + $targetName = $ex->getInvalidTitle(); } } @@ -1039,7 +1118,7 @@ private function makeAttachmentTargetFilenameFromData( $fileKey = "{$pageConfluenceTitle}---$attachmentOrigFilename"; // Some normalization $fileKey = str_replace( ' ', '_', $fileKey ); - $this->customBuckets->addData( 'filenames-to-filetitles-map', $fileKey, $targetName, false, true ); + $this->buckets->addData( 'global-filenames-to-filetitles-map', $fileKey, $targetName, false, true ); return $targetName; } @@ -1078,24 +1157,25 @@ private function makeAttachmentReference( XMLHelper $xmlHelper, DOMElement $atta } private function checkTitles(): void { - $spacePrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' ); - $pagesTitlesMap = $this->customBuckets->getBucketData( 'pages-titles-map' ); + $pagesTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); + + $validityChecker = new TitleValidityChecker(); $hasInvalidTitles = false; $hasInvalidNamespaces = false; foreach ( $pagesTitlesMap as $key => $title ) { - if ( str_ends_with( 'title', '_' ) ) { + if ( !$validityChecker->hasValidEnding( $title ) ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'invalid_ending', $title, true, true ); $hasInvalidTitles = true; } if ( str_contains( $title, ':' ) ) { - if ( strpos( $title, ':' ) !== strrpos( $title, ':' ) ) { + if ( $validityChecker->hasDoubleCollon( $title ) ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'multiple_collons', $title, true, true ); @@ -1104,29 +1184,27 @@ private function checkTitles(): void { $namespace = substr( $title, 0, strpos( $title, ':' ) ); $text = substr( $title, strpos( $title, ':' ) + 1 ); - $matches = []; - preg_match( '#(\d*)([a-zA-Z0-9_]*)#', $namespace, $matches ); - if ( empty( $matches ) || $matches[1] !== '' ) { + if ( !$validityChecker->hasValidNamespace( $namespace ) ) { $this->customBuckets->addData( - 'invalid-namespaces', + 'warning-analyze-invalid-namespaces', 'invalid_char', $namespace, true, true ); $hasInvalidNamespaces = true; } - if ( mb_strlen( urlencode( $text ) ) > 255 ) { + if ( !$validityChecker->hasValidLength( $text ) ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'length', $title, true, true ); $hasInvalidTitles = true; } } else { - if ( mb_strlen( urlencode( $title ) ) > 255 ) { + if ( !$validityChecker->hasValidLength( $title ) ) { $this->customBuckets->addData( - 'invalid-titles', + 'warning-analyze-invalid-titles', 'length', $title, true, true ); @@ -1135,6 +1213,19 @@ private function checkTitles(): void { } } + $files = $this->buckets->getBucketData( 'global-files' ); + $hasInvalidFilenames = false; + foreach ( $files as $title => $paths ) { + if ( $validityChecker->hasValidLength( $title ) ) { + $this->customBuckets->addData( + 'warning-analyze-invalid-filenames', + 'length', $title, + true, true + ); + $hasInvalidFilenames = true; + } + } + if ( $hasInvalidNamespaces === true || $hasInvalidTitles === true ) { $this->output->writeln( "\n\nWarning:\n" ); @@ -1146,9 +1237,75 @@ private function checkTitles(): void { $this->output->writeln( ' - Analyze process found invalid titles' ); } + if ( $hasInvalidFilenames === true ) { + $this->output->writeln( ' - Analyze process found invalid filenames' ); + } + + $this->output->writeln( + "\nPlease check" + ); + $this->output->writeln( + "\n - \"warning-analyze-invalid-namespaces.php\"" + ); + $this->output->writeln( + "\n - \"warning-analyze-invalid-titles.php\"" + ); $this->output->writeln( - "\nPlease check invalid-namespaces.php and/or invalid-titles.php before continuing with extract step" + "\n - \"warning-analyze-invalid-filenames.php\"" + ); + $this->output->writeln( + "\nbefore continuing with extract step" ); } } + + /** + * + * @param string $titleText + * @param string $contentReference + * @return void + */ + private function addAnalyzerTitleRevision( $titleText, $contentReference = 'n/a' ) { + $this->buckets->addData( 'analzye-title-revisions', $titleText, $contentReference ); + } + + /** + * + * @param string $titleText + * @param string $contentReference + * @return void + */ + protected function addTitleRevision( $titleText, $contentReference = 'n/a' ) { + $this->buckets->addData( 'global-title-revisions', $titleText, $contentReference ); + } + + /** + * + * @param string $titleText + * @param string $attachmentReference + * @return void + */ + protected function addTitleAttachment( $titleText, $attachmentReference = 'n/a' ) { + $this->buckets->addData( 'global-title-attachments', $titleText, $attachmentReference ); + } + + /** + * + * @param string $rawFilename + * @param string $attachmentReference + * @return void + */ + protected function addFile( $rawFilename, $attachmentReference = 'n/a' ) { + try { + $filename = $this->getFilename( $rawFilename, $attachmentReference ); + $filename = ( new WindowsFilename( $filename ) ) . ''; + } catch ( InvalidTitleException $ex ) { + $this->logger->error( $ex->getMessage() ); + return; + } + + $prefixedFilename = $this->maybePrefixFilename( $filename ); + + $this->buckets->addData( 'global-files', $prefixedFilename, $attachmentReference ); + } } diff --git a/src/Command/Analyze.php b/src/Command/Analyze.php index b4e4de7..70f6a77 100644 --- a/src/Command/Analyze.php +++ b/src/Command/Analyze.php @@ -65,4 +65,30 @@ private function readConfigFile( &$config ): void { } } } + + /** + * + * @inheritDoc + */ + protected function getBucketKeys() { + return [ + 'global-files', + 'global-title-attachments', + 'global-title-revisions', + 'global-space-id-to-prefix-map', + 'global-space-key-to-prefix-map', + 'global-space-id-homepages', + 'global-space-id-to-description-id-map', + 'global-space-description-id-to-body-id-map', + 'global-space-details', + 'global-userkey-to-username-map', + 'global-pages-titles-map', + 'global-page-id-to-title-map', + 'global-page-id-to-space-id', + 'global-body-contents-to-pages-map', + 'global-additional-files', + 'global-attachment-orig-filename-target-filename-map', + 'global-filenames-to-filetitles-map', + ]; + } } diff --git a/src/Command/Compose.php b/src/Command/Compose.php index b2a3a9c..b9b9640 100644 --- a/src/Command/Compose.php +++ b/src/Command/Compose.php @@ -66,4 +66,21 @@ private function readConfigFile( &$config ): void { } } } + + /** + * + * @inheritDoc + */ + protected function getBucketKeys() { + return [ + 'global-space-id-homepages', + 'global-space-id-to-description-id-map', + 'global-space-description-id-to-body-id-map', + 'global-body-contents-to-pages-map', + 'global-title-attachments', + 'global-title-revisions', + 'global-files', + 'global-additional-files' + ]; + } } diff --git a/src/Command/Extract.php b/src/Command/Extract.php index 59369be..b94ffb7 100644 --- a/src/Command/Extract.php +++ b/src/Command/Extract.php @@ -69,4 +69,17 @@ private function readConfigFile( &$config ): void { } } } + + /** + * + * @inheritDoc + */ + protected function getBucketKeys() { + return [ + // From this step + 'global-title-metadata', + 'global-revision-contents', + 'global-body-contents-to-pages-map', + ]; + } } diff --git a/src/Composer/ConfluenceComposer.php b/src/Composer/ConfluenceComposer.php index b5b8d09..233aea5 100644 --- a/src/Composer/ConfluenceComposer.php +++ b/src/Composer/ConfluenceComposer.php @@ -14,11 +14,6 @@ class ConfluenceComposer extends ComposerBase implements IOutputAwareInterface { - /** - * @var DataBuckets - */ - private $dataBuckets; - /** * @var DataBuckets */ @@ -40,23 +35,12 @@ class ConfluenceComposer extends ComposerBase implements IOutputAwareInterface { public function __construct( $config, Workspace $workspace, DataBuckets $buckets ) { parent::__construct( $config, $workspace, $buckets ); - $this->dataBuckets = new DataBuckets( [ - 'space-id-homepages', - 'space-id-to-description-id-map', - 'space-description-id-to-body-id-map', - 'body-contents-to-pages-map', - 'title-attachments', - 'title-revisions', - 'files', - 'additional-files' - ] ); - $this->customBuckets = new DataBuckets( [ 'title-uploads', 'title-uploads-fail' ] ); - $this->dataBuckets->loadFromWorkspace( $this->workspace ); + $this->customBuckets->loadFromWorkspace( $this->workspace ); if ( isset( $config['config'] ) ) { $this->advancedConfig = $config['config']; @@ -78,71 +62,108 @@ public function buildXML( Builder $builder ) { $this->appendDefaultPages( $builder ); $this->addDefaultFiles(); - $bodyContentsToPagesMap = $this->dataBuckets->getBucketData( 'body-contents-to-pages-map' ); - $spaceIDHomepagesMap = $this->dataBuckets->getBucketData( 'space-id-homepages' ); + $bodyContentsToPagesMap = $this->buckets->getBucketData( 'global-body-contents-to-pages-map' ); + $spaceIDHomepagesMap = $this->buckets->getBucketData( 'global-space-id-homepages' ); $homepageSpaceIDMap = array_flip( $spaceIDHomepagesMap ); - $spaceIDDescriptionIDMap = $this->dataBuckets->getBucketData( 'space-id-to-description-id-map' ); - $spaceDescriptionIDBodyIDMap = $this->dataBuckets->getBucketData( 'space-description-id-to-body-id-map' ); + $spaceIDDescriptionIDMap = $this->buckets->getBucketData( 'global-space-id-to-description-id-map' ); + $spaceDescriptionIDBodyIDMap = $this->buckets->getBucketData( 'global-space-description-id-to-body-id-map' ); - $pagesRevisions = $this->dataBuckets->getBucketData( 'title-revisions' ); - $filesMap = $this->dataBuckets->getBucketData( 'files' ); - $pageAttachmentsMap = $this->dataBuckets->getBucketData( 'title-attachments' ); + $pagesRevisions = $this->buckets->getBucketData( 'global-title-revisions' ); + $filesMap = $this->buckets->getBucketData( 'global-files' ); + $pageAttachmentsMap = $this->buckets->getBucketData( 'global-title-attachments' ); $bodyContentIDMainpageID = []; $pagesToBodyContents = array_flip( $bodyContentsToPagesMap ); foreach ( $spaceIDHomepagesMap as $spaceID => $homepageID ) { + if ( !isset( $pagesToBodyContents[$homepageID] ) ) { + continue; + } $bodyContentsID = $pagesToBodyContents[$homepageID]; $bodyContentIDMainpageID[$bodyContentsID] = $homepageID; } - foreach ( $pagesRevisions as $pageTitle => $pageRevision ) { + foreach ( $pagesRevisions as $pageTitle => $pageRevisions ) { $this->output->writeln( "\nProcessing: $pageTitle\n" ); - $pageRevisionData = explode( '@', $pageRevision[0] ); + // Sometimes not all namespaces should be used for the import. To skip this namespaces + // use this option + $namespace = $this->getNamespace( $pageTitle ); + if ( + isset( $this->advancedConfig['composer-include-namespace'] ) + && !in_array( $namespace, $this->advancedConfig['composer-include-namespace'] ) + ) { + $this->output->writeln( "Namespace {$namespace} skipped by configuration" ); + continue; + } + + // Sometimes titles have contents >256kB which might break the import. To skip this titles + // use this option + if ( + isset( $this->advancedConfig['composer-skip-titles'] ) + && in_array( $pageTitle, $this->advancedConfig['composer-skip-titles'] ) + ) { + $this->output->writeln( "Page {$pageTitle} skipped by configuration" ); + continue; + } + + $sortedRevisions = []; + foreach( $pageRevisions as $pageRevision ) { + $pageRevisionData = explode( '@', $pageRevision ); + $bodyContentIds = $pageRevisionData[0]; - $timestamp = explode( '-', $pageRevisionData[1] )[1]; + $versionTimestamp = explode( '-', $pageRevisionData[1] ); + $version = $versionTimestamp[0]; + $timestamp = $versionTimestamp[1]; - $bodyContentIds = $pageRevisionData[0]; - $bodyContentIdsArr = explode( '/', $bodyContentIds ); + $sortedRevisions[$timestamp] = $bodyContentIds; + } - $pageContent = ""; - foreach ( $bodyContentIdsArr as $bodyContentId ) { - if ( $bodyContentId === '' ) { - // Skip if no reference to a body content is not set - continue; - } + ksort( $sortedRevisions ); + if ( !isset( $this->advancedConfig['include-history'] ) + || $this->advancedConfig['include-history'] !== true + ) { + $bodyContentIds = end( $sortedRevisions ); + $timestamp = array_search( $bodyContentIds, $sortedRevisions ); + $sortedRevisions = []; // Reset sortedRevisions + $sortedRevisions[$timestamp] = $bodyContentIds; + } + + foreach( $sortedRevisions as $timestamp => $bodyContentIds ) { + $bodyContentIdsArr = explode( '/', $bodyContentIds ); + + $pageContent = ""; + foreach ( $bodyContentIdsArr as $bodyContentId ) { + if ( $bodyContentId === '' ) { + // Skip if no reference to a body content is not set + continue; + } + + $this->output->writeln( "Getting '$bodyContentId' body content..." ); + + $pageContent .= $this->workspace->getConvertedContent( $bodyContentId ) . "\n"; - $this->output->writeln( "Getting '$bodyContentId' body content..." ); - - $pageContent .= $this->workspace->getConvertedContent( $bodyContentId ) . "\n"; - - // Add space description to homepage - if ( isset( $bodyContentIDMainpageID[$bodyContentId] ) ) { - // get homepage id if it is a homepage - $mainpageID = $bodyContentIDMainpageID[$bodyContentId]; - if ( isset( $homepageSpaceIDMap[$mainpageID] ) ) { - // get space id - $spaceID = $homepageSpaceIDMap[$mainpageID]; - if ( isset( $spaceIDDescriptionIDMap[$spaceID] ) ) { - // get description id - $descID = $spaceIDDescriptionIDMap[$spaceID]; - if ( isset( $spaceDescriptionIDBodyIDMap[$descID] ) ) { + // Add space description to homepage + if ( isset( $bodyContentIDMainpageID[$bodyContentId] ) ) { + // get homepage id if it is a homepage + $mainpageID = $bodyContentIDMainpageID[$bodyContentId]; + if ( isset( $homepageSpaceIDMap[$mainpageID] ) ) { + // get space id + $spaceID = $homepageSpaceIDMap[$mainpageID]; + if ( isset( $spaceIDDescriptionIDMap[$spaceID] ) ) { // get description id - $descBodyID = $spaceDescriptionIDBodyIDMap[$descID]; - $description = $this->workspace->getConvertedContent( $descBodyID ); - $pageContent .= "[[Space description::$description]]\n"; + $descID = $spaceIDDescriptionIDMap[$spaceID]; + if ( isset( $spaceDescriptionIDBodyIDMap[$descID] ) ) { + // get description id + $descBodyID = $spaceDescriptionIDBodyIDMap[$descID]; + $description = $this->workspace->getConvertedContent( $descBodyID ); + $pageContent .= "[[Space description::$description]]\n"; + } } } } } - } - $namespace = $this->getNamespace( $pageTitle ); - if ( - isset( $this->advancedConfig['composer-include-namespace'] ) - && in_array( $namespace, $this->advancedConfig['composer-include-namespace'] ) - ) { $builder->addRevision( $pageTitle, $pageContent, $timestamp ); // Append attachments @@ -172,8 +193,6 @@ public function buildXML( Builder $builder ) { } } } - } else { - $this->output->writeln( "Page {$pageTitle} skipped by configuration" ); } } diff --git a/src/Converter/ConfluenceConverter.php b/src/Converter/ConfluenceConverter.php index a7404a4..e25c514 100644 --- a/src/Converter/ConfluenceConverter.php +++ b/src/Converter/ConfluenceConverter.php @@ -8,6 +8,7 @@ use DOMXPath; use HalloWelt\MediaWiki\Lib\Migration\Converter\PandocHTML; use HalloWelt\MediaWiki\Lib\Migration\DataBuckets; +use HalloWelt\MediaWiki\Lib\Migration\ExecutionTime; use HalloWelt\MediaWiki\Lib\Migration\IOutputAwareInterface; use HalloWelt\MediaWiki\Lib\Migration\Workspace; use HalloWelt\MigrateConfluence\Converter\Postprocessor\FixImagesWithExternalUrl; @@ -67,7 +68,10 @@ class ConfluenceConverter extends PandocHTML implements IOutputAwareInterface { protected $bodyContentFile = null; /** @var DataBuckets */ - private $dataBuckets = null; + private $executionTimeBuckets = null; + + /** @var DataBuckets */ + private $buckets = null; /** @var DataBuckets */ private $customBuckets = null; @@ -112,29 +116,30 @@ class ConfluenceConverter extends PandocHTML implements IOutputAwareInterface { public function __construct( $config, Workspace $workspace ) { parent::__construct( $config, $workspace ); - $this->dataBuckets = new DataBuckets( [ - 'page-id-to-title-map', - 'pages-titles-map', - 'title-attachments', - 'body-contents-to-pages-map', - 'page-id-to-space-id', - 'space-id-to-prefix-map', - 'space-key-to-prefix-map', - 'filenames-to-filetitles-map', - 'title-metadata', - 'attachment-orig-filename-target-filename-map', - 'files', - 'userkey-to-username-map', - 'space-description-id-to-body-id-map', - 'gliffy-map', - 'attachment-confluence-file-key-to-target-filename-map' + $this->buckets = new DataBuckets( [ + 'global-page-id-to-title-map', + 'global-pages-titles-map', + 'global-title-attachments', + 'global-body-contents-to-pages-map', + 'global-page-id-to-space-id', + 'global-space-id-to-prefix-map', + 'global-space-key-to-prefix-map', + 'global-filenames-to-filetitles-map', + 'global-title-metadata', + 'global-attachment-orig-filename-target-filename-map', + 'global-files', + 'global-userkey-to-username-map', + 'global-space-description-id-to-body-id-map', + 'global-gliffy-map', ] ); - $this->dataBuckets->loadFromWorkspace( $this->workspace ); + $this->buckets->loadFromWorkspace( $this->workspace ); $this->customBuckets = new DataBuckets( [ - 'title-uploads', - 'title-uploads-fail' + 'warning-convert-body-content-id-content-size', + ] ); + $this->executionTimeBuckets = new DataBuckets( [ + 'convert-body-content-id-execution-time', ] ); } @@ -149,9 +154,14 @@ public function setOutput( Output $output ) { * @inheritDoc */ protected function doConvert( SplFileInfo $file ): string { + $executionTime = new ExecutionTime(); + + $this->customBuckets->loadFromWorkspace( $this->workspace ); + $this->executionTimeBuckets->loadFromWorkspace( $this->workspace ); + $this->output->writeln( $file->getPathname() ); - $this->dataLookup = ConversionDataLookup::newFromBuckets( $this->dataBuckets ); - $this->conversionDataWriter = ConversionDataWriter::newFromBuckets( $this->dataBuckets ); + $this->dataLookup = ConversionDataLookup::newFromBuckets( $this->buckets ); + $this->conversionDataWriter = ConversionDataWriter::newFromBuckets( $this->buckets ); $this->rawFile = $file; if ( isset( $this->config['config']['ext-ns-file-repo-compat'] ) @@ -174,7 +184,7 @@ protected function doConvert( SplFileInfo $file ): string { } $this->currentSpace = $this->getSpaceIdFromPageId( $pageId ); - $pagesIdsToTitlesMap = $this->dataBuckets->getBucketData( 'page-id-to-title-map' ); + $pagesIdsToTitlesMap = $this->buckets->getBucketData( 'global-page-id-to-title-map' ); if ( isset( $pagesIdsToTitlesMap[$pageId] ) ) { $this->currentPageTitle = $pagesIdsToTitlesMap[$pageId]; } else { @@ -218,6 +228,35 @@ protected function doConvert( SplFileInfo $file ): string { $this->postProcessLinks(); $this->postprocessWikiText(); + // Content size sometimes breakes import + $exceed = ''; + $wikiTextLength = strlen( $this->wikiText ); + $wikiTextLength = $wikiTextLength / 1000; + if ( $wikiTextLength > 512 ) { + $exceed = '512'; + } elseif ( $wikiTextLength > 256 ) { + $exceed = '256'; + } elseif ( $wikiTextLength > 100 ) { + $exceed = '100'; + } + if ( $exceed !== '' ) { + $this->buckets->addData( + 'warning-convert-body-content-id-content-size', + $exceed, + $bodyContentId + ); + $this->output->writeln( "bodyContentId {$this->currentSpace} contains large content" ); + } + + $executionTimeString = $executionTime->getHumanReadableTime(); + $this->executionTimeBuckets->addData( + 'convert-body-content-id-execution-time', + $bodyContentId, + $executionTimeString, + false, + true + ); + $this->executionTimeBuckets->saveToWorkspace( $this->workspace ); $this->customBuckets->saveToWorkspace( $this->workspace ); return $this->wikiText; @@ -274,7 +313,7 @@ private function runProcessors( $dom ) { ), new StructuredMacroGliffy( $this->dataLookup, $this->conversionDataWriter, $this->currentSpace, - $currentPageTitle, $this->customBuckets, $this->nsFileRepoCompat + $currentPageTitle, $this->buckets, $this->nsFileRepoCompat ), new StructuredMacroContenByLabel( $this->currentPageTitle ), new StructuredMacroAttachments(), @@ -335,7 +374,7 @@ private function getBodyContentIdFromFilename() { * @return int */ private function getPageIdFromBodyContentId( $bodyContentId ) { - $map = $this->dataBuckets->getBucketData( 'body-contents-to-pages-map' ); + $map = $this->buckets->getBucketData( 'global-body-contents-to-pages-map' ); return $map[$bodyContentId] ?? -1; } @@ -345,7 +384,7 @@ private function getPageIdFromBodyContentId( $bodyContentId ) { * @return int */ private function getSpaceDescriptionIDFromBodyContentId( $bodyContentId ) { - $map = $this->dataBuckets->getBucketData( 'space-description-id-to-body-id-map' ); + $map = $this->buckets->getBucketData( 'global-space-description-id-to-body-id-map' ); $map = array_flip( $map ); return $map[$bodyContentId] ?? -1; } @@ -356,7 +395,7 @@ private function getSpaceDescriptionIDFromBodyContentId( $bodyContentId ) { * @return int */ private function getSpaceIdFromPageId( $pageId ) { - $map = $this->dataBuckets->getBucketData( 'page-id-to-space-id' ); + $map = $this->buckets->getBucketData( 'global-page-id-to-space-id' ); return $map[$pageId] ?? -1; } @@ -415,7 +454,7 @@ private function processMacro( $sender, $match, $dom, $xpath ) { 'panel', 'recently-updated', 'section', - 'space-details', + 'global-space-details', 'status', 'task', 'task-list', @@ -532,7 +571,7 @@ protected function preprocessHTMLSource( $oHTMLSourceFile ) { $sContent = str_replace( 'dataBuckets->getBucketData( 'title-metadata' ); + $categorieMap = $this->buckets->getBucketData( 'global-title-metadata' ); $categories = ''; if ( isset( $categorieMap[$pageId] ) && isset( $categorieMap[$pageId]['categories'] ) ) { foreach ( $categorieMap[$pageId]['categories'] as $key => $category ) { @@ -667,7 +706,7 @@ public function postProcessDOM( $dom, $xpath ) { * @return void */ public function postProcessLinks() { - $oldToNewTitlesMap = $this->dataBuckets->getBucketData( 'pages-titles-map' ); + $oldToNewTitlesMap = $this->buckets->getBucketData( 'global-pages-titles-map' ); $this->wikiText = preg_replace_callback( "/\[\[Media:(.*)]]/", @@ -723,7 +762,7 @@ static function ( $aMatches ) { private function addAdditionalAttachments(): string { $wikiText = ''; - $attachmentsMap = $this->dataBuckets->getBucketData( 'title-attachments' ); + $attachmentsMap = $this->buckets->getBucketData( 'global-title-attachments' ); $currentPageTitle = $this->getCurrentPageTitle(); @@ -782,7 +821,11 @@ private function buildMediaExcludeList( $wikiText ): array { * @return string */ private function getCurrentPageTitle(): string { - $spaceIdPrefixMap = $this->dataBuckets->getBucketData( 'space-id-to-prefix-map' ); + $prefix = ''; + $spaceIdPrefixMap = $this->buckets->getBucketData( 'global-space-id-to-prefix-map' ); + if ( !isset( $spaceIdPrefixMap[$this->currentSpace] ) ) { + $this->output->writeln( "SpaceId {$this->currentSpace} not found in spaceIdPrefixMap" ); + } $prefix = $spaceIdPrefixMap[$this->currentSpace]; $currentPageTitle = $this->currentPageTitle; diff --git a/src/Converter/Processor/StructuredMacroGliffy.php b/src/Converter/Processor/StructuredMacroGliffy.php index 5ef1559..86bf832 100644 --- a/src/Converter/Processor/StructuredMacroGliffy.php +++ b/src/Converter/Processor/StructuredMacroGliffy.php @@ -114,7 +114,7 @@ private function makeParamsString( array $params ): string { $params['name'] = $filename; } - $this->dataBuckets->addData( 'gliffy-map', $key, $filename, true, true ); + $this->dataBuckets->addData( 'global-gliffy-map', $key, $filename, true, true ); } else { return ''; } diff --git a/src/Extractor/ConfluenceExtractor.php b/src/Extractor/ConfluenceExtractor.php index f6b437c..38d9971 100644 --- a/src/Extractor/ConfluenceExtractor.php +++ b/src/Extractor/ConfluenceExtractor.php @@ -30,9 +30,10 @@ class ConfluenceExtractor extends ExtractorBase { */ public function __construct( $config, Workspace $workspace, DataBuckets $buckets ) { parent::__construct( $config, $workspace, $buckets ); + $this->customBuckets = new DataBuckets( [ - 'labelling-id-to-label-id-map', - 'label-id-to-name-map', + 'extract-labelling-id-to-label-id-map', + 'extract-label-id-to-name-map', ] ); } @@ -41,6 +42,7 @@ public function __construct( $config, Workspace $workspace, DataBuckets $buckets * @return bool */ protected function doExtract( SplFileInfo $file ): bool { + $this->buckets->loadFromWorkspace( $this->workspace ); $this->customBuckets->loadFromWorkspace( $this->workspace ); if ( isset( $this->config['config']['categories'] ) ) { @@ -109,11 +111,16 @@ protected function doExtract( SplFileInfo $file ): bool { * @return void */ private function extractBodyContents( DOMDocument $dom ): void { + $bodyContentsToPagesMap = $this->buckets->getBucketData( 'global-body-contents-to-pages-map' ); + $xmlHelper = new XMLHelper( $dom ); $bodyContents = $xmlHelper->getObjectNodes( 'BodyContent' ); foreach ( $bodyContents as $bodyContent ) { $id = $xmlHelper->getIDNodeValue( $bodyContent ); + if ( !isset( $bodyContentsToPagesMap[ $id ] ) ) { + continue; + } $bodyContentHTML = $this->getBodyContentHTML( $xmlHelper, $bodyContent ); $targetFileName = $this->workspace->saveRawContent( $id, $bodyContentHTML ); $this->addRevisionContent( $id, $targetFileName ); @@ -140,12 +147,12 @@ private function buildLabellingMap( DOMDocument $dom ): void { $labelProp = $xmlHelper->getPropertyNode( 'label', $labelling ); $labelId = $xmlHelper->getIDNodeValue( $labelProp ); - $labelMap = $this->customBuckets->getBucketData( 'label-id-to-name-map' ); + $labelMap = $this->customBuckets->getBucketData( 'extract-label-id-to-name-map' ); if ( isset( $labelMap[$labelId] ) ) { $categories[] = $labelMap[$labelId]; } - $this->customBuckets->addData( 'labelling-id-to-label-id-map', $id, $labelId, false, true ); + $this->customBuckets->addData( 'extract-labelling-id-to-label-id-map', $id, $labelId, false, true ); } /** @@ -173,7 +180,7 @@ private function buildLabelMap( DOMDocument $dom ): void { $id = $xmlHelper->getIDNodeValue( $label ); $name = $xmlHelper->getPropertyValue( 'name', $label ); - $this->customBuckets->addData( 'label-id-to-name-map', $id, $name, false, true ); + $this->customBuckets->addData( 'extract-label-id-to-name-map', $id, $name, false, true ); } /** @@ -193,8 +200,8 @@ private function getBodyContentHTML( XMLHelper $xmlHelper, DOMElement $bodyConte * @return void */ private function extractPageMetaData( DOMDocument $dom ) { - $labellingMap = $this->customBuckets->getBucketData( 'labelling-id-to-label-id-map' ); - $labelMap = $this->customBuckets->getBucketData( 'label-id-to-name-map' ); + $labellingMap = $this->customBuckets->getBucketData( 'extract-labelling-id-to-label-id-map' ); + $labelMap = $this->customBuckets->getBucketData( 'extract-label-id-to-name-map' ); $xmlHelper = new XMLHelper( $dom ); @@ -229,7 +236,25 @@ private function extractPageMetaData( DOMDocument $dom ) { 'categories' => $categories ]; - $this->buckets->addData( 'title-metadata', $id, $meta, false ); + $this->addTitleMetaData( $id, $meta ); } } + + /** + * + * @param string $revisionReference + * @param string $contentReference + */ + protected function addRevisionContent( $revisionReference, $contentReference = 'n/a' ) { + $this->buckets->addData( 'global-revision-contents', $revisionReference, $contentReference ); + } + + /** + * + * @param string $titleText + * @param string $meta + */ + protected function addTitleMetaData( $titleText, $meta = [] ) { + $this->buckets->addData( 'global-title-metadata', $titleText, $meta, false ); + } } diff --git a/src/Utility/ConversionDataLookup.php b/src/Utility/ConversionDataLookup.php index 3bc887e..54595ec 100644 --- a/src/Utility/ConversionDataLookup.php +++ b/src/Utility/ConversionDataLookup.php @@ -59,13 +59,13 @@ class ConversionDataLookup { */ public static function newFromBuckets( DataBuckets $buckets ) { return new static( - $buckets->getBucketData( 'space-id-to-prefix-map' ), - $buckets->getBucketData( 'pages-titles-map' ), - $buckets->getBucketData( 'filenames-to-filetitles-map' ), - $buckets->getBucketData( 'attachment-orig-filename-target-filename-map' ), - $buckets->getBucketData( 'files' ), - $buckets->getBucketData( 'userkey-to-username-map' ), - $buckets->getBucketData( 'space-key-to-prefix-map' ), + $buckets->getBucketData( 'global-space-id-to-prefix-map' ), + $buckets->getBucketData( 'global-pages-titles-map' ), + $buckets->getBucketData( 'global-filenames-to-filetitles-map' ), + $buckets->getBucketData( 'global-attachment-orig-filename-target-filename-map' ), + $buckets->getBucketData( 'global-files' ), + $buckets->getBucketData( 'global-userkey-to-username-map' ), + $buckets->getBucketData( 'global-space-key-to-prefix-map' ), ); } diff --git a/src/Utility/ConversionDataWriter.php b/src/Utility/ConversionDataWriter.php index e80c1fe..2e23a8c 100644 --- a/src/Utility/ConversionDataWriter.php +++ b/src/Utility/ConversionDataWriter.php @@ -18,7 +18,7 @@ class ConversionDataWriter { */ public static function newFromBuckets( DataBuckets $buckets ) { return new static( - $buckets->getBucketData( 'files' ) + $buckets->getBucketData( 'global-files' ) ); } diff --git a/src/Utility/TitleValidityChecker.php b/src/Utility/TitleValidityChecker.php new file mode 100644 index 0000000..0ebe627 --- /dev/null +++ b/src/Utility/TitleValidityChecker.php @@ -0,0 +1,86 @@ +hasValidEnding( $title ) ) { + return false; + } + + if ( str_contains( $title, ':' ) ) { + if ( $this->hasDoubleCollon( $title ) ) { + return false; + } + + $namespace = substr( $title, 0, strpos( $title, ':' ) ); + $text = substr( $title, strpos( $title, ':' ) + 1 ); + + if ( !$this->hasValidNamespace( $namespace ) ) { + return false; + } + + if ( !$this->hasValidLength( $text ) ) { + return false; + } + } else { + if ( !$this->hasValidLength( $title ) ) { + return false; + } + } + + return true; + } + + /** + * @param string $title + * @return bool + */ + public function hasValidEnding( string $title ): bool { + if ( str_ends_with( $title, '_' ) ) { + return false; + } + return true; + } + + /** + * @param string $title + * @return bool + */ + public function hasDoubleCollon( string $title ): bool { + if ( strpos( $title, ':' ) !== strrpos( $title, ':' ) ) { + return true; + } + return false; + } + + /** + * @param string $namespace + * @return bool + */ + public function hasValidNamespace( string $namespace ): bool { + $matches = []; + preg_match( '#(\d*)([a-zA-Z0-9_]*)#', $namespace, $matches ); + if ( empty( $matches ) || $matches[1] !== '' ) { + return false; + } + return true; + } + + /** + * @param string $title + * @return bool + */ + public function hasValidLength( string $title ): bool { + if ( strlen( $title ) > 255 ) { + return false; + } + return true; + } + +}