11import AdmZip from 'adm-zip' ;
22import { deserialize } from 'bson' ;
3- import { ObjectId } from 'mongodb' ;
4- import { insert } from '../connector' ;
3+ import isEqual from 'fast-deep-equal' ;
4+ import { AnyBulkWriteOperation , Document , FindCursor , ObjectId } from 'mongodb' ;
5+ import { bulkWrite , db , insert } from '../connector' ;
6+
7+ interface StaticAsset {
8+ checksum : string ;
9+ key : string ;
10+ }
11+
12+ interface PageAst {
13+ [ key : string ] : any ;
14+ }
15+
16+ export interface UpdatedPage {
17+ page_id : string ;
18+ filename : string ;
19+ ast : PageAst ;
20+ static_assets : StaticAsset [ ] ;
21+
22+ created_at : Date ;
23+ updated_at : Date ;
24+ deleted : boolean ;
25+ }
526
627const COLLECTION_NAME = 'documents' ;
28+ const UPDATED_AST_COLL_NAME = 'updated_documents' ;
729
830// Service responsible for memoization of page level documents.
931// Any extraneous logic performed on page level documents as part of upload should be added here
@@ -15,12 +37,175 @@ const pagesFromZip = (zip: AdmZip) => {
1537 . map ( ( entry ) => deserialize ( entry . getData ( ) ) ) ;
1638} ;
1739
18- export const insertPages = async ( buildId : ObjectId , zip : AdmZip ) => {
40+ /**
41+ *
42+ * Finds the page documents for a given Snooty project name + branch combination.
43+ * If this is the first build for the Snooty project name + branch, no documents
44+ * will be found.
45+ *
46+ * @param pageIdPrefix - Includes the Snooty project name, user (docsworker-xlarge), and branch
47+ * @param collection - The collection to perform the find query on
48+ */
49+ const findPrevPageDocs = async ( pageIdPrefix : string , collection : string ) => {
50+ const dbSession = await db ( ) ;
51+ const findQuery = {
52+ page_id : { $regex : new RegExp ( `^${ pageIdPrefix } ` ) } ,
53+ deleted : false ,
54+ } ;
55+ const projection = {
56+ _id : 0 ,
57+ page_id : 1 ,
58+ ast : 1 ,
59+ } ;
60+
61+ try {
62+ return dbSession . collection < UpdatedPage > ( collection ) . find ( findQuery ) . project ( projection ) ;
63+ } catch ( error ) {
64+ console . error (
65+ `Error trying to find previous page documents using prefix ${ pageIdPrefix } in ${ collection } }: ${ error } `
66+ ) ;
67+ throw error ;
68+ }
69+ } ;
70+
71+ const createPageAstMapping = async ( docsCursor : FindCursor ) => {
72+ // Create mapping for page id and its AST
73+ const mapping : Record < string , object > = { } ;
74+ // Create set of all page ids. To be used for tracking unseen pages in the current build
75+ const pageIds = new Set < string > ( ) ;
76+ for await ( const doc of docsCursor ) {
77+ mapping [ doc . page_id ] = doc . ast ;
78+ pageIds . add ( doc . page_id ) ;
79+ }
80+ return { mapping, pageIds } ;
81+ } ;
82+
83+ class UpdatedPagesManager {
84+ currentPages : Document [ ] ;
85+ operations : AnyBulkWriteOperation [ ] ;
86+ prevPageDocsMapping : Record < string , object > ;
87+ prevPageIds : Set < string > ;
88+
89+ constructor ( prevPageDocsMapping : Record < string , object > , prevPagesIds : Set < string > , pages : Document [ ] ) {
90+ this . currentPages = pages ;
91+ this . operations = [ ] ;
92+ this . prevPageDocsMapping = prevPageDocsMapping ;
93+ this . prevPageIds = prevPagesIds ;
94+
95+ const updateTime = new Date ( ) ;
96+ this . checkForPageDiffs ( updateTime ) ;
97+ this . markUnseenPagesAsDeleted ( updateTime ) ;
98+ }
99+
100+ /**
101+ *
102+ * Compares the ASTs of the current pages with the previous pages. New update
103+ * operations are added whenever a diff in the page ASTs is found. Page IDs are
104+ * removed from `prevPageIds` to signal that the previous page has been "seen"
105+ *
106+ * @param updateTime - the time to set updates to
107+ */
108+ checkForPageDiffs ( updateTime : Date ) {
109+ this . currentPages . forEach ( ( page ) => {
110+ // Filter out rst (non-page) files
111+ if ( ! page . filename . endsWith ( '.txt' ) ) {
112+ return ;
113+ }
114+
115+ const currentPageId = page . page_id ;
116+ this . prevPageIds . delete ( currentPageId ) ;
117+
118+ // Update the document if page's current AST is different from previous build's.
119+ // New pages should always count as having a "different" AST
120+ if ( ! isEqual ( page . ast , this . prevPageDocsMapping [ currentPageId ] ) ) {
121+ const operation = {
122+ updateOne : {
123+ filter : { page_id : currentPageId } ,
124+ update : {
125+ $set : {
126+ page_id : currentPageId ,
127+ filename : page . filename ,
128+ ast : page . ast ,
129+ static_assets : page . static_assets ,
130+ updated_at : updateTime ,
131+ deleted : false ,
132+ } ,
133+ $setOnInsert : {
134+ created_at : updateTime ,
135+ } ,
136+ } ,
137+ upsert : true ,
138+ } ,
139+ } ;
140+ this . operations . push ( operation ) ;
141+ }
142+ } ) ;
143+ }
144+
145+ /**
146+ *
147+ * Marks any pages from the previous build that were not used as "deleted"
148+ *
149+ * @param updateTime - the time to set updates to
150+ */
151+ markUnseenPagesAsDeleted ( updateTime : Date ) {
152+ this . prevPageIds . forEach ( ( unseenPageId ) => {
153+ const operation = {
154+ updateOne : {
155+ filter : { page_id : unseenPageId } ,
156+ update : {
157+ $set : {
158+ deleted : true ,
159+ updated_at : updateTime ,
160+ } ,
161+ } ,
162+ } ,
163+ } ;
164+ this . operations . push ( operation ) ;
165+ } ) ;
166+ }
167+
168+ getOperations ( ) {
169+ return this . operations ;
170+ }
171+ }
172+
173+ /**
174+ *
175+ * Upserts pages in separate collection. Copies of a page are created by page_id.
176+ * Updated pages within the same Snooty project name + branch should only update
177+ * related page documents.
178+ *
179+ * @param pages
180+ * @param collection
181+ */
182+ const updatePages = async ( pages : Document [ ] , collection : string ) => {
183+ if ( pages . length === 0 ) {
184+ return ;
185+ }
186+
187+ // Find all pages that share the same project name + branch. Expects page IDs
188+ // to include these two properties after parse
189+ const pageIdPrefix = pages [ 0 ] . page_id . split ( '/' ) . slice ( 0 , 3 ) . join ( '/' ) ;
190+ const previousPagesCursor = await findPrevPageDocs ( pageIdPrefix , collection ) ;
191+ const { mapping : prevPageDocsMapping , pageIds : prevPageIds } = await createPageAstMapping ( previousPagesCursor ) ;
192+
193+ const updatedPagesManager = new UpdatedPagesManager ( prevPageDocsMapping , prevPageIds , pages ) ;
194+ const operations = updatedPagesManager . getOperations ( ) ;
195+
196+ if ( operations . length > 0 ) {
197+ await bulkWrite ( operations , collection ) ;
198+ }
199+ } ;
200+
201+ export const insertAndUpdatePages = async ( buildId : ObjectId , zip : AdmZip ) => {
19202 try {
20- const pages = await pagesFromZip ( zip ) ;
21- return insert ( pages , COLLECTION_NAME , buildId ) ;
203+ const pages = pagesFromZip ( zip ) ;
204+ return Promise . all ( [ insert ( pages , COLLECTION_NAME , buildId ) , updatePages ( pages , UPDATED_AST_COLL_NAME ) ] ) ;
22205 } catch ( error ) {
23206 console . error ( `Error at insertion time for ${ COLLECTION_NAME } : ${ error } ` ) ;
24207 throw error ;
25208 }
26209} ;
210+
211+ export const _updatePages = updatePages ;
0 commit comments