From 1edab1e5eac8fb7a8172bac87f4960bd646ce58d Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Sat, 5 Apr 2025 18:17:26 -0400 Subject: [PATCH 01/11] refactor: dependencies for document processor --- backend/Dockerfile | 3 ++- backend/app/services/document_processor.py | 2 +- backend/requirements.txt | 15 ++++++++++++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/backend/Dockerfile b/backend/Dockerfile index 9d76a13..82c5ea2 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -27,7 +27,8 @@ RUN pip install --no-cache-dir -r requirements.txt COPY . . # Install the app as a module for proper imports in tests -RUN pip install --no-cache-dir -e . --no-deps +RUN pip install --no-cache-dir -e . +RUN pip freeze > /app/installed_packages.txt # Expose the port the app runs on EXPOSE 8000 diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index ee340eb..218cfb5 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -3,7 +3,7 @@ import re from pathlib import Path import tempfile -import fitz # PyMuPDF +import fitz # pymupdf from marker import extract_from_file import openai import asyncio diff --git a/backend/requirements.txt b/backend/requirements.txt index 69391f4..e3a563f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,17 +1,26 @@ -fastapi==0.89.1 +# Core dependencies +fastapi>=0.104.1,<0.105.0 uvicorn==0.20.0 sqlalchemy==1.4.41 asyncpg==0.29.0 python-multipart==0.0.5 PyPDF2==3.0.1 -python-dotenv==0.21.1 +python-dotenv>=0.21.1,<2.0.0 # Updated to be compatible with marker-pdf python-decouple==3.7 alembic==1.9.2 pgvector==0.2.4 +pydantic>=2.4.2,<3.0.0 # Testing dependencies pytest==7.2.1 httpx==0.23.3 pytest-asyncio==0.20.3 pytest-cov==4.0.0 -pytest-env==0.8.1 \ No newline at end of file +pytest-env==0.8.1 + +# Document processing +PyMuPDF==1.23.8 # for fitz +marker-pdf==1.6.2 # PDF processing and marking +beautifulsoup4==4.12.3 # for BeautifulSoup +aiofiles==23.2.1 # for async file operations +openai>=1.12.0,<2.0.0 # for OpenAI integration \ No newline at end of file From cc5aa26f4b42f9e524856d16a44ee236a6639d39 Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Sat, 5 Apr 2025 18:17:39 -0400 Subject: [PATCH 02/11] chore: change to AGPLv3 license --- LICENSE | 682 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 661 insertions(+), 21 deletions(-) diff --git a/LICENSE b/LICENSE index c919e6d..ccd6af4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,661 @@ -MIT License - -Copyright (c) 2025 YCombuster - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +GNU AFFERO GENERAL PUBLIC LICENSE +Version 3, 19 November 2007 + +Copyright (C) 2007 Free Software Foundation, Inc. +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. + + Preamble + +The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + +The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + +When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + +Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + +A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + +The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + +An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + +The precise terms and conditions for copying, distribution and +modification follow. + +TERMS AND CONDITIONS + +0. Definitions. + +"This License" refers to version 3 of the GNU Affero General Public License. + +"Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + +"The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + +To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + +A "covered work" means either the unmodified Program or a work based +on the Program. + +To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + +To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + +An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + +1. Source Code. + +The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + +A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + +The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + +The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + +The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + +The Corresponding Source for a work in source code form is that +same work. + +2. Basic Permissions. + +All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + +Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + +3. Protecting Users' Legal Rights From Anti-Circumvention Law. + +No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + +When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + +4. Conveying Verbatim Copies. + +You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + +You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + +5. Conveying Modified Source Versions. + +You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + +a) The work must carry prominent notices stating that you modified +it, and giving a relevant date. + +b) The work must carry prominent notices stating that it is +released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to +"keep intact all notices". + +c) You must license the entire work, as a whole, under this +License to anyone who comes into possession of a copy. This +License will therefore apply, along with any applicable section 7 +additional terms, to the whole of the work, and all its parts, +regardless of how they are packaged. This License gives no +permission to license the work in any other way, but it does not +invalidate such permission if you have separately received it. + +d) If the work has interactive user interfaces, each must display +Appropriate Legal Notices; however, if the Program has interactive +interfaces that do not display Appropriate Legal Notices, your +work need not make them do so. + +A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + +6. Conveying Non-Source Forms. + +You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + +a) Convey the object code in, or embodied in, a physical product +(including a physical distribution medium), accompanied by the +Corresponding Source fixed on a durable physical medium +customarily used for software interchange. + +b) Convey the object code in, or embodied in, a physical product +(including a physical distribution medium), accompanied by a +written offer, valid for at least three years and valid for as +long as you offer spare parts or customer support for that product +model, to give anyone who possesses the object code either (1) a +copy of the Corresponding Source for all the software in the +product that is covered by this License, on a durable physical +medium customarily used for software interchange, for a price no +more than your reasonable cost of physically performing this +conveying of source, or (2) access to copy the +Corresponding Source from a network server at no charge. + +c) Convey individual copies of the object code with a copy of the +written offer to provide the Corresponding Source. This +alternative is allowed only occasionally and noncommercially, and +only if you received the object code with such an offer, in accord +with subsection 6b. + +d) Convey the object code by offering access from a designated +place (gratis or for a charge), and offer equivalent access to the +Corresponding Source in the same way through the same place at no +further charge. You need not require recipients to copy the +Corresponding Source along with the object code. If the place to +copy the object code is a network server, the Corresponding Source +may be on a different server (operated by you or a third party) +that supports equivalent copying facilities, provided you maintain +clear directions next to the object code saying where to find the +Corresponding Source. Regardless of what server hosts the +Corresponding Source, you remain obligated to ensure that it is +available for as long as needed to satisfy these requirements. + +e) Convey the object code using peer-to-peer transmission, provided +you inform other peers where the object code and Corresponding +Source of the work are being offered to the general public at no +charge under subsection 6d. + +A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + +A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + +"Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + +If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + +The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + +Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + +7. Additional Terms. + +"Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + +a) Disclaiming warranty or limiting liability differently from the +terms of sections 15 and 16 of this License; or + +b) Requiring preservation of specified reasonable legal notices or +author attributions in that material or in the Appropriate Legal +Notices displayed by works containing it; or + +c) Prohibiting misrepresentation of the origin of that material, or +requiring that modified versions of such material be marked in +reasonable ways as different from the original version; or + +d) Limiting the use for publicity purposes of names of licensors or +authors of the material; or + +e) Declining to grant rights under trademark law for use of some +trade names, trademarks, or service marks; or + +f) Requiring indemnification of licensors and authors of that +material by anyone who conveys the material (or modified versions of +it) with contractual assumptions of liability to the recipient, for +any liability that these contractual assumptions directly impose on +those licensors and authors. + +All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + +If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + +8. Termination. + +You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + +However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + +Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + +9. Acceptance Not Required for Having Copies. + +You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + +10. Automatic Licensing of Downstream Recipients. + +Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + +An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + +11. Patents. + +A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + +A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + +In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + +If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + +A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + +12. No Surrender of Others' Freedom. + +If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + +13. Remote Network Interaction; Use with the GNU General Public License. + +Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + +Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + +14. Revised Versions of this License. + +The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + +If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + +Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + +15. Disclaimer of Warranty. + +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +16. Limitation of Liability. + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + +17. Interpretation of Sections 15 and 16. + +If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + +END OF TERMS AND CONDITIONS + +How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + +To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + +An open-source self-study platform inspired by modern courseware tools. Built independently to promote free and accessible education. + +Copyright (C) 2025 Reese Chong + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published +by the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + +If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + +You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. From 0c9594dfd0202c5982485cbcf7033882840411f6 Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Sun, 6 Apr 2025 15:35:10 -0400 Subject: [PATCH 03/11] chore: implement internal pdf to json function with marker --- backend/app/services/document_processor.py | 40 +++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index 218cfb5..ea0b994 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -4,13 +4,17 @@ from pathlib import Path import tempfile import fitz # pymupdf -from marker import extract_from_file +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.schema import BlockTypes +from marker.schema import SourceMetadata import openai import asyncio import asyncpg import aiofiles import uuid import os +import json # for the sectioning of the chunks from bs4 import BeautifulSoup @@ -48,6 +52,40 @@ def __init__(self): def html_to_text(html): return BeautifulSoup(html, "html.parser").get_text() +def pdf_to_json(path, output_path): + # config = { + # "output_format": "json", + # } + # config_parser = ConfigParser(config) + + converter = PdfConverter( + # config=config_parser.generate_config_dict(), + artifact_dict=create_model_dict(), + # processor_list=config_parser.get_processors(), + # renderer=config_parser.get_renderer(), + # llm_service=config_parser.get_llm_service() + ) + + document = converter.build_document( + path, + output_format="json" + ) + + metadata = SourceMetadata( + source=path # Just use the file path or any identifier + ) + + data = { + document: document, + metadata: metadata + } + + # for debugging purposes + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(document.dict(), f, ensure_ascii=False, indent=2) + + return data + def sub_chunk(json_data, db: Session, source_metadata: dict): """ This function takes our marker JSON, takes the sections, From cab1c773b11cd4ab87ba78d9dd155287c32d1a85 Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Sun, 6 Apr 2025 21:44:25 -0400 Subject: [PATCH 04/11] chore: implement fastapi route changes --- backend/app/main.py | 1 + backend/app/services/document_processor.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/backend/app/main.py b/backend/app/main.py index 1810c16..a72e474 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -44,6 +44,7 @@ def health_check(): async def upload_document( file: UploadFile = File(...), db: AsyncSession = Depends(get_db) + ): """ Upload and process a document (currently supports PDF). diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index ea0b994..d467489 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -7,7 +7,7 @@ from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.schema import BlockTypes -from marker.schema import SourceMetadata +# from marker.schema import SourceMetadata import openai import asyncio import asyncpg @@ -71,13 +71,16 @@ def pdf_to_json(path, output_path): output_format="json" ) - metadata = SourceMetadata( - source=path # Just use the file path or any identifier - ) + # metadata = SourceMetadata( + # source=path # Just use the file path or any identifier + # ) + + temp_dict = dict() + temp_dict.add("placeholder") data = { document: document, - metadata: metadata + metadata: temp_dict } # for debugging purposes @@ -133,7 +136,7 @@ def insert_content(block, parent_id=None): """ content_text = html_to_text(block.get("html", "")) block_type = block.get("block_type", "Unknown") - embedding = generate_embedding(content_text) # your embedding function + # embedding = generate_embedding(content_text) # your embedding function # the reason why title is None for only text and not all others: blocks can be SectionHeader, ListGroup, ListItem, Page, so it makes sense to keep track of those # we don't want to embed the text itself as a title @@ -146,7 +149,7 @@ def insert_content(block, parent_id=None): title=None if block_type == "Text" else content_text[:100], # add a label if it's not pure text content=content_text, # content is as extracted content_type=block_type, # keep track of type - embedding=embedding, # TODO: add embedding + # embedding=embedding, # TODO: add embedding created_at=datetime.utcnow(), updated_at=datetime.utcnow() ) @@ -276,6 +279,7 @@ async def _split_pdf_into_chunks( return chunks +# TODO: rename to be more descriptive async def process_file( self, upload_file: UploadFile, From 2245f1b1ff9217483d741ec0fdd47fe28a70578a Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Sun, 6 Apr 2025 23:40:47 -0400 Subject: [PATCH 05/11] fix: address various issues in content insertion and configuration - Replace invalid temp_dict.add() usage with a dict literal to fix runtime errors. - Pass section_stack into traverse_blocks to ensure proper block hierarchy traversal. - Pass db parameter into insert_content to allow database access where needed. - Remove dead or commented-out config_parser code to clean up unused logic. --- backend/app/services/document_processor.py | 179 +++++++++++++-------- 1 file changed, 112 insertions(+), 67 deletions(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index d467489..28cdfa5 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -1,3 +1,15 @@ +""" +This is a very important file. It uploads our PDFs to the database, no matter how large the PDF is + +- User uploads PDF +- process_file → split into text chunks (for preview maybe) → +- pdf_to_json → semantic JSON from Marker +- sub_chunk → create source metadata row in KnowledgeBaseSource +- traverse_blocks → recurse over semantic blocks and call insert_content depending on the type +- insert_content → save each paragraph/section/list/etc into KnowledgeBaseContent. + +""" + from typing import Dict, List, Optional from fastapi import UploadFile import re @@ -21,6 +33,12 @@ from datetime import datetime from sqlalchemy.orm import Session +# TODO: +# - [ ] (Optional) Bundle common params (db, source_id, section_stack) into a context object for cleaner recursion. +# - [ ] (Optional) Improve error handling around database commits (rollback on failure). +# - [ ] (Optional) Remove dead/commented out code (old config_parser stuff in `pdf_to_json`). +# - [ ] (Optional) Add type hints to all functions for better readability and IDE support. + class DocumentChunk: def __init__( self, @@ -52,18 +70,29 @@ def __init__(self): def html_to_text(html): return BeautifulSoup(html, "html.parser").get_text() +""" +What is a "block" in marker? +they are the "smallest structural units in the PDF" +e.g., a section header, a paragraph of text, a list, an image, a page footer +""" def pdf_to_json(path, output_path): - # config = { - # "output_format": "json", - # } - # config_parser = ConfigParser(config) + """ + config reference + config = settings (what output you want, OCR yes/no, use LLM yes/no) + + artifact_dict = the brains (the models/knowledge about documents) + processor_list = cleanup helpers (optional extras to improve block extraction/formatting) + renderer = pretty printer (turns blocks into Markdown/JSON/HTML) + llm_service = smart assistant (an optional LLM to "fix" hard cases) + + config = { + "output_format": "json", + } + config_parser = ConfigParser(config) + """ converter = PdfConverter( - # config=config_parser.generate_config_dict(), artifact_dict=create_model_dict(), - # processor_list=config_parser.get_processors(), - # renderer=config_parser.get_renderer(), - # llm_service=config_parser.get_llm_service() ) document = converter.build_document( @@ -71,16 +100,18 @@ def pdf_to_json(path, output_path): output_format="json" ) + # TODO: update metadata retrieval # metadata = SourceMetadata( # source=path # Just use the file path or any identifier # ) + # TODO: pass in real metadata. relies on the above temp_dict = dict() - temp_dict.add("placeholder") + temp_dict = {"placeholder": True} data = { - document: document, - metadata: temp_dict + "document": document, + "metadata": temp_dict } # for debugging purposes @@ -93,17 +124,22 @@ def sub_chunk(json_data, db: Session, source_metadata: dict): """ This function takes our marker JSON, takes the sections, and formulates metadata and extracts content. It saves it into the database with insert_content + It prepares for block traversal. + Args: JSON data from marker, database session, a dictionary which we plug into the knowledge_base_source Returns: success if we get through all of the subchunks """ # 1. Insert knowledge_base_sources row (created empty one) AKA the metadata # remember that this is from @models.py courtesy of SQLAlchemy + + # error: models is not defined source = models.KnowledgeBaseSource( name=source_metadata.get("title", "Untitled"), author=source_metadata.get("author", "Unknown"), language=source_metadata.get("language", "en"), license=source_metadata.get("license", "unknown"), source_type="textbook", + # TODO: figure out non-deprecated version created_at=datetime.utcnow(), updated_at=datetime.utcnow() ) @@ -116,72 +152,81 @@ def sub_chunk(json_data, db: Session, source_metadata: dict): # reread the new row db.refresh(source) - section_stack = [] # stack of (section_id, level) + section_stack = [] # initialize stack of (section_id, level) + + # Pass db and stack + traverse_blocks(json_data.get("children", []), parent_id=None, db=db, section_stack=section_stack) + + return {"status": "success", "source_id": source.source_id} +def insert_content(block, db=None, parent_id=None): """ - What is a "block" in marker? - they are the "smallest structural units in the PDF" - e.g., a section header, a paragraph of text, a list, an image, a page footer + Inserts the HTML that we got from the marker JSON + into the corresponding database table + + title: + + Args: + Returns: """ - def insert_content(block, parent_id=None): - """ - Inserts the HTML that we got from the marker JSON - into the corresponding database table + # GET THE TEXT + # TODO: update how we get this + # it should be extracting from json + content_text = html_to_text(block.get("html", "")) + block_type = block.get("block_type", "Unknown") + # embedding = generate_embedding(content_text) # your embedding function - title: + # the reason why title is None for only text and not all others: blocks can be SectionHeader, ListGroup, ListItem, Page, so it makes sense to keep track of those + # we don't want to embed the text itself as a title + + # also, source is from sub_chunk, our parent function + # so we're updating the knowledge_base_content row with SQLAlchemy ORM + + # error: models is not defined + content = models.KnowledgeBaseContent( + source_id=source.source_id, # we keep the key consistent + parent_content_id=parent_id, # parent is from params + title=None if block_type == "Text" else content_text[:100], # add a label if it's not pure text + content=content_text, # content is as extracted + content_type=block_type, # keep track of type + # embedding=embedding, # TODO: add embedding + created_at=datetime.utcnow(), + updated_at=datetime.utcnow() + ) + db.add(content) + db.commit() + db.refresh(content) + return content.content_id - Args: - Returns: - """ - content_text = html_to_text(block.get("html", "")) - block_type = block.get("block_type", "Unknown") - # embedding = generate_embedding(content_text) # your embedding function +def traverse_blocks(blocks, parent_id=None, db=None, section_stack=None): + section_stack = section_stack or [] - # the reason why title is None for only text and not all others: blocks can be SectionHeader, ListGroup, ListItem, Page, so it makes sense to keep track of those - # we don't want to embed the text itself as a title + for block in blocks: + block_type = block.get("block_type") - # also, source is from sub_chunk, our parent function - # so we're updating the knowledge_base_content row with SQLAlchemy ORM - content = models.KnowledgeBaseContent( - source_id=source.source_id, # we keep the key consistent - parent_content_id=parent_id, # parent is from params - title=None if block_type == "Text" else content_text[:100], # add a label if it's not pure text - content=content_text, # content is as extracted - content_type=block_type, # keep track of type - # embedding=embedding, # TODO: add embedding - created_at=datetime.utcnow(), - updated_at=datetime.utcnow() - ) - db.add(content) - db.commit() - db.refresh(content) - return content.content_id - - def traverse_blocks(blocks, parent_id=None): - for block in blocks: - block_type = block.get("block_type") - - # if page, then recurse through it as pages are containers not content - # if section header (like "Chapter 1: Limits"), then insert the block with parent id being the page - # if text OR ListItem, then we insert the block with parent id being the latest nest (if exists) - # if ListGroup, then recurse through the bullet points/numbered list containers with parent id being latest nest (if exists) - - # note: section stack keeps track of nested section headers. [-1][0] gets the top of the stack. - if block_type == "Page": - traverse_blocks(block.get("children", []), parent_id=parent_id) - elif block_type == "SectionHeader": - section_id = insert_content(block, parent_id=parent_id) - # update the stack for this new section level - section_stack.append((section_id, block.get("section_hierarchy", {}))) - traverse_blocks(block.get("children", []), parent_id=section_id) - elif block_type in {"Text", "ListItem"}: - # its parent is the latest nested item if we even have one. else it's just the page. - insert_content(block, parent_id=section_stack[-1][0] if section_stack else parent_id) - elif block_type == "ListGroup": - traverse_blocks(block.get("children", []), parent_id=section_stack[-1][0] if section_stack else parent_id) + # if page, then recurse through it as pages are containers not content + # if section header (like "Chapter 1: Limits"), then insert the block with parent id being the page + # if text OR ListItem, then we insert the block with parent id being the latest nest (if exists) + # if ListGroup, then recurse through the bullet points/numbered list containers with parent id being latest nest (if exists) + + # note: section stack keeps track of nested section headers. [-1][0] gets the top of the stack. + if block_type == "Page": + traverse_blocks(block.get("children", []), parent_id=parent_id, db=db, section_stack=section_stack) + elif block_type == "SectionHeader": + section_id = insert_content(block, db=db, parent_id=parent_id) + # update the stack for this new section level + section_stack.append((section_id, block.get("section_hierarchy", {}))) + traverse_blocks(block.get("children", []), parent_id=section_id, db=db, section_stack=section_stack) + elif block_type in {"Text", "ListItem"}: + # its parent is the latest nested item if we even have one. else it's just the page. + insert_content(block, db=db, parent_id=section_stack[-1][0] if section_stack else parent_id) + elif block_type == "ListGroup": + traverse_blocks(block.get("children", []), parent_id=section_stack[-1][0] if section_stack else parent_id, db=db, section_stack=section_stack) # 2. Begin traversal + + # error: json_data and source is not defined traverse_blocks(json_data.get("children", [])) return {"status": "success", "source_id": source.source_id} From d4c73f062841901256feb5ad2e71f5133f9741dc Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Mon, 7 Apr 2025 00:03:00 -0400 Subject: [PATCH 06/11] refactor: typing and pass missing arguments --- backend/app/services/document_processor.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index 28cdfa5..a293557 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -10,7 +10,7 @@ """ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Any from fastapi import UploadFile import re from pathlib import Path @@ -120,7 +120,7 @@ def pdf_to_json(path, output_path): return data -def sub_chunk(json_data, db: Session, source_metadata: dict): +def sub_chunk(json_data: dict[str, Any], db: Session, source_metadata: dict[str, Any]): """ This function takes our marker JSON, takes the sections, and formulates metadata and extracts content. It saves it into the database with insert_content @@ -170,6 +170,9 @@ def insert_content(block, db=None, parent_id=None): Returns: """ + if not source: + raise ValueError("Source object is required") + # GET THE TEXT # TODO: update how we get this # it should be extracting from json @@ -199,7 +202,7 @@ def insert_content(block, db=None, parent_id=None): db.refresh(content) return content.content_id -def traverse_blocks(blocks, parent_id=None, db=None, section_stack=None): +def traverse_blocks(json_data: dict[str, Any], blocks, parent_id=None, db=None, section_stack=None): section_stack = section_stack or [] for block in blocks: @@ -212,22 +215,22 @@ def traverse_blocks(blocks, parent_id=None, db=None, section_stack=None): # note: section stack keeps track of nested section headers. [-1][0] gets the top of the stack. if block_type == "Page": - traverse_blocks(block.get("children", []), parent_id=parent_id, db=db, section_stack=section_stack) + traverse_blocks(json_data, block.get("children", []), parent_id=parent_id, db=db, section_stack=section_stack) elif block_type == "SectionHeader": section_id = insert_content(block, db=db, parent_id=parent_id) # update the stack for this new section level section_stack.append((section_id, block.get("section_hierarchy", {}))) - traverse_blocks(block.get("children", []), parent_id=section_id, db=db, section_stack=section_stack) + traverse_blocks(json_data, block.get("children", []), parent_id=section_id, db=db, section_stack=section_stack) elif block_type in {"Text", "ListItem"}: # its parent is the latest nested item if we even have one. else it's just the page. insert_content(block, db=db, parent_id=section_stack[-1][0] if section_stack else parent_id) elif block_type == "ListGroup": - traverse_blocks(block.get("children", []), parent_id=section_stack[-1][0] if section_stack else parent_id, db=db, section_stack=section_stack) + traverse_blocks(json_data, block.get("children", []), parent_id=section_stack[-1][0] if section_stack else parent_id, db=db, section_stack=section_stack) # 2. Begin traversal # error: json_data and source is not defined - traverse_blocks(json_data.get("children", [])) + traverse_blocks(json_data, json_data.get("children", [])) return {"status": "success", "source_id": source.source_id} # ------------------- From 347c40ff226a39c1de2d180fc0d63776d0b9074f Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Mon, 7 Apr 2025 00:14:45 -0400 Subject: [PATCH 07/11] refactor: eliminate prop drilling, NameError, package reused params into a context class --- backend/app/services/document_processor.py | 42 +++++++++++++--------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index a293557..5fb074d 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -27,6 +27,7 @@ import uuid import os import json +from app import models # Import models from app package # for the sectioning of the chunks from bs4 import BeautifulSoup @@ -59,6 +60,13 @@ def __init__(self): self.max_chunk_size = 1000 # maximum characters per chunk self.overlap = 50 # number of characters to overlap between chunks +class DocumentProcessingContext: + def __init__(self, db: Session, source: "models.KnowledgeBaseSource"): + self.db = db + self.source = source + self.section_stack = [] + self.metadata = {} # optional extras + # CONSTANTS UPLOAD_DIR = os.path.join(os.path.dirname(__file__), "uploads") CHUNK_SIZE = 64 * 1024 # 64KB chunks for better performance @@ -120,7 +128,7 @@ def pdf_to_json(path, output_path): return data -def sub_chunk(json_data: dict[str, Any], db: Session, source_metadata: dict[str, Any]): +def sub_chunk(json_data: dict[str, Any], db: Session, source_metadata: dict[str, Any]) -> dict[str, Any]: """ This function takes our marker JSON, takes the sections, and formulates metadata and extracts content. It saves it into the database with insert_content @@ -152,14 +160,17 @@ def sub_chunk(json_data: dict[str, Any], db: Session, source_metadata: dict[str, # reread the new row db.refresh(source) + # initialize the context + context = DocumentProcessingContext(db=db, source=source) + section_stack = [] # initialize stack of (section_id, level) # Pass db and stack - traverse_blocks(json_data.get("children", []), parent_id=None, db=db, section_stack=section_stack) + traverse_blocks(json_data.get("children", []), parent_id=None, context=context) - return {"status": "success", "source_id": source.source_id} + return {"status": "success", "source_id": context.source.source_id} -def insert_content(block, db=None, parent_id=None): +def insert_content(block, parent_id=None, context: DocumentProcessingContext = None): """ Inserts the HTML that we got from the marker JSON into the corresponding database table @@ -170,7 +181,7 @@ def insert_content(block, db=None, parent_id=None): Returns: """ - if not source: + if not context or not context.source: raise ValueError("Source object is required") # GET THE TEXT @@ -188,7 +199,7 @@ def insert_content(block, db=None, parent_id=None): # error: models is not defined content = models.KnowledgeBaseContent( - source_id=source.source_id, # we keep the key consistent + source_id=context.source.source_id, # we keep the key consistent parent_content_id=parent_id, # parent is from params title=None if block_type == "Text" else content_text[:100], # add a label if it's not pure text content=content_text, # content is as extracted @@ -197,13 +208,13 @@ def insert_content(block, db=None, parent_id=None): created_at=datetime.utcnow(), updated_at=datetime.utcnow() ) - db.add(content) - db.commit() - db.refresh(content) + context.db.add(content) + context.db.commit() + context.db.refresh(content) return content.content_id -def traverse_blocks(json_data: dict[str, Any], blocks, parent_id=None, db=None, section_stack=None): - section_stack = section_stack or [] +def traverse_blocks(blocks: list, parent_id=None, context: DocumentProcessingContext = None): + section_stack = context.section_stack for block in blocks: block_type = block.get("block_type") @@ -215,23 +226,22 @@ def traverse_blocks(json_data: dict[str, Any], blocks, parent_id=None, db=None, # note: section stack keeps track of nested section headers. [-1][0] gets the top of the stack. if block_type == "Page": - traverse_blocks(json_data, block.get("children", []), parent_id=parent_id, db=db, section_stack=section_stack) + traverse_blocks(block.get("children", []), parent_id=parent_id, context=context) elif block_type == "SectionHeader": section_id = insert_content(block, db=db, parent_id=parent_id) # update the stack for this new section level section_stack.append((section_id, block.get("section_hierarchy", {}))) - traverse_blocks(json_data, block.get("children", []), parent_id=section_id, db=db, section_stack=section_stack) + traverse_blocks(block.get("children", []), parent_id=section_id, context=context) elif block_type in {"Text", "ListItem"}: # its parent is the latest nested item if we even have one. else it's just the page. insert_content(block, db=db, parent_id=section_stack[-1][0] if section_stack else parent_id) elif block_type == "ListGroup": - traverse_blocks(json_data, block.get("children", []), parent_id=section_stack[-1][0] if section_stack else parent_id, db=db, section_stack=section_stack) + traverse_blocks(block.get("children", []), parent_id=section_stack[-1][0], context=context) # 2. Begin traversal # error: json_data and source is not defined - traverse_blocks(json_data, json_data.get("children", [])) - return {"status": "success", "source_id": source.source_id} + return {"status": "success", "source_id": context.source.source_id} # ------------------- From ac3f7667df18292024db2f91b5b3c3364866f137 Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Mon, 7 Apr 2025 00:22:32 -0400 Subject: [PATCH 08/11] chore: update comments --- backend/app/services/document_processor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index 5fb074d..19972c6 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -35,7 +35,7 @@ from sqlalchemy.orm import Session # TODO: -# - [ ] (Optional) Bundle common params (db, source_id, section_stack) into a context object for cleaner recursion. +# - [x] (Optional) Bundle common params (db, source_id, section_stack) into a context object for cleaner recursion. # - [ ] (Optional) Improve error handling around database commits (rollback on failure). # - [ ] (Optional) Remove dead/commented out code (old config_parser stuff in `pdf_to_json`). # - [ ] (Optional) Add type hints to all functions for better readability and IDE support. @@ -240,7 +240,6 @@ def traverse_blocks(blocks: list, parent_id=None, context: DocumentProcessingCon # 2. Begin traversal - # error: json_data and source is not defined return {"status": "success", "source_id": context.source.source_id} # ------------------- From 00f70227c4c43c48f24ca6c12d5ab0ba86a61998 Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Mon, 7 Apr 2025 00:25:38 -0400 Subject: [PATCH 09/11] refactor: batch inserts during document processing for faster uploads --- backend/app/services/document_processor.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index 19972c6..3fcec0c 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -153,22 +153,21 @@ def sub_chunk(json_data: dict[str, Any], db: Session, source_metadata: dict[str, ) # ADD to database (stages it) db.add(source) - - # commit the transaction db.commit() - - # reread the new row db.refresh(source) # initialize the context context = DocumentProcessingContext(db=db, source=source) - section_stack = [] # initialize stack of (section_id, level) - - # Pass db and stack - traverse_blocks(json_data.get("children", []), parent_id=None, context=context) + try: + # Pass db and stack + traverse_blocks(json_data.get("children", []), parent_id=None, context=context) + context.db.commit() + except Exception as e: + context.db.rollback() + raise e - return {"status": "success", "source_id": context.source.source_id} + return {"status": "success", "source_id": source.source_id} def insert_content(block, parent_id=None, context: DocumentProcessingContext = None): """ @@ -209,9 +208,7 @@ def insert_content(block, parent_id=None, context: DocumentProcessingContext = N updated_at=datetime.utcnow() ) context.db.add(content) - context.db.commit() - context.db.refresh(content) - return content.content_id + return content def traverse_blocks(blocks: list, parent_id=None, context: DocumentProcessingContext = None): section_stack = context.section_stack From 5045459fadf6fb08a357e515adb9a334423c1669 Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Mon, 7 Apr 2025 00:29:10 -0400 Subject: [PATCH 10/11] perf: optimize document ingestion using bulk_save_objects for massive speedup - Added content_buffer to DocumentProcessingContext - Modified insert_content to buffer KnowledgeBaseContent objects - Used bulk_save_objects for batched database insertions - Improved processing speed for large PDFs and textbooks --- backend/app/services/document_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index 3fcec0c..64651f5 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -66,6 +66,7 @@ def __init__(self, db: Session, source: "models.KnowledgeBaseSource"): self.source = source self.section_stack = [] self.metadata = {} # optional extras + self.content_buffer = [] # used to hold all the blocks before committing to database # CONSTANTS UPLOAD_DIR = os.path.join(os.path.dirname(__file__), "uploads") @@ -162,6 +163,7 @@ def sub_chunk(json_data: dict[str, Any], db: Session, source_metadata: dict[str, try: # Pass db and stack traverse_blocks(json_data.get("children", []), parent_id=None, context=context) + context.db.bulk_save_objects(context.content_buffer) context.db.commit() except Exception as e: context.db.rollback() @@ -207,7 +209,7 @@ def insert_content(block, parent_id=None, context: DocumentProcessingContext = N created_at=datetime.utcnow(), updated_at=datetime.utcnow() ) - context.db.add(content) + context.content_buffer.append(content) return content def traverse_blocks(blocks: list, parent_id=None, context: DocumentProcessingContext = None): From 9a2535b24f4fff8975715b8ef8c07fcc01955d0c Mon Sep 17 00:00:00 2001 From: Reese Chong Date: Mon, 7 Apr 2025 00:42:37 -0400 Subject: [PATCH 11/11] perf: stream buffer with auto-flush to prevent memory blowup - Implemented dynamic buffering for database inserts - Auto-flushes content when buffer fills to maintain constant memory usage - Enables smooth streaming ingestion for multi-thousand page documents without stalls --- backend/app/services/document_processor.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/backend/app/services/document_processor.py b/backend/app/services/document_processor.py index 64651f5..02defc5 100644 --- a/backend/app/services/document_processor.py +++ b/backend/app/services/document_processor.py @@ -67,6 +67,7 @@ def __init__(self, db: Session, source: "models.KnowledgeBaseSource"): self.section_stack = [] self.metadata = {} # optional extras self.content_buffer = [] # used to hold all the blocks before committing to database + self.buffer_limit = 500 # how big the batch is before saving # CONSTANTS UPLOAD_DIR = os.path.join(os.path.dirname(__file__), "uploads") @@ -74,6 +75,12 @@ def __init__(self, db: Session, source: "models.KnowledgeBaseSource"): # ------------------- +def flush_buffer(context): + if context.content_buffer: + context.db.bulk_save_objects(context.content_buffer) + context.db.commit() + context.content_buffer.clear() + # for the sectioning of the chunking def html_to_text(html): @@ -163,8 +170,10 @@ def sub_chunk(json_data: dict[str, Any], db: Session, source_metadata: dict[str, try: # Pass db and stack traverse_blocks(json_data.get("children", []), parent_id=None, context=context) - context.db.bulk_save_objects(context.content_buffer) - context.db.commit() + + # Final flush if anything remains in buffer + flush_buffer(context) + except Exception as e: context.db.rollback() raise e @@ -183,7 +192,7 @@ def insert_content(block, parent_id=None, context: DocumentProcessingContext = N """ if not context or not context.source: - raise ValueError("Source object is required") + raise ValueError("Context with source is required") # GET THE TEXT # TODO: update how we get this @@ -210,6 +219,11 @@ def insert_content(block, parent_id=None, context: DocumentProcessingContext = N updated_at=datetime.utcnow() ) context.content_buffer.append(content) + + # If buffer is full, bulk save and reset + if len(context.content_buffer) >= context.buffer_limit: + flush_buffer(context) + return content def traverse_blocks(blocks: list, parent_id=None, context: DocumentProcessingContext = None):