diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
deleted file mode 100644
index 6c2b096..0000000
--- a/.github/workflows/CI.yml
+++ /dev/null
@@ -1,129 +0,0 @@
-name: Continuous Integration
-on:
-  schedule:
-    - cron: "0 8 * * 1-5"
-  push:
-    branches: [main]
-  pull_request:
-    branches: [main]
-  workflow_dispatch:
-
-concurrency:
-  group: actions-id-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-formatting:
-    name: Check Formatting Errors
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - name: Install Dependencies
-        run: |
-          python -m pip install pycodestyle autopep8
-          python -m pip install .
-      
-      - name: Run pycodestyle
-        run: |
-          pycodestyle --statistics --count --max-line-length=150 --show-source .
-
-  build-and-test:
-    needs: check-formatting
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-        os: [ubuntu-latest, windows-latest, macos-latest]
-
-    runs-on: ${{ matrix.os }}
-    defaults:
-      run:
-        shell: bash -el {0}
-    name: ${{ matrix.os }} Python ${{ matrix.python-version }} Subtest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: mamba-org/setup-micromamba@main
-        with:
-          environment-name: temp
-          condarc: |
-            channels:
-              - defaults
-              - conda-forge
-            channel_priority: flexible
-          create-args: |
-            python=${{ matrix.python-version }}
-      - name: Install Dependencies
-        run: |
-          python -m pip install .
-          python -m pip install coverage pytest
-      - name: Run Tests
-        run: |
-          coverage run --source=. -m pytest -v
-      - name: Show Coverage
-        run: |
-          coverage report -m
-  
-  ci-report-status:
-    name: report CI status
-    needs: build-and-test
-    runs-on: ubuntu-latest
-    steps:
-      - run: |
-          result="${{ needs.build-and-test.result }}"
-          if [[ $result == "success" ]] ; then
-            exit 0
-          else
-            exit 1
-          fi
-  
-  check-for-new-release:
-    runs-on: ubuntu-latest
-    needs: build-and-test
-    steps:
-      - uses: actions/checkout@v3
-      - name: Check PyPI version
-        uses: maybe-hello-world/pyproject-check-version@v3
-        id: versioncheck
-        with:
-          pyproject-path: "./pyproject.toml"
-    
-      - name: Report Results
-        run: |
-            echo "New Release found? ${{ steps.versioncheck.outputs.local_version_is_higher }}"
-            echo "Local version: ${{ steps.versioncheck.outputs.local_version }}"
-            echo "Public version: ${{ steps.versioncheck.outputs.public_version }}"
-    outputs:
-      do_publish: ${{ steps.versioncheck.outputs.local_version_is_higher }} 
-  
-  pypi-package:
-    name: Build and publish Python 🐍 distributions 📦 to PyPI
-    runs-on: ubuntu-latest
-    needs: [check-for-new-release, ci-report-status]
-    if: ${{ needs.check-for-new-release.outputs.do_publish == 'true' && github.ref == 'refs/heads/main' && github.repository == 'QuantumPioneer/tristate20'}}
-    steps:
-    - uses: actions/checkout@master
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v3
-      with:
-        python-version: "3.10"
-    - name: Install pypa/build
-      run: >-
-        python -m
-        pip install
-        build
-        --user
-    - name: Build a binary wheel and a source tarball
-      run: >-
-        python -m
-        build
-        --sdist
-        --wheel
-        --outdir dist/
-        .
-    - name: Publish distribution 📦 to PyPI
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
-        skip-existing: true
-        verbose: true
-        
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index aac7f39..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2022 Jackson Burns
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/README.md b/README.md
index 61a65c2..892711f 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,9 @@
-<h1 align="center">tristate20</h1> 
-<h3 align="center">TRISTATE20 Dataset</h3>
+<h1 align="center">QuantumPioneer</h1> 
 
-<p align="center">  
-  <img alt="tristate20logo" src="https://github.com/QuantumPioneer/tristate20/blob/main/tristate20_logo.png">
-</p> 
-<p align="center">
-  <img alt="GitHub Repo Stars" src="https://img.shields.io/github/stars/QuantumPioneer/tristate20?style=social">
-  <img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/tristate20">
-  <img alt="PyPI" src="https://img.shields.io/pypi/v/tristate20">
-  <img alt="PyPI - License" src="https://img.shields.io/github/license/QuantumPioneer/tristate20">
-</p>
+## Data availability 
+The QuantumPioneer datasets are available for download on [Zenodo](). Python notebooks are provided with the datasets to demonstrate how to load the datasets.
 
-## Online Documentation
-[Click here to read the documentation](https://QuantumPioneer.github.io/tristate20/)
+## Data provenance
+Log files generated by Gaussian and ORCA were parsed using [generator.py](https://github.com/QuantumPioneer/databases/blob/main/scripts/generator.py) which relies on the [FastLogfileParser](https://github.com/QuantumPioneer/FastLogfileParser) package. The resulting parquet files were further processed using the scripts in [scripts/qm\_results](https://github.com/QuantumPioneer/databases/blob/main/scripts/qm_results). Importantly these scripts matched the atom mapped smiles to the respective data points.
 
-## SuperCloud
-Use this command to upgrade the installed version of `fastlogfileparser` (for initial development purposes only):
-```bash
-pip install --force-reinstall --no-cache-dir git+https://github.com/QuantumPioneer/FastLogfileParser.git@release/quantumpioneer_v1
-```
+Files generated by COSMOtherm were separately parsed and filtered to produce two master csv files, one for transition states and the other for ground state species. These csv files were then split by solvent using scripts in [scripts/solvation](https://github.com/QuantumPioneer/databases/blob/main/scripts/solvation)
diff --git a/databases/__init__.py b/databases/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/demo.ipynb b/demo.ipynb
deleted file mode 100644
index f5efa7a..0000000
--- a/demo.ipynb
+++ /dev/null
@@ -1,1130 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# `QuantumPioneer/databases` Demo\n",
-    "This notebook demonstrates how to interact with the `QuantumPioneer` databases.\n",
-    "\n",
-    "See inline comments for more information."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "# both pandas and polars can read parquet files - pick whichever you prefer!\n",
-    "# there are advantages and disadvantages to both\n",
-    "import pandas as pd\n",
-    "import polars as pl\n",
-    "\n",
-    "import pyarrow as pa\n",
-    "# this library interacts with the parquet format directly, and both pandas and polars can use it too\n",
-    "import pyarrow.parquet as pq\n",
-    "\n",
-    "# schema = layout of the database (what are the datatypes, etc.)\n",
-    "# the schema for the quantumpioneer databases are stored in databases.schema and vary depending on the type of data\n",
-    "from databases.schema import DLPNO_SCHEMA, DFT_SCHEMA"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## DLPNO"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set this to match wherever you have the database file located\n",
-    "DLPNO_DATABASE_FPATH = \"QuantumPioneer_v1_DLPNO.parquet\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>source</th>\n",
-       "      <th>route_section</th>\n",
-       "      <th>charge</th>\n",
-       "      <th>multiplicity</th>\n",
-       "      <th>energy</th>\n",
-       "      <th>run_time</th>\n",
-       "      <th>input_coordinates</th>\n",
-       "      <th>dipole_au</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-400.826710</td>\n",
-       "      <td>995.0</td>\n",
-       "      <td>[[1.966178, 1.124096, -0.127534], [0.66917, 1....</td>\n",
-       "      <td>1.19932</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-522.304109</td>\n",
-       "      <td>3716.0</td>\n",
-       "      <td>[[3.793948, 0.51694, -0.021986], [2.578084, 0....</td>\n",
-       "      <td>0.52795</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-422.263499</td>\n",
-       "      <td>1073.0</td>\n",
-       "      <td>[[-1.503284, -1.357277, -0.49672], [-0.952133,...</td>\n",
-       "      <td>0.30215</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-454.045509</td>\n",
-       "      <td>990.0</td>\n",
-       "      <td>[[-1.905391, -1.469124, -0.652625], [-0.866371...</td>\n",
-       "      <td>1.55913</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                              source  \\\n",
-       "0  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "1  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "2  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "3  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "\n",
-       "                                       route_section  charge  multiplicity  \\\n",
-       "0  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "1  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "2  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "3  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "\n",
-       "       energy  run_time                                  input_coordinates  \\\n",
-       "0 -400.826710     995.0  [[1.966178, 1.124096, -0.127534], [0.66917, 1....   \n",
-       "1 -522.304109    3716.0  [[3.793948, 0.51694, -0.021986], [2.578084, 0....   \n",
-       "2 -422.263499    1073.0  [[-1.503284, -1.357277, -0.49672], [-0.952133,...   \n",
-       "3 -454.045509     990.0  [[-1.905391, -1.469124, -0.652625], [-0.866371...   \n",
-       "\n",
-       "   dipole_au  \n",
-       "0    1.19932  \n",
-       "1    0.52795  \n",
-       "2    0.30215  \n",
-       "3    1.55913  "
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# just open and read the entire dataset (very practical with the DLPNO data, which is small), which will be slow with pandas\n",
-    "df = pd.read_parquet(\n",
-    "    DLPNO_DATABASE_FPATH,\n",
-    "    schema=DLPNO_SCHEMA,  # pandas will try and guess this on its own if you don't provide it - it gets it right, but is slower\n",
-    ")\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J cc-pvqz/c cc-pvqz-f12-cabs RIJCOSX NormalSCF NormalPNO',\n",
-       " array([array([ 1.966178,  1.124096, -0.127534]),\n",
-       "        array([0.66917 , 1.414874, 0.013796]),\n",
-       "        array([-0.144905,  0.22587 ,  0.479877]),\n",
-       "        array([-1.324741, -0.093627, -0.41136 ]),\n",
-       "        array([-2.652683,  0.470902, -0.158743]),\n",
-       "        array([-2.352086, -0.868398,  0.16531 ]),\n",
-       "        array([ 0.897242, -0.899951,  0.499408]),\n",
-       "        array([ 1.521856, -1.190401, -0.772729]),\n",
-       "        array([ 2.222622, -0.324108,  0.166081]),\n",
-       "        array([ 2.746526,  1.814837, -0.449243]),\n",
-       "        array([ 0.21828 ,  2.389503, -0.184036]),\n",
-       "        array([-0.532997,  0.393642,  1.499854]),\n",
-       "        array([-1.055263, -0.374274, -1.438704]),\n",
-       "        array([-3.350308,  0.617562, -0.992022]),\n",
-       "        array([-2.797654,  1.146153,  0.694767]),\n",
-       "        array([ 0.737028, -1.760108,  1.157542]),\n",
-       "        array([ 3.189127, -0.669057,  0.545677])], dtype=object)]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# from here you can do all of your usual pandas manipulations\n",
-    "df.iloc[0][[\"route_section\", \"input_coordinates\"]].to_list()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>source</th>\n",
-       "      <th>energy</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-400.826710</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-522.304109</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-422.263499</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-454.045509</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                              source      energy\n",
-       "0  /data1/groups/co2_capture/reactant_product_cal... -400.826710\n",
-       "1  /data1/groups/co2_capture/reactant_product_cal... -522.304109\n",
-       "2  /data1/groups/co2_capture/reactant_product_cal... -422.263499\n",
-       "3  /data1/groups/co2_capture/reactant_product_cal... -454.045509"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# you can reduce the amount of memory consumed by only loading the columns that you care about using columns=...\n",
-    "df = pd.read_parquet(DLPNO_DATABASE_FPATH, columns=[\"source\", \"energy\"])\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>source</th>\n",
-       "      <th>route_section</th>\n",
-       "      <th>charge</th>\n",
-       "      <th>multiplicity</th>\n",
-       "      <th>energy</th>\n",
-       "      <th>run_time</th>\n",
-       "      <th>input_coordinates</th>\n",
-       "      <th>dipole_au</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-400.826710</td>\n",
-       "      <td>995.0</td>\n",
-       "      <td>[[1.966178, 1.124096, -0.127534], [0.66917, 1....</td>\n",
-       "      <td>1.19932</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-522.304109</td>\n",
-       "      <td>3716.0</td>\n",
-       "      <td>[[3.793948, 0.51694, -0.021986], [2.578084, 0....</td>\n",
-       "      <td>0.52795</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-422.263499</td>\n",
-       "      <td>1073.0</td>\n",
-       "      <td>[[-1.503284, -1.357277, -0.49672], [-0.952133,...</td>\n",
-       "      <td>0.30215</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-454.045509</td>\n",
-       "      <td>990.0</td>\n",
-       "      <td>[[-1.905391, -1.469124, -0.652625], [-0.866371...</td>\n",
-       "      <td>1.55913</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                              source  \\\n",
-       "0  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "1  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "2  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "3  /data1/groups/co2_capture/reactant_product_cal...   \n",
-       "\n",
-       "                                       route_section  charge  multiplicity  \\\n",
-       "0  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "1  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "2  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "3  uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...       0             2   \n",
-       "\n",
-       "       energy  run_time                                  input_coordinates  \\\n",
-       "0 -400.826710     995.0  [[1.966178, 1.124096, -0.127534], [0.66917, 1....   \n",
-       "1 -522.304109    3716.0  [[3.793948, 0.51694, -0.021986], [2.578084, 0....   \n",
-       "2 -422.263499    1073.0  [[-1.503284, -1.357277, -0.49672], [-0.952133,...   \n",
-       "3 -454.045509     990.0  [[-1.905391, -1.469124, -0.652625], [-0.866371...   \n",
-       "\n",
-       "   dipole_au  \n",
-       "0    1.19932  \n",
-       "1    0.52795  \n",
-       "2    0.30215  \n",
-       "3    1.55913  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# and you filter out specific rows _when reading_ the database to further reduce memory consumption (and speed things up)\n",
-    "# these statements can be complex, but the pandas docs explain it well:\n",
-    "# https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html\n",
-    "df = pd.read_parquet(\n",
-    "    DLPNO_DATABASE_FPATH,\n",
-    "    # SKIP rows where...\n",
-    "    filters=[\n",
-    "        [\n",
-    "            (  # multiplicity is equal to 1\n",
-    "                \"multiplicity\",\n",
-    "                \"=\",\n",
-    "                1,\n",
-    "            ),  # AND\n",
-    "            (  # energy is less than -500\n",
-    "                \"energy\",\n",
-    "                \"<\",\n",
-    "                -500,\n",
-    "            ),\n",
-    "        ],\n",
-    "        [  # OR\n",
-    "            (  # these two specific files\n",
-    "                \"source\",\n",
-    "                \"not in\",\n",
-    "                (\n",
-    "                    \"/data1/groups/co2_capture/reactant_product_calculation/ts_nho_round1/output/DLPNO_sp_f12/outputs/outputs_146/146857.log\",\n",
-    "                    \"/data1/groups/co2_capture/reactant_product_calculation/ts_nho_round1/output/DLPNO_sp_f12/outputs/outputs_146/146989.log\",\n",
-    "                ),\n",
-    "            ),\n",
-    "        ],\n",
-    "    ],\n",
-    ")\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>source</th>\n",
-       "      <th>charge</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                              source  charge\n",
-       "0  /data1/groups/co2_capture/reactant_product_cal...       0\n",
-       "1  /data1/groups/co2_capture/reactant_product_cal...       0\n",
-       "2  /data1/groups/co2_capture/reactant_product_cal...       0\n",
-       "3  /data1/groups/co2_capture/reactant_product_cal...       0"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# you can also filter out based on one row without actually loading it\n",
-    "df = pd.read_parquet(DLPNO_DATABASE_FPATH, filters=[(\"run_time\", \"<\", 100)], columns=[\"source\", \"charge\"])\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div><style>\n",
-       ".dataframe > thead > tr,\n",
-       ".dataframe > tbody > tr {\n",
-       "  text-align: right;\n",
-       "  white-space: pre-wrap;\n",
-       "}\n",
-       "</style>\n",
-       "<small>shape: (4, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>source</th><th>route_section</th><th>charge</th><th>multiplicity</th><th>energy</th><th>run_time</th><th>input_coordinates</th><th>dipole_au</th></tr><tr><td>str</td><td>str</td><td>u8</td><td>u8</td><td>f64</td><td>u32</td><td>list[list[f64]]</td><td>f32</td></tr></thead><tbody><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>&quot;uHF UNO DLPNO-CCSD(T)-F12D cc-…</td><td>0</td><td>2</td><td>-400.82671</td><td>995</td><td>[[1.966178, 1.124096, -0.127534], [0.66917, 1.414874, 0.013796], … [3.189127, -0.669057, 0.545677]]</td><td>1.19932</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>&quot;uHF UNO DLPNO-CCSD(T)-F12D cc-…</td><td>0</td><td>2</td><td>-522.304109</td><td>3716</td><td>[[3.793948, 0.51694, -0.021986], [2.578084, 0.203352, 0.838065], … [-1.554274, 2.973256, -0.233796]]</td><td>0.52795</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>&quot;uHF UNO DLPNO-CCSD(T)-F12D cc-…</td><td>0</td><td>2</td><td>-422.263499</td><td>1073</td><td>[[-1.503284, -1.357277, -0.49672], [-0.952133, -0.060015, 0.07242], … [2.290489, 0.453102, 0.719722]]</td><td>0.30215</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>&quot;uHF UNO DLPNO-CCSD(T)-F12D cc-…</td><td>0</td><td>2</td><td>-454.045509</td><td>990</td><td>[[-1.905391, -1.469124, -0.652625], [-0.866371, -0.836593, -0.159013], … [-0.069671, 2.402362, -0.805667]]</td><td>1.55913</td></tr></tbody></table></div>"
-      ],
-      "text/plain": [
-       "shape: (4, 8)\n",
-       "┌─────────────┬─────────────┬────────┬────────────┬────────────┬──────────┬────────────┬───────────┐\n",
-       "│ source      ┆ route_secti ┆ charge ┆ multiplici ┆ energy     ┆ run_time ┆ input_coor ┆ dipole_au │\n",
-       "│ ---         ┆ on          ┆ ---    ┆ ty         ┆ ---        ┆ ---      ┆ dinates    ┆ ---       │\n",
-       "│ str         ┆ ---         ┆ u8     ┆ ---        ┆ f64        ┆ u32      ┆ ---        ┆ f32       │\n",
-       "│             ┆ str         ┆        ┆ u8         ┆            ┆          ┆ list[list[ ┆           │\n",
-       "│             ┆             ┆        ┆            ┆            ┆          ┆ f64]]      ┆           │\n",
-       "╞═════════════╪═════════════╪════════╪════════════╪════════════╪══════════╪════════════╪═══════════╡\n",
-       "│ /data1/grou ┆ uHF UNO DLP ┆ 0      ┆ 2          ┆ -400.82671 ┆ 995      ┆ [[1.966178 ┆ 1.19932   │\n",
-       "│ ps/co2_capt ┆ NO-CCSD(T)- ┆        ┆            ┆            ┆          ┆ ,          ┆           │\n",
-       "│ ure/reac…   ┆ F12D cc-…   ┆        ┆            ┆            ┆          ┆ 1.124096,  ┆           │\n",
-       "│             ┆             ┆        ┆            ┆            ┆          ┆ -0.12753…  ┆           │\n",
-       "│ /data1/grou ┆ uHF UNO DLP ┆ 0      ┆ 2          ┆ -522.30410 ┆ 3716     ┆ [[3.793948 ┆ 0.52795   │\n",
-       "│ ps/co2_capt ┆ NO-CCSD(T)- ┆        ┆            ┆ 9          ┆          ┆ , 0.51694, ┆           │\n",
-       "│ ure/reac…   ┆ F12D cc-…   ┆        ┆            ┆            ┆          ┆ -0.021986… ┆           │\n",
-       "│ /data1/grou ┆ uHF UNO DLP ┆ 0      ┆ 2          ┆ -422.26349 ┆ 1073     ┆ [[-1.50328 ┆ 0.30215   │\n",
-       "│ ps/co2_capt ┆ NO-CCSD(T)- ┆        ┆            ┆ 9          ┆          ┆ 4,         ┆           │\n",
-       "│ ure/reac…   ┆ F12D cc-…   ┆        ┆            ┆            ┆          ┆ -1.357277, ┆           │\n",
-       "│             ┆             ┆        ┆            ┆            ┆          ┆ -0.496…    ┆           │\n",
-       "│ /data1/grou ┆ uHF UNO DLP ┆ 0      ┆ 2          ┆ -454.04550 ┆ 990      ┆ [[-1.90539 ┆ 1.55913   │\n",
-       "│ ps/co2_capt ┆ NO-CCSD(T)- ┆        ┆            ┆ 9          ┆          ┆ 1,         ┆           │\n",
-       "│ ure/reac…   ┆ F12D cc-…   ┆        ┆            ┆            ┆          ┆ -1.469124, ┆           │\n",
-       "│             ┆             ┆        ┆            ┆            ┆          ┆ -0.652…    ┆           │\n",
-       "└─────────────┴─────────────┴────────┴────────────┴────────────┴──────────┴────────────┴───────────┘"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# you can run everything from above using polars as well, and in my experience it uses less memory and is faster\n",
-    "df = pl.read_parquet(DLPNO_DATABASE_FPATH)\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# a notable difference is that polars sets `memory_map=True` by default (pandas supports it, but is False and accessible via kwarg only)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div><style>\n",
-       ".dataframe > thead > tr,\n",
-       ".dataframe > tbody > tr {\n",
-       "  text-align: right;\n",
-       "  white-space: pre-wrap;\n",
-       "}\n",
-       "</style>\n",
-       "<small>shape: (4, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>source</th><th>charge</th></tr><tr><td>str</td><td>u8</td></tr></thead><tbody><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>0</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>0</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>0</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>0</td></tr></tbody></table></div>"
-      ],
-      "text/plain": [
-       "shape: (4, 2)\n",
-       "┌─────────────────────────────────┬────────┐\n",
-       "│ source                          ┆ charge │\n",
-       "│ ---                             ┆ ---    │\n",
-       "│ str                             ┆ u8     │\n",
-       "╞═════════════════════════════════╪════════╡\n",
-       "│ /data1/groups/co2_capture/reac… ┆ 0      │\n",
-       "│ /data1/groups/co2_capture/reac… ┆ 0      │\n",
-       "│ /data1/groups/co2_capture/reac… ┆ 0      │\n",
-       "│ /data1/groups/co2_capture/reac… ┆ 0      │\n",
-       "└─────────────────────────────────┴────────┘"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# to pass filters to polars, you have to use the `pyarrow_options` argument (polars only supports limiting the number of rows in\n",
-    "# in sequential order via `n_rows`)\n",
-    "df = pl.read_parquet(\n",
-    "    DLPNO_DATABASE_FPATH,\n",
-    "    columns=[\"source\", \"charge\"],\n",
-    "    pyarrow_options=dict(\n",
-    "        filters=[(\"run_time\", \"<\", 100)],\n",
-    "        schema=DLPNO_SCHEMA,\n",
-    "    ),\n",
-    ")\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div><style>\n",
-       ".dataframe > thead > tr,\n",
-       ".dataframe > tbody > tr {\n",
-       "  text-align: right;\n",
-       "  white-space: pre-wrap;\n",
-       "}\n",
-       "</style>\n",
-       "<small>shape: (4, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>source</th><th>energy</th></tr><tr><td>str</td><td>f64</td></tr></thead><tbody><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>-623.790852</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>-170.225353</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>-134.31814</td></tr><tr><td>&quot;/data1/groups/co2_capture/reac…</td><td>-515.612084</td></tr></tbody></table></div>"
-      ],
-      "text/plain": [
-       "shape: (4, 2)\n",
-       "┌─────────────────────────────────┬─────────────┐\n",
-       "│ source                          ┆ energy      │\n",
-       "│ ---                             ┆ ---         │\n",
-       "│ str                             ┆ f64         │\n",
-       "╞═════════════════════════════════╪═════════════╡\n",
-       "│ /data1/groups/co2_capture/reac… ┆ -623.790852 │\n",
-       "│ /data1/groups/co2_capture/reac… ┆ -170.225353 │\n",
-       "│ /data1/groups/co2_capture/reac… ┆ -134.31814  │\n",
-       "│ /data1/groups/co2_capture/reac… ┆ -515.612084 │\n",
-       "└─────────────────────────────────┴─────────────┘"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# ...or just use polars other functions\n",
-    "df = (\n",
-    "    pl.scan_parquet(\n",
-    "        DLPNO_DATABASE_FPATH,\n",
-    "    )  # opens the file, but does not actually read it (LazyFrame)\n",
-    "    .filter(\n",
-    "        pl.col(\"run_time\") < 100,\n",
-    "    )  # sets up our filters, but still does not run the query\n",
-    "    .select(pl.col(\"source\"), pl.col(\"energy\"))\n",
-    "    .collect()  # actually runs the query\n",
-    ")\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>source</th>\n",
-       "      <th>energy</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-623.790852</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-170.225353</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-134.318140</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>/data1/groups/co2_capture/reactant_product_cal...</td>\n",
-       "      <td>-515.612084</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                              source      energy\n",
-       "0  /data1/groups/co2_capture/reactant_product_cal... -623.790852\n",
-       "1  /data1/groups/co2_capture/reactant_product_cal... -170.225353\n",
-       "2  /data1/groups/co2_capture/reactant_product_cal... -134.318140\n",
-       "3  /data1/groups/co2_capture/reactant_product_cal... -515.612084"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# the final option is to interact with the data via pyarrow directly, which takes all the same arguments as before but in a slightly\n",
-    "# different setup - this is the single fastest way to read the data\n",
-    "table = pq.ParquetDataset(DLPNO_DATABASE_FPATH, schema=DLPNO_SCHEMA, filters=[(\"run_time\", \"<\", 100)]).read(columns=[\"source\", \"energy\"])\n",
-    "df = table.to_pandas()\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## DFT\n",
-    "All of the above applies for the DFT data also - this will just demonstrate some of the fields that are specific to the DFT data, as well as limiting the number of rows (since this dataset is much bigger).\n",
-    "\n",
-    "This section will just use `pyarrow.parquet` since this dataset is more complicated and this library can do a lot more \"nuts and bolts\" interactions with the data.\n",
-    "Everything shown could _probably_ be done with pandas or polars, but not as easily."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set this to match wherever you have the database file located\n",
-    "DFT_DATABASE_FPATH = \"QuantumPioneer_v1_DFT.parquet\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pyarrow._parquet.FileMetaData object at 0x7f69ac9edc60>\n",
-       "  created_by: parquet-cpp-arrow version 16.1.0\n",
-       "  num_columns: 26\n",
-       "  num_rows: 453472\n",
-       "  num_row_groups: 886\n",
-       "  format_version: 2.6\n",
-       "  serialized_size: 3008687"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# look at the information about the database\n",
-    "f = pq.ParquetFile(DFT_DATABASE_FPATH)\n",
-    "f.metadata"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>source</th>\n",
-       "      <th>route_section</th>\n",
-       "      <th>charge</th>\n",
-       "      <th>multiplicity</th>\n",
-       "      <th>max_steps</th>\n",
-       "      <th>normal_termination</th>\n",
-       "      <th>cpu_time</th>\n",
-       "      <th>wall_time</th>\n",
-       "      <th>e0_h</th>\n",
-       "      <th>hf</th>\n",
-       "      <th>...</th>\n",
-       "      <th>aniso_polarizability_au</th>\n",
-       "      <th>iso_polarizability_au</th>\n",
-       "      <th>scf</th>\n",
-       "      <th>dipole_moment_debye</th>\n",
-       "      <th>frequencies</th>\n",
-       "      <th>mulliken_charges_summed</th>\n",
-       "      <th>frequency_modes</th>\n",
-       "      <th>xyz</th>\n",
-       "      <th>std_xyz</th>\n",
-       "      <th>std_forces</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...</td>\n",
-       "      <td>P opt=(calcfc,maxcycle=128,noeig,nomicro,carte...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>100</td>\n",
-       "      <td>True</td>\n",
-       "      <td>2374</td>\n",
-       "      <td>614</td>\n",
-       "      <td>-382.633530</td>\n",
-       "      <td>-382.760525</td>\n",
-       "      <td>...</td>\n",
-       "      <td>18.4085</td>\n",
-       "      <td>62.8190</td>\n",
-       "      <td>[382.758210298, 382.759560141, 382.760381356, ...</td>\n",
-       "      <td>[1.8824, 1.7059, 0.6066]</td>\n",
-       "      <td>[131.7364, 188.0713, 194.6236, 263.776, 273.62...</td>\n",
-       "      <td>[[1.0, 0.234094], [2.0, -0.375229], [3.0, 0.37...</td>\n",
-       "      <td>[[[1.0, 6.0, -0.07, -0.1, -0.16], [2.0, 6.0, -...</td>\n",
-       "      <td>[[1.0, 6.0, 0.0, 0.178454, 1.90411, 0.117548],...</td>\n",
-       "      <td>[[[1.0, 6.0, 0.0, -0.175993, 1.912209, -0.1176...</td>\n",
-       "      <td>[[[1.0, 6.0, -2.303e-05, 0.000148725, 7.95e-06...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...</td>\n",
-       "      <td>P opt=(calcall,maxcycle=64,noeig,nomicro,carte...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>64</td>\n",
-       "      <td>True</td>\n",
-       "      <td>5603</td>\n",
-       "      <td>1415</td>\n",
-       "      <td>-319.766421</td>\n",
-       "      <td>-319.876823</td>\n",
-       "      <td>...</td>\n",
-       "      <td>37.2826</td>\n",
-       "      <td>58.9151</td>\n",
-       "      <td>[319.870024548, 319.873712193, 319.875876296, ...</td>\n",
-       "      <td>[-2.9536, 6.6691, 0.0]</td>\n",
-       "      <td>[125.7135, 225.1377, 243.6628, 275.6355, 307.2...</td>\n",
-       "      <td>[[1.0, 0.177478], [2.0, -0.11589], [3.0, -0.08...</td>\n",
-       "      <td>[[[1.0, 6.0, -0.0, -0.0, -0.02], [2.0, 6.0, -0...</td>\n",
-       "      <td>[[1.0, 6.0, 0.0, -2.04409, -0.600169, 0.01305]...</td>\n",
-       "      <td>[[[1.0, 6.0, 0.0, -2.050841, -0.610164, -0.000...</td>\n",
-       "      <td>[[[1.0, 6.0, 0.001360017, 0.000413491, -5.4464...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...</td>\n",
-       "      <td>P opt=(calcfc,maxcycle=128,noeig,nomicro,carte...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>120</td>\n",
-       "      <td>True</td>\n",
-       "      <td>9251</td>\n",
-       "      <td>2448</td>\n",
-       "      <td>-423.099887</td>\n",
-       "      <td>-423.281993</td>\n",
-       "      <td>...</td>\n",
-       "      <td>14.4899</td>\n",
-       "      <td>74.3067</td>\n",
-       "      <td>[423.274519549, 423.277512792, 423.280067289, ...</td>\n",
-       "      <td>[0.6979, 0.8904, 0.3061]</td>\n",
-       "      <td>[61.6212, 178.6859, 229.9573, 253.2535, 273.82...</td>\n",
-       "      <td>[[1.0, 0.178076], [2.0, -0.425597], [3.0, 0.11...</td>\n",
-       "      <td>[[[1.0, 6.0, -0.08, 0.02, -0.05], [2.0, 6.0, 0...</td>\n",
-       "      <td>[[1.0, 6.0, 0.0, 0.243366, 2.083137, 0.689603]...</td>\n",
-       "      <td>[[[1.0, 6.0, 0.0, -0.459041, 2.197114, -0.2423...</td>\n",
-       "      <td>[[[1.0, 6.0, -4.106e-06, 1.1987e-05, -2.732e-0...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...</td>\n",
-       "      <td>P opt=(calcall,maxcycle=64,noeig,nomicro,carte...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>64</td>\n",
-       "      <td>True</td>\n",
-       "      <td>8237</td>\n",
-       "      <td>537</td>\n",
-       "      <td>-303.770502</td>\n",
-       "      <td>-303.893091</td>\n",
-       "      <td>...</td>\n",
-       "      <td>46.1818</td>\n",
-       "      <td>63.9089</td>\n",
-       "      <td>[303.886122623, 303.88988402, 303.892570737, 3...</td>\n",
-       "      <td>[3.0462, 0.8742, 0.0015]</td>\n",
-       "      <td>[128.1369, 165.4973, 216.2848, 249.2191, 292.2...</td>\n",
-       "      <td>[[1.0, 0.154953], [2.0, -0.118076], [3.0, -0.0...</td>\n",
-       "      <td>[[[1.0, 6.0, -0.0, 0.0, -0.03], [2.0, 6.0, -0....</td>\n",
-       "      <td>[[1.0, 6.0, 0.0, -1.37189, -1.175717, -0.07927...</td>\n",
-       "      <td>[[[1.0, 6.0, 0.0, 1.994908, -0.843335, 0.00100...</td>\n",
-       "      <td>[[[1.0, 6.0, -0.000275241, 6.5551e-05, 1.2646e...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>4 rows × 26 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                              source  \\\n",
-       "0  /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...   \n",
-       "1  /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...   \n",
-       "2  /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...   \n",
-       "3  /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...   \n",
-       "\n",
-       "                                       route_section  charge  multiplicity  \\\n",
-       "0  P opt=(calcfc,maxcycle=128,noeig,nomicro,carte...       0             2   \n",
-       "1  P opt=(calcall,maxcycle=64,noeig,nomicro,carte...       0             2   \n",
-       "2  P opt=(calcfc,maxcycle=128,noeig,nomicro,carte...       0             2   \n",
-       "3  P opt=(calcall,maxcycle=64,noeig,nomicro,carte...       0             2   \n",
-       "\n",
-       "   max_steps  normal_termination  cpu_time  wall_time        e0_h          hf  \\\n",
-       "0        100                True      2374        614 -382.633530 -382.760525   \n",
-       "1         64                True      5603       1415 -319.766421 -319.876823   \n",
-       "2        120                True      9251       2448 -423.099887 -423.281993   \n",
-       "3         64                True      8237        537 -303.770502 -303.893091   \n",
-       "\n",
-       "   ...  aniso_polarizability_au  iso_polarizability_au  \\\n",
-       "0  ...                  18.4085                62.8190   \n",
-       "1  ...                  37.2826                58.9151   \n",
-       "2  ...                  14.4899                74.3067   \n",
-       "3  ...                  46.1818                63.9089   \n",
-       "\n",
-       "                                                 scf  \\\n",
-       "0  [382.758210298, 382.759560141, 382.760381356, ...   \n",
-       "1  [319.870024548, 319.873712193, 319.875876296, ...   \n",
-       "2  [423.274519549, 423.277512792, 423.280067289, ...   \n",
-       "3  [303.886122623, 303.88988402, 303.892570737, 3...   \n",
-       "\n",
-       "        dipole_moment_debye  \\\n",
-       "0  [1.8824, 1.7059, 0.6066]   \n",
-       "1    [-2.9536, 6.6691, 0.0]   \n",
-       "2  [0.6979, 0.8904, 0.3061]   \n",
-       "3  [3.0462, 0.8742, 0.0015]   \n",
-       "\n",
-       "                                         frequencies  \\\n",
-       "0  [131.7364, 188.0713, 194.6236, 263.776, 273.62...   \n",
-       "1  [125.7135, 225.1377, 243.6628, 275.6355, 307.2...   \n",
-       "2  [61.6212, 178.6859, 229.9573, 253.2535, 273.82...   \n",
-       "3  [128.1369, 165.4973, 216.2848, 249.2191, 292.2...   \n",
-       "\n",
-       "                             mulliken_charges_summed  \\\n",
-       "0  [[1.0, 0.234094], [2.0, -0.375229], [3.0, 0.37...   \n",
-       "1  [[1.0, 0.177478], [2.0, -0.11589], [3.0, -0.08...   \n",
-       "2  [[1.0, 0.178076], [2.0, -0.425597], [3.0, 0.11...   \n",
-       "3  [[1.0, 0.154953], [2.0, -0.118076], [3.0, -0.0...   \n",
-       "\n",
-       "                                     frequency_modes  \\\n",
-       "0  [[[1.0, 6.0, -0.07, -0.1, -0.16], [2.0, 6.0, -...   \n",
-       "1  [[[1.0, 6.0, -0.0, -0.0, -0.02], [2.0, 6.0, -0...   \n",
-       "2  [[[1.0, 6.0, -0.08, 0.02, -0.05], [2.0, 6.0, 0...   \n",
-       "3  [[[1.0, 6.0, -0.0, 0.0, -0.03], [2.0, 6.0, -0....   \n",
-       "\n",
-       "                                                 xyz  \\\n",
-       "0  [[1.0, 6.0, 0.0, 0.178454, 1.90411, 0.117548],...   \n",
-       "1  [[1.0, 6.0, 0.0, -2.04409, -0.600169, 0.01305]...   \n",
-       "2  [[1.0, 6.0, 0.0, 0.243366, 2.083137, 0.689603]...   \n",
-       "3  [[1.0, 6.0, 0.0, -1.37189, -1.175717, -0.07927...   \n",
-       "\n",
-       "                                             std_xyz  \\\n",
-       "0  [[[1.0, 6.0, 0.0, -0.175993, 1.912209, -0.1176...   \n",
-       "1  [[[1.0, 6.0, 0.0, -2.050841, -0.610164, -0.000...   \n",
-       "2  [[[1.0, 6.0, 0.0, -0.459041, 2.197114, -0.2423...   \n",
-       "3  [[[1.0, 6.0, 0.0, 1.994908, -0.843335, 0.00100...   \n",
-       "\n",
-       "                                          std_forces  \n",
-       "0  [[[1.0, 6.0, -2.303e-05, 0.000148725, 7.95e-06...  \n",
-       "1  [[[1.0, 6.0, 0.001360017, 0.000413491, -5.4464...  \n",
-       "2  [[[1.0, 6.0, -4.106e-06, 1.1987e-05, -2.732e-0...  \n",
-       "3  [[[1.0, 6.0, -0.000275241, 6.5551e-05, 1.2646e...  \n",
-       "\n",
-       "[4 rows x 26 columns]"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# many rows in this data! just load some of them (see https://stackoverflow.com/a/69888274)\n",
-    "first_ten_rows = next(f.iter_batches(batch_size=64))\n",
-    "df = pa.Table.from_batches([first_ten_rows]).to_pandas()\n",
-    "df.head(4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "   multiplicity          hf\n",
-      "0             2 -382.760525\n",
-      "1             2 -319.876823\n",
-      "2             2 -423.281993\n",
-      "3             2 -303.893091\n"
-     ]
-    }
-   ],
-   "source": [
-    "# we can also apply the many filters from above on this data\n",
-    "table = pq.ParquetDataset(\n",
-    "    DFT_DATABASE_FPATH,\n",
-    "    schema=DFT_SCHEMA,\n",
-    "    filters=[(\"hf\", \"!=\", np.nan)],  # skip rows where hf is missing\n",
-    "    memory_map=True,  # reduce memory usage by delaying reads into memory\n",
-    ").read(columns=[\"multiplicity\", \"hf\"])\n",
-    "\n",
-    "for batch in table.to_batches(512):\n",
-    "    print(batch.to_pandas().head(4))\n",
-    "    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0    [[1.0, 6.0, 0.0, 0.178454, 1.90411, 0.117548],...\n",
-      "1    [[1.0, 6.0, 0.0, -2.04409, -0.600169, 0.01305]...\n",
-      "2    [[1.0, 6.0, 0.0, 0.243366, 2.083137, 0.689603]...\n",
-      "3    [[1.0, 6.0, 0.0, -1.37189, -1.175717, -0.07927...\n",
-      "Name: xyz, dtype: object\n"
-     ]
-    }
-   ],
-   "source": [
-    "# the last thing I will point to is the pyarrow scanner, which makes a lot of these operations easy too\n",
-    "# https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html\n",
-    "# This part of the pyarrow API is currently experimental, so it might change, but it is very useful\n",
-    "\n",
-    "ds = pa.dataset.dataset(DFT_DATABASE_FPATH, schema=DFT_SCHEMA)\n",
-    "s = pa.dataset.Scanner.from_dataset(\n",
-    "    ds,\n",
-    "    columns=[\"charge\", \"multiplicity\", \"xyz\"],\n",
-    "    filter=(pa.compute.field(\"hf\") != pa.compute.scalar(np.nan)),  # must use pyarrow.compute syntax instead of plain strings...\n",
-    "    batch_size=5_096,  # reduce this to fit in your memory limitations\n",
-    ")\n",
-    "for batch in s.to_batches():\n",
-    "    print(batch.to_pandas()['xyz'].head(4))\n",
-    "    break"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "quantumpioneer",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index fd878a3..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,37 +0,0 @@
-[build-system]
-requires = ["setuptools>=64"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "tristate20"
-version = "0.0.0a2"
-authors = [
-    { name = "Jackson Burns" },
-]
-license = { text = "MIT" }
-description = "TRISTATE20 Dataset"
-classifiers = [
-    "Programming Language :: Python :: 3",
-]
-urls = { Homepage = "https://github.com/QuantumPioneer/tristate20" }
-requires-python = ">=3.8"
-dependencies = ["pyarrow"]
-
-[project.optional-dependencies]
-dev = ["black", "isort", "pytest"]
-demos = ["jupyter"]
-
-[project.readme]
-file = "README.md"
-content-type = "text/markdown"
-
-[tool.isort]
-profile = "black"
-
-[tool.setuptools]
-include-package-data = true
-
-[tool.setuptools.packages.find]
-where = ["."]
-include = ["tristate20*"]
-exclude = ["docs*", "examples*", "test*"]
diff --git a/databases/generator.py b/scripts/generator.py
similarity index 100%
rename from databases/generator.py
rename to scripts/generator.py
diff --git a/databases/schema.py b/scripts/schema.py
similarity index 100%
rename from databases/schema.py
rename to scripts/schema.py