diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml deleted file mode 100644 index 6c2b096..0000000 --- a/.github/workflows/CI.yml +++ /dev/null @@ -1,129 +0,0 @@ -name: Continuous Integration -on: - schedule: - - cron: "0 8 * * 1-5" - push: - branches: [main] - pull_request: - branches: [main] - workflow_dispatch: - -concurrency: - group: actions-id-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - check-formatting: - name: Check Formatting Errors - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Install Dependencies - run: | - python -m pip install pycodestyle autopep8 - python -m pip install . - - - name: Run pycodestyle - run: | - pycodestyle --statistics --count --max-line-length=150 --show-source . - - build-and-test: - needs: check-formatting - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] - os: [ubuntu-latest, windows-latest, macos-latest] - - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -el {0} - name: ${{ matrix.os }} Python ${{ matrix.python-version }} Subtest - steps: - - uses: actions/checkout@v3 - - uses: mamba-org/setup-micromamba@main - with: - environment-name: temp - condarc: | - channels: - - defaults - - conda-forge - channel_priority: flexible - create-args: | - python=${{ matrix.python-version }} - - name: Install Dependencies - run: | - python -m pip install . - python -m pip install coverage pytest - - name: Run Tests - run: | - coverage run --source=. -m pytest -v - - name: Show Coverage - run: | - coverage report -m - - ci-report-status: - name: report CI status - needs: build-and-test - runs-on: ubuntu-latest - steps: - - run: | - result="${{ needs.build-and-test.result }}" - if [[ $result == "success" ]] ; then - exit 0 - else - exit 1 - fi - - check-for-new-release: - runs-on: ubuntu-latest - needs: build-and-test - steps: - - uses: actions/checkout@v3 - - name: Check PyPI version - uses: maybe-hello-world/pyproject-check-version@v3 - id: versioncheck - with: - pyproject-path: "./pyproject.toml" - - - name: Report Results - run: | - echo "New Release found? ${{ steps.versioncheck.outputs.local_version_is_higher }}" - echo "Local version: ${{ steps.versioncheck.outputs.local_version }}" - echo "Public version: ${{ steps.versioncheck.outputs.public_version }}" - outputs: - do_publish: ${{ steps.versioncheck.outputs.local_version_is_higher }} - - pypi-package: - name: Build and publish Python 🐍 distributions πŸ“¦ to PyPI - runs-on: ubuntu-latest - needs: [check-for-new-release, ci-report-status] - if: ${{ needs.check-for-new-release.outputs.do_publish == 'true' && github.ref == 'refs/heads/main' && github.repository == 'QuantumPioneer/tristate20'}} - steps: - - uses: actions/checkout@master - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - . - - name: Publish distribution πŸ“¦ to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - skip-existing: true - verbose: true - \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index aac7f39..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2022 Jackson Burns - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md index 61a65c2..892711f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,9 @@ -

tristate20

-

TRISTATE20 Dataset

+

QuantumPioneer

-

- tristate20logo -

-

- GitHub Repo Stars - PyPI - Downloads - PyPI - PyPI - License -

+## Data availability +The QuantumPioneer datasets are available for download on [Zenodo](). Python notebooks are provided with the datasets to demonstrate how to load the datasets. -## Online Documentation -[Click here to read the documentation](https://QuantumPioneer.github.io/tristate20/) +## Data provenance +Log files generated by Gaussian and ORCA were parsed using [generator.py](https://github.com/QuantumPioneer/databases/blob/main/scripts/generator.py) which relies on the [FastLogfileParser](https://github.com/QuantumPioneer/FastLogfileParser) package. The resulting parquet files were further processed using the scripts in [scripts/qm\_results](https://github.com/QuantumPioneer/databases/blob/main/scripts/qm_results). Importantly these scripts matched the atom mapped smiles to the respective data points. -## SuperCloud -Use this command to upgrade the installed version of `fastlogfileparser` (for initial development purposes only): -```bash -pip install --force-reinstall --no-cache-dir git+https://github.com/QuantumPioneer/FastLogfileParser.git@release/quantumpioneer_v1 -``` +Files generated by COSMOtherm were separately parsed and filtered to produce two master csv files, one for transition states and the other for ground state species. These csv files were then split by solvent using scripts in [scripts/solvation](https://github.com/QuantumPioneer/databases/blob/main/scripts/solvation) diff --git a/databases/__init__.py b/databases/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/demo.ipynb b/demo.ipynb deleted file mode 100644 index f5efa7a..0000000 --- a/demo.ipynb +++ /dev/null @@ -1,1130 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `QuantumPioneer/databases` Demo\n", - "This notebook demonstrates how to interact with the `QuantumPioneer` databases.\n", - "\n", - "See inline comments for more information." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# both pandas and polars can read parquet files - pick whichever you prefer!\n", - "# there are advantages and disadvantages to both\n", - "import pandas as pd\n", - "import polars as pl\n", - "\n", - "import pyarrow as pa\n", - "# this library interacts with the parquet format directly, and both pandas and polars can use it too\n", - "import pyarrow.parquet as pq\n", - "\n", - "# schema = layout of the database (what are the datatypes, etc.)\n", - "# the schema for the quantumpioneer databases are stored in databases.schema and vary depending on the type of data\n", - "from databases.schema import DLPNO_SCHEMA, DFT_SCHEMA" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DLPNO" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# set this to match wherever you have the database file located\n", - "DLPNO_DATABASE_FPATH = \"QuantumPioneer_v1_DLPNO.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourceroute_sectionchargemultiplicityenergyrun_timeinput_coordinatesdipole_au
0/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-400.826710995.0[[1.966178, 1.124096, -0.127534], [0.66917, 1....1.19932
1/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-522.3041093716.0[[3.793948, 0.51694, -0.021986], [2.578084, 0....0.52795
2/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-422.2634991073.0[[-1.503284, -1.357277, -0.49672], [-0.952133,...0.30215
3/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-454.045509990.0[[-1.905391, -1.469124, -0.652625], [-0.866371...1.55913
\n", - "
" - ], - "text/plain": [ - " source \\\n", - "0 /data1/groups/co2_capture/reactant_product_cal... \n", - "1 /data1/groups/co2_capture/reactant_product_cal... \n", - "2 /data1/groups/co2_capture/reactant_product_cal... \n", - "3 /data1/groups/co2_capture/reactant_product_cal... \n", - "\n", - " route_section charge multiplicity \\\n", - "0 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "1 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "2 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "3 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "\n", - " energy run_time input_coordinates \\\n", - "0 -400.826710 995.0 [[1.966178, 1.124096, -0.127534], [0.66917, 1.... \n", - "1 -522.304109 3716.0 [[3.793948, 0.51694, -0.021986], [2.578084, 0.... \n", - "2 -422.263499 1073.0 [[-1.503284, -1.357277, -0.49672], [-0.952133,... \n", - "3 -454.045509 990.0 [[-1.905391, -1.469124, -0.652625], [-0.866371... \n", - "\n", - " dipole_au \n", - "0 1.19932 \n", - "1 0.52795 \n", - "2 0.30215 \n", - "3 1.55913 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# just open and read the entire dataset (very practical with the DLPNO data, which is small), which will be slow with pandas\n", - "df = pd.read_parquet(\n", - " DLPNO_DATABASE_FPATH,\n", - " schema=DLPNO_SCHEMA, # pandas will try and guess this on its own if you don't provide it - it gets it right, but is slower\n", - ")\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J cc-pvqz/c cc-pvqz-f12-cabs RIJCOSX NormalSCF NormalPNO',\n", - " array([array([ 1.966178, 1.124096, -0.127534]),\n", - " array([0.66917 , 1.414874, 0.013796]),\n", - " array([-0.144905, 0.22587 , 0.479877]),\n", - " array([-1.324741, -0.093627, -0.41136 ]),\n", - " array([-2.652683, 0.470902, -0.158743]),\n", - " array([-2.352086, -0.868398, 0.16531 ]),\n", - " array([ 0.897242, -0.899951, 0.499408]),\n", - " array([ 1.521856, -1.190401, -0.772729]),\n", - " array([ 2.222622, -0.324108, 0.166081]),\n", - " array([ 2.746526, 1.814837, -0.449243]),\n", - " array([ 0.21828 , 2.389503, -0.184036]),\n", - " array([-0.532997, 0.393642, 1.499854]),\n", - " array([-1.055263, -0.374274, -1.438704]),\n", - " array([-3.350308, 0.617562, -0.992022]),\n", - " array([-2.797654, 1.146153, 0.694767]),\n", - " array([ 0.737028, -1.760108, 1.157542]),\n", - " array([ 3.189127, -0.669057, 0.545677])], dtype=object)]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# from here you can do all of your usual pandas manipulations\n", - "df.iloc[0][[\"route_section\", \"input_coordinates\"]].to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourceenergy
0/data1/groups/co2_capture/reactant_product_cal...-400.826710
1/data1/groups/co2_capture/reactant_product_cal...-522.304109
2/data1/groups/co2_capture/reactant_product_cal...-422.263499
3/data1/groups/co2_capture/reactant_product_cal...-454.045509
\n", - "
" - ], - "text/plain": [ - " source energy\n", - "0 /data1/groups/co2_capture/reactant_product_cal... -400.826710\n", - "1 /data1/groups/co2_capture/reactant_product_cal... -522.304109\n", - "2 /data1/groups/co2_capture/reactant_product_cal... -422.263499\n", - "3 /data1/groups/co2_capture/reactant_product_cal... -454.045509" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# you can reduce the amount of memory consumed by only loading the columns that you care about using columns=...\n", - "df = pd.read_parquet(DLPNO_DATABASE_FPATH, columns=[\"source\", \"energy\"])\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourceroute_sectionchargemultiplicityenergyrun_timeinput_coordinatesdipole_au
0/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-400.826710995.0[[1.966178, 1.124096, -0.127534], [0.66917, 1....1.19932
1/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-522.3041093716.0[[3.793948, 0.51694, -0.021986], [2.578084, 0....0.52795
2/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-422.2634991073.0[[-1.503284, -1.357277, -0.49672], [-0.952133,...0.30215
3/data1/groups/co2_capture/reactant_product_cal...uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ...02-454.045509990.0[[-1.905391, -1.469124, -0.652625], [-0.866371...1.55913
\n", - "
" - ], - "text/plain": [ - " source \\\n", - "0 /data1/groups/co2_capture/reactant_product_cal... \n", - "1 /data1/groups/co2_capture/reactant_product_cal... \n", - "2 /data1/groups/co2_capture/reactant_product_cal... \n", - "3 /data1/groups/co2_capture/reactant_product_cal... \n", - "\n", - " route_section charge multiplicity \\\n", - "0 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "1 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "2 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "3 uHF UNO DLPNO-CCSD(T)-F12D cc-pvtz-f12 def2/J ... 0 2 \n", - "\n", - " energy run_time input_coordinates \\\n", - "0 -400.826710 995.0 [[1.966178, 1.124096, -0.127534], [0.66917, 1.... \n", - "1 -522.304109 3716.0 [[3.793948, 0.51694, -0.021986], [2.578084, 0.... \n", - "2 -422.263499 1073.0 [[-1.503284, -1.357277, -0.49672], [-0.952133,... \n", - "3 -454.045509 990.0 [[-1.905391, -1.469124, -0.652625], [-0.866371... \n", - "\n", - " dipole_au \n", - "0 1.19932 \n", - "1 0.52795 \n", - "2 0.30215 \n", - "3 1.55913 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# and you filter out specific rows _when reading_ the database to further reduce memory consumption (and speed things up)\n", - "# these statements can be complex, but the pandas docs explain it well:\n", - "# https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html\n", - "df = pd.read_parquet(\n", - " DLPNO_DATABASE_FPATH,\n", - " # SKIP rows where...\n", - " filters=[\n", - " [\n", - " ( # multiplicity is equal to 1\n", - " \"multiplicity\",\n", - " \"=\",\n", - " 1,\n", - " ), # AND\n", - " ( # energy is less than -500\n", - " \"energy\",\n", - " \"<\",\n", - " -500,\n", - " ),\n", - " ],\n", - " [ # OR\n", - " ( # these two specific files\n", - " \"source\",\n", - " \"not in\",\n", - " (\n", - " \"/data1/groups/co2_capture/reactant_product_calculation/ts_nho_round1/output/DLPNO_sp_f12/outputs/outputs_146/146857.log\",\n", - " \"/data1/groups/co2_capture/reactant_product_calculation/ts_nho_round1/output/DLPNO_sp_f12/outputs/outputs_146/146989.log\",\n", - " ),\n", - " ),\n", - " ],\n", - " ],\n", - ")\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourcecharge
0/data1/groups/co2_capture/reactant_product_cal...0
1/data1/groups/co2_capture/reactant_product_cal...0
2/data1/groups/co2_capture/reactant_product_cal...0
3/data1/groups/co2_capture/reactant_product_cal...0
\n", - "
" - ], - "text/plain": [ - " source charge\n", - "0 /data1/groups/co2_capture/reactant_product_cal... 0\n", - "1 /data1/groups/co2_capture/reactant_product_cal... 0\n", - "2 /data1/groups/co2_capture/reactant_product_cal... 0\n", - "3 /data1/groups/co2_capture/reactant_product_cal... 0" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# you can also filter out based on one row without actually loading it\n", - "df = pd.read_parquet(DLPNO_DATABASE_FPATH, filters=[(\"run_time\", \"<\", 100)], columns=[\"source\", \"charge\"])\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (4, 8)
sourceroute_sectionchargemultiplicityenergyrun_timeinput_coordinatesdipole_au
strstru8u8f64u32list[list[f64]]f32
"/data1/groups/co2_capture/reac…"uHF UNO DLPNO-CCSD(T)-F12D cc-…02-400.82671995[[1.966178, 1.124096, -0.127534], [0.66917, 1.414874, 0.013796], … [3.189127, -0.669057, 0.545677]]1.19932
"/data1/groups/co2_capture/reac…"uHF UNO DLPNO-CCSD(T)-F12D cc-…02-522.3041093716[[3.793948, 0.51694, -0.021986], [2.578084, 0.203352, 0.838065], … [-1.554274, 2.973256, -0.233796]]0.52795
"/data1/groups/co2_capture/reac…"uHF UNO DLPNO-CCSD(T)-F12D cc-…02-422.2634991073[[-1.503284, -1.357277, -0.49672], [-0.952133, -0.060015, 0.07242], … [2.290489, 0.453102, 0.719722]]0.30215
"/data1/groups/co2_capture/reac…"uHF UNO DLPNO-CCSD(T)-F12D cc-…02-454.045509990[[-1.905391, -1.469124, -0.652625], [-0.866371, -0.836593, -0.159013], … [-0.069671, 2.402362, -0.805667]]1.55913
" - ], - "text/plain": [ - "shape: (4, 8)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ source ┆ route_secti ┆ charge ┆ multiplici ┆ energy ┆ run_time ┆ input_coor ┆ dipole_au β”‚\n", - "β”‚ --- ┆ on ┆ --- ┆ ty ┆ --- ┆ --- ┆ dinates ┆ --- β”‚\n", - "β”‚ str ┆ --- ┆ u8 ┆ --- ┆ f64 ┆ u32 ┆ --- ┆ f32 β”‚\n", - "β”‚ ┆ str ┆ ┆ u8 ┆ ┆ ┆ list[list[ ┆ β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ f64]] ┆ β”‚\n", - "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════β•ͺ════════β•ͺ════════════β•ͺ════════════β•ͺ══════════β•ͺ════════════β•ͺ═══════════║\n", - "β”‚ /data1/grou ┆ uHF UNO DLP ┆ 0 ┆ 2 ┆ -400.82671 ┆ 995 ┆ [[1.966178 ┆ 1.19932 β”‚\n", - "β”‚ ps/co2_capt ┆ NO-CCSD(T)- ┆ ┆ ┆ ┆ ┆ , ┆ β”‚\n", - "β”‚ ure/reac… ┆ F12D cc-… ┆ ┆ ┆ ┆ ┆ 1.124096, ┆ β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ -0.12753… ┆ β”‚\n", - "β”‚ /data1/grou ┆ uHF UNO DLP ┆ 0 ┆ 2 ┆ -522.30410 ┆ 3716 ┆ [[3.793948 ┆ 0.52795 β”‚\n", - "β”‚ ps/co2_capt ┆ NO-CCSD(T)- ┆ ┆ ┆ 9 ┆ ┆ , 0.51694, ┆ β”‚\n", - "β”‚ ure/reac… ┆ F12D cc-… ┆ ┆ ┆ ┆ ┆ -0.021986… ┆ β”‚\n", - "β”‚ /data1/grou ┆ uHF UNO DLP ┆ 0 ┆ 2 ┆ -422.26349 ┆ 1073 ┆ [[-1.50328 ┆ 0.30215 β”‚\n", - "β”‚ ps/co2_capt ┆ NO-CCSD(T)- ┆ ┆ ┆ 9 ┆ ┆ 4, ┆ β”‚\n", - "β”‚ ure/reac… ┆ F12D cc-… ┆ ┆ ┆ ┆ ┆ -1.357277, ┆ β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ -0.496… ┆ β”‚\n", - "β”‚ /data1/grou ┆ uHF UNO DLP ┆ 0 ┆ 2 ┆ -454.04550 ┆ 990 ┆ [[-1.90539 ┆ 1.55913 β”‚\n", - "β”‚ ps/co2_capt ┆ NO-CCSD(T)- ┆ ┆ ┆ 9 ┆ ┆ 1, ┆ β”‚\n", - "β”‚ ure/reac… ┆ F12D cc-… ┆ ┆ ┆ ┆ ┆ -1.469124, ┆ β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ -0.652… ┆ β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# you can run everything from above using polars as well, and in my experience it uses less memory and is faster\n", - "df = pl.read_parquet(DLPNO_DATABASE_FPATH)\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# a notable difference is that polars sets `memory_map=True` by default (pandas supports it, but is False and accessible via kwarg only)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (4, 2)
sourcecharge
stru8
"/data1/groups/co2_capture/reac…0
"/data1/groups/co2_capture/reac…0
"/data1/groups/co2_capture/reac…0
"/data1/groups/co2_capture/reac…0
" - ], - "text/plain": [ - "shape: (4, 2)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ source ┆ charge β”‚\n", - "β”‚ --- ┆ --- β”‚\n", - "β”‚ str ┆ u8 β”‚\n", - "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ════════║\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ 0 β”‚\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ 0 β”‚\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ 0 β”‚\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ 0 β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# to pass filters to polars, you have to use the `pyarrow_options` argument (polars only supports limiting the number of rows in\n", - "# in sequential order via `n_rows`)\n", - "df = pl.read_parquet(\n", - " DLPNO_DATABASE_FPATH,\n", - " columns=[\"source\", \"charge\"],\n", - " pyarrow_options=dict(\n", - " filters=[(\"run_time\", \"<\", 100)],\n", - " schema=DLPNO_SCHEMA,\n", - " ),\n", - ")\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (4, 2)
sourceenergy
strf64
"/data1/groups/co2_capture/reac…-623.790852
"/data1/groups/co2_capture/reac…-170.225353
"/data1/groups/co2_capture/reac…-134.31814
"/data1/groups/co2_capture/reac…-515.612084
" - ], - "text/plain": [ - "shape: (4, 2)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ source ┆ energy β”‚\n", - "β”‚ --- ┆ --- β”‚\n", - "β”‚ str ┆ f64 β”‚\n", - "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════║\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ -623.790852 β”‚\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ -170.225353 β”‚\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ -134.31814 β”‚\n", - "β”‚ /data1/groups/co2_capture/reac… ┆ -515.612084 β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# ...or just use polars other functions\n", - "df = (\n", - " pl.scan_parquet(\n", - " DLPNO_DATABASE_FPATH,\n", - " ) # opens the file, but does not actually read it (LazyFrame)\n", - " .filter(\n", - " pl.col(\"run_time\") < 100,\n", - " ) # sets up our filters, but still does not run the query\n", - " .select(pl.col(\"source\"), pl.col(\"energy\"))\n", - " .collect() # actually runs the query\n", - ")\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourceenergy
0/data1/groups/co2_capture/reactant_product_cal...-623.790852
1/data1/groups/co2_capture/reactant_product_cal...-170.225353
2/data1/groups/co2_capture/reactant_product_cal...-134.318140
3/data1/groups/co2_capture/reactant_product_cal...-515.612084
\n", - "
" - ], - "text/plain": [ - " source energy\n", - "0 /data1/groups/co2_capture/reactant_product_cal... -623.790852\n", - "1 /data1/groups/co2_capture/reactant_product_cal... -170.225353\n", - "2 /data1/groups/co2_capture/reactant_product_cal... -134.318140\n", - "3 /data1/groups/co2_capture/reactant_product_cal... -515.612084" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# the final option is to interact with the data via pyarrow directly, which takes all the same arguments as before but in a slightly\n", - "# different setup - this is the single fastest way to read the data\n", - "table = pq.ParquetDataset(DLPNO_DATABASE_FPATH, schema=DLPNO_SCHEMA, filters=[(\"run_time\", \"<\", 100)]).read(columns=[\"source\", \"energy\"])\n", - "df = table.to_pandas()\n", - "df.head(4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DFT\n", - "All of the above applies for the DFT data also - this will just demonstrate some of the fields that are specific to the DFT data, as well as limiting the number of rows (since this dataset is much bigger).\n", - "\n", - "This section will just use `pyarrow.parquet` since this dataset is more complicated and this library can do a lot more \"nuts and bolts\" interactions with the data.\n", - "Everything shown could _probably_ be done with pandas or polars, but not as easily." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# set this to match wherever you have the database file located\n", - "DFT_DATABASE_FPATH = \"QuantumPioneer_v1_DFT.parquet\"" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\n", - " created_by: parquet-cpp-arrow version 16.1.0\n", - " num_columns: 26\n", - " num_rows: 453472\n", - " num_row_groups: 886\n", - " format_version: 2.6\n", - " serialized_size: 3008687" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# look at the information about the database\n", - "f = pq.ParquetFile(DFT_DATABASE_FPATH)\n", - "f.metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourceroute_sectionchargemultiplicitymax_stepsnormal_terminationcpu_timewall_timee0_hhf...aniso_polarizability_auiso_polarizability_auscfdipole_moment_debyefrequenciesmulliken_charges_summedfrequency_modesxyzstd_xyzstd_forces
0/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...P opt=(calcfc,maxcycle=128,noeig,nomicro,carte...02100True2374614-382.633530-382.760525...18.408562.8190[382.758210298, 382.759560141, 382.760381356, ...[1.8824, 1.7059, 0.6066][131.7364, 188.0713, 194.6236, 263.776, 273.62...[[1.0, 0.234094], [2.0, -0.375229], [3.0, 0.37...[[[1.0, 6.0, -0.07, -0.1, -0.16], [2.0, 6.0, -...[[1.0, 6.0, 0.0, 0.178454, 1.90411, 0.117548],...[[[1.0, 6.0, 0.0, -0.175993, 1.912209, -0.1176...[[[1.0, 6.0, -2.303e-05, 0.000148725, 7.95e-06...
1/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...P opt=(calcall,maxcycle=64,noeig,nomicro,carte...0264True56031415-319.766421-319.876823...37.282658.9151[319.870024548, 319.873712193, 319.875876296, ...[-2.9536, 6.6691, 0.0][125.7135, 225.1377, 243.6628, 275.6355, 307.2...[[1.0, 0.177478], [2.0, -0.11589], [3.0, -0.08...[[[1.0, 6.0, -0.0, -0.0, -0.02], [2.0, 6.0, -0...[[1.0, 6.0, 0.0, -2.04409, -0.600169, 0.01305]...[[[1.0, 6.0, 0.0, -2.050841, -0.610164, -0.000...[[[1.0, 6.0, 0.001360017, 0.000413491, -5.4464...
2/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...P opt=(calcfc,maxcycle=128,noeig,nomicro,carte...02120True92512448-423.099887-423.281993...14.489974.3067[423.274519549, 423.277512792, 423.280067289, ...[0.6979, 0.8904, 0.3061][61.6212, 178.6859, 229.9573, 253.2535, 273.82...[[1.0, 0.178076], [2.0, -0.425597], [3.0, 0.11...[[[1.0, 6.0, -0.08, 0.02, -0.05], [2.0, 6.0, 0...[[1.0, 6.0, 0.0, 0.243366, 2.083137, 0.689603]...[[[1.0, 6.0, 0.0, -0.459041, 2.197114, -0.2423...[[[1.0, 6.0, -4.106e-06, 1.1987e-05, -2.732e-0...
3/data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi...P opt=(calcall,maxcycle=64,noeig,nomicro,carte...0264True8237537-303.770502-303.893091...46.181863.9089[303.886122623, 303.88988402, 303.892570737, 3...[3.0462, 0.8742, 0.0015][128.1369, 165.4973, 216.2848, 249.2191, 292.2...[[1.0, 0.154953], [2.0, -0.118076], [3.0, -0.0...[[[1.0, 6.0, -0.0, 0.0, -0.03], [2.0, 6.0, -0....[[1.0, 6.0, 0.0, -1.37189, -1.175717, -0.07927...[[[1.0, 6.0, 0.0, 1.994908, -0.843335, 0.00100...[[[1.0, 6.0, -0.000275241, 6.5551e-05, 1.2646e...
\n", - "

4 rows Γ— 26 columns

\n", - "
" - ], - "text/plain": [ - " source \\\n", - "0 /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi... \n", - "1 /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi... \n", - "2 /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi... \n", - "3 /data1/groups/RMG/Projects/Hao-Wei-Oscar-Yunsi... \n", - "\n", - " route_section charge multiplicity \\\n", - "0 P opt=(calcfc,maxcycle=128,noeig,nomicro,carte... 0 2 \n", - "1 P opt=(calcall,maxcycle=64,noeig,nomicro,carte... 0 2 \n", - "2 P opt=(calcfc,maxcycle=128,noeig,nomicro,carte... 0 2 \n", - "3 P opt=(calcall,maxcycle=64,noeig,nomicro,carte... 0 2 \n", - "\n", - " max_steps normal_termination cpu_time wall_time e0_h hf \\\n", - "0 100 True 2374 614 -382.633530 -382.760525 \n", - "1 64 True 5603 1415 -319.766421 -319.876823 \n", - "2 120 True 9251 2448 -423.099887 -423.281993 \n", - "3 64 True 8237 537 -303.770502 -303.893091 \n", - "\n", - " ... aniso_polarizability_au iso_polarizability_au \\\n", - "0 ... 18.4085 62.8190 \n", - "1 ... 37.2826 58.9151 \n", - "2 ... 14.4899 74.3067 \n", - "3 ... 46.1818 63.9089 \n", - "\n", - " scf \\\n", - "0 [382.758210298, 382.759560141, 382.760381356, ... \n", - "1 [319.870024548, 319.873712193, 319.875876296, ... \n", - "2 [423.274519549, 423.277512792, 423.280067289, ... \n", - "3 [303.886122623, 303.88988402, 303.892570737, 3... \n", - "\n", - " dipole_moment_debye \\\n", - "0 [1.8824, 1.7059, 0.6066] \n", - "1 [-2.9536, 6.6691, 0.0] \n", - "2 [0.6979, 0.8904, 0.3061] \n", - "3 [3.0462, 0.8742, 0.0015] \n", - "\n", - " frequencies \\\n", - "0 [131.7364, 188.0713, 194.6236, 263.776, 273.62... \n", - "1 [125.7135, 225.1377, 243.6628, 275.6355, 307.2... \n", - "2 [61.6212, 178.6859, 229.9573, 253.2535, 273.82... \n", - "3 [128.1369, 165.4973, 216.2848, 249.2191, 292.2... \n", - "\n", - " mulliken_charges_summed \\\n", - "0 [[1.0, 0.234094], [2.0, -0.375229], [3.0, 0.37... \n", - "1 [[1.0, 0.177478], [2.0, -0.11589], [3.0, -0.08... \n", - "2 [[1.0, 0.178076], [2.0, -0.425597], [3.0, 0.11... \n", - "3 [[1.0, 0.154953], [2.0, -0.118076], [3.0, -0.0... \n", - "\n", - " frequency_modes \\\n", - "0 [[[1.0, 6.0, -0.07, -0.1, -0.16], [2.0, 6.0, -... \n", - "1 [[[1.0, 6.0, -0.0, -0.0, -0.02], [2.0, 6.0, -0... \n", - "2 [[[1.0, 6.0, -0.08, 0.02, -0.05], [2.0, 6.0, 0... \n", - "3 [[[1.0, 6.0, -0.0, 0.0, -0.03], [2.0, 6.0, -0.... \n", - "\n", - " xyz \\\n", - "0 [[1.0, 6.0, 0.0, 0.178454, 1.90411, 0.117548],... \n", - "1 [[1.0, 6.0, 0.0, -2.04409, -0.600169, 0.01305]... \n", - "2 [[1.0, 6.0, 0.0, 0.243366, 2.083137, 0.689603]... \n", - "3 [[1.0, 6.0, 0.0, -1.37189, -1.175717, -0.07927... \n", - "\n", - " std_xyz \\\n", - "0 [[[1.0, 6.0, 0.0, -0.175993, 1.912209, -0.1176... \n", - "1 [[[1.0, 6.0, 0.0, -2.050841, -0.610164, -0.000... \n", - "2 [[[1.0, 6.0, 0.0, -0.459041, 2.197114, -0.2423... \n", - "3 [[[1.0, 6.0, 0.0, 1.994908, -0.843335, 0.00100... \n", - "\n", - " std_forces \n", - "0 [[[1.0, 6.0, -2.303e-05, 0.000148725, 7.95e-06... \n", - "1 [[[1.0, 6.0, 0.001360017, 0.000413491, -5.4464... \n", - "2 [[[1.0, 6.0, -4.106e-06, 1.1987e-05, -2.732e-0... \n", - "3 [[[1.0, 6.0, -0.000275241, 6.5551e-05, 1.2646e... \n", - "\n", - "[4 rows x 26 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# many rows in this data! just load some of them (see https://stackoverflow.com/a/69888274)\n", - "first_ten_rows = next(f.iter_batches(batch_size=64))\n", - "df = pa.Table.from_batches([first_ten_rows]).to_pandas()\n", - "df.head(4)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " multiplicity hf\n", - "0 2 -382.760525\n", - "1 2 -319.876823\n", - "2 2 -423.281993\n", - "3 2 -303.893091\n" - ] - } - ], - "source": [ - "# we can also apply the many filters from above on this data\n", - "table = pq.ParquetDataset(\n", - " DFT_DATABASE_FPATH,\n", - " schema=DFT_SCHEMA,\n", - " filters=[(\"hf\", \"!=\", np.nan)], # skip rows where hf is missing\n", - " memory_map=True, # reduce memory usage by delaying reads into memory\n", - ").read(columns=[\"multiplicity\", \"hf\"])\n", - "\n", - "for batch in table.to_batches(512):\n", - " print(batch.to_pandas().head(4))\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 [[1.0, 6.0, 0.0, 0.178454, 1.90411, 0.117548],...\n", - "1 [[1.0, 6.0, 0.0, -2.04409, -0.600169, 0.01305]...\n", - "2 [[1.0, 6.0, 0.0, 0.243366, 2.083137, 0.689603]...\n", - "3 [[1.0, 6.0, 0.0, -1.37189, -1.175717, -0.07927...\n", - "Name: xyz, dtype: object\n" - ] - } - ], - "source": [ - "# the last thing I will point to is the pyarrow scanner, which makes a lot of these operations easy too\n", - "# https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html\n", - "# This part of the pyarrow API is currently experimental, so it might change, but it is very useful\n", - "\n", - "ds = pa.dataset.dataset(DFT_DATABASE_FPATH, schema=DFT_SCHEMA)\n", - "s = pa.dataset.Scanner.from_dataset(\n", - " ds,\n", - " columns=[\"charge\", \"multiplicity\", \"xyz\"],\n", - " filter=(pa.compute.field(\"hf\") != pa.compute.scalar(np.nan)), # must use pyarrow.compute syntax instead of plain strings...\n", - " batch_size=5_096, # reduce this to fit in your memory limitations\n", - ")\n", - "for batch in s.to_batches():\n", - " print(batch.to_pandas()['xyz'].head(4))\n", - " break" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "quantumpioneer", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index fd878a3..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,37 +0,0 @@ -[build-system] -requires = ["setuptools>=64"] -build-backend = "setuptools.build_meta" - -[project] -name = "tristate20" -version = "0.0.0a2" -authors = [ - { name = "Jackson Burns" }, -] -license = { text = "MIT" } -description = "TRISTATE20 Dataset" -classifiers = [ - "Programming Language :: Python :: 3", -] -urls = { Homepage = "https://github.com/QuantumPioneer/tristate20" } -requires-python = ">=3.8" -dependencies = ["pyarrow"] - -[project.optional-dependencies] -dev = ["black", "isort", "pytest"] -demos = ["jupyter"] - -[project.readme] -file = "README.md" -content-type = "text/markdown" - -[tool.isort] -profile = "black" - -[tool.setuptools] -include-package-data = true - -[tool.setuptools.packages.find] -where = ["."] -include = ["tristate20*"] -exclude = ["docs*", "examples*", "test*"] diff --git a/databases/generator.py b/scripts/generator.py similarity index 100% rename from databases/generator.py rename to scripts/generator.py diff --git a/databases/schema.py b/scripts/schema.py similarity index 100% rename from databases/schema.py rename to scripts/schema.py