From 6f51fc0d4547840ad53280e225072783b0068f43 Mon Sep 17 00:00:00 2001
From: Dennis Moschina <45356478+DennisMoschina@users.noreply.github.com>
Date: Mon, 16 Feb 2026 15:08:31 +0100
Subject: [PATCH] docs: update changelog and add documentation for getting
 started, data model, and API reference

---
 CHANGELOG.md            |   4 +-
 README.md               | 100 ++++++--------------------------
 docs/README.md          |  16 +++++
 docs/api-reference.md   | 125 ++++++++++++++++++++++++++++++++++++++++
 docs/data-model.md      |  61 ++++++++++++++++++++
 docs/getting-started.md |  96 ++++++++++++++++++++++++++++++
 6 files changed, 318 insertions(+), 84 deletions(-)
 create mode 100644 docs/README.md
 create mode 100644 docs/api-reference.md
 create mode 100644 docs/data-model.md
 create mode 100644 docs/getting-started.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c87b7cb..2e3621d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,10 @@
 ## 0.0.2
 
 * added access to raw microphone dataframes
+* added a dedicated `docs/` documentation set (getting started, data model, API reference)
+* updated `README.md` as a concise entry point and linked full docs
 
 ## 0.0.1
 
 * Initial release of the Open Earable Python SDK.
-* parse Open Earable data files
\ No newline at end of file
+* parse Open Earable data files
diff --git a/README.md b/README.md
index 6628afd..c3d8953 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,14 @@
 # Open Earable Python
 
-A Python toolkit for parsing and analyzing multi-sensor recordings from an OpenEarable device. The library provides pandas-friendly accessors for IMU, barometer, PPG, bone accelerometer, optical temperature, and microphone data, along with audio utilities.
-
-## Features
-- Load `.oe` recordings into a single time-aligned pandas DataFrame.
-- Convenient attribute and key-based accessors for grouped sensors and individual channels.
-- Play or export microphone audio directly from notebooks.
-- Export combined sensor data to CSV for downstream analysis.
+Python toolkit for parsing and analyzing multi-sensor OpenEarable recordings.
 
 ## Installation
-The package targets Python 3.9+.
-
-Once published to PyPI:
 
 ```bash
 pip install open-earable-python
 ```
 
-From source (for development):
+For local development:
 
 ```bash
 git clone https://github.com/OpenEarable/open-earable-python.git
@@ -27,86 +18,29 @@ source .venv/bin/activate
 pip install -e .
 ```
 
-## Quickstart
-Load a recording and explore the combined DataFrame:
+## Quick Example
 
 ```python
 from open_earable_python import SensorDataset
 
-# Load a single .oe file
-recording = SensorDataset("my_recording.oe")
+dataset = SensorDataset("recording.oe")
 
-# Time-indexed dataframe containing all available sensors
-full_df = recording.get_dataframe()
-print(full_df.head())
+# Combined time-indexed DataFrame of all parsed sensors
+df = dataset.get_dataframe()
 
-# Export to CSV
-recording.save_csv("my_recording.csv")
+# Per-sensor views
+imu_df = dataset.imu.df
+ppg_red = dataset.ppg["ppg.red"]
+audio_df = dataset.get_audio_dataframe()
 ```
 
-### Sensor access patterns
-Each sensor has an accessor exposing both grouped views and individual channels using attribute or key syntax. For IMU data:
-
-```python
-imu = recording.imu
-
-# Full IMU dataframe (original column names retained)
-imu.df          # or imu.to_dataframe()
-imu["acc.x"]   # Column-style access
-
-# Accelerometer
-imu.acc         # Accelerometer dataframe
-imu.acc["x"]   # Accelerometer X channel
-imu.acc["y"]
-imu.acc["z"]
+## Documentation
 
-# Gyroscope
-imu.gyro        # Gyroscope dataframe
-imu.gyro["x"]
-imu.gyro["y"]
-imu.gyro["z"]
+- [Documentation index](docs/README.md)
+- [Getting started](docs/getting-started.md)
+- [Data model and sensor channels](docs/data-model.md)
+- [API reference](docs/api-reference.md)
 
-# Magnetometer
-imu.mag          # Magnetometer dataframe
-imu.mag["x"]
-imu.mag["y"]
-imu.mag["z"]
-```
-
-PPG channels follow the same pattern:
-
-```python
-ppg = recording.ppg
-ppg.df           # Full PPG dataframe
-ppg["ppg.red"]  # Column-style access
-ppg["red"]      # Channel shortcut
-ppg.ir
-ppg.green
-ppg.ambient
-```
+## License
 
-### Working with multiple recordings
-Load several files at once and iterate over them:
-
-```python
-from open_earable_python.dataset import load_recordings
-
-paths = ["session1.oe", "session2.oe"]
-recordings = load_recordings(paths)
-
-# Access a specific recording
-first = recordings[0]
-print(first.list_sensors())
-```
-
-### Audio utilities
-- `play_audio(sampling_rate=48000)`: play stereo microphone data in a Jupyter environment.
-- `save_audio(path, sampling_rate=48000)`: export microphone audio to WAV.
-- `get_audio_dataframe(sampling_rate=48000)`: return microphone PCM as a timestamp-indexed DataFrame (`mic.inner`, `mic.outer`).
-
-Example:
-
-```python
-audio_df = recording.get_audio_dataframe()
-print(audio_df.head())
-```
+MIT. See `LICENSE`.
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..52001eb
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,16 @@
+# Open Earable Python Documentation
+
+`open-earable-python` parses `.oe` recordings into pandas DataFrames and exposes convenient accessors for OpenEarable sensor streams.
+
+## Contents
+
+- [Getting started](getting-started.md)
+- [Data model and sensor channels](data-model.md)
+- [API reference](api-reference.md)
+
+## Package Scope
+
+- Parse binary OpenEarable streams into structured sensor samples.
+- Build per-sensor and combined time-indexed DataFrames.
+- Decode microphone PCM samples and export/play audio.
+- Load one or multiple recordings with the same API.
diff --git a/docs/api-reference.md b/docs/api-reference.md
new file mode 100644
index 0000000..d62c9cc
--- /dev/null
+++ b/docs/api-reference.md
@@ -0,0 +1,125 @@
+# API Reference
+
+## Package Exports
+
+```python
+from open_earable_python import SensorDataset, load_recordings
+```
+
+## `SensorDataset`
+
+High-level API for loading and analyzing a single `.oe` recording.
+
+### Constructor
+
+```python
+SensorDataset(filename: str, verbose: bool = False)
+```
+
+- `filename`: path to `.oe` file.
+- `verbose`: enables parser diagnostic output.
+
+Parsing happens during initialization.
+
+### Attributes
+
+- `filename: str` source file path.
+- `verbose: bool` parser verbosity flag.
+- `parse_result: parser.ParseResult` raw parse output.
+- `sensor_dfs: Dict[int, pandas.DataFrame]` per-SID DataFrames.
+- `df: pandas.DataFrame` lazily built combined DataFrame.
+- `audio_stereo: Optional[numpy.ndarray]` stereo audio frames (`int16`, shape `(N, 2)`).
+- `audio_df: pandas.DataFrame` cached audio DataFrame.
+
+Sensor accessor attributes:
+
+- `dataset.imu`
+- `dataset.barometer`
+- `dataset.microphone`
+- `dataset.ppg`
+- `dataset.optical_temp`
+- `dataset.bone_acc`
+
+Each accessor supports grouped and channel-level access (see data model docs).
+
+### Methods
+
+#### `parse() -> None`
+
+Re-parses the recording file and updates `parse_result`.
+
+#### `list_sensors() -> List[str]`
+
+Returns sensor names with non-empty DataFrames.
+
+#### `get_sensor_dataframe(name: str) -> pandas.DataFrame`
+
+Returns one sensor DataFrame by name.
+
+- Valid names: `imu`, `barometer`, `microphone`, `ppg`, `optical_temp`, `bone_acc`
+- Raises `KeyError` for unknown names.
+
+#### `get_dataframe() -> pandas.DataFrame`
+
+Builds and caches a merged DataFrame across all non-empty sensor streams.
+
+#### `get_audio_dataframe(sampling_rate: int = 48000) -> pandas.DataFrame`
+
+Returns timestamp-indexed audio DataFrame with columns:
+
+- `mic.inner`
+- `mic.outer`
+
+Behavior:
+
+- Raises `ValueError` if `sampling_rate <= 0`.
+- Returns empty DataFrame with expected columns if no mic packets exist.
+- Caches by sampling rate.
+
+#### `export_csv() -> None`
+
+Writes combined DataFrame to `<recording_basename>.csv` by delegating to `save_csv()`.
+
+#### `save_csv(path: str) -> None`
+
+Saves the combined DataFrame to CSV if `self.df` is non-empty.
+
+Call `get_dataframe()` first to ensure `self.df` is populated.
+
+#### `play_audio(sampling_rate: int = 48000) -> None`
+
+Plays audio in IPython/Jupyter via `IPython.display.Audio`.
+
+#### `save_audio(path: str, sampling_rate: int = 48000) -> None`
+
+Writes WAV audio with `scipy.io.wavfile.write`.
+
+## `load_recordings`
+
+```python
+load_recordings(file_paths: Sequence[str]) -> List[SensorDataset]
+```
+
+Creates `SensorDataset` objects for existing files only.
+
+## Parser Module (`open_earable_python.parser`)
+
+Core classes and helpers for decoding binary packets:
+
+- `Parser`: stream parser over packetized binary data.
+- `PayloadParser`: base parser interface.
+- `SchemePayloadParser`: parser built from `SensorScheme`.
+- `MicPayloadParser`: parser for microphone payloads.
+- `ParseResult`: parse container with per-SID DataFrames and microphone artifacts.
+- `interleaved_mic_to_stereo(samples)`: converts interleaved samples to stereo.
+- `mic_packet_to_stereo_frames(packet, sampling_rate)`: timestamp + stereo frame conversion.
+
+## Scheme Module (`open_earable_python.scheme`)
+
+Defines sensor schema primitives:
+
+- `ParseType` enum
+- `SensorComponentScheme`
+- `SensorComponentGroupScheme`
+- `SensorScheme`
+- `build_default_sensor_schemes(sensor_sid)`
diff --git a/docs/data-model.md b/docs/data-model.md
new file mode 100644
index 0000000..59ae273
--- /dev/null
+++ b/docs/data-model.md
@@ -0,0 +1,61 @@
+# Data Model and Sensor Channels
+
+## Time Index
+
+All sensor DataFrames are indexed by `timestamp` in seconds (`float`), derived from packet timestamps in microseconds.
+
+## Sensor Streams
+
+`SensorDataset` defines these sensor streams:
+
+- `imu` (SID 0)
+- `barometer` (SID 1)
+- `microphone` (SID 2)
+- `ppg` (SID 4)
+- `optical_temp` (SID 6)
+- `bone_acc` (SID 7)
+
+## Default Columns by Sensor
+
+- `imu`: `acc.x`, `acc.y`, `acc.z`, `gyro.x`, `gyro.y`, `gyro.z`, `mag.x`, `mag.y`, `mag.z`
+- `barometer`: `barometer.temperature`, `barometer.pressure`
+- `ppg`: `ppg.red`, `ppg.ir`, `ppg.green`, `ppg.ambient`
+- `bone_acc`: `bone_acc.x`, `bone_acc.y`, `bone_acc.z`
+- `optical_temp`: `optical_temp`
+- `microphone`: `mic.inner`, `mic.outer`
+
+## Accessor Semantics
+
+Each sensor is exposed as a `_SensorAccessor` object:
+
+- `sensor.df` or `sensor.to_dataframe()` returns the full sensor DataFrame with original column names.
+- Group columns are available as sub-DataFrames:
+  - `dataset.imu.acc` -> columns `x`, `y`, `z`
+  - `dataset.imu.gyro` -> columns `x`, `y`, `z`
+  - `dataset.imu.mag` -> columns `x`, `y`, `z`
+  - `dataset.ppg.ppg` -> columns `red`, `ir`, `green`, `ambient`
+- Original columns remain directly accessible:
+  - `dataset.imu["acc.x"]`
+  - `dataset.ppg["ppg.red"]`
+
+## Combined DataFrame
+
+`get_dataframe()` merges all non-empty per-sensor DataFrames:
+
+- Creates a union of all sensor timestamps.
+- Reindexes each sensor DataFrame onto that common index.
+- Concatenates columns into one DataFrame.
+
+This preserves each stream while aligning them on time.
+
+## Microphone Data Details
+
+Microphone payloads are parsed as interleaved `int16` samples and converted to stereo frames:
+
+- Input interleaving: `[outer, inner, outer, inner, ...]`
+- Output stereo columns/order: `[inner, outer]`
+
+The audio DataFrame generated by `get_audio_dataframe()` uses:
+
+- index: `timestamp` in seconds
+- columns: `mic.inner`, `mic.outer`
diff --git a/docs/getting-started.md b/docs/getting-started.md
new file mode 100644
index 0000000..d5eeb31
--- /dev/null
+++ b/docs/getting-started.md
@@ -0,0 +1,96 @@
+# Getting Started
+
+## Requirements
+
+- Python 3.9+
+- `numpy`, `pandas`, `scipy`, `ipython` (installed automatically with this package)
+
+## Installation
+
+```bash
+pip install open-earable-python
+```
+
+From source:
+
+```bash
+git clone https://github.com/OpenEarable/open-earable-python.git
+cd open-earable-python
+python -m venv .venv
+source .venv/bin/activate
+pip install -e .
+```
+
+## Load a Recording
+
+```python
+from open_earable_python import SensorDataset
+
+dataset = SensorDataset("my_recording.oe")
+```
+
+`SensorDataset` parses the file immediately during initialization.
+
+## Work with Sensor Data
+
+```python
+# Combined DataFrame (all available non-empty sensor streams)
+df = dataset.get_dataframe()
+print(df.head())
+
+# List non-empty sensor streams
+print(dataset.list_sensors())
+
+# Access one sensor DataFrame directly
+imu_df = dataset.get_sensor_dataframe("imu")
+print(imu_df.columns)
+```
+
+## Access Channels via Accessors
+
+```python
+# Full IMU DataFrame (columns: acc.x, acc.y, ...)
+imu = dataset.imu.df
+
+# Group-level access (columns renamed to x, y, z)
+acc = dataset.imu.acc
+gyro = dataset.imu.gyro
+
+# Channel-level access
+acc_x = dataset.imu.acc["x"]
+mag_z = dataset.imu.mag.z
+```
+
+## Work with Audio
+
+```python
+# Timestamp-indexed stereo audio DataFrame
+audio_df = dataset.get_audio_dataframe()  # default 48_000 Hz
+print(audio_df.columns)  # mic.inner, mic.outer
+
+# Save WAV
+dataset.save_audio("recording.wav")
+
+# Play in Jupyter/IPython environments
+dataset.play_audio()
+```
+
+## Export CSV
+
+```python
+# Build combined DataFrame, then export it
+dataset.get_dataframe()
+dataset.save_csv("recording.csv")
+```
+
+`save_csv()` writes only if the combined DataFrame is already populated (for example after calling `get_dataframe()`).
+
+## Load Multiple Files
+
+```python
+from open_earable_python import load_recordings
+
+recordings = load_recordings(["session1.oe", "session2.oe"])
+for rec in recordings:
+    print(rec.filename, rec.list_sensors())
+```