Skip to content

Commit 41ee298

Browse files
committed
complete revise the architecture
1 parent f97d4e0 commit 41ee298

File tree

15 files changed

+943
-1112
lines changed

15 files changed

+943
-1112
lines changed

.gitignore

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,32 @@ Thumbs.db
3636
# Misc
3737
node_modules/
3838

39+
40+
# Below is for notebooks folder
41+
# Ignore Python cache and compiled files everywhere
42+
__pycache__/
43+
*.pyc
44+
*.pyo
45+
46+
# Ignore virtual environments at root or in notebooks/
47+
.venv/
48+
venv/
49+
env/
50+
notebooks/.venv/
51+
notebooks/venv/
52+
notebooks/env/
53+
notebooks/uv/
54+
55+
# Ignore marimo and Jupyter checkpoints in notebooks/
56+
notebooks/.ipynb_checkpoints/
57+
notebooks/.marimo_checkpoints/
58+
59+
# Ignore data/output folders and files in notebooks/
60+
notebooks/results/
61+
notebooks/output/
62+
notebooks/*.parquet
63+
notebooks/*.csv
64+
65+
notebooks/archived
66+
.notebooks/.ruff_cache/
67+
.ruff_cache/

notebooks/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.13

notebooks/post_run_analysis.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
# /// script
2+
# requires-python = ">=3.13"
3+
# dependencies = [
4+
# "marimo",
5+
# "matplotlib==3.10.3",
6+
# "numpy==2.3.1",
7+
# "pandas==2.3.1",
8+
# "polars==1.31.0",
9+
# "pyarrow==20.0.0",
10+
# "pygam==0.9.1",
11+
# "scikit-learn==1.7.0",
12+
# "scipy==1.16.0",
13+
# "seaborn==0.13.2",
14+
# ]
15+
# ///
16+
17+
import marimo
18+
19+
__generated_with = "0.14.10"
20+
app = marimo.App(width="medium")
21+
22+
23+
@app.cell
24+
def _():
25+
import marimo as mo
26+
import polars as pl
27+
import pandas as pd
28+
import seaborn as sns
29+
30+
import numpy as np
31+
import matplotlib.pyplot as plt
32+
from sklearn.preprocessing import SplineTransformer
33+
from sklearn.linear_model import Ridge
34+
from sklearn.pipeline import make_pipeline
35+
return Ridge, SplineTransformer, make_pipeline, mo, np, pd, pl, plt, sns
36+
37+
38+
@app.cell
39+
def _(mo):
40+
mo.md(
41+
r"""
42+
## Abstract
43+
44+
This notebook shows how the premium rates are smoothed after being calculated using Rust model.
45+
46+
The intension at first is to used Rust for analysis. However, at the time of this is produced, due to its Rust language's nature (compiled), it takes significant time to produce the output. On another hand, Python reactive notebook is more advanced and packed with featured. Hence, Python is used for post anlysis.
47+
48+
After premium rate is calculated using Rust model, the premium rates are still jumpy at several model points. This notebook shows how the premium rate is smoothed using common p-sline technique.
49+
50+
In practise, it is observed that many actuaries are performing this step manually with interpolation. However, it is not recommended as not being reproducible and labor intensive.
51+
52+
## Crude premium rate
53+
54+
"""
55+
)
56+
return
57+
58+
59+
@app.cell
60+
def _(pl):
61+
# Load the DataFrame from the Parquet file
62+
df = pl.read_parquet(
63+
"D:/proj/term-sm-rust/results/first_test/run_0/projected_df.parquet"
64+
)
65+
66+
# Transform data to obtain premium rate
67+
prem_rate_df = (
68+
df.lazy()
69+
.with_columns(
70+
(pl.col("prem_pp") / pl.col("sum_insured") * pl.lit(1000.0)).alias(
71+
"prem_rate"
72+
)
73+
)
74+
.filter(pl.col("t") == 0)
75+
.group_by(["age", "term"])
76+
.agg(pl.col("prem_rate").mean().alias("ave_prem_rate"))
77+
.sort(["term", "age"])
78+
.collect()
79+
)
80+
81+
prem_rate_df
82+
return (prem_rate_df,)
83+
84+
85+
@app.cell
86+
def _(mo):
87+
mo.md(r"""As observed from the scatter plots, there are several points that are not smoothed aka jumpy. We expect that the premium rate will gradually increase with positive slope as the age increases.""")
88+
return
89+
90+
91+
@app.cell
92+
def _(plt, prem_rate_df, sns):
93+
# Convert to pandas for seaborn plotting
94+
prem_rate_pd = prem_rate_df.to_pandas()
95+
96+
# Plot using seaborn
97+
plt.figure(figsize=(8, 5))
98+
sns.lineplot(data=prem_rate_pd, x="age", y="ave_prem_rate", hue="term", marker="o")
99+
plt.xlabel("Age")
100+
plt.ylabel("Average Premium Rate")
101+
plt.title("Average Premium Rate by Age and Term")
102+
plt.grid(True)
103+
plt.tight_layout()
104+
plt.show()
105+
return (prem_rate_pd,)
106+
107+
108+
@app.cell
109+
def _(mo):
110+
mo.md(r"""## Smooth the crude premium rate""")
111+
return
112+
113+
114+
@app.cell
115+
def _(Ridge, SplineTransformer, make_pipeline, np, plt, prem_rate_pd):
116+
plt.figure(figsize=(8, 5))
117+
118+
# To store smoothed values
119+
smoothed_points = []
120+
121+
for term, group in prem_rate_pd.groupby("term"):
122+
#
123+
group = group.sort_values("age")
124+
X = group["age"].values.reshape(-1, 1)
125+
y = group["ave_prem_rate"].values
126+
127+
# Spline smoothing pipeline (degree=3 for cubic, n_knots=8 for smoothness)
128+
n_knots = min(8, len(np.unique(X)))
129+
model = make_pipeline(
130+
SplineTransformer(degree=3, n_knots=n_knots, include_bias=False),
131+
Ridge(alpha=0.0)
132+
)
133+
model.fit(X, y)
134+
135+
# Predict at each observed age
136+
y_smooth = model.predict(X)
137+
138+
# Store smoothed values
139+
group_smoothed = group.copy()
140+
group_smoothed["smoothed_prem_rate"] = y_smooth
141+
smoothed_points.append(group_smoothed)
142+
143+
# Plot smooth curve and scatter
144+
XX = np.linspace(X.min(), X.max(), 200).reshape(-1, 1)
145+
plt.plot(XX, model.predict(XX), label=f"Term {term}", linewidth=1)
146+
plt.scatter(X, y, s=20, alpha=0.5)
147+
148+
plt.xlabel("Age")
149+
plt.ylabel("Average Premium Rate")
150+
plt.title("Smoothed Average Premium Rate by Age and Term (Spline)")
151+
plt.legend()
152+
plt.grid(True)
153+
plt.tight_layout()
154+
plt.show()
155+
return (smoothed_points,)
156+
157+
158+
@app.cell
159+
def _(mo):
160+
mo.md(r"""After creating smooth curve for each term, we obtained values of the smooth curve""")
161+
return
162+
163+
164+
@app.cell
165+
def _(pd, smoothed_points):
166+
# Concatenate all smoothed groups into a single DataFrame
167+
smoothed_df = pd.concat(smoothed_points, ignore_index=True)
168+
169+
smoothed_df
170+
return (smoothed_df,)
171+
172+
173+
@app.cell
174+
def _(pl, smoothed_df):
175+
# Convert to Polars DataFrame
176+
smoothed_pl = pl.from_pandas(smoothed_df)
177+
178+
# For each term, sort by age and compute the difference with the next value
179+
smoothed_pl = (
180+
smoothed_pl
181+
.sort(["term", "age"])
182+
.with_columns(
183+
(pl.col("smoothed_prem_rate").shift(-1) - pl.col("smoothed_prem_rate"))
184+
.over("term")
185+
.alias("diff_to_next")
186+
)
187+
)
188+
189+
smoothed_pl
190+
return (smoothed_pl,)
191+
192+
193+
@app.cell
194+
def _(pl, smoothed_pl):
195+
# Count how many negative values in diff_to_next
196+
tol = 1e-4
197+
negative_count = smoothed_pl.filter(pl.col("diff_to_next") < -tol).height
198+
negative_count
199+
return
200+
201+
202+
@app.cell
203+
def _(mo):
204+
mo.md(
205+
r"""
206+
As observed there is still one points that is has a negative slope. We can either iterate over different subset of variable to smooth out the curve.
207+
208+
However, I make an objective decision if tolerance is small enough, it will be ignored. Else, we will obtain the rate from the previous period to make the difference 0.
209+
"""
210+
)
211+
return
212+
213+
214+
if __name__ == "__main__":
215+
app.run()

notebooks/pyproject.toml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[project]
2+
name = "notebooks"
3+
version = "0.1.0"
4+
description = "Post premium rate run analysis"
5+
readme = "README.md"
6+
requires-python = ">=3.13"
7+
dependencies = [
8+
"marimo>=0.14.10",
9+
"matplotlib>=3.10.3",
10+
"numpy>=2.3.1",
11+
"pandas>=2.3.1",
12+
"polars>=1.31.0",
13+
"pyarrow>=20.0.0",
14+
"seaborn>=0.13.2",
15+
]

0 commit comments

Comments
 (0)