Skip to content

Commit 28840bf

Browse files
Add more tests and update docstring
1 parent da05a64 commit 28840bf

File tree

2 files changed

+27
-4
lines changed

2 files changed

+27
-4
lines changed

pyspark_datasources/jsonplaceholder.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# pyspark_datasources/jsonplaceholder.py
2-
31
from typing import Dict, Any, List, Iterator
42
import requests
53
from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
@@ -45,6 +43,29 @@ class JSONPlaceholderDataSource(DataSource):
4543
Read specific item:
4644
4745
>>> spark.read.format("jsonplaceholder").option("endpoint", "posts").option("id", "1").load().show()
46+
47+
Referential Integrity
48+
-------------------
49+
The data source supports joining related datasets:
50+
51+
1. Posts and Users relationship:
52+
posts.userId = users.id
53+
>>> posts_df = spark.read.format("jsonplaceholder").option("endpoint", "posts").load()
54+
>>> users_df = spark.read.format("jsonplaceholder").option("endpoint", "users").load()
55+
>>> posts_with_authors = posts_df.join(users_df, posts_df.userId == users_df.id)
56+
57+
2. Posts and Comments relationship:
58+
comments.postId = posts.id
59+
>>> comments_df = spark.read.format("jsonplaceholder").option("endpoint", "comments").load()
60+
>>> posts_with_comments = posts_df.join(comments_df, posts_df.id == comments_df.postId)
61+
62+
3. Users, Albums and Photos relationship:
63+
albums.userId = users.id
64+
photos.albumId = albums.id
65+
>>> albums_df = spark.read.format("jsonplaceholder").option("endpoint", "albums").load()
66+
>>> photos_df = spark.read.format("jsonplaceholder").option("endpoint", "photos").load()
67+
>>> user_albums = users_df.join(albums_df, users_df.id == albums_df.userId)
68+
>>> user_photos = user_albums.join(photos_df, albums_df.id == photos_df.albumId)
4869
"""
4970

5071
@classmethod

tests/test_data_sources.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ def test_jsonplaceholder_posts():
7272
assert posts_df.count() > 0 # Ensure we have some posts
7373

7474

75-
def test_jsonplaceholder_users():
75+
def test_jsonplaceholder_referential_integrity():
7676
from pyspark_datasources.jsonplaceholder import JSONPlaceholderDataSource
7777
spark.dataSource.register(JSONPlaceholderDataSource)
7878
users_df = spark.read.format("jsonplaceholder").option("endpoint", "users").load()
79-
assert users_df.count() > 0 # Ensure we have some users
79+
posts_df = spark.read.format("jsonplaceholder").option("endpoint", "posts").load()
80+
posts_with_authors = posts_df.join(users_df, posts_df.userId == users_df.id)
81+
assert posts_with_authors.count() > 0 # Ensure join is valid and we have posts with authors

0 commit comments

Comments
 (0)