|
1 | | -# pyspark_datasources/jsonplaceholder.py |
2 | | - |
3 | 1 | from typing import Dict, Any, List, Iterator |
4 | 2 | import requests |
5 | 3 | from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition |
@@ -45,6 +43,29 @@ class JSONPlaceholderDataSource(DataSource): |
45 | 43 | Read specific item: |
46 | 44 |
|
47 | 45 | >>> spark.read.format("jsonplaceholder").option("endpoint", "posts").option("id", "1").load().show() |
| 46 | +
|
| 47 | + Referential Integrity |
| 48 | + ------------------- |
| 49 | + The data source supports joining related datasets: |
| 50 | +
|
| 51 | + 1. Posts and Users relationship: |
| 52 | + posts.userId = users.id |
| 53 | + >>> posts_df = spark.read.format("jsonplaceholder").option("endpoint", "posts").load() |
| 54 | + >>> users_df = spark.read.format("jsonplaceholder").option("endpoint", "users").load() |
| 55 | + >>> posts_with_authors = posts_df.join(users_df, posts_df.userId == users_df.id) |
| 56 | +
|
| 57 | + 2. Posts and Comments relationship: |
| 58 | + comments.postId = posts.id |
| 59 | + >>> comments_df = spark.read.format("jsonplaceholder").option("endpoint", "comments").load() |
| 60 | + >>> posts_with_comments = posts_df.join(comments_df, posts_df.id == comments_df.postId) |
| 61 | +
|
| 62 | + 3. Users, Albums and Photos relationship: |
| 63 | + albums.userId = users.id |
| 64 | + photos.albumId = albums.id |
| 65 | + >>> albums_df = spark.read.format("jsonplaceholder").option("endpoint", "albums").load() |
| 66 | + >>> photos_df = spark.read.format("jsonplaceholder").option("endpoint", "photos").load() |
| 67 | + >>> user_albums = users_df.join(albums_df, users_df.id == albums_df.userId) |
| 68 | + >>> user_photos = user_albums.join(photos_df, albums_df.id == photos_df.albumId) |
48 | 69 | """ |
49 | 70 |
|
50 | 71 | @classmethod |
|
0 commit comments