Skip to content

Commit 0163102

Browse files
authored
fix: support new images schema by converting to legacy schema (#41)
fix: support new images schema by converting to legacy schema See openfoodfacts/openfoodfacts-server#11818 for more details about the schema changes.
1 parent 944e17c commit 0163102

File tree

4 files changed

+123
-8
lines changed

4 files changed

+123
-8
lines changed

openfoodfacts_exports/exports/parquet/common.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import pyarrow as pa
55
from huggingface_hub import HfApi
6+
from openfoodfacts.images import convert_to_legacy_schema
67
from pydantic import BaseModel, Field, model_validator
78

89
logger = logging.getLogger(__name__)
@@ -249,13 +250,15 @@ def parse_images(cls, data: dict) -> dict:
249250
key as the key and the image data as the value.
250251
251252
To make the schema compatible with Parquet, we convert these fields
252-
into a list of dictionaries with `key`, `imgid`, `rev`, `sizes`, `uploaded_t`,
253-
and `uploader` keys. We copy the image key (ex: `3`, `nutrition_fr`,...)
254-
from the original dictionary and add it as a field under the `key` key.
253+
into a list of dictionaries with `key`, `imgid`, `rev`, `sizes`,
254+
`uploaded_t`, and `uploader` keys. We copy the image key (ex: `3`,
255+
`nutrition_fr`,...) from the original dictionary and add it as a field
256+
under the `key` key.
255257
"""
256258
images = data.pop("images", None)
257259
data["images"] = []
258260
if images:
261+
images = convert_to_legacy_schema(images)
259262
for key, value in images.items():
260263
data["images"].append({"key": key, **value})
261264
return data

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ dependencies = [
1010
"huggingface-hub>=0.26.2",
1111
"minio>=7.2.10",
1212
"more-itertools>=10.5.0",
13-
"openfoodfacts>=2.2.0",
13+
"openfoodfacts==2.5.2",
1414
"orjson>=3.10.11",
1515
"pyarrow>=18.0.0",
1616
"pytz>=2024.2",

tests/unit/exports/test_parquet.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
BEAUTY_PRODUCT_SCHEMA,
99
BeautyProduct,
1010
)
11+
from openfoodfacts_exports.exports.parquet.common import Product
1112

1213

1314
class TestConvertJSONLToParquet:
@@ -21,3 +22,114 @@ def test_convert_jsonl_to_parquet_data_missing(self):
2122
schema=BEAUTY_PRODUCT_SCHEMA,
2223
dtype_map=BEAUTY_DTYPE_MAP,
2324
)
25+
26+
27+
PARSED_IMAGES_WITH_LEGACY_SCHEMA = [
28+
{
29+
"key": "1",
30+
"sizes": {
31+
"100": {"h": 100, "w": 56},
32+
"400": {"h": 400, "w": 225},
33+
"full": {"h": 3555, "w": 2000},
34+
},
35+
"uploaded_t": "1490702616",
36+
"uploader": "user1",
37+
},
38+
{
39+
"key": "nutrition_fr",
40+
"angle": None,
41+
"geometry": "0x0-0-0",
42+
"imgid": "1",
43+
"normalize": "0",
44+
"ocr": 1,
45+
"orientation": "0",
46+
"rev": "18",
47+
"sizes": {
48+
"100": {"h": 53, "w": 100},
49+
"200": {"h": 107, "w": 200},
50+
"400": {"h": 213, "w": 400},
51+
"full": {"h": 1093, "w": 2050},
52+
},
53+
"white_magic": "0",
54+
"x1": None,
55+
"x2": None,
56+
"y1": None,
57+
"y2": None,
58+
},
59+
]
60+
61+
62+
IMAGES_WITH_NEW_SCHEMA = {
63+
"uploaded": {
64+
"1": {
65+
"sizes": {
66+
"100": {
67+
"h": 100,
68+
"w": 56,
69+
"url": "https://images.openfoodfacts.org/images/products/326/385/950/6216/1.100.jpg",
70+
},
71+
"400": {
72+
"h": 400,
73+
"w": 225,
74+
"url": "https://images.openfoodfacts.org/images/products/326/385/950/6216/1.400.jpg",
75+
},
76+
"full": {
77+
"h": 3555,
78+
"w": 2000,
79+
"url": "https://images.openfoodfacts.org/images/products/326/385/950/6216/1.jpg",
80+
},
81+
},
82+
"uploaded_t": "1490702616",
83+
"uploader": "user1",
84+
},
85+
},
86+
"selected": {
87+
"nutrition": {
88+
"fr": {
89+
"imgid": "1",
90+
"rev": "18",
91+
"sizes": {
92+
"100": {
93+
"h": 53,
94+
"w": 100,
95+
"url": "https://images.openfoodfacts.org/images/products/326/385/950/6216/nutrition_fr.18.100.jpg",
96+
},
97+
"200": {
98+
"h": 107,
99+
"w": 200,
100+
"url": "https://images.openfoodfacts.org/images/products/326/385/950/6216/nutrition_fr.18.200.jpg",
101+
},
102+
"400": {
103+
"h": 213,
104+
"w": 400,
105+
"url": "https://images.openfoodfacts.org/images/products/326/385/950/6216/nutrition_fr.18.400.jpg",
106+
},
107+
"full": {
108+
"h": 1093,
109+
"w": 2050,
110+
"url": "https://images.openfoodfacts.org/images/products/326/385/950/6216/nutrition_fr.18.full.jpg",
111+
},
112+
},
113+
"generation": {
114+
"white_magic": "0",
115+
"x1": None,
116+
"x2": None,
117+
"y1": None,
118+
"y2": None,
119+
"normalize": "0",
120+
"ocr": 1,
121+
"orientation": "0",
122+
"angle": None,
123+
"geometry": "0x0-0-0",
124+
},
125+
},
126+
}
127+
},
128+
}
129+
130+
131+
class TestProduct:
132+
def test_parse_images(self):
133+
assert Product.parse_images({"images": IMAGES_WITH_NEW_SCHEMA}) == {
134+
"images": PARSED_IMAGES_WITH_LEGACY_SCHEMA
135+
}

uv.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)