From d696ce5fc175fafadc922e7e243e8983fe448842 Mon Sep 17 00:00:00 2001 From: Yauhen Yavorski Date: Fri, 18 Apr 2025 05:57:47 +0200 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20feat:=20script=20to=20generate?= =?UTF-8?q?=20image=20captions=20using=20BLIP=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generate_prompts.py | 78 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 generate_prompts.py diff --git a/generate_prompts.py b/generate_prompts.py new file mode 100644 index 0000000..c5e7396 --- /dev/null +++ b/generate_prompts.py @@ -0,0 +1,78 @@ +""" + +This script generates text prompts (captions) for images in a specified folder + +using a BLIP image-to-text pipeline. The generated prompts are saved as key-value +pairs in a JSON file, where the key is the image filename and the value is the +corresponding prompt + +Added as part of masters project Spring 2025 +""" + +from json import dump +from os import listdir, path + +from PIL import Image +from transformers import pipeline + + +def generate_prompts_for_folder(images_folder: str, output_file: str) -> None: + """ + Processes all images in `images_folder` to generate a text prompt (caption) + for each image using a BLIP image-to-text pipeline, and then saves them as + key-value pairs in a JSON file (image filename: prompt). + """ + captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") + + prompts_dict = {} + + for filename in listdir(images_folder): + if not filename.lower().endswith((".png", ".jpg", ".jpeg")): + continue # Skip non-image files. + + image_path = path.join(images_folder, filename) + try: + image = Image.open(image_path).convert("RGB") + except Exception as e: + print(f"Error loading {image_path}: {e}") + continue + + try: + generated = captioner(image) + + prompt_text = generated[0][ + 'generated_text' + ] # Extract the generated text from the output + except Exception as e: + print(f"Error generating caption for {filename}: {e}") + prompt_text = "error generating caption" + + prompts_dict[filename] = prompt_text + print(f"Processed {filename}: {prompt_text}") + + with open(output_file, "w") as f: + dump(prompts_dict, f, indent=4) + print(f"Saved prompts to {output_file}") + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser( + description="Generate image captions for all images in a folder and save to a JSON file." + ) + parser.add_argument( + "--images_folder", + type=str, + required=True, + help="Path to the folder containing images", + ) + parser.add_argument( + "--output_file", + type=str, + default="image_prompts.json", + help="Path to the output JSON file", + ) + args = parser.parse_args() + + generate_prompts_for_folder(args.images_folder, args.output_file) From 46fc93c5b63fcf2194157c1d8161326fd8b0a983 Mon Sep 17 00:00:00 2001 From: Yauhen Yavorski Date: Fri, 18 Apr 2025 05:58:06 +0200 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=94=A7=20config:=20SLURM=20script=20f?= =?UTF-8?q?or=20generating=20prompts=20with=20specified=20dependencies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Promt.slump | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 Promt.slump diff --git a/Promt.slump b/Promt.slump new file mode 100644 index 0000000..cc97500 --- /dev/null +++ b/Promt.slump @@ -0,0 +1,55 @@ +#!/bin/bash +#SBATCH --partition=GPUQ +#SBATCH --account=share-ie-idi +#SBATCH --time=9-99:99:99 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:a100:4 +#SBATCH --constraint="gpu40g|gpu80g" +#SBATCH --job-name="generate_prompts" +#SBATCH --output=generate_prompts.out +#SBATCH --mem=64G + +module purge +module --ignore_cache load foss/2022a +module --ignore_cache load Python/3.10.4-GCCcore-11.3.0 + +VENV_DIR=$(mktemp -d -t env-repaint-XXXXXXXXXX) +python3 -m venv "$VENV_DIR" +source "$VENV_DIR/bin/activate" + +python -m pip install --upgrade pip + +pip uninstall -y numpy opencv-python pillow scipy scikit-image # remove conflicting libs + +pip install --no-cache-dir --force-reinstall \ + numpy==1.23.5 \ + opencv-python==4.6.0.66 \ + pillow==9.3.0 \ + scipy==1.9.3 \ + scikit-image==0.19.3 \ + einops==0.6.0 \ + lmdb==1.3.0 \ + lpips==0.1.4 \ + PyYAML==6.0 \ + tensorboardX==2.5.1 \ + timm==0.6.12 \ + torch==1.13.0 \ + torchsummaryX==1.3.0 \ + torchvision==0.14.0 \ + tqdm \ + gradio==3.39.0 + +pip install Ninja +pip install tensorboard scikit-image +pip install -U torch torchvision +pip install ema-pytorch +pip install diffusers transformers accelerate scipy safetensors + +export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" + + +python ./generate_prompts.py + +deactivate +rm -rf "$VENV_DIR" From 34aac2954958d5b1d2ba5e5490c7fb779ec96291 Mon Sep 17 00:00:00 2001 From: Yauhen Yavorski Date: Fri, 18 Apr 2025 06:00:48 +0200 Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=94=A7=20fix:=20update=20SLURM=20scri?= =?UTF-8?q?pt=20to=20specify=20images=20folder=20and=20output=20file=20for?= =?UTF-8?q?=20prompt=20generation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Promt.slump | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Promt.slump b/Promt.slump index cc97500..d46d2a8 100644 --- a/Promt.slump +++ b/Promt.slump @@ -49,7 +49,7 @@ pip install diffusers transformers accelerate scipy safetensors export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" -python ./generate_prompts.py +python generate_prompts.py --images_folder /path/to/images --output_file prompts.json deactivate rm -rf "$VENV_DIR" From e5885774855c4ac6327c55ae759ae3393b7898ec Mon Sep 17 00:00:00 2001 From: Yauhen Yavorski Date: Fri, 18 Apr 2025 06:05:39 +0200 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=94=A7=20fix:=20update=20SLURM=20scri?= =?UTF-8?q?pt=20account=20placeholder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Promt.slump | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Promt.slump b/Promt.slump index d46d2a8..a40b9e1 100644 --- a/Promt.slump +++ b/Promt.slump @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --partition=GPUQ -#SBATCH --account=share-ie-idi +#SBATCH --account= #SBATCH --time=9-99:99:99 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 From 04570d12d95d9a4d0a2e0590a4c090ac0b6db822 Mon Sep 17 00:00:00 2001 From: Yauhen Yavorski Date: Fri, 18 Apr 2025 06:05:56 +0200 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=93=9A=20docs:=20update=20README=20to?= =?UTF-8?q?=20improve=20clarity=20and=20add=20prompt=20generation=20instru?= =?UTF-8?q?ctions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a244a4c..6810621 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,32 @@ # Flickr Diverse Faces - FDF + Flickr Diverse Faces (FDF) is a dataset with **1.5M faces** "in the wild". FDF has a large diversity in terms of facial pose, age, ethnicity, occluding objects, facial painting, and image background. -The dataset is designed for generative models for face anonymization, and it was released with the paper "*DeepPrivacy: A Generative Adversarial Network for Face Anonymization.* - +The dataset is designed for generative models for face anonymization, and it was released with the paper "_DeepPrivacy: A Generative Adversarial Network for Face Anonymization._ ![](media/header_im.jpg) The dataset was crawled from the website Flickr ([YFCC-100M dataset](http://projects.dfki.uni-kl.de/yfcc100m/)) and automatically annotated. Each face is annotated with **7 facial landmarks** (left/right ear, lef/right eye, left/right shoulder, and nose), and a **bounding box** of the face. [Our paper]() goes into more detail about the automatic annotation. - - ## Licenses + The images are collected from images in the YFCC-100M dataset and each image in our dataset is free to use for **academic** or **open source** projects. For each face, the corresponding original license is given in the metadata. Some of the images require giving proper credit to the original author, as well as indicating any changes that were made to the images. The original author is given in the metadata. The dataset contains images with the following licenses: - - [CC BY-NC-SA 2.0](https://creativecommons.org/licenses/by-nc-sa/2.0/): 623,598 Images (23.4 GB) - - [CC BY-SA 2.0](https://creativecommons.org/licenses/by-sa/2.0/): 199,502 Images 7.4 GB) + +- [CC BY-NC-SA 2.0](https://creativecommons.org/licenses/by-nc-sa/2.0/): 623,598 Images (23.4 GB) +- [CC BY-SA 2.0](https://creativecommons.org/licenses/by-sa/2.0/): 199,502 Images 7.4 GB) - [CC BY 2.0](https://creativecommons.org/licenses/by/2.0/): 352,961 Images (13.1 GB) - [CC BY-NC 2.0](https://creativecommons.org/licenses/by-nc/2.0/): 295,192 Images (10.9 GB) The FDF metadata is under [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0). ## Citation + If you find this code or dataset useful, please cite the following: + ``` @InProceedings{10.1007/978-3-030-33720-9_44, author="Hukkel{\aa}s, H{\aa}kon @@ -47,19 +49,20 @@ isbn="978-3-030-33720-9" pip install wget, tqdm ``` -2. To download metadata, run (expects python 3.6+): +2. To download metadata, run (expects python 3.6+): ``` python download.py --target_directory data/fdf ``` 3. If you want to download including images: + ``` python download.py --target_directory data/fdf --download_images ``` - ## Metainfo + For each face in the dataset, follows the following metainfo: ``` @@ -68,7 +71,7 @@ For each face in the dataset, follows the following metainfo: "author": "flickr_username", "bounding_box": [], # List with 4 eleemnts [xmin, ymin, xmax, ymax] indicating the bounding box of the face in the FDF image. In range 0-1. "category": "validation", # validation or training set - "date_crawled": "2019-3-6", + "date_crawled": "2019-3-6", "date_taken": "2010-01-16 21:47:59.0", "date_uploaded": "2010-01-16", "landmark": [], # List with shape (7,2). Each row is (x0, y0) indicating the position of the landmark. Landmark order: [nose, r_eye, l_eye, r_ear, l_ear, r_shoulder, l_shoulder]. In range 0-1. @@ -84,23 +87,36 @@ For each face in the dataset, follows the following metainfo: } ``` +## Prompt Generation + +For tasks that require text prompts, we have added separate script `generate_prompts.py`. This script can be ran using the following command. The workload can be demanding so we urge use to submit it using ['Promt.slump' script](https://www.hpc.ntnu.no/idun/documentation/running-jobs/). + +``` + python generate_prompts.py --images_folder /path/to/images --output_file prompts.json +``` + ## Statistics + #### Distribution of image licenses ![](media/license_pie_chart.png) ### Training vs Validation Percentage + There are 50,000 validation images, 1,421,253 training images. ![](media/category_pie_chart.png) ### Original Face size -Each face in the original image has a resolution of minimum: + +Each face in the original image has a resolution of minimum: ![](media/face_size_chart.png) ## Citation + If you find the dataset useful, please cite the following: + ``` @InProceedings{10.1007/978-3-030-33720-9_44, author="Hukkel{\aa}s, H{\aa}kon