From d88c35f7b2f1cdf3f15be11a6392d821d223548e Mon Sep 17 00:00:00 2001 From: Matteo Date: Tue, 18 Feb 2025 17:47:33 +0100 Subject: [PATCH 1/2] save chunk, hash and files on database --- .vscode/settings.json | 5 +- .../controller/api/DuplicationController.java | 62 +++-- .../duplication/DuplicationService.java | 255 ++++++++++++++---- .../goofy/GoofyFiles/model/ChunkEntity.java | 82 ++++++ .../GoofyFiles/model/FileChunkEntity.java | 62 +++++ .../goofy/GoofyFiles/model/FileEntity.java | 70 +++++ .../repository/ChunkRepository.java | 16 ++ .../repository/FileChunkRepository.java | 8 + .../GoofyFiles/repository/FileRepository.java | 8 + 9 files changed, 493 insertions(+), 75 deletions(-) create mode 100644 java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java create mode 100644 java/src/main/java/com/goofy/GoofyFiles/model/FileChunkEntity.java create mode 100644 java/src/main/java/com/goofy/GoofyFiles/model/FileEntity.java create mode 100644 java/src/main/java/com/goofy/GoofyFiles/repository/ChunkRepository.java create mode 100644 java/src/main/java/com/goofy/GoofyFiles/repository/FileChunkRepository.java create mode 100644 java/src/main/java/com/goofy/GoofyFiles/repository/FileRepository.java diff --git a/.vscode/settings.json b/.vscode/settings.json index f54d20c..4eb66f3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,5 +4,8 @@ "editor.formatOnSave": true, "editor.defaultFormatter": "esbenp.prettier-vscode", "java.compile.nullAnalysis.mode": "automatic", - "java.configuration.updateBuildConfiguration": "automatic" + "java.configuration.updateBuildConfiguration": "automatic", + "[java]": { + "editor.defaultFormatter": "redhat.java" + } } diff --git a/java/src/main/java/com/goofy/GoofyFiles/controller/api/DuplicationController.java b/java/src/main/java/com/goofy/GoofyFiles/controller/api/DuplicationController.java index 898ba6e..72b2b70 100644 --- a/java/src/main/java/com/goofy/GoofyFiles/controller/api/DuplicationController.java +++ b/java/src/main/java/com/goofy/GoofyFiles/controller/api/DuplicationController.java @@ -18,27 +18,49 @@ @RequestMapping("api/duplication") public class DuplicationController { - private final DuplicationService duplicationService; + private final DuplicationService duplicationService; - public DuplicationController(DuplicationService duplicationService) { - this.duplicationService = duplicationService; + public DuplicationController(DuplicationService duplicationService) { + this.duplicationService = duplicationService; + } + + @PostMapping("/analyze") + public ResponseEntity analyzeFile( + @RequestParam("file") MultipartFile file, + @RequestParam(value = "algorithm", defaultValue = "SHA256") HashingAlgorithm algorithm) { + try { + File tempFile = File.createTempFile("upload-", "-" + file.getOriginalFilename()); + file.transferTo(tempFile); + + Map result = duplicationService.analyzeFile(tempFile, algorithm); + + tempFile.delete(); + return ResponseEntity.ok(result); + } catch (IOException e) { + return ResponseEntity.internalServerError() + .body(Map.of("error", "Échec du traitement du fichier: " + e.getMessage())); } + } + + @PostMapping("/process") + public ResponseEntity processFile( + @RequestParam("file") MultipartFile file, + @RequestParam(value = "algorithm", defaultValue = "SHA256") HashingAlgorithm algorithm) { + try { + File tempFile = File.createTempFile("upload-", "-" + file.getOriginalFilename()); + file.transferTo(tempFile); + + Map result = duplicationService.processAndStoreFile( + tempFile, + file.getOriginalFilename(), + file.getSize(), + algorithm); - @PostMapping("/analyze") - public ResponseEntity analyzeFile( - @RequestParam("file") MultipartFile file, - @RequestParam(value = "algorithm", defaultValue = "SHA256") HashingAlgorithm algorithm) { - try { - File tempFile = File.createTempFile("upload-", "-" + file.getOriginalFilename()); - file.transferTo(tempFile); - - Map result = duplicationService.analyzeFile(tempFile, algorithm); - - tempFile.delete(); - return ResponseEntity.ok(result); - } catch (IOException e) { - return ResponseEntity.internalServerError() - .body(Map.of("error", "Échec du traitement du fichier: " + e.getMessage())); - } + tempFile.delete(); + return ResponseEntity.ok(result); + } catch (IOException e) { + return ResponseEntity.internalServerError() + .body(Map.of("error", "Échec du traitement et de l'enregistrement du fichier: " + e.getMessage())); } -} + } +} \ No newline at end of file diff --git a/java/src/main/java/com/goofy/GoofyFiles/duplication/DuplicationService.java b/java/src/main/java/com/goofy/GoofyFiles/duplication/DuplicationService.java index 1def8fc..b9b44b4 100644 --- a/java/src/main/java/com/goofy/GoofyFiles/duplication/DuplicationService.java +++ b/java/src/main/java/com/goofy/GoofyFiles/duplication/DuplicationService.java @@ -5,80 +5,227 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.digest.Blake3; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; import com.goofy.GoofyFiles.chunking.Chunk; import com.goofy.GoofyFiles.chunking.ChunkingService; +import com.goofy.GoofyFiles.model.ChunkEntity; +import com.goofy.GoofyFiles.model.FileChunkEntity; +import com.goofy.GoofyFiles.model.FileEntity; +import com.goofy.GoofyFiles.repository.ChunkRepository; +import com.goofy.GoofyFiles.repository.FileChunkRepository; +import com.goofy.GoofyFiles.repository.FileRepository; import com.google.common.hash.Hashing; @Service public class DuplicationService { - private static final Logger logger = LoggerFactory.getLogger(DuplicationService.class); - private final ChunkingService chunkingService; + private static final Logger logger = LoggerFactory.getLogger(DuplicationService.class); - public DuplicationService(ChunkingService chunkingService) { - this.chunkingService = chunkingService; + private final ChunkingService chunkingService; + private final FileRepository fileRepository; + private final ChunkRepository chunkRepository; + private final FileChunkRepository fileChunkRepository; + + /** + * Constructeur principal pour l'utilisation en production + */ + @Autowired + public DuplicationService( + ChunkingService chunkingService, + FileRepository fileRepository, + ChunkRepository chunkRepository, + FileChunkRepository fileChunkRepository) { + this.chunkingService = chunkingService; + this.fileRepository = fileRepository; + this.chunkRepository = chunkRepository; + this.fileChunkRepository = fileChunkRepository; + } + + /** + * Constructeur simplifié pour les tests + * Ne prend que le ChunkingService, les opérations de base de données ne seront + * pas disponibles + */ + public DuplicationService(ChunkingService chunkingService) { + this.chunkingService = chunkingService; + this.fileRepository = null; + this.chunkRepository = null; + this.fileChunkRepository = null; + } + + public Map analyzeFile(File file, HashingAlgorithm algorithm) throws IOException { + List chunks = chunkingService.chunkFile(file); + Map duplicates = new HashMap<>(); + + for (Chunk chunk : chunks) { + String hash = calculateHash(chunk.getData(), algorithm); + duplicates.merge(hash, 1, Integer::sum); + logger.debug("Chunk at position {} with size {} bytes has hash: {}", + chunk.getPosition(), chunk.getData().length, hash); } - public Map analyzeFile(File file, HashingAlgorithm algorithm) throws IOException { - List chunks = chunkingService.chunkFile(file); - Map duplicates = new HashMap<>(); - - for (Chunk chunk : chunks) { - String hash = calculateHash(chunk.getData(), algorithm); - duplicates.merge(hash, 1, Integer::sum); - logger.debug("Chunk at position {} with size {} bytes has hash: {}", - chunk.getPosition(), chunk.getData().length, hash); - } + // Filtrer les chunks qui apparaissent plus d'une fois (vous pouvez logguer ou + // utiliser ce résultat) + duplicates.entrySet().stream() + .filter(e -> e.getValue() > 1); + + long uniqueChunks = duplicates.size(); + long totalChunks = chunks.size(); + long duplicatedChunks = duplicates.entrySet().stream() + .filter(e -> e.getValue() > 1) + .count(); + + return Map.of( + "fileName", file.getName(), + "totalChunks", totalChunks, + "uniqueChunks", uniqueChunks, + "duplicatedChunks", duplicatedChunks, + "algorithm", algorithm.name(), + "duplicateDetails", duplicates.entrySet().stream() + .filter(e -> e.getValue() > 1) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue))); + } + + private String calculateHash(byte[] data, HashingAlgorithm algorithm) { + try { + switch (algorithm) { + case SHA1: + return Hashing.sha1().hashBytes(data).toString(); + case SHA256: + return Hashing.sha256().hashBytes(data).toString(); + case BLAKE3: + // Utilisation de Apache Commons Codec pour BLAKE3 + byte[] hashBytes = Blake3.hash(data); + return Hex.encodeHexString(hashBytes); + default: + throw new IllegalArgumentException("Algorithme de hachage non supporté: " + algorithm); + } + } catch (Exception e) { + throw new RuntimeException("Erreur lors du calcul du hash", e); + } + } + + @Transactional + public Map processAndStoreFile( + File file, + String fileName, + long fileSize, + HashingAlgorithm algorithm) throws IOException { + if (fileRepository == null || chunkRepository == null || fileChunkRepository == null) { + throw new UnsupportedOperationException( + "Cette méthode nécessite les repositories qui n'ont pas été injectés. " + + "Utilisez le constructeur avec tous les paramètres pour cette fonctionnalité."); + } - // Filtrer les chunks qui apparaissent plus d'une fois (vous pouvez logguer ou utiliser ce résultat) - duplicates.entrySet().stream() - .filter(e -> e.getValue() > 1); - - long uniqueChunks = duplicates.size(); - long totalChunks = chunks.size(); - long duplicatedChunks = duplicates.entrySet().stream() - .filter(e -> e.getValue() > 1) - .count(); - - return Map.of( - "fileName", file.getName(), - "totalChunks", totalChunks, - "uniqueChunks", uniqueChunks, - "duplicatedChunks", duplicatedChunks, - "algorithm", algorithm.name(), - "duplicateDetails", duplicates.entrySet().stream() - .filter(e -> e.getValue() > 1) - .collect(Collectors.toMap( - Map.Entry::getKey, - Map.Entry::getValue - )) - ); + // 1. Extraire le nom et l'extension + String name = fileName; + String extension = ""; + int lastDotIndex = fileName.lastIndexOf('.'); + if (lastDotIndex > 0) { + name = fileName.substring(0, lastDotIndex); + extension = fileName.substring(lastDotIndex + 1); } - private String calculateHash(byte[] data, HashingAlgorithm algorithm) { - try { - switch (algorithm) { - case SHA1: - return Hashing.sha1().hashBytes(data).toString(); - case SHA256: - return Hashing.sha256().hashBytes(data).toString(); - case BLAKE3: - // Utilisation de Apache Commons Codec pour BLAKE3 - byte[] hashBytes = Blake3.hash(data); - return Hex.encodeHexString(hashBytes); - default: - throw new IllegalArgumentException("Algorithme de hachage non supporté: " + algorithm); - } - } catch (Exception e) { - throw new RuntimeException("Erreur lors du calcul du hash", e); + // 2. Créer et sauvegarder l'entité de fichier + FileEntity fileEntity = new FileEntity(); + fileEntity.setName(name); + fileEntity.setExtension(extension); + fileEntity.setSize(fileSize); + fileEntity = fileRepository.save(fileEntity); + + // 3. Découper le fichier + List chunks = chunkingService.chunkFile(file); + + // Statistiques pour le résultat + int totalChunks = chunks.size(); + int duplicateChunks = 0; + int uniqueChunks = 0; + long savedStorage = 0; + + // 4. Traiter chaque chunk + for (Chunk chunk : chunks) { + String hash = calculateHash(chunk.getData(), algorithm); + + // Chercher si ce chunk existe déjà en base + Optional existingChunk; + switch (algorithm) { + case SHA1: + existingChunk = chunkRepository.findByHashSha1(hash); + break; + case SHA256: + existingChunk = chunkRepository.findByHashSha256(hash); + break; + case BLAKE3: + existingChunk = chunkRepository.findByHashBlake3(hash); + break; + default: + existingChunk = Optional.empty(); + } + + // Traiter le chunk (nouveau ou existant) + ChunkEntity chunkEntity; + if (existingChunk.isPresent()) { + chunkEntity = existingChunk.get(); + duplicateChunks++; + savedStorage += chunk.getOriginalSize(); + logger.info("Chunk dupliqué trouvé: {}", hash); + } else { + chunkEntity = new ChunkEntity(); + chunkEntity.setData(chunk.getData()); + + // Stocker le hash selon l'algorithme + switch (algorithm) { + case SHA1: + chunkEntity.setHashSha1(hash); + break; + case SHA256: + chunkEntity.setHashSha256(hash); + break; + case BLAKE3: + chunkEntity.setHashBlake3(hash); + break; } + + chunkEntity = chunkRepository.save(chunkEntity); + uniqueChunks++; + } + + // Créer la relation entre le fichier et le chunk + FileChunkEntity fileChunk = new FileChunkEntity(); + fileChunk.setFile(fileEntity); + fileChunk.setChunk(chunkEntity); + fileChunk.setPosition(chunk.getPosition()); + fileChunkRepository.save(fileChunk); } -} + + // 5. Préparer le résultat + Map result = new HashMap<>(); + result.put("fileId", fileEntity.getId()); + result.put("fileName", fileEntity.getName()); + result.put("extension", fileEntity.getExtension()); + result.put("fileSize", fileEntity.getSize()); + result.put("algorithm", algorithm.name()); + result.put("totalChunks", totalChunks); + result.put("uniqueChunks", uniqueChunks); + result.put("duplicateChunks", duplicateChunks); + result.put("savedStorage", savedStorage); + result.put("deduplicationRatio", totalChunks > 0 ? (double) duplicateChunks / totalChunks : 0); + + logger.info("Fichier traité: id={}, nom={}, chunks={}, uniques={}, doublons={}", + fileEntity.getId(), fileName, totalChunks, uniqueChunks, duplicateChunks); + + return result; + } +} \ No newline at end of file diff --git a/java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java b/java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java new file mode 100644 index 0000000..0fa0591 --- /dev/null +++ b/java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java @@ -0,0 +1,82 @@ +package com.goofy.GoofyFiles.model; + +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.List; + +import jakarta.persistence.*; + +@Entity +@Table(name = "chunk") +public class ChunkEntity { + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Long id; + + @Lob + private byte[] data; + + @Column(name = "hash_sha_1") + private String hashSha1; + + @Column(name = "hash_sha_256") + private String hashSha256; + + @Column(name = "hash_blake3") + private String hashBlake3; + + @Column(name = "created_at") + private LocalDateTime createdAt = LocalDateTime.now(); + + @OneToMany(mappedBy = "chunk") + private List fileChunks = new ArrayList<>(); + + // Getters and setters + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public byte[] getData() { + return data; + } + + public void setData(byte[] data) { + this.data = data; + } + + public String getHashSha1() { + return hashSha1; + } + + public void setHashSha1(String hashSha1) { + this.hashSha1 = hashSha1; + } + + public String getHashSha256() { + return hashSha256; + } + + public void setHashSha256(String hashSha256) { + this.hashSha256 = hashSha256; + } + + public String getHashBlake3() { + return hashBlake3; + } + + public void setHashBlake3(String hashBlake3) { + this.hashBlake3 = hashBlake3; + } + + public LocalDateTime getCreatedAt() { + return createdAt; + } + + public List getFileChunks() { + return fileChunks; + } +} \ No newline at end of file diff --git a/java/src/main/java/com/goofy/GoofyFiles/model/FileChunkEntity.java b/java/src/main/java/com/goofy/GoofyFiles/model/FileChunkEntity.java new file mode 100644 index 0000000..f6442c7 --- /dev/null +++ b/java/src/main/java/com/goofy/GoofyFiles/model/FileChunkEntity.java @@ -0,0 +1,62 @@ +package com.goofy.GoofyFiles.model; + +import java.time.LocalDateTime; + +import jakarta.persistence.*; + +@Entity +@Table(name = "file_chunk") +public class FileChunkEntity { + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Long id; + + @ManyToOne + @JoinColumn(name = "file_id") + private FileEntity file; + + @ManyToOne + @JoinColumn(name = "chunk_id") + private ChunkEntity chunk; + + private Integer position; + + @Column(name = "created_at") + private LocalDateTime createdAt = LocalDateTime.now(); + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public FileEntity getFile() { + return file; + } + + public void setFile(FileEntity file) { + this.file = file; + } + + public ChunkEntity getChunk() { + return chunk; + } + + public void setChunk(ChunkEntity chunk) { + this.chunk = chunk; + } + + public Integer getPosition() { + return position; + } + + public void setPosition(Integer position) { + this.position = position; + } + + public LocalDateTime getCreatedAt() { + return createdAt; + } +} \ No newline at end of file diff --git a/java/src/main/java/com/goofy/GoofyFiles/model/FileEntity.java b/java/src/main/java/com/goofy/GoofyFiles/model/FileEntity.java new file mode 100644 index 0000000..4b4f8b6 --- /dev/null +++ b/java/src/main/java/com/goofy/GoofyFiles/model/FileEntity.java @@ -0,0 +1,70 @@ +package com.goofy.GoofyFiles.model; + +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.List; + +import jakarta.persistence.*; + +@Entity +@Table(name = "files") +public class FileEntity { + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Long id; + + private String name; + private String extension; + private Long size; + + @Column(name = "created_at") + private LocalDateTime createdAt = LocalDateTime.now(); + + @OneToMany(mappedBy = "file", cascade = CascadeType.ALL) + private List fileChunks = new ArrayList<>(); + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getExtension() { + return extension; + } + + public void setExtension(String extension) { + this.extension = extension; + } + + public Long getSize() { + return size; + } + + public void setSize(Long size) { + this.size = size; + } + + public LocalDateTime getCreatedAt() { + return createdAt; + } + + public List getFileChunks() { + return fileChunks; + } + + public void addFileChunk(FileChunkEntity fileChunk) { + this.fileChunks.add(fileChunk); + fileChunk.setFile(this); + } +} \ No newline at end of file diff --git a/java/src/main/java/com/goofy/GoofyFiles/repository/ChunkRepository.java b/java/src/main/java/com/goofy/GoofyFiles/repository/ChunkRepository.java new file mode 100644 index 0000000..336ab34 --- /dev/null +++ b/java/src/main/java/com/goofy/GoofyFiles/repository/ChunkRepository.java @@ -0,0 +1,16 @@ +package com.goofy.GoofyFiles.repository; + +import java.util.Optional; + +import org.springframework.data.jpa.repository.JpaRepository; + +import com.goofy.GoofyFiles.model.ChunkEntity; + +public interface ChunkRepository extends JpaRepository { + + Optional findByHashSha1(String hash); + + Optional findByHashSha256(String hash); + + Optional findByHashBlake3(String hash); +} \ No newline at end of file diff --git a/java/src/main/java/com/goofy/GoofyFiles/repository/FileChunkRepository.java b/java/src/main/java/com/goofy/GoofyFiles/repository/FileChunkRepository.java new file mode 100644 index 0000000..e1c672e --- /dev/null +++ b/java/src/main/java/com/goofy/GoofyFiles/repository/FileChunkRepository.java @@ -0,0 +1,8 @@ +package com.goofy.GoofyFiles.repository; + +import org.springframework.data.jpa.repository.JpaRepository; + +import com.goofy.GoofyFiles.model.FileChunkEntity; + +public interface FileChunkRepository extends JpaRepository { +} \ No newline at end of file diff --git a/java/src/main/java/com/goofy/GoofyFiles/repository/FileRepository.java b/java/src/main/java/com/goofy/GoofyFiles/repository/FileRepository.java new file mode 100644 index 0000000..62b0557 --- /dev/null +++ b/java/src/main/java/com/goofy/GoofyFiles/repository/FileRepository.java @@ -0,0 +1,8 @@ +package com.goofy.GoofyFiles.repository; + +import org.springframework.data.jpa.repository.JpaRepository; + +import com.goofy.GoofyFiles.model.FileEntity; + +public interface FileRepository extends JpaRepository { +} \ No newline at end of file From 9dec75f51f51fd59b0f4ba0f31ea61461e2d78f5 Mon Sep 17 00:00:00 2001 From: Matteo Date: Tue, 18 Feb 2025 18:20:56 +0100 Subject: [PATCH 2/2] fix: save hexa to data --- java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java b/java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java index 0fa0591..0d24ea4 100644 --- a/java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java +++ b/java/src/main/java/com/goofy/GoofyFiles/model/ChunkEntity.java @@ -13,7 +13,7 @@ public class ChunkEntity { @GeneratedValue(strategy = GenerationType.IDENTITY) private Long id; - @Lob + @Column(columnDefinition = "bytea") private byte[] data; @Column(name = "hash_sha_1")