Merge pull request #43 from Gradiant/release/v1.1.2_hotfix

tantalizer · web-flow · commit caa3ebae0f45 · 2020-01-09T18:01:01.000+01:00
fix logging issues while ocring raster pdf files
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,10 +1,14 @@
+1.1.2
+-----
+* Fix issue with logging while forcing OCR on PDF documents
+
 1.1.1
 -----
 
 * Update to tika 1.23
 * Add dockerhub image and update documentation on its use: https://hub.docker.com/r/gradiant/faro
 * Fix #32: logging duplicates
-* Fix #37 : fixing metadata when a list is extracted in some fields (dates and pages)	
+* Fix #37 : fixing metadata when a list is extracted in some fields (dates and pages)
 
 1.1.0
 -----
diff --git a/faro/io_parser.py b/faro/io_parser.py
@@ -72,6 +72,7 @@ def parse_file(file_path):
                     force_ocr = True
                 else:
                     filesize_chars_ratio = filesize / chars
+                    logger.debug("PDF filesize_chars_ratio: {:.2f}".format(filesize_chars_ratio))
                     if filesize_chars_ratio > pdf_ocr_ratio:
                         force_ocr = True
                         logger.debug('size: {}, chars: {}, ratio: {}'.format(
@@ -80,8 +81,8 @@ def parse_file(file_path):
                             filesize_chars_ratio))
 
                 if force_ocr:
+                    logger.info("performing OCR on PDF file: {}".format(file_path))
                     parsed['metadata']['ocr_parsing'] = True
-                    logger.info("PDF filesize_chars_ratio: {:.2f}...performing OCR".format(filesize_chars_ratio))
                     parsed_ocr_text = parser.from_file(
                         file_path,
                         service='text',