Actually, keep the original untouched

This commit is contained in:
shamoon 2024-09-06 18:12:41 -07:00
parent 9e59e02ffa
commit f892538fda
2 changed files with 14 additions and 5 deletions

View File

@ -423,10 +423,6 @@ Insurances/ # Insurances
Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type Paperless will attempt to "clean" certain invalid PDFs with `qpdf` before processing if, for example, the mime_type
detection is incorrect. This can happen if the PDF is not properly formatted or contains errors. detection is incorrect. This can happen if the PDF is not properly formatted or contains errors.
!!! warning
This process will technically modify the document before processing.
## Celery Monitoring {#celery-monitoring} ## Celery Monitoring {#celery-monitoring}
The monitoring tool The monitoring tool

View File

@ -532,6 +532,7 @@ class ConsumerPlugin(
) )
self.working_copy = Path(tempdir.name) / Path(self.filename) self.working_copy = Path(tempdir.name) / Path(self.filename)
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy) copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
self.unmodified_original = None
# Determine the parser class. # Determine the parser class.
@ -559,6 +560,14 @@ class ConsumerPlugin(
) )
mime_type = magic.from_file(self.working_copy, mime=True) mime_type = magic.from_file(self.working_copy, mime=True)
self.log.debug(f"Detected mime type after qpdf: {mime_type}") self.log.debug(f"Detected mime type after qpdf: {mime_type}")
# Save the original file for later
self.unmodified_original = (
Path(tempdir.name) / Path("uo") / Path(self.filename)
)
copy_file_with_basic_stats(
self.input_doc.original_file,
self.unmodified_original,
)
except Exception as e: except Exception as e:
self.log.error(f"Error attempting to clean PDF: {e}") self.log.error(f"Error attempting to clean PDF: {e}")
@ -712,7 +721,9 @@ class ConsumerPlugin(
self._write( self._write(
document.storage_type, document.storage_type,
self.working_copy, self.unmodified_original
if self.unmodified_original is not None
else self.working_copy,
document.source_path, document.source_path,
) )
@ -748,6 +759,8 @@ class ConsumerPlugin(
self.log.debug(f"Deleting file {self.working_copy}") self.log.debug(f"Deleting file {self.working_copy}")
self.input_doc.original_file.unlink() self.input_doc.original_file.unlink()
self.working_copy.unlink() self.working_copy.unlink()
if self.unmodified_original is not None: # pragma: no cover
self.unmodified_original.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037 # https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join( shadow_file = os.path.join(