Compare commits

..

14 Commits

Author SHA1 Message Date
lemmi
27906df149 paperless_cmd.sh: use exec to run supervisord
Without exec, signals aren't passed through properly. Stopping the
container won't have any effect unless killed.

closes #1616
2022-09-20 07:23:04 -07:00
Paperless-ngx Translation Bot [bot]
b8e7f0b45f New Crowdin updates (#1607)
* bugfix: increase delay

partially reverts 86358d5561
re-implements 4fbabe43ea

Signed-off-by: Florian Brandes <florian.brandes@posteo.de>

* New translations messages.xlf (Finnish) [ci skip]

Signed-off-by: Florian Brandes <florian.brandes@posteo.de>
Co-authored-by: Florian Brandes <florian.brandes@posteo.de>
2022-09-16 15:45:17 -07:00
Trenton H
355b3fcb3d Fixes grammar in comment
Co-authored-by: Florian <florian.brandes@posteo.de>
2022-09-16 09:08:16 -07:00
Trenton Holmes
7aa0e5650b Updates how barcodes are detected, using pikepdf images, instead of converting each page to an image 2022-09-16 09:08:16 -07:00
Paperless-ngx Translation Bot [bot]
f9a0adc64e New Crowdin updates (#1580)
* New translations messages.xlf (French)
[ci skip]

* New translations messages.xlf (Finnish)
[ci skip]

* New translations messages.xlf (Finnish)
[ci skip]
2022-09-16 08:37:07 -07:00
shamoon
3b34aed64f Merge pull request #1605 from paperless-ngx/fix/1599-consume-permissions
Fix: Consume directory permissions were not updated
2022-09-16 08:31:41 -07:00
Trenton H
3a7cbd3a42 Fixes an issue where the consume directory wasn't included in the permissions fix at Docker entry 2022-09-16 07:52:33 -07:00
Trenton H
0e443ba017 Merge pull request #1596 from paperless-ngx/fix/1590-barcodes
Fix: Double barcode separation creates empty file
2022-09-15 14:18:01 -07:00
shamoon
8ed401aec1 Merge pull request #1591 from paperless-ngx/fix/1583-tika-str
Fix: Parsing Tika documents fails with AttributeError
2022-09-14 21:52:49 -07:00
Trenton Holmes
9ae847039b Fixes the seperation of files by barcode, during the case where 2 barcodes appear back to back 2022-09-14 14:00:37 -07:00
Trenton Holmes
d4cb84ff76 Ensure the tika parse function gets a string, not a PathLike 2022-09-14 07:48:12 -07:00
shamoon
17ae2aacbf Merge pull request #1576 from paperless-ngx/fix/slow-classifier
Fix: Resolve issue with slow classifier
2022-09-13 11:57:58 -07:00
Michael Shamoon
82d03f2dc6 fix tag list vertical space 2022-09-13 11:54:25 -07:00
Trenton Holmes
3cf2aaf8ff Locks numpy version until an upstream fix or resolution to aarch64 classifier slowdown 2022-09-13 10:33:07 -07:00
14 changed files with 537 additions and 460 deletions

View File

@@ -33,6 +33,8 @@ redis = "*"
scikit-learn = "~=1.1"
# Pin this until piwheels is building 1.9 (see https://www.piwheels.org/project/scipy/)
scipy = "==1.8.1"
# https://github.com/paperless-ngx/paperless-ngx/issues/1364
numpy = "==1.22.3"
whitenoise = "~=6.2"
watchdog = "~=2.1"
whoosh="~=2.7"
@@ -51,7 +53,6 @@ concurrent-log-handler = "*"
"importlib-resources" = {version = "*", markers = "python_version < '3.9'"}
zipp = {version = "*", markers = "python_version < '3.9'"}
pyzbar = "*"
pdf2image = "*"
mysqlclient = "*"
setproctitle = "*"

50
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e187d1abccb2e393ef0fe452737dca7b19aca430117edccfe95160d1902faa21"
"sha256": "896665b8ff6d8a99af44b729c581033add1ba5cbd927723ef275649491c92a4f"
},
"pipfile-spec": 6,
"requires": {},
@@ -712,37 +712,57 @@
},
"numpy": {
"hashes": [
"sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676",
"sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4",
"sha256:17e5226674f6ea79e14e3b91bfbc153fdf3ac13f5cc54ee7bc8fdbe820a32da0",
"sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce",
"sha256:2bd879d3ca4b6f39b7770829f73278b7c5e248c91d538aab1e506c628353e47f",
"sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123",
"sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1",
"sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e",
"sha256:4f41f5bf20d9a521f8cab3a34557cd77b6f205ab2116651f12959714494268b0",
"sha256:5593f67e66dea4e237f5af998d31a43e447786b2154ba1ad833676c788f37cde",
"sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5",
"sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d",
"sha256:5e28cd64624dc2354a349152599e55308eb6ca95a13ce6a7d5679ebff2962913",
"sha256:633679a472934b1c20a12ed0c9a6c9eb167fbb4cb89031939bfd03dd9dbc62b8",
"sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a",
"sha256:806970e69106556d1dd200e26647e9bee5e2b3f1814f9da104a943e8d548ca38",
"sha256:806cc25d5c43e240db709875e947076b2826f47c2c340a5a2f36da5bb10c58d6",
"sha256:8247f01c4721479e482cc2f9f7d973f3f47810cbc8c65e38fd1bbd3141cc9842",
"sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab",
"sha256:8ebf7e194b89bc66b78475bd3624d92980fca4e5bb86dda08d677d786fefc414",
"sha256:8ecb818231afe5f0f568c81f12ce50f2b828ff2b27487520d85eb44c71313b9e",
"sha256:8f9d84a24889ebb4c641a9b99e54adb8cab50972f0166a3abc14c3b93163f074",
"sha256:909c56c4d4341ec8315291a105169d8aae732cfb4c250fbc375a1efb7a844f8f",
"sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75",
"sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168",
"sha256:9b83d48e464f393d46e8dd8171687394d39bc5abfe2978896b77dc2604e8635d",
"sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4",
"sha256:ac987b35df8c2a2eab495ee206658117e9ce867acf3ccb376a19e83070e69418",
"sha256:b78d00e48261fbbd04aa0d7427cf78d18401ee0abd89c7559bbf422e5b1c7d01",
"sha256:b8b97a8a87cadcd3f94659b4ef6ec056261fa1e1c3317f4193ac231d4df70215",
"sha256:bd5b7ccae24e3d8501ee5563e82febc1771e73bd268eef82a1e8d2b4d556ae66",
"sha256:bdc02c0235b261925102b1bd586579b7158e9d0d07ecb61148a1799214a4afd5",
"sha256:be6b350dfbc7f708d9d853663772a9310783ea58f6035eec649fb9c4371b5389",
"sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f",
"sha256:c403c81bb8ffb1c993d0165a11493fd4bf1353d258f6997b3ee288b0a48fce77",
"sha256:cf8c6aed12a935abf2e290860af8e77b26a042eb7f2582ff83dc7ed5f963340c",
"sha256:d98addfd3c8728ee8b2c49126f3c44c703e2b005d4a95998e2167af176a9e722",
"sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18",
"sha256:dc76bca1ca98f4b122114435f83f1fcf3c0fe48e4e6f660e07996abf2f53903c",
"sha256:dec198619b7dbd6db58603cd256e092bcadef22a796f778bf87f8592b468441d",
"sha256:df28dda02c9328e122661f399f7655cdcbcf22ea42daa3650a26bce08a187450",
"sha256:e603ca1fb47b913942f3e660a15e55a9ebca906857edfea476ae5f0fe9b457d5",
"sha256:ecfdd68d334a6b97472ed032b5b37a30d8217c097acfff15e8452c710e775524"
"sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62",
"sha256:ecfdd68d334a6b97472ed032b5b37a30d8217c097acfff15e8452c710e775524",
"sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe",
"sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430",
"sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802",
"sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"
],
"markers": "python_version >= '3.8'",
"version": "==1.23.2"
"index": "pypi",
"version": "==1.22.3"
},
"ocrmypdf": {
"hashes": [
@@ -768,14 +788,6 @@
"index": "pypi",
"version": "==2.5.2"
},
"pdf2image": {
"hashes": [
"sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65",
"sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb"
],
"index": "pypi",
"version": "==1.16.0"
},
"pdfminer.six": {
"hashes": [
"sha256:5a64c924410ac48501d6060b21638bf401db69f5b1bd57207df7fbc070ac8ae2",
@@ -1035,6 +1047,7 @@
},
"pyyaml": {
"hashes": [
"sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf",
"sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293",
"sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b",
"sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57",
@@ -1046,26 +1059,32 @@
"sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287",
"sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513",
"sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0",
"sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782",
"sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0",
"sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92",
"sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f",
"sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2",
"sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc",
"sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1",
"sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c",
"sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86",
"sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4",
"sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c",
"sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34",
"sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b",
"sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d",
"sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c",
"sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb",
"sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7",
"sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737",
"sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3",
"sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d",
"sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358",
"sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53",
"sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78",
"sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803",
"sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a",
"sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f",
"sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174",
"sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"
],
@@ -2241,6 +2260,7 @@
},
"pyyaml": {
"hashes": [
"sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf",
"sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293",
"sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b",
"sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57",
@@ -2252,26 +2272,32 @@
"sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287",
"sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513",
"sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0",
"sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782",
"sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0",
"sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92",
"sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f",
"sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2",
"sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc",
"sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1",
"sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c",
"sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86",
"sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4",
"sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c",
"sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34",
"sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b",
"sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d",
"sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c",
"sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb",
"sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7",
"sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737",
"sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3",
"sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d",
"sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358",
"sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53",
"sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78",
"sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803",
"sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a",
"sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f",
"sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174",
"sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"
],

View File

@@ -50,6 +50,7 @@ map_folders() {
# Export these so they can be used in docker-prepare.sh
export DATA_DIR="${PAPERLESS_DATA_DIR:-/usr/src/paperless/data}"
export MEDIA_ROOT_DIR="${PAPERLESS_MEDIA_ROOT:-/usr/src/paperless/media}"
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
}
initialize() {
@@ -77,7 +78,11 @@ initialize() {
local export_dir="/usr/src/paperless/export"
for dir in "${export_dir}" "${DATA_DIR}" "${DATA_DIR}/index" "${MEDIA_ROOT_DIR}" "${MEDIA_ROOT_DIR}/documents" "${MEDIA_ROOT_DIR}/documents/originals" "${MEDIA_ROOT_DIR}/documents/thumbnails"; do
for dir in \
"${export_dir}" \
"${DATA_DIR}" "${DATA_DIR}/index" \
"${MEDIA_ROOT_DIR}" "${MEDIA_ROOT_DIR}/documents" "${MEDIA_ROOT_DIR}/documents/originals" "${MEDIA_ROOT_DIR}/documents/thumbnails" \
"${CONSUME_DIR}"; do
if [[ ! -d "${dir}" ]]; then
echo "Creating directory ${dir}"
mkdir "${dir}"
@@ -91,7 +96,11 @@ initialize() {
set +e
echo "Adjusting permissions of paperless files. This may take a while."
chown -R paperless:paperless ${tmp_dir}
for dir in "${export_dir}" "${DATA_DIR}" "${MEDIA_ROOT_DIR}"; do
for dir in \
"${export_dir}" \
"${DATA_DIR}" \
"${MEDIA_ROOT_DIR}" \
"${CONSUME_DIR}"; do
find "${dir}" -not \( -user paperless -and -group paperless \) -exec chown paperless:paperless {} +
done
set -e

View File

@@ -12,4 +12,4 @@ if [ "$(id -u)" == "$(id -u paperless)" ]; then
)
fi
/usr/local/bin/supervisord -c /etc/supervisord.conf "${rootless_args[@]}"
exec /usr/local/bin/supervisord -c /etc/supervisord.conf "${rootless_args[@]}"

View File

@@ -80,7 +80,7 @@ a {
}
.tags {
top: 0;
top: .2rem;
right: 0;
max-width: 80%;
row-gap: .2rem;

File diff suppressed because it is too large Load Diff

View File

@@ -345,7 +345,7 @@
<context context-type="sourcefile">src/app/app.component.ts</context>
<context context-type="linenumber">146</context>
</context-group>
<target state="translated">Début de l'envoi …</target>
<target state="translated">Début du téléversement...</target>
</trans-unit>
<trans-unit id="2173456130768795374" datatype="html">
<source>Paperless-ngx</source>

View File

@@ -3,12 +3,15 @@ import os
import shutil
import tempfile
from functools import lru_cache
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
from typing import List
from typing import Optional
from typing import Tuple
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pikepdf import Page
from pikepdf import Pdf
from pikepdf import PdfImage
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
@@ -31,7 +34,7 @@ def supported_file_type(mime_type) -> bool:
return mime_type in supported_mime
def barcode_reader(image) -> List[str]:
def barcode_reader(image: Image) -> List[str]:
"""
Read any barcodes contained in image
Returns a list containing all found barcodes
@@ -98,21 +101,39 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
return newpath
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
"""
Scan the provided pdf file for page separating barcodes
Returns a list of pagenumbers, which separate the file
Returns a PDF filepath and a list of pagenumbers,
which separate the file into new files
"""
separator_page_numbers = []
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
# use a temporary directory in case the file os too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
current_barcodes = barcode_reader(page)
if separator_barcode in current_barcodes:
separator_page_numbers.append(current_page_number)
return separator_page_numbers
pdf_filepath = None
mime_type = get_file_mime_type(filepath)
if supported_file_type(mime_type):
pdf_filepath = filepath
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
pdf = Pdf.open(pdf_filepath)
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img)
if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
separator_page_numbers.append(page_num)
else:
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
)
return pdf_filepath, separator_page_numbers
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
@@ -122,47 +143,56 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
document_paths = []
if not pages_to_split_on:
logger.warning("No pages to split on!")
return document_paths
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
fname = os.path.splitext(os.path.basename(filepath))[0]
pdf = Pdf.open(filepath)
document_paths = []
logger.debug(f"Temp dir is {str(tempdir)}")
if not pages_to_split_on:
logger.warning("No pages to split on!")
else:
# go from the first page to the first separator page
# A list of documents, ie a list of lists of pages
documents: List[List[Page]] = []
# A single document, ie a list of pages
document: List[Page] = []
for idx, page in enumerate(pdf.pages):
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
document.append(page)
# Make sure to append the very last document to the documents
if idx == (len(pdf.pages) - 1):
documents.append(document)
document = []
else:
# This is a split index, save the current PDF pages, and restart
# a new destination page listing
logger.debug(f"Starting new document at idx {idx}")
documents.append(document)
document = []
documents = [x for x in documents if len(x)]
logger.debug(f"Split into {len(documents)} new documents")
# Write the new documents out
for doc_idx, document in enumerate(documents):
dst = Pdf.new()
for n, page in enumerate(pdf.pages):
if n < pages_to_split_on[0]:
dst.pages.append(page)
output_filename = f"{fname}_document_0.pdf"
dst.pages.extend(document)
output_filename = f"{fname}_document_{doc_idx}.pdf"
logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages")
savepath = os.path.join(tempdir, output_filename)
with open(savepath, "wb") as out:
dst.save(out)
document_paths = [savepath]
document_paths.append(savepath)
# iterate through the rest of the document
for count, page_number in enumerate(pages_to_split_on):
logger.debug(f"Count: {str(count)} page_number: {str(page_number)}")
dst = Pdf.new()
try:
next_page = pages_to_split_on[count + 1]
except IndexError:
next_page = len(pdf.pages)
# skip the first page_number. This contains the barcode page
for page in range(page_number + 1, next_page):
logger.debug(
f"page_number: {str(page_number)} next_page: {str(next_page)}",
)
dst.pages.append(pdf.pages[page])
output_filename = f"{fname}_document_{str(count + 1)}.pdf"
logger.debug(f"pdf no:{str(count)} has {str(len(dst.pages))} pages")
savepath = os.path.join(tempdir, output_filename)
with open(savepath, "wb") as out:
dst.save(out)
document_paths.append(savepath)
logger.debug(f"Temp files are {str(document_paths)}")
return document_paths

View File

@@ -96,29 +96,13 @@ def consume_file(
# check for separators in current document
if settings.CONSUMER_ENABLE_BARCODES:
mime_type = barcodes.get_file_mime_type(path)
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
if not barcodes.supported_file_type(mime_type):
# if not supported, skip this routine
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
if separators:
logger.debug(
f"Pages with separators found in: {str(path)}",
)
else:
separators = []
document_list = []
if mime_type == "image/tiff":
file_to_process = barcodes.convert_from_tiff_to_pdf(path)
else:
file_to_process = path
separators = barcodes.scan_file_for_separating_barcodes(file_to_process)
if separators:
logger.debug(
f"Pages with separators found in: {str(path)}",
)
document_list = barcodes.separate_pages(file_to_process, separators)
document_list = barcodes.separate_pages(pdf_filepath, separators)
if document_list:
for n, document in enumerate(document_list):
@@ -134,15 +118,13 @@ def consume_file(
target_dir=path.parent,
)
# if we got here, the document was successfully split
# and can safely be deleted
if mime_type == "image/tiff":
# Remove the TIFF converted to PDF file
logger.debug(f"Deleting file {file_to_process}")
os.unlink(file_to_process)
# Remove the original file (new file is saved above)
logger.debug(f"Deleting file {path}")
os.unlink(path)
# Delete the PDF file which was split
os.remove(pdf_filepath)
# If the original was a TIFF, remove the original file as well
if str(pdf_filepath) != str(path):
logger.debug(f"Deleting file {path}")
os.unlink(path)
# notify the sender, otherwise the progress bar
# in the UI stays stuck

View File

@@ -13,22 +13,23 @@ from PIL import Image
class TestBarcode(DirectoriesMixin, TestCase):
SAMPLE_DIR = os.path.join(
os.path.dirname(__file__),
"samples",
)
BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes")
def test_barcode_reader(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-PATCHT.png",
)
test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png")
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pbm",
)
img = Image.open(test_file)
@@ -37,9 +38,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_distorsion(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion.png",
)
img = Image.open(test_file)
@@ -48,9 +47,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_distorsion2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion2.png",
)
img = Image.open(test_file)
@@ -59,9 +56,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_unreadable(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-unreadable.png",
)
img = Image.open(test_file)
@@ -69,9 +64,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_qr(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"qr-code-PATCHT.png",
)
img = Image.open(test_file)
@@ -80,9 +73,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_128(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-PATCHT.png",
)
img = Image.open(test_file)
@@ -90,15 +81,13 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_no_barcode(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.png")
test_file = os.path.join(self.SAMPLE_DIR, "simple.png")
img = Image.open(test_file)
self.assertEqual(barcodes.barcode_reader(img), [])
def test_barcode_reader_custom_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.png",
)
img = Image.open(test_file)
@@ -106,9 +95,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_custom_qr_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.png",
)
img = Image.open(test_file)
@@ -116,9 +103,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_custom_128_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png",
)
img = Image.open(test_file)
@@ -126,19 +111,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_get_mime_type(self):
tiff_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.tiff",
)
pdf_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.pdf",
)
png_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png",
)
tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1")
@@ -173,8 +154,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_convert_error_from_pdf_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
@@ -183,117 +163,155 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_scan_file_for_separating_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_barcodes2(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_scan_file_for_separating_barcodes3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_barcodes4(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"several-patcht-codes.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [2, 5])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [2, 5])
def test_scan_file_for_separating_barcodes_upsidedown(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle_reverse.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-qr.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_128_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_wrong_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_separate_pages(self):
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
pages = barcodes.separate_pages(test_file, [1])
self.assertEqual(len(pages), 2)
def test_separate_pages_double_code(self):
"""
GIVEN:
- Input PDF with two patch code pages in a row
WHEN:
- The input file is split
THEN:
- Only two files are output
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.pdf",
"patch-code-t-double.pdf",
)
pages = barcodes.separate_pages(test_file, [1])
pages = barcodes.separate_pages(test_file, [1, 2])
self.assertEqual(len(pages), 2)
def test_separate_pages_no_list(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
@@ -308,9 +326,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@@ -320,9 +336,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
nonexistingdir = "/nowhere"
@@ -340,9 +354,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@@ -352,31 +364,36 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_splitter(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
separators = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertTrue(separators)
document_list = barcodes.separate_pages(test_file, separators)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(test_file, pdf_file)
self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
self.assertTrue(document_list)
for document in document_list:
barcodes.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
shutil.copy(test_file, dst)
@@ -388,9 +405,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
)
def test_consume_barcode_tiff_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
@@ -412,18 +427,17 @@ class TestBarcode(DirectoriesMixin, TestCase):
and continue archiving the file as is.
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.jpg",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
shutil.copy(test_file, dst)
with self.assertLogs("paperless.tasks", level="WARNING") as cm:
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
self.assertIn("Success", tasks.consume_file(dst))
self.assertListEqual(
cm.output,
[
"WARNING:paperless.tasks:Unsupported file format for barcode reader: image/jpeg",
"WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
],
)
m.assert_called_once()
@@ -445,9 +459,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
the user uploads a supported image file, but without extension
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle")

View File

@@ -283,7 +283,9 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(
CONSUMER_POLLING=1,
CONSUMER_POLLING_DELAY=1,
# please leave the delay here and down below
# see https://github.com/paperless-ngx/paperless-ngx/pull/66
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
class TestConsumerPolling(TestConsumer):
@@ -300,7 +302,7 @@ class TestConsumerRecursive(TestConsumer):
@override_settings(
CONSUMER_RECURSIVE=True,
CONSUMER_POLLING=1,
CONSUMER_POLLING_DELAY=1,
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
class TestConsumerRecursivePolling(TestConsumer):
@@ -345,7 +347,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(
CONSUMER_POLLING=1,
CONSUMER_POLLING_DELAY=1,
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
def test_consume_file_with_path_tags_polling(self):

View File

@@ -1,4 +1,6 @@
import grp
import os
import pwd
import shutil
import stat
@@ -32,12 +34,15 @@ def path_check(var, directory):
with open(test_file, "w"):
pass
except PermissionError:
dir_stat = os.stat(directory)
dir_mode = stat.filemode(dir_stat.st_mode)
dir_owner = pwd.getpwuid(dir_stat.st_uid).pw_name
dir_group = grp.getgrgid(dir_stat.st_gid).gr_name
messages.append(
Error(
writeable_message.format(var),
writeable_hint.format(
f"\n{stat.filemode(os.stat(directory).st_mode)} "
f"{directory}\n",
f"\n{dir_mode} {dir_owner} {dir_group} " f"{directory}\n",
),
),
)

View File

@@ -1,4 +1,5 @@
import os
from pathlib import Path
import dateutil.parser
import requests
@@ -28,6 +29,11 @@ class TikaDocumentParser(DocumentParser):
def extract_metadata(self, document_path, mime_type):
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as e:
@@ -47,10 +53,14 @@ class TikaDocumentParser(DocumentParser):
for key in parsed["metadata"]
]
def parse(self, document_path, mime_type, file_name=None):
def parse(self, document_path: Path, mime_type, file_name=None):
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as err: