2018-09-06 12:00:01 +02:00

409 lines
12 KiB
Python

# coding=utf-8
import dateutil.parser
import logging
import os
import re
import uuid
from collections import OrderedDict
from fuzzywuzzy import fuzz
from django.conf import settings
try:
from django.core.urlresolvers import reverse
except ImportError:
from django.urls import reverse
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from .managers import LogManager
class MatchingModel(models.Model):
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True)
automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')
class Meta:
abstract = True
ordering = ("name",)
def __str__(self):
return self.name
def save(self, *args, **kwargs):
if not self.slug:
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)
class Correspondent(MatchingModel):
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
class Meta:
ordering = ("name",)
class Tag(MatchingModel):
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
)
colour = models.PositiveIntegerField(choices=COLOURS, default=1)
is_inbox_tag = models.BooleanField(
default=False,
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.")
is_archived_tag = models.BooleanField(
default=False,
help_text="Marks this tag as an archive tag: All documents tagged with archive tags will never be modified automatically (i.e., modifying tags by matching rules)")
class DocumentType(MatchingModel):
pass
class Document(models.Model):
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (
(STORAGE_TYPE_UNENCRYPTED, "Unencrypted"),
(STORAGE_TYPE_GPG, "Encrypted with GNU Privacy Guard")
)
correspondent = models.ForeignKey(
Correspondent,
blank=True,
null=True,
related_name="documents",
on_delete=models.SET_NULL
)
title = models.CharField(max_length=128, blank=True, db_index=True)
document_type = models.ForeignKey(
DocumentType,
blank=True,
null=True,
related_name="documents",
on_delete=models.SET_NULL
)
content = models.TextField(
db_index=True,
blank=True,
help_text="The raw, text-only data of the document. This field is "
"primarily used for searching."
)
file_type = models.CharField(
max_length=4,
editable=False,
choices=tuple([(t, t.upper()) for t in TYPES])
)
tags = models.ManyToManyField(
Tag, related_name="documents", blank=True)
checksum = models.CharField(
max_length=32,
editable=False,
unique=True,
help_text="The checksum of the original document (before it was "
"encrypted). We use this to prevent duplicate document "
"imports."
)
created = models.DateTimeField(
default=timezone.now, db_index=True)
modified = models.DateTimeField(
auto_now=True, editable=False, db_index=True)
storage_type = models.CharField(
max_length=11,
choices=STORAGE_TYPES,
default=STORAGE_TYPE_UNENCRYPTED,
editable=False
)
added = models.DateTimeField(
default=timezone.now, editable=False, db_index=True)
archive_serial_number = models.IntegerField(
blank=True,
null=True,
unique=True,
db_index=True,
help_text="The position of this document in your physical document archive.")
class Meta:
ordering = ("correspondent", "title")
def __str__(self):
created = self.created.strftime("%Y%m%d%H%M%S")
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
if self.correspondent or self.title:
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
@property
def source_path(self):
file_name = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg"
return os.path.join(
settings.MEDIA_ROOT,
"documents",
"originals",
file_name
)
@property
def source_file(self):
return open(self.source_path, "rb")
@property
def file_name(self):
return slugify(str(self)) + "." + self.file_type
@property
def download_url(self):
return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk})
@property
def thumbnail_path(self):
file_name = "{:07}.png".format(self.pk)
if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg"
return os.path.join(
settings.MEDIA_ROOT,
"documents",
"thumbnails",
file_name
)
@property
def thumbnail_file(self):
return open(self.thumbnail_path, "rb")
@property
def thumbnail_url(self):
return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk})
class Log(models.Model):
LEVELS = (
(logging.DEBUG, "Debugging"),
(logging.INFO, "Informational"),
(logging.WARNING, "Warning"),
(logging.ERROR, "Error"),
(logging.CRITICAL, "Critical"),
)
group = models.UUIDField(blank=True)
message = models.TextField()
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
objects = LogManager()
class Meta:
ordering = ("-modified",)
def __str__(self):
return self.message
def save(self, *args, **kwargs):
"""
To allow for the case where we don't want to group the message, we
shouldn't force the caller to specify a one-time group value. However,
allowing group=None means that the manager can't differentiate the
different un-grouped messages, so instead we set a random one here.
"""
if not self.group:
self.group = uuid.uuid4()
models.Model.save(self, *args, **kwargs)
class FileInfo:
# This epic regex *almost* worked for our needs, so I'm keeping it here for
# posterity, in the hopes that we might find a way to make it work one day.
ALMOST_REGEX = re.compile(
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
r"((?P<correspondent>{non_separated_word}+){separator})??"
r"(?P<title>{non_separated_word}+)"
r"({separator}(?P<tags>[a-z,0-9-]+))?"
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
separator=r"\s+-\s+",
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
flags=re.IGNORECASE
))
])
def __init__(self, created=None, correspondent=None, title=None, tags=(),
extension=None):
self.created = created
self.title = title
self.extension = extension
self.correspondent = correspondent
self.tags = tags
@classmethod
def _get_created(cls, created):
try:
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
except ValueError:
return None
@classmethod
def _get_correspondent(cls, name):
if not name:
return None
return Correspondent.objects.get_or_create(name=name, defaults={
"slug": slugify(name)
})[0]
@classmethod
def _get_title(cls, title):
return title
@classmethod
def _get_tags(cls, tags):
r = []
for t in tags.split(","):
r.append(Tag.objects.get_or_create(
slug=t.lower(),
defaults={"name": t}
)[0])
return tuple(r)
@classmethod
def _get_extension(cls, extension):
r = extension.lower()
if r == "jpeg":
return "jpg"
if r == "tif":
return "tiff"
return r
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
properties[name] = getattr(cls, "_get_{}".format(name))(
properties[name]
)
@classmethod
def from_path(cls, path):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<date> - <correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
for regex in cls.REGEXES.values():
m = regex.match(os.path.basename(path))
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
cls._mangle_property(properties, "correspondent")
cls._mangle_property(properties, "title")
cls._mangle_property(properties, "tags")
cls._mangle_property(properties, "extension")
return cls(**properties)