Version bump

Fixed migration order
2018-12-12 13:25:28 +01:00 · 2018-12-12 13:13:21 +01:00 · 2018-12-12 13:11:30 +01:00 · 2018-12-11 22:58:14 +01:00 · 2018-12-11 22:36:26 +01:00 · 2018-12-11 22:26:20 +01:00
43 changed files with 56519 additions and 284 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -82,3 +82,6 @@ scripts/nuke

 # Static files collected by the collectstatic command
 static/
+
+# Classification Models
+models/
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,7 @@ language: python

 before_install:
 - sudo apt-get update -qq
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr tesseract-ocr-eng tesseract-ocr-cat
+- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr tesseract-ocr-eng tesseract-ocr-cat tesseract-ocr-deu

 sudo: false

--- a/4
+++ b/4
@@ -13,10 +13,10 @@ ENV PAPERLESS_EXPORT_DIR=/export \
    PAPERLESS_CONSUMPTION_DIR=/consume


-RUN apk update --no-cache && apk add python3 gnupg libmagic bash shadow curl \
+RUN apk update --no-cache && apk add python3 gnupg libmagic libpq bash shadow curl \
        sudo poppler tesseract-ocr imagemagick ghostscript unpaper optipng && \
    apk add --virtual .build-dependencies \
-        python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
+        python3-dev poppler-dev postgresql-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
 # Install python dependencies
    python3 -m ensurepip && \
    rm -r /usr/lib/python*/ensurepip && \
--- a/2
+++ b/2
@@ -25,7 +25,6 @@ python-dateutil = "*"
 python-dotenv = "*"
 python-gnupg = "*"
 pytz = "*"
-ipython = "*"
 sphinx = "*"
 tox = "*"
 pycodestyle = "*"
@@ -37,3 +36,4 @@ pytest-env = "*"
 pytest-xdist = "*"

 [dev-packages]
+ipython = "*"
--- a/docs/changelog_jonaswinkler.rst
+++ b/docs/changelog_jonaswinkler.rst
@@ -0,0 +1,20 @@
+Changelog (jonaswinkler)
+########################
+
+1.0.0
+=====
+
+* First release based on paperless 2.6.0
+* Added: Automatic document classification using neural networks (replaces
+  regex-based tagging)
+* Added: Document types
+* Added: Archive serial number allows easy referencing of physical document
+  copies
+* Added: Inbox tags (added automatically to newly consumed documents)
+* Added: Document viewer on document edit page
+* Database backend is now configurable
+
+1.0.1
+=====
+
+* Fixed migration order
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -46,3 +46,4 @@ Contents
   contributing
   scanners
   changelog
+   changelog_jonaswinkler
--- a/models/.keep
+++ b/models/.keep
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -3,6 +3,16 @@
 # As this file contains passwords it should only be readable by the user
 # running paperless.

+###############################################################################
+####                        Database Settings                              ####
+###############################################################################
+
+# By default, sqlite is used as the database backend. This can be changed here.
+#PAPERLESS_DBENGINE="django.db.backends.postgresql_psycopg2"
+#PAPERLESS_DBNAME="paperless"
+#PAPERLESS_DBUSER="paperless"
+#PAPERLESS_DBPASS="paperless"
+

 ###############################################################################
 ####                         Paths & Folders                               ####
@@ -38,6 +48,13 @@ PAPERLESS_CONSUMPTION_DIR=""
 #PAPERLESS_STATIC_URL="/static/"


+# You can specify where the document classification model file should be
+# stored. Make sure that this file is writeable by the user executing the
+# management command "document_create_classifier" and that the path exists.
+# The default location is /models/model.pickle wwithin the install folder.
+#PAPERLESS_MODEL_FILE=/path/to/model/file
+
+
 # These values are required if you want paperless to check a particular email
 # box every 10 minutes and attempt to consume documents from there.  If you
 # don't define a HOST, mail checking will just be disabled.
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,6 +36,7 @@ jinja2==2.10
 langdetect==1.0.7
 markupsafe==1.0
 more-itertools==4.3.0
+numpy==1.15.1
 packaging==18.0
 parso==0.3.1
 pdftotext==2.1.1
@@ -43,6 +44,7 @@ pexpect==4.6.0
 pickleshare==0.7.5
 pillow==5.3.0
 pluggy==0.8.0
+psycopg2==2.7.6.1
 prompt-toolkit==2.0.7
 ptyprocess==0.6.0
 py==1.7.0
@@ -65,6 +67,8 @@ pytz==2018.7
 regex==2018.11.2
 requests==2.20.0
 six==1.11.0
+scikit-learn==0.19.2
+scipy==1.1.0
 snowballstemmer==1.2.1
 sphinx==1.8.1
 sphinxcontrib-websupport==1.1.0
--- a/src/documents/actions.py
+++ b/src/documents/actions.py
@@ -4,7 +4,8 @@ from django.contrib.admin.utils import model_ngettext
 from django.core.exceptions import PermissionDenied
 from django.template.response import TemplateResponse

-from documents.models import Correspondent, Tag
+from documents.classifier import DocumentClassifier
+from documents.models import Correspondent, DocumentType, Tag


 def select_action(
@@ -17,9 +18,9 @@ def select_action(
    if not modeladmin.has_change_permission(request):
        raise PermissionDenied

-    if request.POST.get('post'):
+    if request.POST.get("post"):
        n = queryset.count()
-        selected_object = modelclass.objects.get(id=request.POST.get('obj_id'))
+        selected_object = modelclass.objects.get(id=request.POST.get("obj_id"))
        if n:
            for document in queryset:
                if document_action:
@@ -137,6 +138,57 @@ def remove_correspondent_from_selected(modeladmin, request, queryset):
    )


+def set_document_type_on_selected(modeladmin, request, queryset):
+    return select_action(
+        modeladmin=modeladmin,
+        request=request,
+        queryset=queryset,
+        title="Set document type on multiple documents",
+        action="set_document_type_on_selected",
+        modelclass=DocumentType,
+        success_message="Successfully set document type %(selected_object)s "
+                        "on %(count)d %(items)s.",
+        queryset_action=lambda qs, document_type: qs.update(
+            document_type=document_type)
+    )
+
+
+def remove_document_type_from_selected(modeladmin, request, queryset):
+    return simple_action(
+        modeladmin=modeladmin,
+        request=request,
+        queryset=queryset,
+        success_message="Successfully removed document type from %(count)d "
+                        "%(items)s.",
+        queryset_action=lambda qs: qs.update(document_type=None)
+    )
+
+
+def run_document_classifier_on_selected(modeladmin, request, queryset):
+    clf = DocumentClassifier()
+    try:
+        clf.reload()
+        return simple_action(
+            modeladmin=modeladmin,
+            request=request,
+            queryset=queryset,
+            success_message="Successfully applied document classifier to "
+                            "%(count)d %(items)s.",
+            document_action=lambda doc: clf.classify_document(
+                doc,
+                classify_correspondent=True,
+                classify_tags=True,
+                classify_document_type=True)
+        )
+    except FileNotFoundError:
+        modeladmin.message_user(
+            request,
+            "Classifier model file not found.",
+            messages.ERROR
+        )
+        return None
+
+
 add_tag_to_selected.short_description = "Add tag to selected documents"
 remove_tag_from_selected.short_description = \
    "Remove tag from selected documents"
@@ -144,3 +196,9 @@ set_correspondent_on_selected.short_description = \
    "Set correspondent on selected documents"
 remove_correspondent_from_selected.short_description = \
    "Remove correspondent from selected documents"
+set_document_type_on_selected.short_description = \
+    "Set document type on selected documents"
+remove_document_type_from_selected.short_description = \
+    "Remove document type from selected documents"
+run_document_classifier_on_selected.short_description = \
+    "Run document classifier on selected"
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -16,10 +16,13 @@ from documents.actions import (
    add_tag_to_selected,
    remove_correspondent_from_selected,
    remove_tag_from_selected,
-    set_correspondent_on_selected
+    set_correspondent_on_selected,
+    set_document_type_on_selected,
+    remove_document_type_from_selected,
+    run_document_classifier_on_selected
 )

-from .models import Correspondent, Document, Log, Tag
+from .models import Correspondent, Document, DocumentType, Log, Tag


 class FinancialYearFilter(admin.SimpleListFilter):
@@ -116,13 +119,11 @@ class CorrespondentAdmin(CommonAdmin):

    list_display = (
        "name",
-        "match",
-        "matching_algorithm",
+        "automatic_classification",
        "document_count",
        "last_correspondence"
    )
-    list_filter = ("matching_algorithm",)
-    list_editable = ("match", "matching_algorithm")
+    list_editable = ("automatic_classification",)

    readonly_fields = ("slug",)

@@ -146,9 +147,12 @@ class CorrespondentAdmin(CommonAdmin):
 class TagAdmin(CommonAdmin):

    list_display = (
-        "name", "colour", "match", "matching_algorithm", "document_count")
-    list_filter = ("colour", "matching_algorithm")
-    list_editable = ("colour", "match", "matching_algorithm")
+        "name",
+        "colour",
+        "automatic_classification",
+        "document_count")
+    list_filter = ("colour",)
+    list_editable = ("colour", "automatic_classification")

    readonly_fields = ("slug",)

@@ -165,6 +169,23 @@ class TagAdmin(CommonAdmin):
    document_count.admin_order_field = "document_count"


+class DocumentTypeAdmin(CommonAdmin):
+
+    list_display = ("name", "automatic_classification", "document_count")
+    list_editable = ("automatic_classification",)
+
+    readonly_fields = ("slug",)
+
+    def get_queryset(self, request):
+        qs = super(DocumentTypeAdmin, self).get_queryset(request)
+        qs = qs.annotate(document_count=models.Count("documents"))
+        return qs
+
+    def document_count(self, obj):
+        return obj.document_count
+    document_count.admin_order_field = "document_count"
+
+
 class DocumentAdmin(CommonAdmin):

    class Media:
@@ -175,8 +196,9 @@ class DocumentAdmin(CommonAdmin):
    search_fields = ("correspondent__name", "title", "content", "tags__name")
    readonly_fields = ("added", "file_type", "storage_type",)
    list_display = ("title", "created", "added", "thumbnail", "correspondent",
-                    "tags_")
+                    "tags_", "archive_serial_number", "document_type")
    list_filter = (
+        "document_type",
        "tags",
        ("correspondent", RecentCorrespondentFilter),
        FinancialYearFilter
@@ -190,7 +212,10 @@ class DocumentAdmin(CommonAdmin):
        add_tag_to_selected,
        remove_tag_from_selected,
        set_correspondent_on_selected,
-        remove_correspondent_from_selected
+        remove_correspondent_from_selected,
+        set_document_type_on_selected,
+        remove_document_type_from_selected,
+        run_document_classifier_on_selected
    ]

    date_hierarchy = "created"
@@ -223,6 +248,9 @@ class DocumentAdmin(CommonAdmin):
                    extra_context=None):

        extra_context = extra_context or {}
+        doc = Document.objects.get(id=object_id)
+        extra_context["download_url"] = doc.download_url
+        extra_context["file_type"] = doc.file_type

        if self.document_queue and object_id:
            if int(object_id) in self.document_queue:
@@ -346,6 +374,7 @@ class LogAdmin(CommonAdmin):

 admin.site.register(Correspondent, CorrespondentAdmin)
 admin.site.register(Tag, TagAdmin)
+admin.site.register(DocumentType, DocumentTypeAdmin)
 admin.site.register(Document, DocumentAdmin)
 admin.site.register(Log, LogAdmin)

--- a/src/documents/apps.py
+++ b/src/documents/apps.py
@@ -11,8 +11,8 @@ class DocumentsConfig(AppConfig):
        from .signals import document_consumption_started
        from .signals import document_consumption_finished
        from .signals.handlers import (
-            set_correspondent,
-            set_tags,
+            classify_document,
+            add_inbox_tags,
            run_pre_consume_script,
            run_post_consume_script,
            cleanup_document_deletion,
@@ -21,8 +21,8 @@ class DocumentsConfig(AppConfig):

        document_consumption_started.connect(run_pre_consume_script)

-        document_consumption_finished.connect(set_tags)
-        document_consumption_finished.connect(set_correspondent)
+        document_consumption_finished.connect(classify_document)
+        document_consumption_finished.connect(add_inbox_tags)
        document_consumption_finished.connect(set_log_entry)
        document_consumption_finished.connect(run_post_consume_script)

--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -0,0 +1,240 @@
+import logging
+import os
+import pickle
+
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.neural_network import MLPClassifier
+from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
+
+from documents.models import Correspondent, DocumentType, Tag, Document
+from paperless import settings
+
+
+def preprocess_content(content):
+    content = content.lower()
+    content = content.strip()
+    content = content.replace("\n", " ")
+    content = content.replace("\r", " ")
+    while content.find("  ") > -1:
+        content = content.replace("  ", " ")
+    return content
+
+
+class DocumentClassifier(object):
+
+    def __init__(self):
+        self.classifier_version = 0
+
+        self.data_vectorizer = None
+
+        self.tags_binarizer = None
+        self.correspondent_binarizer = None
+        self.document_type_binarizer = None
+
+        self.tags_classifier = None
+        self.correspondent_classifier = None
+        self.document_type_classifier = None
+
+    def reload(self):
+        if os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
+            logging.getLogger(__name__).info("Reloading classifier models")
+            with open(settings.MODEL_FILE, "rb") as f:
+                self.data_vectorizer = pickle.load(f)
+                self.tags_binarizer = pickle.load(f)
+                self.correspondent_binarizer = pickle.load(f)
+                self.document_type_binarizer = pickle.load(f)
+
+                self.tags_classifier = pickle.load(f)
+                self.correspondent_classifier = pickle.load(f)
+                self.document_type_classifier = pickle.load(f)
+            self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
+
+    def save_classifier(self):
+        with open(settings.MODEL_FILE, "wb") as f:
+            pickle.dump(self.data_vectorizer, f)
+
+            pickle.dump(self.tags_binarizer, f)
+            pickle.dump(self.correspondent_binarizer, f)
+            pickle.dump(self.document_type_binarizer, f)
+
+            pickle.dump(self.tags_classifier, f)
+            pickle.dump(self.correspondent_classifier, f)
+            pickle.dump(self.document_type_classifier, f)
+
+    def train(self):
+        data = list()
+        labels_tags = list()
+        labels_correspondent = list()
+        labels_document_type = list()
+
+        # Step 1: Extract and preprocess training data from the database.
+        logging.getLogger(__name__).info("Gathering data from database...")
+        for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+            data.append(preprocess_content(doc.content))
+
+            y = -1
+            if doc.document_type:
+                if doc.document_type.automatic_classification:
+                    y = doc.document_type.id
+            labels_document_type.append(y)
+
+            y = -1
+            if doc.correspondent:
+                if doc.correspondent.automatic_classification:
+                    y = doc.correspondent.id
+            labels_correspondent.append(y)
+
+            tags = [tag.id for tag in doc.tags.filter(
+                automatic_classification=True
+            )]
+            labels_tags.append(tags)
+
+        labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
+        logging.getLogger(__name__).info(
+            "{} documents, {} tag(s), {} correspondent(s), "
+            "{} document type(s).".format(
+                len(data),
+                len(labels_tags_unique),
+                len(set(labels_correspondent)),
+                len(set(labels_document_type))
+            )
+        )
+
+        # Step 2: vectorize data
+        logging.getLogger(__name__).info("Vectorizing data...")
+        self.data_vectorizer = CountVectorizer(
+            analyzer="char",
+            ngram_range=(3, 5),
+            min_df=0.1
+        )
+        data_vectorized = self.data_vectorizer.fit_transform(data)
+
+        self.tags_binarizer = MultiLabelBinarizer()
+        labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
+
+        self.correspondent_binarizer = LabelBinarizer()
+        labels_correspondent_vectorized = \
+            self.correspondent_binarizer.fit_transform(labels_correspondent)
+
+        self.document_type_binarizer = LabelBinarizer()
+        labels_document_type_vectorized = \
+            self.document_type_binarizer.fit_transform(labels_document_type)
+
+        # Step 3: train the classifiers
+        if len(self.tags_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info("Training tags classifier...")
+            self.tags_classifier = MLPClassifier(verbose=True)
+            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
+        else:
+            self.tags_classifier = None
+            logging.getLogger(__name__).info(
+                "There are no tags. Not training tags classifier."
+            )
+
+        if len(self.correspondent_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info(
+                "Training correspondent classifier..."
+            )
+            self.correspondent_classifier = MLPClassifier(verbose=True)
+            self.correspondent_classifier.fit(
+                data_vectorized,
+                labels_correspondent_vectorized
+            )
+        else:
+            self.correspondent_classifier = None
+            logging.getLogger(__name__).info(
+                "There are no correspondents. Not training correspondent "
+                "classifier."
+            )
+
+        if len(self.document_type_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info(
+                "Training document type classifier..."
+            )
+            self.document_type_classifier = MLPClassifier(verbose=True)
+            self.document_type_classifier.fit(
+                data_vectorized,
+                labels_document_type_vectorized
+            )
+        else:
+            self.document_type_classifier = None
+            logging.getLogger(__name__).info(
+                "There are no document types. Not training document type "
+                "classifier."
+            )
+
+    def classify_document(
+            self, document, classify_correspondent=False,
+            classify_document_type=False, classify_tags=False,
+            replace_tags=False):
+
+        X = self.data_vectorizer.transform(
+            [preprocess_content(document.content)]
+        )
+
+        if classify_correspondent and self.correspondent_classifier:
+            self._classify_correspondent(X, document)
+
+        if classify_document_type and self.document_type_classifier:
+            self._classify_document_type(X, document)
+
+        if classify_tags and self.tags_classifier:
+            self._classify_tags(X, document, replace_tags)
+
+        document.save(update_fields=("correspondent", "document_type"))
+
+    def _classify_correspondent(self, X, document):
+        y = self.correspondent_classifier.predict(X)
+        correspondent_id = self.correspondent_binarizer.inverse_transform(y)[0]
+        try:
+            correspondent = None
+            if correspondent_id != -1:
+                correspondent = Correspondent.objects.get(id=correspondent_id)
+                logging.getLogger(__name__).info(
+                    "Detected correspondent: {}".format(correspondent.name)
+                )
+            else:
+                logging.getLogger(__name__).info("Detected correspondent: -")
+            document.correspondent = correspondent
+        except Correspondent.DoesNotExist:
+            logging.getLogger(__name__).warning(
+                "Detected correspondent with id {} does not exist "
+                "anymore! Did you delete it?".format(correspondent_id)
+            )
+
+    def _classify_document_type(self, X, document):
+        y = self.document_type_classifier.predict(X)
+        document_type_id = self.document_type_binarizer.inverse_transform(y)[0]
+        try:
+            document_type = None
+            if document_type_id != -1:
+                document_type = DocumentType.objects.get(id=document_type_id)
+                logging.getLogger(__name__).info(
+                    "Detected document type: {}".format(document_type.name)
+                )
+            else:
+                logging.getLogger(__name__).info("Detected document type: -")
+            document.document_type = document_type
+        except DocumentType.DoesNotExist:
+            logging.getLogger(__name__).warning(
+                "Detected document type with id {} does not exist "
+                "anymore! Did you delete it?".format(document_type_id)
+            )
+
+    def _classify_tags(self, X, document, replace_tags):
+        y = self.tags_classifier.predict(X)
+        tags_ids = self.tags_binarizer.inverse_transform(y)[0]
+        if replace_tags:
+            document.tags.clear()
+        for tag_id in tags_ids:
+            try:
+                tag = Tag.objects.get(id=tag_id)
+                logging.getLogger(__name__).info(
+                    "Detected tag: {}".format(tag.name)
+                )
+                document.tags.add(tag)
+            except Tag.DoesNotExist:
+                logging.getLogger(__name__).warning(
+                    "Detected tag with id {} does not exist anymore! Did "
+                    "you delete it?".format(tag_id)
+                )
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -225,7 +225,7 @@ class Consumer:
                storage_type=self.storage_type
            )

-        relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
+        relevant_tags = set(file_info.tags)
        if relevant_tags:
            tag_names = ", ".join([t.slug for t in relevant_tags])
            self.log("debug", "Tagging with {}".format(tag_names))
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -1,6 +1,6 @@
 from django_filters.rest_framework import BooleanFilter, FilterSet

-from .models import Correspondent, Document, Tag
+from .models import Correspondent, Document, Tag, DocumentType


 CHAR_KWARGS = (
@@ -35,6 +35,19 @@ class TagFilterSet(FilterSet):
        }


+class DocumentTypeFilterSet(FilterSet):
+
+    class Meta:
+        model = DocumentType
+        fields = {
+            "name": [
+                "startswith", "endswith", "contains",
+                "istartswith", "iendswith", "icontains"
+            ],
+            "slug": ["istartswith", "iendswith", "icontains"]
+        }
+
+
 class DocumentFilterSet(FilterSet):

    tags_empty = BooleanFilter(
@@ -57,4 +70,7 @@ class DocumentFilterSet(FilterSet):
            "tags__name": CHAR_KWARGS,
            "tags__slug": CHAR_KWARGS,

+            "document_type__name": CHAR_KWARGS,
+            "document_type__slug": CHAR_KWARGS,
+
        }
--- a/src/documents/management/commands/document_correspondents.py
+++ b/src/documents/management/commands/document_correspondents.py
@@ -1,82 +0,0 @@
-import sys
-
-from django.core.management.base import BaseCommand
-
-from documents.models import Correspondent, Document
-
-from ...mixins import Renderable
-
-
-class Command(Renderable, BaseCommand):
-
-    help = """
-        Using the current set of correspondent rules, apply said rules to all
-        documents in the database, effectively allowing you to back-tag all
-        previously indexed documents with correspondent created (or modified)
-        after their initial import.
-    """.replace("    ", "")
-
-    TOO_MANY_CONTINUE = (
-        "Detected {} potential correspondents for {}, so we've opted for {}")
-    TOO_MANY_SKIP = (
-        "Detected {} potential correspondents for {}, so we're skipping it")
-    CHANGE_MESSAGE = (
-        'Document {}: "{}" was given the correspondent id {}: "{}"')
-
-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            "--use-first",
-            default=False,
-            action="store_true",
-            help="By default this command won't try to assign a correspondent "
-                 "if more than one matches the document.  Use this flag if "
-                 "you'd rather it just pick the first one it finds."
-        )
-
-    def handle(self, *args, **options):
-
-        self.verbosity = options["verbosity"]
-
-        for document in Document.objects.filter(correspondent__isnull=True):
-
-            potential_correspondents = list(
-                Correspondent.match_all(document.content))
-
-            if not potential_correspondents:
-                continue
-
-            potential_count = len(potential_correspondents)
-            correspondent = potential_correspondents[0]
-
-            if potential_count > 1:
-                if not options["use_first"]:
-                    print(
-                        self.TOO_MANY_SKIP.format(potential_count, document),
-                        file=sys.stderr
-                    )
-                    continue
-                print(
-                    self.TOO_MANY_CONTINUE.format(
-                        potential_count,
-                        document,
-                        correspondent
-                    ),
-                    file=sys.stderr
-                )
-
-            document.correspondent = correspondent
-            document.save(update_fields=("correspondent",))
-
-            print(
-                self.CHANGE_MESSAGE.format(
-                    document.pk,
-                    document.title,
-                    correspondent.pk,
-                    correspondent.name
-                ),
-                file=sys.stderr
-            )
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@@ -0,0 +1,25 @@
+import logging
+
+from django.core.management.base import BaseCommand
+from documents.classifier import DocumentClassifier
+from paperless import settings
+from ...mixins import Renderable
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        Trains the classifier on your data and saves the resulting models to a
+        file. The document consumer will then automatically use this new model.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def handle(self, *args, **options):
+        clf = DocumentClassifier()
+        clf.train()
+        logging.getLogger(__name__).info(
+            "Saving models to {}...".format(settings.MODEL_FILE)
+        )
+        clf.save_classifier()
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -6,7 +6,7 @@ import shutil
 from django.core.management.base import BaseCommand, CommandError
 from django.core import serializers

-from documents.models import Document, Correspondent, Tag
+from documents.models import Document, Correspondent, Tag, DocumentType
 from paperless.db import GnuPG

 from ...mixins import Renderable
@@ -96,6 +96,9 @@ class Command(Renderable, BaseCommand):
        manifest += json.loads(serializers.serialize(
            "json", Tag.objects.all()))

+        manifest += json.loads(serializers.serialize(
+            "json", DocumentType.objects.all()))
+
        with open(os.path.join(self.target, "manifest.json"), "w") as f:
            json.dump(manifest, f, indent=2)

--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -1,5 +1,8 @@
+import logging
+
 from django.core.management.base import BaseCommand

+from documents.classifier import DocumentClassifier
 from documents.models import Document, Tag

 from ...mixins import Renderable
@@ -8,25 +11,66 @@ from ...mixins import Renderable
 class Command(Renderable, BaseCommand):

    help = """
-        Using the current set of tagging rules, apply said rules to all
-        documents in the database, effectively allowing you to back-tag all
-        previously indexed documents with tags created (or modified) after
-        their initial import.
+        Using the current classification model, assigns correspondents, tags
+        and document types to all documents, effectively allowing you to
+        back-tag all previously indexed documents with metadata created (or
+        modified) after their initial import.
    """.replace("    ", "")

    def __init__(self, *args, **kwargs):
        self.verbosity = 0
        BaseCommand.__init__(self, *args, **kwargs)

+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-c", "--correspondent",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-T", "--tags",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-t", "--type",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-i", "--inbox-only",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-r", "--replace-tags",
+            action="store_true"
+        )
+
    def handle(self, *args, **options):

        self.verbosity = options["verbosity"]

-        for document in Document.objects.all():
+        if options["inbox_only"]:
+            queryset = Document.objects.filter(tags__is_inbox_tag=True)
+        else:
+            queryset = Document.objects.all()
+        documents = queryset.distinct()

-            tags = Tag.objects.exclude(
-                pk__in=document.tags.values_list("pk", flat=True))
+        logging.getLogger(__name__).info("Loading classifier")
+        clf = DocumentClassifier()
+        try:
+            clf.reload()
+        except FileNotFoundError:
+            logging.getLogger(__name__).fatal("Cannot classify documents, "
+                                              "classifier model file was not "
+                                              "found.")
+            return

-            for tag in Tag.match_all(document.content, tags):
-                print('Tagging {} with "{}"'.format(document, tag))
-                document.tags.add(tag)
+        for document in documents:
+            logging.getLogger(__name__).info(
+                "Processing document {}".format(document.title)
+            )
+            clf.classify_document(
+                document,
+                classify_document_type=options["type"],
+                classify_tags=options["tags"],
+                classify_correspondent=options["correspondent"],
+                replace_tags=options["replace_tags"]
+            )
--- a/src/documents/migrations/0022_auto_20181007_1420.py
+++ b/src/documents/migrations/0022_auto_20181007_1420.py
@@ -11,9 +11,10 @@ def re_slug_all_the_things(apps, schema_editor):
    """

    Tag = apps.get_model("documents", "Tag")
-    Correspondent = apps.get_model("documents", "Tag")
+    Correspondent = apps.get_model("documents", "Correspondent")
+    DocumentType = apps.get_model("documents", "DocumentType")

-    for klass in (Tag, Correspondent):
+    for klass in (Tag, Correspondent, DocumentType):
        for instance in klass.objects.all():
            klass.objects.filter(
                pk=instance.pk
@@ -25,7 +26,7 @@ def re_slug_all_the_things(apps, schema_editor):
 class Migration(migrations.Migration):

    dependencies = [
-        ('documents', '0021_document_storage_type'),
+        ('documents', '1003_auto_20180904_1425'),
    ]

    operations = [
@@ -48,5 +49,10 @@ class Migration(migrations.Migration):
            name='slug',
            field=models.SlugField(blank=True, editable=False),
        ),
+        migrations.AlterField(
+            model_name='documenttype',
+            name='slug',
+            field=models.SlugField(blank=True, editable=False),
+        ),
        migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
    ]
--- a/src/documents/migrations/1001_workflow_improvements.py
+++ b/src/documents/migrations/1001_workflow_improvements.py
@@ -0,0 +1,23 @@
+# Generated by Django 2.0.7 on 2018-07-12 09:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0021_document_storage_type'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='archive_serial_number',
+            field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='is_inbox_tag',
+            field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
+        ),
+    ]
--- a/src/documents/migrations/1002_auto_20180823_1155.py
+++ b/src/documents/migrations/1002_auto_20180823_1155.py
@@ -0,0 +1,33 @@
+# Generated by Django 2.0.7 on 2018-08-23 11:55
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1001_workflow_improvements'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='DocumentType',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('name', models.CharField(max_length=128, unique=True)),
+                ('slug', models.SlugField(blank=True)),
+                ('match', models.CharField(blank=True, max_length=256)),
+                ('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
+                ('is_insensitive', models.BooleanField(default=True)),
+            ],
+            options={
+                'abstract': False,
+            },
+        ),
+        migrations.AddField(
+            model_name='document',
+            name='document_type',
+            field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.DocumentType'),
+        ),
+    ]
--- a/src/documents/migrations/1003_auto_20180904_1425.py
+++ b/src/documents/migrations/1003_auto_20180904_1425.py
@@ -0,0 +1,77 @@
+# Generated by Django 2.0.8 on 2018-09-04 14:25
+
+from django.db import migrations, models
+
+
+def transfer_automatic_classification(apps, schema_editor):
+    for model_name in ["Tag", "Correspondent", "DocumentType"]:
+        model_class = apps.get_model("documents", model_name)
+        for o in model_class.objects.all():
+            o.automatic_classification = o.match is not None and len(o.match) > 0
+            o.save()
+
+
+def reverse_automatic_classification(apps, schema_editor):
+    pass
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1002_auto_20180823_1155'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='correspondent',
+            name='automatic_classification',
+            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
+        ),
+        migrations.AddField(
+            model_name='documenttype',
+            name='automatic_classification',
+            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='automatic_classification',
+            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
+        ),
+        migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
+        migrations.RemoveField(
+            model_name='correspondent',
+            name='is_insensitive',
+        ),
+        migrations.RemoveField(
+            model_name='correspondent',
+            name='match',
+        ),
+        migrations.RemoveField(
+            model_name='correspondent',
+            name='matching_algorithm',
+        ),
+        migrations.RemoveField(
+            model_name='documenttype',
+            name='is_insensitive',
+        ),
+        migrations.RemoveField(
+            model_name='documenttype',
+            name='match',
+        ),
+        migrations.RemoveField(
+            model_name='documenttype',
+            name='matching_algorithm',
+        ),
+        migrations.RemoveField(
+            model_name='tag',
+            name='is_insensitive',
+        ),
+        migrations.RemoveField(
+            model_name='tag',
+            name='match',
+        ),
+        migrations.RemoveField(
+            model_name='tag',
+            name='matching_algorithm',
+        ),
+    ]
--- a/src/documents/mixins.py
+++ b/src/documents/mixins.py
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -24,43 +24,15 @@ except ImportError:

 class MatchingModel(models.Model):

-    MATCH_ANY = 1
-    MATCH_ALL = 2
-    MATCH_LITERAL = 3
-    MATCH_REGEX = 4
-    MATCH_FUZZY = 5
-    MATCHING_ALGORITHMS = (
-        (MATCH_ANY, "Any"),
-        (MATCH_ALL, "All"),
-        (MATCH_LITERAL, "Literal"),
-        (MATCH_REGEX, "Regular Expression"),
-        (MATCH_FUZZY, "Fuzzy Match"),
-    )
-
    name = models.CharField(max_length=128, unique=True)
    slug = models.SlugField(blank=True, editable=False)

-    match = models.CharField(max_length=256, blank=True)
-    matching_algorithm = models.PositiveIntegerField(
-        choices=MATCHING_ALGORITHMS,
-        default=MATCH_ANY,
-        help_text=(
-            "Which algorithm you want to use when matching text to the OCR'd "
-            "PDF.  Here, \"any\" looks for any occurrence of any word "
-            "provided in the PDF, while \"all\" requires that every word "
-            "provided appear in the PDF, albeit not in the order provided.  A "
-            "\"literal\" match means that the text you enter must appear in "
-            "the PDF exactly as you've entered it, and \"regular expression\" "
-            "uses a regex to match the PDF.  (If you don't know what a regex "
-            "is, you probably don't want this option.)  Finally, a \"fuzzy "
-            "match\" looks for words or phrases that are mostly—but not "
-            "exactly—the same, which can be useful for matching against "
-            "documents containg imperfections that foil accurate OCR."
-        )
+    automatic_classification = models.BooleanField(
+        default=False,
+        help_text="Automatically assign to newly added documents based on "
+                  "current usage in your document collection."
    )

-    is_insensitive = models.BooleanField(default=True)
-
    class Meta:
        abstract = True
        ordering = ("name",)
@@ -68,86 +40,8 @@ class MatchingModel(models.Model):
    def __str__(self):
        return self.name

-    @property
-    def conditions(self):
-        return "{}: \"{}\" ({})".format(
-            self.name, self.match, self.get_matching_algorithm_display())
-
-    @classmethod
-    def match_all(cls, text, tags=None):
-
-        if tags is None:
-            tags = cls.objects.all()
-
-        text = text.lower()
-        for tag in tags:
-            if tag.matches(text):
-                yield tag
-
-    def matches(self, text):
-
-        search_kwargs = {}
-
-        # Check that match is not empty
-        if self.match.strip() == "":
-            return False
-
-        if self.is_insensitive:
-            search_kwargs = {"flags": re.IGNORECASE}
-
-        if self.matching_algorithm == self.MATCH_ALL:
-            for word in self._split_match():
-                search_result = re.search(
-                    r"\b{}\b".format(word), text, **search_kwargs)
-                if not search_result:
-                    return False
-            return True
-
-        if self.matching_algorithm == self.MATCH_ANY:
-            for word in self._split_match():
-                if re.search(r"\b{}\b".format(word), text, **search_kwargs):
-                    return True
-            return False
-
-        if self.matching_algorithm == self.MATCH_LITERAL:
-            return bool(re.search(
-                r"\b{}\b".format(self.match), text, **search_kwargs))
-
-        if self.matching_algorithm == self.MATCH_REGEX:
-            return bool(re.search(
-                re.compile(self.match, **search_kwargs), text))
-
-        if self.matching_algorithm == self.MATCH_FUZZY:
-            match = re.sub(r'[^\w\s]', '', self.match)
-            text = re.sub(r'[^\w\s]', '', text)
-            if self.is_insensitive:
-                match = match.lower()
-                text = text.lower()
-
-            return True if fuzz.partial_ratio(match, text) >= 90 else False
-
-        raise NotImplementedError("Unsupported matching algorithm")
-
-    def _split_match(self):
-        """
-        Splits the match to individual keywords, getting rid of unnecessary
-        spaces and grouping quoted words together.
-
-        Example:
-          '  some random  words "with   quotes  " and   spaces'
-            ==>
-          ["some", "random", "words", "with+quotes", "and", "spaces"]
-        """
-        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
-        normspace = re.compile(r"\s+").sub
-        return [
-            normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
-            for t in findterms(self.match)
-        ]
-
    def save(self, *args, **kwargs):

-        self.match = self.match.lower()
        self.slug = slugify(self.name)

        models.Model.save(self, *args, **kwargs)
@@ -183,6 +77,17 @@ class Tag(MatchingModel):

    colour = models.PositiveIntegerField(choices=COLOURS, default=1)

+    is_inbox_tag = models.BooleanField(
+        default=False,
+        help_text="Marks this tag as an inbox tag: All newly consumed "
+                  "documents will be tagged with inbox tags."
+    )
+
+
+class DocumentType(MatchingModel):
+
+    pass
+

 class Document(models.Model):

@@ -214,6 +119,14 @@ class Document(models.Model):

    title = models.CharField(max_length=128, blank=True, db_index=True)

+    document_type = models.ForeignKey(
+        DocumentType,
+        blank=True,
+        null=True,
+        related_name="documents",
+        on_delete=models.SET_NULL
+    )
+
    content = models.TextField(
        db_index=True,
        blank=True,
@@ -254,6 +167,15 @@ class Document(models.Model):
    added = models.DateTimeField(
        default=timezone.now, editable=False, db_index=True)

+    archive_serial_number = models.IntegerField(
+        blank=True,
+        null=True,
+        unique=True,
+        db_index=True,
+        help_text="The position of this document in your physical document "
+                  "archive."
+    )
+
    class Meta:
        ordering = ("correspondent", "title")

--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -1,13 +1,20 @@
 from rest_framework import serializers

-from .models import Correspondent, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log, DocumentType


 class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):

    class Meta:
        model = Correspondent
-        fields = ("id", "slug", "name")
+        fields = ("id", "slug", "name", "automatic_classification")
+
+
+class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
+
+    class Meta:
+        model = DocumentType
+        fields = ("id", "slug", "name", "automatic_classification")


 class TagSerializer(serializers.HyperlinkedModelSerializer):
@@ -15,7 +22,7 @@ class TagSerializer(serializers.HyperlinkedModelSerializer):
    class Meta:
        model = Tag
        fields = (
-            "id", "slug", "name", "colour", "match", "matching_algorithm")
+            "id", "slug", "name", "colour", "automatic_classification")


 class CorrespondentField(serializers.HyperlinkedRelatedField):
@@ -28,17 +35,25 @@ class TagsField(serializers.HyperlinkedRelatedField):
        return Tag.objects.all()


+class DocumentTypeField(serializers.HyperlinkedRelatedField):
+    def get_queryset(self):
+        return DocumentType.objects.all()
+
+
 class DocumentSerializer(serializers.ModelSerializer):

    correspondent = CorrespondentField(
        view_name="drf:correspondent-detail", allow_null=True)
    tags = TagsField(view_name="drf:tag-detail", many=True)
+    document_type = DocumentTypeField(
+        view_name="drf:documenttype-detail", allow_null=True)

    class Meta:
        model = Document
        fields = (
            "id",
            "correspondent",
+            "document_type",
            "title",
            "content",
            "file_type",
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -8,57 +8,36 @@ from django.contrib.auth.models import User
 from django.contrib.contenttypes.models import ContentType
 from django.utils import timezone

-from ..models import Correspondent, Document, Tag
+from documents.classifier import DocumentClassifier
+from ..models import Document, Tag


 def logger(message, group):
    logging.getLogger(__name__).debug(message, extra={"group": group})


-def set_correspondent(sender, document=None, logging_group=None, **kwargs):
+classifier = DocumentClassifier()

-    # No sense in assigning a correspondent when one is already set.
-    if document.correspondent:
-        return

-    # No matching correspondents, so no need to continue
-    potential_correspondents = list(Correspondent.match_all(document.content))
-    if not potential_correspondents:
-        return
-
-    potential_count = len(potential_correspondents)
-    selected = potential_correspondents[0]
-    if potential_count > 1:
-        message = "Detected {} potential correspondents, so we've opted for {}"
-        logger(
-            message.format(potential_count, selected),
-            logging_group
+def classify_document(sender, document=None, logging_group=None, **kwargs):
+    global classifier
+    try:
+        classifier.reload()
+        classifier.classify_document(
+            document,
+            classify_correspondent=True,
+            classify_tags=True,
+            classify_document_type=True
+        )
+    except FileNotFoundError:
+        logging.getLogger(__name__).fatal(
+            "Cannot classify document, classifier model file was not found."
        )

-    logger(
-        'Assigning correspondent "{}" to "{}" '.format(selected, document),
-        logging_group
-    )

-    document.correspondent = selected
-    document.save(update_fields=("correspondent",))
-
-
-def set_tags(sender, document=None, logging_group=None, **kwargs):
-
-    current_tags = set(document.tags.all())
-    relevant_tags = set(Tag.match_all(document.content)) - current_tags
-
-    if not relevant_tags:
-        return
-
-    message = 'Tagging "{}" with "{}"'
-    logger(
-        message.format(document, ", ".join([t.slug for t in relevant_tags])),
-        logging_group
-    )
-
-    document.tags.add(*relevant_tags)
+def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
+    inbox_tags = Tag.objects.filter(is_inbox_tag=True)
+    document.tags.add(*inbox_tags)


 def run_pre_consume_script(sender, filename, **kwargs):
--- a/src/documents/static/documents/js/pdf.js
+++ b/src/documents/static/documents/js/pdf.js
--- a/src/documents/static/documents/js/pdf.js.map
+++ b/src/documents/static/documents/js/pdf.js.map
--- a/src/documents/static/documents/js/pdf.worker.js
+++ b/src/documents/static/documents/js/pdf.worker.js
--- a/src/documents/static/documents/js/pdf.worker.js.map
+++ b/src/documents/static/documents/js/pdf.worker.js.map
--- a/src/documents/static/paperless.css
+++ b/src/documents/static/paperless.css
@@ -20,4 +20,17 @@ td a.tag {
 #result_list td textarea {
  width: 90%;
  height: 5em;
+}
+
+#change_form_twocolumn_parent {
+  display: flex;
+}
+#change_form_form_parent {
+  flex:50%;
+  margin-right: 10px;
+}
+#change_form_viewer_parent {
+  flex:50%;
+  margin-left: 10px;
+  text-align: center;
 }
--- a/src/documents/templates/admin/documents/document/change_form.html
+++ b/src/documents/templates/admin/documents/document/change_form.html
@@ -4,6 +4,27 @@

 {{ block.super }}

+{% if file_type in "pdf jpg png" %}
+
+	<div id="change_form_twocolumn_parent">
+		<div id="change_form_form_parent"></div>
+		<div id="change_form_viewer_parent">
+			{% if file_type == "pdf" %}
+				{% include "admin/documents/document/viewers/viewer_pdf.html" %}
+			{% endif %}
+			{% if file_type in "jpg png" %}
+				{% include "admin/documents/document/viewers/viewer_image.html" %}
+			{% endif %}
+		</div>
+	</div>
+
+	<script>
+		django.jQuery("#change_form_form_parent").append(django.jQuery("#document_form"));
+		django.jQuery("#content-main").append(django.jQuery("#change_form_twocolumn_parent"));
+	</script>
+
+{% endif %}
+
 {% if next_object %}
 	<script type="text/javascript">//<![CDATA[
 		(function($){
--- a/src/documents/templates/admin/documents/document/change_list_results.html
+++ b/src/documents/templates/admin/documents/document/change_list_results.html
@@ -24,7 +24,8 @@
    border: 1px solid #cccccc;
    border-radius: 2%;
    overflow: hidden;
-    height: 300px;
+    height: 350px;
+    position: relative;
  }
  .result .header {
    padding: 5px;
@@ -60,6 +61,11 @@
  .result a.tag {
    color: #ffffff;
  }
+  .result .documentType {
+    padding: 5px;
+    background-color: #eeeeee;
+    text-align: center;
+  }
  .result .date {
    padding: 5px;
  }
@@ -79,6 +85,15 @@
  .result .image img {
    width: 100%;
  }
+  .result .footer {
+    position: absolute;
+    bottom: 0;
+    right: 0;
+    border-left: 1px solid #cccccc;
+    border-top: 1px solid #cccccc;
+    padding: 4px 10px 4px 10px;
+    background: white;
+  }

  .grid {
    margin-right: 260px;
@@ -152,7 +167,9 @@
    {# 4: Image #}
    {# 5: Correspondent #}
    {# 6: Tags #}
-    {# 7: Document edit url #}
+    {# 7: Archive serial number #}
+    {# 8: Document type #}
+    {# 9: Document edit url #}
    <div class="box">
      <div class="result">
        <div class="header">
@@ -166,7 +183,7 @@
            selection would not be possible with mouse click + drag. Instead,
            the underlying link would be dragged.
          {% endcomment %}
-          <div class="headerLink" onclick="location.href='{{ result.7 }}';"></div>
+          <div class="headerLink" onclick="location.href='{{ result.9 }}';"></div>
          <div class="checkbox">{{ result.0 }}</div>
          <div class="info">
            {{ result.5 }}
@@ -174,10 +191,14 @@
          {{ result.1 }}
          <div style="clear: both;"></div>
        </div>
+        {% if '>-<' not in result.8 %}<div class="documentType">{{ result.8 }}</div>{% endif %}
        <div class="tags">{{ result.6 }}</div>
        <div class="date">{{ result.2 }}</div>
        <div style="clear: both;"></div>
        <div class="image">{{ result.4 }}</div>
+        {# Only show the archive serial number if it is set on the document. #}
+        {# checking for >-< (i.e., will a dash be displayed) doesn't feel like a very good solution to me. #}
+        {% if '>-<' not in result.7 %}<div class="footer">#{{ result.7 }}</div>{% endif %}
      </div>
    </div>
  {% endfor %}
--- a/src/documents/templates/admin/documents/document/select_object.html
+++ b/src/documents/templates/admin/documents/document/select_object.html
--- a/src/documents/templates/admin/documents/document/viewers/viewer_image.html
+++ b/src/documents/templates/admin/documents/document/viewers/viewer_image.html
@@ -0,0 +1 @@
+<img src="{{download_url}}" style="max-width: 100%">
--- a/src/documents/templates/admin/documents/document/viewers/viewer_pdf.html
+++ b/src/documents/templates/admin/documents/document/viewers/viewer_pdf.html
@@ -0,0 +1,130 @@
+{% load static %}
+
+<div>
+    <input id="prev" value="Previous" class="default" type="button">
+    <input id="next" value="Next" class="default" type="button">
+    &nbsp; &nbsp;
+    <span>Page: <span id="page_num"></span> / <span id="page_count"></span></span>
+    &nbsp; &nbsp;
+    <input id="zoomin" value="+" class="default" type="button">
+    <input id="zoomout" value="-" class="default" type="button">
+</div>
+
+<div style="width: 100%; overflow: auto;">
+    <canvas id="the-canvas"></canvas>
+</div>
+<script type="text/javascript" src="{% static 'documents/js/pdf.js' %}"></script>
+<script type="text/javascript" src="{% static 'documents/js/pdf.worker.js' %}"></script>
+
+{# Load and display PDF document#}
+<script>
+var pdfjsLib = window['pdfjs-dist/build/pdf'];
+
+var pdfDoc = null,
+    pageNum = 1,
+    pageRendering = false,
+    pageNumPending = null,
+    scale = 1.0,
+    canvas = document.getElementById('the-canvas'),
+    ctx = canvas.getContext('2d');
+
+/**
+ * Get page info from document, resize canvas accordingly, and render page.
+ * @param num Page number.
+ */
+function renderPage(num) {
+    pageRendering = true;
+    // Using promise to fetch the page
+    pdfDoc.getPage(num).then(function(page) {
+        var viewport = page.getViewport(scale);
+        canvas.height = viewport.height;
+        canvas.width = viewport.width;
+        // Render PDF page into canvas context
+        var renderContext = {
+            canvasContext: ctx,
+            viewport: viewport
+        };
+        var renderTask = page.render(renderContext);
+        // Wait for rendering to finish
+        renderTask.promise.then(function () {
+            pageRendering = false;
+            if (pageNumPending !== null) {
+                // New page rendering is pending
+                renderPage(pageNumPending);
+                pageNumPending = null;
+            }
+        });
+    });
+    // Update page counters
+    document.getElementById('page_num').textContent = num;
+}
+
+/**
+ * If another page rendering in progress, waits until the rendering is
+ * finised. Otherwise, executes rendering immediately.
+ */
+function queueRenderPage(num) {
+    if (pageRendering) {
+        pageNumPending = num;
+    } else {
+        renderPage(num);
+    }
+}
+
+/**
+ * Displays previous page.
+ */
+function onPrevPage() {
+    if (pageNum <= 1) {
+        return;
+    }
+    pageNum--;
+    queueRenderPage(pageNum);
+}
+
+document.getElementById('prev').addEventListener('click', onPrevPage);
+
+/**
+ * Displays next page.
+ */
+function onNextPage() {
+    if (pageNum >= pdfDoc.numPages) {
+        return;
+    }
+    pageNum++;
+    queueRenderPage(pageNum);
+}
+
+document.getElementById('next').addEventListener('click', onNextPage);
+
+/**
+ * Displays next page.
+ */
+function onZoomIn() {
+    scale *= 1.2;
+    queueRenderPage(pageNum);
+}
+
+document.getElementById('zoomin').addEventListener('click', onZoomIn);
+
+/**
+ * Displays next page.
+ */
+function onZoomOut() {
+    scale /= 1.2;
+    queueRenderPage(pageNum);
+}
+
+document.getElementById('zoomout').addEventListener('click', onZoomOut);
+
+/**
+ * Asynchronously downloads PDF.
+ */
+pdfjsLib.getDocument("{{download_url}}").then(function (pdfDoc_) {
+    pdfDoc = pdfDoc_;
+    document.getElementById('page_count').textContent = pdfDoc.numPages;
+    // Initial/first page rendering
+    renderPage(pageNum);
+});
+</script>
+
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -20,14 +20,21 @@ from rest_framework.viewsets import (
    ReadOnlyModelViewSet
 )

-from .filters import CorrespondentFilterSet, DocumentFilterSet, TagFilterSet
+from .filters import (
+    CorrespondentFilterSet,
+    DocumentFilterSet,
+    TagFilterSet,
+    DocumentTypeFilterSet
+)
+
 from .forms import UploadForm
-from .models import Correspondent, Document, Log, Tag
+from .models import Correspondent, Document, Log, Tag, DocumentType
 from .serialisers import (
    CorrespondentSerializer,
    DocumentSerializer,
    LogSerializer,
-    TagSerializer
+    TagSerializer,
+    DocumentTypeSerializer
 )


@@ -116,6 +123,17 @@ class TagViewSet(ModelViewSet):
    ordering_fields = ("name", "slug")


+class DocumentTypeViewSet(ModelViewSet):
+    model = DocumentType
+    queryset = DocumentType.objects.all()
+    serializer_class = DocumentTypeSerializer
+    pagination_class = StandardPagination
+    permission_classes = (IsAuthenticated,)
+    filter_backends = (DjangoFilterBackend, OrderingFilter)
+    filter_class = DocumentTypeFilterSet
+    ordering_fields = ("name", "slug")
+
+
 class DocumentViewSet(RetrieveModelMixin,
                      UpdateModelMixin,
                      DestroyModelMixin,
--- a/src/manage.py
+++ b/src/manage.py
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -58,7 +58,7 @@ if _allowed_hosts:
    ALLOWED_HOSTS = _allowed_hosts.split(",")

 FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")
-    
+
 # Application definition

 INSTALLED_APPS = [
@@ -144,14 +144,18 @@ DATABASES = {
    }
 }

-if os.getenv("PAPERLESS_DBUSER"):
+if os.getenv("PAPERLESS_DBENGINE"):
    DATABASES["default"] = {
-        "ENGINE": "django.db.backends.postgresql_psycopg2",
+        "ENGINE": os.getenv("PAPERLESS_DBENGINE"),
        "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
        "USER": os.getenv("PAPERLESS_DBUSER"),
    }
    if os.getenv("PAPERLESS_DBPASS"):
        DATABASES["default"]["PASSWORD"] = os.getenv("PAPERLESS_DBPASS")
+    if os.getenv("PAPERLESS_DBHOST"):
+        DATABASES["default"]["HOST"] = os.getenv("PAPERLESS_DBHOST")
+    if os.getenv("PAPERLESS_DBPORT"):
+        DATABASES["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")


 # Password validation
@@ -209,6 +213,14 @@ MEDIA_URL = os.getenv("PAPERLESS_MEDIA_URL", "/media/")
 DATA_UPLOAD_MAX_NUMBER_FIELDS = None


+# Document classification models location
+MODEL_FILE = os.getenv(
+    "PAPERLESS_MODEL_FILE", os.path.join(
+        BASE_DIR, "..", "models", "model.pickle"
+    )
+)
+
+
 # Paperless-specific stuff
 # You shouldn't have to edit any of these values.  Rather, you can set these
 # values in /etc/paperless.conf instead.
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -12,12 +12,14 @@ from documents.views import (
    FetchView,
    LogViewSet,
    PushView,
-    TagViewSet
+    TagViewSet,
+    DocumentTypeViewSet
 )
 from reminders.views import ReminderViewSet

 router = DefaultRouter()
 router.register(r"correspondents", CorrespondentViewSet)
+router.register(r"document_types", DocumentTypeViewSet)
 router.register(r"documents", DocumentViewSet)
 router.register(r"logs", LogViewSet)
 router.register(r"reminders", ReminderViewSet)
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (2, 6, 0)
+__version__ = (1, 0, 1)
--- a/src/paperless_tesseract/tests/samples/tests_date_3.png
+++ b/src/paperless_tesseract/tests/samples/tests_date_3.png
Author	SHA1	Message	Date
Jonas Winkler	c6a51a1cdc	Version bump	2018-12-12 13:25:28 +01:00
Jonas Winkler	4b20d5d4b9	Fixed migration order	2018-12-12 13:13:21 +01:00
Jonas Winkler	cccd183c31	Fixed migration order	2018-12-12 13:11:30 +01:00
Jonas Winkler	1baa203ef2	Merge branch 'release-1.0.0' into dev	2018-12-11 22:58:14 +01:00
Jonas Winkler	c3ce05e1cd	Merge branch 'master' into dev	2018-12-11 22:36:26 +01:00
Jonas Winkler	7659dde16c	Merge remote-tracking branch 'origin/patch-1' into dev	2018-12-11 22:26:20 +01:00
Jonas Winkler	872d657361	Version bumb	2018-12-11 14:32:30 +01:00
Daniel Quinn	7b4785bdb9	Merge pull request #450 from erikarvstedt/fix-parser-test Fix date test sample image	2018-12-11 11:43:14 +00:00
Jonas Winkler	ea58c66fd4	Merge branch 'master' into dev	2018-12-11 12:38:15 +01:00
Jonas Winkler	bcd9220021	minor changes	2018-12-11 12:26:44 +01:00
Jonas Winkler	766109ae4e	Merge remote-tracking branch 'upstream/master'	2018-12-11 12:06:15 +01:00
jonaswinkler	baf89cad8e	Update 0022_auto_20181007_1420.py copy paste error.	2018-12-10 18:38:19 +01:00
Daniel Quinn	3c2a1a8c13	Merge pull request #451 from speshak/remote_pg Add DBHOST & DBPORT parameters to settings	2018-12-06 23:38:50 +00:00
Daniel Quinn	1c7047bbb8	Move ipython out of the base dependencies	2018-12-06 23:28:33 +00:00
Scott Peshak	96dafe8c43	Add psycopg2 dependencies to Dockerfile	2018-12-02 16:14:58 -06:00
Scott Peshak	d6896daece	Add psycopg2 to requirements.txt	2018-12-02 16:14:58 -06:00
Scott Peshak	d12f0642f2	Add DBHOST & DBPORT parameters Resolves #445	2018-12-02 15:20:29 -06:00
Erik Arvstedt	a19f0ef97e	Fix date test sample image The previous version of `tests_date_3.png` had too much spacing between the `0` and the `8` glyphs, which resulted in the year getting parsed as `200 8` in Tesseract 3.05.00 (+ tessdata 3.04.00). This caused the date parsing test to fail.	2018-12-02 15:10:21 +01:00
Erik Arvstedt	ec7125b6bb	Fix travis ocr languages The tests need German language support for Tesseract	2018-12-02 15:10:20 +01:00
Jonas Winkler	b347e3347d	Restored tagging functionality	2018-09-27 20:41:16 +02:00
Jonas Winkler	7257cece30	Code style changes	2018-09-26 10:51:42 +02:00
Jonas Winkler	5b9f38d398	Removed the archive tag, as it wasnt really used anyway.	2018-09-25 21:51:38 +02:00
Jonas Winkler	b31d4779bf	Code style changes	2018-09-25 21:12:47 +02:00
Jonas Winkler	60618381f8	Code style adjustments	2018-09-25 16:09:33 +02:00
Jonas Winkler	779ea6a015	Merge branch 'master' into dev	2018-09-25 14:53:21 +02:00
Jonas Winkler	94ede7389d	Merge remote-tracking branch 'upstream/master'	2018-09-25 14:47:12 +02:00
Jonas Winkler	03beca7838	Fixed api issue (some parameter name got renamed)	2018-09-16 13:29:56 +02:00
Jonas Winkler	fb1dcb6e08	Merge branch 'fix-document-viewer' into dev	2018-09-14 16:48:37 +02:00
Jonas Winkler	a298cbd4ce	Merge branch 'fix-document-viewer'	2018-09-14 16:48:27 +02:00
Jonas Winkler	f1a1e7f1a4	fixed document viewer	2018-09-14 16:48:08 +02:00
Jonas Winkler	8371c2399f	Merge branch 'dev'	2018-09-13 14:15:33 +02:00
Jonas Winkler	909586bf25	Code style changed	2018-09-13 14:15:16 +02:00
Jonas Winkler	8d003a6a85	Save and edit next button appears on documents without viewer as well. Made the new recent correspondents filter optional. Disabled by default.	2018-09-13 13:10:05 +02:00
Jonas Winkler	0209b71404	Merge branch 'dev'	2018-09-13 10:29:10 +02:00
Jonas Winkler	0dc3644cc1	Added missing dependencies	2018-09-12 17:43:13 +02:00
Jonas Winkler	fb1a2ee577	Merge branch 'dev'	2018-09-12 17:20:12 +02:00
Jonas Winkler	7c589f71a4	Fixed a few minor issues.	2018-09-12 16:25:23 +02:00
Jonas Winkler	25a6aa909b	removed duplicate code	2018-09-12 13:43:28 +02:00
Jonas Winkler	ef0d37985b	Merge branch 'master' into dev	2018-09-12 11:47:35 +02:00
Jonas Winkler	898931cc03	bugfix	2018-09-11 20:45:36 +02:00
Jonas Winkler	17803e7936	fixed settings	2018-09-11 17:30:46 +02:00
Jonas Winkler	e72735c4f0	Merge remote-tracking branch 'upstream/master'	2018-09-11 14:43:59 +02:00
Jonas Winkler	46a5bc00d7	Merge branch 'machine-learning' into dev	2018-09-11 14:36:21 +02:00
Jonas Winkler	d46ee11143	The classifier works with ids now, not names. Minor changes.	2018-09-11 14:30:18 +02:00
Jonas Winkler	d2534a73e5	changed classifier	2018-09-11 00:33:07 +02:00
Jonas Winkler	11adc94e5e	mode change	2018-09-06 12:00:01 +02:00
Jonas Winkler	04bf5fc094	fixed merge error	2018-09-06 10:15:15 +02:00
Jonas Winkler	d26f940a91	Merge branch 'dev' into machine-learning	2018-09-06 00:29:41 +02:00
Jonas Winkler	13725ef8ee	Merge branch 'master' into dev	2018-09-06 00:28:58 +02:00
Jonas Winkler	6f0ca432c4	Added scikit-learn to requirements	2018-09-06 00:20:44 +02:00
Jonas Winkler	dd8746bac7	fixed the api	2018-09-05 15:29:05 +02:00
Jonas Winkler	8eeded95c4	Merge branch 'dev' into machine-learning	2018-09-05 15:26:39 +02:00
Jonas Winkler	131e1c9dd8	fixed the api	2018-09-05 15:25:14 +02:00
Jonas Winkler	a6b4fc7e81	fixed api	2018-09-05 14:57:37 +02:00
Jonas Winkler	cea880f245	implemented automatic classification field functionality	2018-09-05 14:31:02 +02:00
Jonas Winkler	82bc0e3368	Fixed a few things	2018-09-05 12:43:11 +02:00
Jonas Winkler	70bd05450a	removed matching model fields, automatic classifier reloading, added autmatic_classification field to matching model	2018-09-04 18:40:26 +02:00
Jonas Winkler	c765ef5eeb	Merge remote-tracking branch 'upstream/master'	2018-09-04 16:02:48 +02:00
Jonas Winkler	30134034e2	Fixed documents not being saved after modification	2018-09-04 15:33:51 +02:00
Jonas Winkler	8a1a736340	Merge branch 'document-type' into dev	2018-09-04 14:55:59 +02:00
Jonas Winkler	68652c8c37	Document Type exporting	2018-09-04 14:55:29 +02:00
Jonas Winkler	c091eba26e	Implemented the classifier model, including automatic tagging of new documents	2018-09-04 14:39:55 +02:00
Jonas Winkler	ca315ba76c	Added code that trains models based on data from the databasae	2018-09-03 15:55:41 +02:00
Jonas Winkler	350da81081	Added command to create datasets	2018-09-02 12:47:19 +02:00
Jonas Winkler	4129002086	Added static to ignore	2018-09-02 11:46:45 +02:00
Jonas Winkler	781a1dae71	- added recent correspondents filter - sortable document_count fields - added last correspondence field to CorrespondentAdmin	2018-08-28 15:42:39 +02:00
Jonas Winkler	01fed4f49d	Removed WebDAV from dev, since it is kind of broken.	2018-08-28 12:12:29 +02:00
Jonas Winkler	d7ab69fed9	Added document type	2018-08-24 13:45:15 +02:00
Jonas Winkler	dfa5ea423f	Merge branch 'ui-improvements' into dev	2018-07-16 20:56:49 +02:00
Jonas Winkler	a698a1b66b	Different way to get the changelist.	2018-07-16 18:35:01 +02:00
Jonas Winkler	a5129018d2	Merge branch 'ui-improvements' into dev	2018-07-16 18:19:05 +02:00
Jonas Winkler	e3974c68ba	bugfix	2018-07-16 18:01:27 +02:00
Jonas Winkler	d72604eb86	Merge branch 'ui-improvements' into dev	2018-07-16 16:09:41 +02:00
Jonas Winkler	f0c94cc65f	Added 'save and edit next' functionality	2018-07-16 16:08:51 +02:00
Jonas Winkler	f21debe95d	css stuff	2018-07-16 14:39:09 +02:00
Jonas Winkler	033ab72475	Merge branch 'workflow-improvements' into dev	2018-07-15 13:42:00 +02:00
Jonas Winkler	b059602050	Merge branch 'db-config' into dev	2018-07-15 13:41:54 +02:00
Jonas Winkler	2775dfb735	Merge branch 'ui-improvements' into dev	2018-07-15 13:41:49 +02:00
Jonas Winkler	04384c7037	Merge branch 'master' into dev	2018-07-15 13:41:43 +02:00
Jonas Winkler	75beb91791	added options to change database backend	2018-07-15 13:40:38 +02:00
Jonas Winkler	b138f4b52b	fixed image width	2018-07-15 13:07:00 +02:00
Jonas Winkler	d108a69f1b	added document viewers on document change form for easier editing of metadata, supports pdf, png, jpg	2018-07-14 23:05:28 +02:00
Jonas Winkler	bdaea3915e	Merge branch 'master' into ui-improvements	2018-07-13 11:24:19 +02:00
Jonas Winkler	9e71b70d4b	fixed the api	2018-07-13 11:20:45 +02:00
Jonas Winkler	960340a5db	updated migrations	2018-07-12 11:54:03 +02:00
Jonas Winkler	b3709663f1	Merge branch 'ui-improvements' into dev	2018-07-11 15:07:30 +02:00
Jonas Winkler	9f20175cd3	Merge branch 'workflow-improvements' into dev	2018-07-11 15:05:56 +02:00
Jonas Winkler	adf57b2669	Merge branch 'master' into webdav	2018-07-11 15:02:50 +02:00
Jonas Winkler	f2c32d840e	Added setting to enable webdav (default: disabled), cleaned up the code somewhat.	2018-07-11 14:59:47 +02:00
Jonas Winkler	ba9d7c8892	Moved actions to separate file	2018-07-11 13:02:18 +02:00
Jonas Winkler	270b0487ec	Merge branch 'master' into workflow-improvements	2018-07-10 15:53:38 +02:00
Jonas Winkler	a63880ed19	Merge remote-tracking branch 'upstream/master'	2018-07-10 15:46:46 +02:00
Jonas Winkler	a40737bd0e	Added actions to modify tags and correspondents on multiple documents	2018-07-10 15:39:24 +02:00
Jonas Winkler	c5b315f518	Show document serial number on change list	2018-07-06 18:04:31 +02:00
Jonas Winkler	e143a20f50	automatically update documents whenever a tag or correspondent is changed (this should make the document_retagger and document_correspondent managers somewhat obsolete (?)	2018-07-06 13:51:50 +02:00
Jonas Winkler	c3a144f2ca	inbox tags, archive tags, archive serial number for documents	2018-07-06 13:25:02 +02:00
Jonas Winkler	38bb1f9672	Some minor changes	2018-07-06 11:53:08 +02:00
Jonas Winkler	22da848be4	Updated WebDAV filtering. Filters resulting in empty results are not available anymore.	2018-07-05 17:21:13 +02:00
Jonas Winkler	a53e30e0a5	Initial support for WebDAV. Lots of stuff is not there yet and most of the stuff which is there is not really tested. But it kind of already works.	2018-07-05 16:18:20 +02:00
Jonas Winkler	7a2bd58ef8	Updated date filter to use the drilldown feature of django	2018-07-04 17:10:56 +02:00
Jonas Winkler	8f6231bd34	Updated to Django 2	2018-07-04 17:03:59 +02:00
				`@@ -0,0 +1 @@`
				`<img src="{{download_url}}" style="max-width: 100%">`