infra/paperless.compose.yml

123 lines
4.4 KiB
YAML

# Paperless-ngx — docs.manohargupta.com
# OCR + full-text search for PDFs and Office docs (lender drafts, tariff schedules, etc.)
# 5 containers: webserver, redis broker, postgres, tika (Office), gotenberg (PDF render).
# Tika + Gotenberg add ~400 MB RAM but are essential for .docx/.xlsx indexing.
# First boot is slow (~90s) -- DB migrations run before the web UI becomes available.
services:
# Redis: job queue between the web UI and the OCR/consumer worker
paperless-broker:
image: redis:7-alpine
restart: unless-stopped
volumes:
- paperless_redis:/data
networks:
- paperless_internal
# Postgres: document metadata, tags, correspondents, search index
paperless-db:
image: postgres:16-alpine
restart: unless-stopped
environment:
POSTGRES_DB: paperless
POSTGRES_USER: paperless
POSTGRES_PASSWORD: ${PAPERLESS_DB_PASSWORD}
volumes:
- paperless_db_data:/var/lib/postgresql/data
networks:
- paperless_internal
healthcheck:
test: ["CMD-SHELL", "pg_isready -U paperless -d paperless"]
interval: 10s
timeout: 5s
retries: 5
# Gotenberg: renders Office files (docx, xlsx) to PDF before OCR
paperless-gotenberg:
image: docker.io/gotenberg/gotenberg:8
restart: unless-stopped
command:
- "gotenberg"
- "--chromium-disable-javascript=true" # Security: no JS execution
- "--chromium-allow-list=file:///tmp/.*" # Only allow local file access
networks:
- paperless_internal
# Tika: extracts text from Office formats that Gotenberg can't handle alone
paperless-tika:
image: docker.io/apache/tika:latest
restart: unless-stopped
networks:
- paperless_internal
# Main app: web UI + OCR worker + consumer (watches the consume volume)
paperless:
image: ghcr.io/paperless-ngx/paperless-ngx:latest
restart: unless-stopped
depends_on:
paperless-db:
condition: service_healthy
paperless-broker:
condition: service_started
paperless-gotenberg:
condition: service_started
paperless-tika:
condition: service_started
environment:
PAPERLESS_REDIS: redis://paperless-broker:6379
PAPERLESS_DBHOST: paperless-db
PAPERLESS_DBNAME: paperless
PAPERLESS_DBUSER: paperless
PAPERLESS_DBPASS: ${PAPERLESS_DB_PASSWORD}
# Secret key for Django session signing -- must be stable across restarts
PAPERLESS_SECRET_KEY: ${PAPERLESS_SECRET_KEY}
PAPERLESS_URL: https://docs.manohargupta.com
# Office doc support via Tika + Gotenberg
PAPERLESS_TIKA_ENABLED: "1"
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://paperless-gotenberg:3000
PAPERLESS_TIKA_ENDPOINT: http://paperless-tika:9998
# OCR: 'skip' means don't re-OCR docs that already have a text layer (faster)
# Add '+hin' to language if you have Hindi documents: eng+hin (adds ~200 MB)
PAPERLESS_OCR_LANGUAGE: eng
PAPERLESS_OCR_MODE: skip
PAPERLESS_TIME_ZONE: Asia/Kolkata
USERMAP_UID: "1000"
USERMAP_GID: "1000"
volumes:
- paperless_data:/usr/src/paperless/data # search index, models
- paperless_media:/usr/src/paperless/media # original files + thumbnails
- paperless_export:/usr/src/paperless/export # manual export target
- paperless_consume:/usr/src/paperless/consume # drop files here to auto-ingest
networks:
- dokploy-network
- paperless_internal
labels:
- "traefik.enable=true"
- "traefik.docker.network=dokploy-network"
- "traefik.http.routers.paperless.rule=Host(`docs.manohargupta.com`)"
- "traefik.http.routers.paperless.entrypoints=websecure"
- "traefik.http.routers.paperless.tls.certresolver=letsencrypt"
- "traefik.http.services.paperless.loadbalancer.server.port=8000"
deploy:
labels:
- "traefik.enable=true"
- "traefik.docker.network=dokploy-network"
- "traefik.http.routers.paperless.rule=Host(`docs.manohargupta.com`)"
- "traefik.http.routers.paperless.entrypoints=websecure"
- "traefik.http.routers.paperless.tls.certresolver=letsencrypt"
- "traefik.http.services.paperless.loadbalancer.server.port=8000"
volumes:
paperless_redis:
paperless_db_data:
paperless_data:
paperless_media:
paperless_export:
paperless_consume:
networks:
dokploy-network:
external: true
paperless_internal:
driver: bridge