 #! /bin/bash

# "scan2pdf.sh"
# A script to scan one or multiple pages and convert them to pdf
# -----------------------------------------------------------------
# Copyright (c) 2004-2023 Joerg Hau <hau.joerg(at)gmail.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of version 2 of the GNU General Public
# License as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# -----------------------------------------------------------------
# Revisions:
#   2004-xx-xx, first operational version (JHa)
#   2004-12-10, cmd line parsing and A4/Letter options (JHa)
#   2005-03-02, Zip compression (instead of "Fax") (JHa)
#   2005-03-18, fix in checkfor() error message (JHa)
#   2005-03-30, added resolution on cmd line (JHa)
#   2005-10-04, added option for color and grayscale (JHa)
#   2005-11-30, added option for multi-page scanning (JHa)
#   2006-11-28, grayscale now default (JHa)
#   2006-12-28, workaround for broken "page" instruction
#               in some versions of Imagemagick (JHa)
#   2009-10-31, re-introduced "page" instruction, this fixes
#               also the page size of the final PDF (JHa)
#   2009-11-07, added rotation (JHa)
#   2010-09-05, added pdf2ps && ps2pdf compression step (JHa)
#   2012-10-31, added possibility to wait (for editing slides)
#   2015-03-03, some changes to accomodate for EPSON GT-1500 (JHa)
#   2017-01-30, check for existing output file; improved contrast; 
#               use of ADF; changes for Fujitsu SP-1120  (JHa)
#   2017-02-08, select scanner type via cmd line; added OCR  (JHa)
#   2017-02-12, SP-1120 now defaults  to multi-page scan  (JHa)
#   2017-02-13, added -sPAPERSIZE=a4 -dPDFFitPage (JHa)
#   2017-09-02, bugfix in button-wait for standard flatbed scanners (JHa)
#   2017-09-03, parallel processing works (JHa)
#   2018-06-19, chgd GT1500 switch from -E to -G, added Epson 1660 Photo,
#               adressing two different scanners OK (JHa)
#   2018-07-15, fixed the tesseract page size issue (JHa)
#   2018-08-31, added scanning over the network (JHa)
#   2018-09-15, not using pdfopt anymore due to some unidentified bug (JHa)
#   2018-10-01, minor fixes caused by the removal of pdfopt (JHa)
#   2019-02-23, switch for contrast option (JHa)
#   2019-03-28, switch for tesseract options (JHa)
#   2020-01-30, corrected last message, added explicit device (JHa)
#   2020-02-05, added export OMP_THREAD_LIMIT=1 to limit CPU ressource 
#               conflicts on processors that support AVX2
#   2020-03-16, added LC_ALL to get rid of error messages in Debian 10 KDE
#   2020-04-05, refined checkfor() error message
#   2021-11-05, modified defaults
#   2022-02-09, now uses temp files (and deletes these upon error)
#   2022-02-16, option to launch PDF viewer after scanning
#   2022-07-08, added "set -e" in case some command does not work as intended
#   2022-08-09, all echo commands now go to stderr
#   2022-08-25, revised settings to allow 1-page scanning on Fujitsu (explicitly 
#               specify "-p 1"). Added some time measurement for OCR process.
#   2023-04-15, added info for 1-page scanning on Fujitsu
#   2024-01-15, changed order of commands to avoid "scanimage: rounded value 
#				of br-y from 380 to 297" errors for Custom page size
#   2024-01-17, corrected some errors I instzriduced with the last change ;-)
# -----------------------------------------------------------------

# -----------------------------------------------------------------
# Idea: options could be queried with scanimage -h but this is slow
# -----------------------------------------------------------------

# Debian 10+11 have problems with LC... so we set them explicitly
#
export LC_ALL=C

# default paper size and resolution. 
# Note that some scanners support only a fixed set of resolutions
#
SIZE="A4"
RES="225" 

# Scan modes. The exact syntax depends a bit on the scanner used,
# e.g. you may need to replace "binary" (Epson) by "lineart" (HP)
# ... just run 'scanimage --h' to find out ;-)
#
MODE_BW="Binary"
MODE_GRAY="Gray"
MODE_COL="Color"

# default number of pages
#
PAGES="1"

# page number passed via command line? Default no = empty
#
PAGES_CMD=""

# command for button press. Again, the exact syntax depends on the
# scanner used, check also the "--batch-prompt" ;-)
# default is "no" for 1 page
#
BUTTON_CMD="--wait-for-button=no"

# pause for editing? default no
#
PAUSE=""

# default mode is color
#
MODE=$MODE_COL

# Default language for OCR 
# use the codes from tesseract, i.e. "eng", "fra", "deu"
# empty = no OCR is performed
#
OCRLANG="fra"

# compression algorithm; PDF supports embedded zip compression
#
COMPR="-compress Zip"

# by default, scan in portrait = do not rotate
#
ROT=""

# by default, scan from flatbed (otherwise "--source ADF" or the like)
#
SRC=""

# contrast. You may play with 15%,85%; for faint text leave this out (option "-C")
#
CONTRAST="-level 3%,97%"

# any other tesseract options that may be useful
# -c debug_file=/dev/null invokes quiet mode: https://stackoverflow.com/questions/31806648/tesseract-quiet-mode
#
TESSOPT="-c debug_file=/dev/null"

# PDF viewer
#
VIEWER="atril"


# --------------------------------------------------------------
# subroutine to check for some required executables
# argument: (list of) programs to test for
# will exit with rc=1 if any program was not found
# --------------------------------------------------------------
function checkfor()
{
for i in $*; do
    PROG=`which $i`
    if [ $? != 0 ] ; then
	echo "Problem: '$i' command not found, please install it." >&2
	exit 1
    fi
done
}

checkfor scanimage convert pdftk pdf2ps ps2pdf rm parallel mktemp ${VIEWER} 

# ------------------------------------------------------------
# subroutine to print usage mode
# ------------------------------------------------------------
function usage()
{
cat >&2 << eof
${0##*/} - scan to PDF

A script to scan one or more pages and convert them to PDF.

Copyright (c) 2004...2023 Joerg Hau <hau.joerg(at)gmail.com>.

This program is free software; you can redistribute it and/or
modify it under the terms of version 2 of the GNU General Public
License as published by the Free Software Foundation.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Usage: ${0##*/} [options] path/to/outfile[.pdf]

Options:
    -4, --A4          use A4 paper size.
    -l, --letter      use Letter paper size.
    -s, --other       use 'other' paper size (as specified inside this script)
    -p, --pages x     scan x pages and combine them (with ADF, use big numbers like "199").
    -r resolution     use this resolution.
    -b, --bw          scan in black/white.
    -g, --gray        scan in grayscale
    -c, --color       scan in color (default).
    -C, --contrast    do NOT use contrast adjustment (use this for faint documents)
    -L, --Left        rotate 90 degrees.
    -R, --Right       rotate 270 degrees.
    -e, --edit        pause to edit pages before compressing
    -o, --ocr lang    run OCR with language 'lang' (deu, fra, eng / default = fra)
    -E, --Epson       use Epson 1660 Photo scanner
    -X, --ENet        use Epson 1660 Photo scanner over network
    -H, --HP          use HP all-in-one scanner
    -G, --GT1500      use ADF of Epson GT1500 scanner
    -F, --SP1120      use ADF of Fujitsu SP1120 document scanner 
    -n, --nomulti     do NOT use SP1120 multifeed detection (implies -F)
    -v, --view        launch PDF viewer after scanning
    -? | -h | --help  this help ;-)

    Invoking "${0##*/}" with no options will
    - initiate a single-page scan in $MODE at $RES dpi in $SIZE size,
    - perform OCR in $OCRLANG,
    - and convert the resulting file to compressed PDF.

eof
}

# --------------------------------------------------------------
# parse & handle command line parameters (flags)
# --------------------------------------------------------------
while true; do
    case $1 in
        -4 | --A4 )         SIZE="A4"
                            ;;
        -l | --letter )     SIZE="letter"
                            ;;
        -s | --sizeOther )  SIZE="other"
                            ;;
        -p | --pages )      shift
                            PAGES="$1"
                            PAGES_CMD="1" 	# any non-empty string will do
                            ;;
        -r | --resolution ) shift
                            RES="$1"
                            ;;
        -b | --bw )         MODE=$MODE_BW
                            ;;
        -g | --gray )       MODE=$MODE_GRAY
                            ;;
        -c | --color)       MODE=$MODE_COL
                            ;;
        -C | --contrast)    CONTRAST=""
                            ;;
        -L | --Left)        ROT="-rotate 270"
                            ;;
        -R | --Right)       ROT="-rotate 90"
                            ;;
        -e | --edit)        PAUSE="pause"
                            ;;
        -o | --ocr )        shift
                            OCRLANG="$1"
                            ;;
        -E | --Epson )      SCANNER="EPSON" 
							SRC=""
							MULTIFEED=""
							;;
        -X | --ENet )  		SCANNER="EPSON-NET"
							SRC=""
							MULTIFEED=""
							;;
        -G | --GT1500 )     SCANNER="GT1500"
							SRC="ADF"
							MULTIFEED=""
							;;
        -F | --SP1120 )     SCANNER="SP1120"
							SRC="ADF"
							MULTIFEED="Stop"
                            ;;
        -n | --nomulti )    SCANNER="SP1120"
							SRC="ADF"
							MULTIFEED="Do-not-detect"
                            ;;
        -H | --HP )         SCANNER="HP"
							SRC=""
							MULTIFEED=""
							;;
        -v | --view )       DO_VIEW="1"	# any non-empty string will do
							;;
        -h | -? | --help )  usage
                            exit 2
                            ;;
        *)                  break
                            ;;
    esac
    shift
done

# --------------------------------------------------------------
# if OCR is requested, check for tesseract and ghostscript 
# -------------------------------------------------------------
if [ -n $OCRLANG ] ; then
    checkfor tesseract gs
fi

# --------------------------------------------------------------
# if view is requested, check for viewer
# -------------------------------------------------------------
if [ -n $DO_VIEW ] ; then
    checkfor $VIEWER
fi

# --------------------------------------------------------------
# if script is called w/o remaining arguments, give instructions
# --------------------------------------------------------------
if [ -z $1 ] ; then
    usage
    exit 1
fi

# --------------------------------------------------------------
# if the output file exists, do not ask but exit
# --------------------------------------------------------------
if [ -e ${1%.*}.pdf ] ; then
    echo "File ${1%.*}.pdf exists! Exiting." >&2
    exit 2
fi

# --------------------------------------------------------------
# assign scan size from paper size. 
# "other"can be anything that you assign here
# --------------------------------------------------------------
case $SIZE in
    "A4" )      X="210"
                Y="297"
                ;;
    "letter" )
                X="215"
                Y="280"
                ;;
    "other" )
                X="165"
                Y="240"
                ;;
esac

# --------------------------------------------------------------
# Assign other variables depending on scanner and cmd line
# The syntax varies between scanners!
# Note: we could actually run all scanners over the network ;-)
# --------------------------------------------------------------
case $SCANNER in
    "GT1500" )  RES_CMD="--x-resolution ${RES}dpi --y-resolution ${RES}dpi"
				ADF_CMD="--source Automatic"
				MULTIFEED_CMD=""
				BUTTON_CMD="--wait-for-button=no"
                ;;
    "SP1120" )  RES_CMD="--resolution ${RES}dpi --paper-size Custom --paper-size Custom --page-width $X --page-height $Y"
				# if you want to scan single-side only, use '--source Adf-front'
				ADF_CMD="-d pfusp --source Adf-duplex --blank-page-skip=yes --blank-page-skip-sensitivity 5 --page-auto=yes --autofeed=yes "
				MULTIFEED_CMD="--multifeed-detection ${MULTIFEED}"
				BUTTON_CMD=""
				# if number of pages was NOT passed on command line, set default to 300 for ADF
				if [ -z "$PAGES_CMD" ] ; then PAGES="300" ; fi
                ;;
    "HP" )      RES_CMD="--resolution ${RES}dpi"
				ADF_CMD=""
				MULTIFEED_CMD=""
				BUTTON_CMD=""
				if [ "$Y" == "297" ] ; then Y="296.9" ; fi  # many HP cannot scan full A4 size
				;;
    "EPSON" )  	RES_CMD="--resolution ${RES}dpi"
				ADF_CMD="-d epkowa "  # we're re-using that variable to assign the device
				MULTIFEED_CMD=""
				BUTTON_CMD="--wait-for-button=yes"
				if [ "$PAGES" == "1" ] ; then BUTTON_CMD="" ; fi
                ;;
    "EPSON-NET" )  	RES_CMD="--resolution ${RES}dpi"
				ADF_CMD="-d net:192.168.11.99:epkowa "  # we're re-using that variable to assign the device
				MULTIFEED_CMD=""
				BUTTON_CMD="--wait-for-button=yes"
				if [ "$PAGES" == "1" ] ; then BUTTON_CMD="" ; fi
                ;;
esac

echo "Scanning ${PAGES} page(s) of ${SIZE} paper ($Y) in ${MODE} at ${RES} dpi, output file is '${1%.*}.pdf'.">&2

if [ -n "$SRC" ] ; then echo "Using Auto Document Feeder (ADF), button disabled." >&2; fi
if [ -n "$ROT" ] ; then echo "Image will be rotated." >&2; fi
if [ -n "$PAUSE" ] ; then echo "Will pause for editing before compression." >&2; fi
if [ -n "${OCRLANG// }" ] ; then echo "OCR in $OCRLANG requested." >&2; fi

if (( $PAGES > 1 )) ; then   	# this is multi-page mode
	if [ -n "$BUTTON_CMD" ] ; then 	# non-ADF scan
		echo >&2
		echo "=== Put first page on scanner and press the scan button when you're ready. ===">&2
		echo >&2
	fi 
fi

# Create a temporary filename
#
OUT=$(mktemp "$(basename $0).XXXXXXXXXX")

# make sure that these files are deleted later on
#
trap 'rm ${OUT}*' EXIT

# If you use a non-standard paper size, "$ADF_CMD --paper-size Custom" 
# must come BEFORE you specify the page size.
#
scanimage $ADF_CMD $RES_CMD -x $X -y $Y --mode $MODE $MULTIFEED_CMD \
		  --batch=${OUT}%03d.pnm --batch-count=$PAGES $BUTTON_CMD  

# If problem => exit // this does not work when the scanner runs out of pages
#
# if [ $? != 0 ]; then
#     echo "*** Some problem occurred while scanning. Abort - status: $?"
#     exit 1
# fi

# Note: In ADF mode, we cannot use errorlevel checking at this point.
#       this bails out as soon as the scanner runs out of paper ...

# exit if some command does not work as intended
set -e

if [ -n "$PAUSE" ] ; then read -p "Scanning finished, press [Enter] to continue: " >&2; fi && \

# If you copy and edit the pnm files here, you can use the following commands 
# to convert them to a single pdf afterwards:
#
# for i in out*.pnm; do convert -page A4 -compress Zip  $i ${i%.*}.pdf && rm $i; done
# pdftk out*.pdf cat output xx.pdf
# pdfopt xx.pdf yy.pdf
# rm out*.pdf xx.pdf 
# pdf2ps yy.pdf yy.ps && ps2pdf yy.ps yy.pdf
# rm yy.ps
# mv yy.pdf /path/to/file.pdf 

# measure conversion start time
start_time=$(date +%s.%2N)

echo -n "Conversion to pdf in progress ... " >&2 && \
if [ -n "${OCRLANG// }" ] ; then 
    # Convert to PNG and run tesseract.
	#
    # set OMP_THREAD_LIMIT=1 to limit CPU ressource conflicts on processors that support AVX2
	# see https://github.com/the-paperless-project/paperless/issues/438
	# 
	export OMP_THREAD_LIMIT=1

    # Processing through tesseract will cause the page size to be wrong (too big),
	# so we need to add the page size with -density and -units before running tesseract:
	# see https://github.com/tesseract-ocr/tesseract/issues/150
    #
	ls -1 ${OUT}*.pnm | parallel convert $CONTRAST $COMPR -density ${RES}x${RES} -units PixelsPerInch $ROT '{}' '{.}.png' && \
	ls -1 ${OUT}*.png | parallel tesseract $TESSOPT -l $OCRLANG '{}' '{.}' pdf && \
	rm ${OUT}*.png ${OUT}*.pnm
else
    # just convert to PDF
	ls -1 ${OUT}*.pnm | parallel convert $CONTRAST $COMPR $ROT -page $SIZE '{}' '{.}.pdf' && \
    rm ${OUT}*.pnm
fi 

# if we have  more than 1 page, invoke pdftk
#
if (( $PAGES > 1 )) ; then   
	pdftk ${OUT}*.pdf cat output ${1%.*}.tmp.pdf && \
	mv ${1%.*}.tmp.pdf  ${1%.*}.pdf  && \
	rm ${OUT}*.pdf ## ${1%.*}.tmp.pdf 
else
	mv ${OUT}001.pdf ${1%.*}.pdf
fi
echo -n "done. " >&2

# measure conversion end time
end_time=$(date +%s.%2N)
echo "This took $(echo "scale=2; $end_time - $start_time" | bc) seconds." >&2

echo -n "Compressing file '${1%.*}.pdf' " >&2 && \
if [ -n "${OCRLANG// }" ] ; then
    echo -n "using Ghostscript ... " >&2
    # If OCR was requested, we need to rewrite carefully
	gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/default \
	   -dNOPAUSE -dQUIET -dBATCH -dDetectDuplicateImages -dNOTRANSPARENCY \
	   -dCompressFonts=true -sOutputFile=${OUT}.rewrite.pdf ${1%.*}.pdf && \
	mv ${OUT}.rewrite.pdf ${1%.*}.pdf
else
    echo -n "... " >&2
	# Very efficient in terms of filesize, but destroys all OCR information:
	pdf2ps ${1%.*}.pdf ${OUT}.ps && \
	ps2pdf ${OUT}.ps ${1%.*}.pdf && \
	rm ${OUT}.ps 
fi
echo "done.">&2

if [ -n $DO_VIEW ] ; then
    $VIEWER ${1%.*}.pdf &
    echo "done.">&2
fi

# exit with error status of last cmd
#
STATUS=$?
exit $STATUS
