fix wrong git ignore

2025-12-15 17:44:00 +01:00
parent ed9f31bb96
commit 8f313c00f0
126 changed files with 70589 additions and 1 deletions
--- a/internal/pdf/core/const.go
+++ b/internal/pdf/core/const.go
@@ -0,0 +1,13 @@
+package core
+
+import "errors"
+
+var (
+	// ErrUnsupportedEncodingParameters error indicates that encoding/decoding was attempted with unsupported
+	// encoding parameters.
+	// For example when trying to encode with an unsupported Predictor (flate).
+	ErrUnsupportedEncodingParameters = errors.New("unsupported encoding parameters")
+	ErrNoCCITTFaxDecode              = errors.New(" CCITTFaxDecode encoding is not yet implemented")
+	ErrNoJBIG2Decode                 = errors.New(" JBIG2Decode encoding is not yet implemented")
+	ErrNoJPXDecode                   = errors.New(" JPXDecode encoding is not yet implemented")
+)
--- a/internal/pdf/core/crossrefs.go
+++ b/internal/pdf/core/crossrefs.go
@@ -0,0 +1,372 @@
+package core
+
+import (
+	"bufio"
+	"bytes"
+	"errors"
+	"os"
+	"strings"
+
+	"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
+)
+
+// TODO (v3): Create a new type xrefType which can be an integer and can be used for improved type checking.
+// TODO (v3): Unexport these constants and rename with camelCase.
+const (
+	// XREF_TABLE_ENTRY indicates a normal xref table entry.
+	XREF_TABLE_ENTRY = iota
+
+	// XREF_OBJECT_STREAM indicates an xref entry in an xref object stream.
+	XREF_OBJECT_STREAM = iota
+)
+
+// XrefObject defines a cross reference entry which is a map between object number (with generation number) and the
+// location of the actual object, either as a file offset (xref table entry), or as a location within an xref
+// stream object (xref object stream).
+// TODO (v3): Unexport.
+type XrefObject struct {
+	xtype        int
+	objectNumber int
+	generation   int
+	// For normal xrefs (defined by OFFSET)
+	offset int64
+	// For xrefs to object streams.
+	osObjNumber int
+	osObjIndex  int
+}
+
+// XrefTable is a map between object number and corresponding XrefObject.
+// TODO (v3): Unexport.
+// TODO: Consider changing to a slice, so can maintain the object order without sorting when analyzing.
+type XrefTable map[int]XrefObject
+
+// ObjectStream represents an object stream's information which can contain multiple indirect objects.
+// The information specifies the number of objects and has information about offset locations for
+// each object.
+// TODO (v3): Unexport.
+type ObjectStream struct {
+	N       int // TODO (v3): Unexport.
+	ds      []byte
+	offsets map[int]int64
+}
+
+// ObjectStreams defines a map between object numbers (object streams only) and underlying ObjectStream information.
+type ObjectStreams map[int]ObjectStream
+
+// ObjectCache defines a map between object numbers and corresponding PdfObject. Serves as a cache for PdfObjects that
+// have already been parsed.
+// TODO (v3): Unexport.
+type ObjectCache map[int]PdfObject
+
+// Get an object from an object stream.
+func (parser *PdfParser) lookupObjectViaOS(sobjNumber int, objNum int) (PdfObject, error) {
+	var bufReader *bytes.Reader
+	var objstm ObjectStream
+	var cached bool
+
+	objstm, cached = parser.objstms[sobjNumber]
+	if !cached {
+		soi, err := parser.LookupByNumber(sobjNumber)
+		if err != nil {
+			common.Log.Debug("Missing object stream with number %d", sobjNumber)
+			return nil, err
+		}
+
+		so, ok := soi.(*PdfObjectStream)
+		if !ok {
+			return nil, errors.New("invalid object stream")
+		}
+
+		if parser.crypter != nil && !parser.crypter.isDecrypted(so) {
+			return nil, errors.New("need to decrypt the stream")
+		}
+
+		sod := so.PdfObjectDictionary
+		common.Log.Trace("so d: %s\n", *sod)
+		name, ok := sod.Get("Type").(*PdfObjectName)
+		if !ok {
+			common.Log.Debug("error: Object stream should always have a Type")
+			return nil, errors.New("object stream missing Type")
+		}
+		if strings.ToLower(string(*name)) != "objstm" {
+			common.Log.Debug("error: Object stream type shall always be ObjStm !")
+			return nil, errors.New("object stream type != ObjStm")
+		}
+
+		N, ok := sod.Get("N").(*PdfObjectInteger)
+		if !ok {
+			return nil, errors.New("invalid N in stream dictionary")
+		}
+		firstOffset, ok := sod.Get("First").(*PdfObjectInteger)
+		if !ok {
+			return nil, errors.New("invalid First in stream dictionary")
+		}
+
+		common.Log.Trace("type: %s number of objects: %d", name, *N)
+		ds, err := DecodeStream(so)
+		if err != nil {
+			return nil, err
+		}
+
+		common.Log.Trace("Decoded: %s", ds)
+
+		// Temporarily change the reader object to this decoded buffer.
+		// Change back afterwards.
+		bakOffset := parser.GetFileOffset()
+		defer func() { parser.SetFileOffset(bakOffset) }()
+
+		bufReader = bytes.NewReader(ds)
+		parser.reader = bufio.NewReader(bufReader)
+
+		common.Log.Trace("Parsing offset map")
+		// Load the offset map (relative to the beginning of the stream...)
+		offsets := map[int]int64{}
+		// Object list and offsets.
+		for i := 0; i < int(*N); i++ {
+			parser.skipSpaces()
+			// Object number.
+			obj, err := parser.parseNumber()
+			if err != nil {
+				return nil, err
+			}
+			onum, ok := obj.(*PdfObjectInteger)
+			if !ok {
+				return nil, errors.New("invalid object stream offset table")
+			}
+
+			parser.skipSpaces()
+			// Offset.
+			obj, err = parser.parseNumber()
+			if err != nil {
+				return nil, err
+			}
+			offset, ok := obj.(*PdfObjectInteger)
+			if !ok {
+				return nil, errors.New("invalid object stream offset table")
+			}
+
+			common.Log.Trace("obj %d offset %d", *onum, *offset)
+			offsets[int(*onum)] = int64(*firstOffset + *offset)
+		}
+
+		objstm = ObjectStream{N: int(*N), ds: ds, offsets: offsets}
+		parser.objstms[sobjNumber] = objstm
+	} else {
+		// Temporarily change the reader object to this decoded buffer.
+		// Point back afterwards.
+		bakOffset := parser.GetFileOffset()
+		defer func() { parser.SetFileOffset(bakOffset) }()
+
+		bufReader = bytes.NewReader(objstm.ds)
+		// Temporarily change the reader object to this decoded buffer.
+		parser.reader = bufio.NewReader(bufReader)
+	}
+
+	offset := objstm.offsets[objNum]
+	common.Log.Trace("ACTUAL offset[%d] = %d", objNum, offset)
+
+	bufReader.Seek(offset, os.SEEK_SET)
+	parser.reader = bufio.NewReader(bufReader)
+
+	bb, _ := parser.reader.Peek(100)
+	common.Log.Trace("OBJ peek \"%s\"", string(bb))
+
+	val, err := parser.parseObject()
+	if err != nil {
+		common.Log.Debug("error Fail to read object (%s)", err)
+		return nil, err
+	}
+	if val == nil {
+		return nil, errors.New("object cannot be null")
+	}
+
+	// Make an indirect object around it.
+	io := PdfIndirectObject{}
+	io.ObjectNumber = int64(objNum)
+	io.PdfObject = val
+
+	return &io, nil
+}
+
+// LookupByNumber looks up a PdfObject by object number.  Returns an error on failure.
+// TODO (v3): Unexport.
+func (parser *PdfParser) LookupByNumber(objNumber int) (PdfObject, error) {
+	// Outside interface for lookupByNumberWrapper.  Default attempts repairs of bad xref tables.
+	obj, _, err := parser.lookupByNumberWrapper(objNumber, true)
+	return obj, err
+}
+
+// Wrapper for lookupByNumber, checks if object encrypted etc.
+func (parser *PdfParser) lookupByNumberWrapper(objNumber int, attemptRepairs bool) (PdfObject, bool, error) {
+	obj, inObjStream, err := parser.lookupByNumber(objNumber, attemptRepairs)
+	if err != nil {
+		return nil, inObjStream, err
+	}
+
+	// If encrypted, decrypt it prior to returning.
+	// Do not attempt to decrypt objects within object streams.
+	if !inObjStream && parser.crypter != nil && !parser.crypter.isDecrypted(obj) {
+		err := parser.crypter.Decrypt(obj, 0, 0)
+		if err != nil {
+			return nil, inObjStream, err
+		}
+	}
+
+	return obj, inObjStream, nil
+}
+
+func getObjectNumber(obj PdfObject) (int64, int64, error) {
+	if io, isIndirect := obj.(*PdfIndirectObject); isIndirect {
+		return io.ObjectNumber, io.GenerationNumber, nil
+	}
+	if so, isStream := obj.(*PdfObjectStream); isStream {
+		return so.ObjectNumber, so.GenerationNumber, nil
+	}
+	return 0, 0, errors.New("not an indirect/stream object")
+}
+
+// LookupByNumber
+// Repair signals whether to repair if broken.
+func (parser *PdfParser) lookupByNumber(objNumber int, attemptRepairs bool) (PdfObject, bool, error) {
+	obj, ok := parser.ObjCache[objNumber]
+	if ok {
+		common.Log.Trace("Returning cached object %d", objNumber)
+		return obj, false, nil
+	}
+
+	xref, ok := parser.xrefs[objNumber]
+	if !ok {
+		// An indirect reference to an undefined object shall not be
+		// considered an error by a conforming reader; it shall be
+		// treated as a reference to the null object.
+		common.Log.Trace("Unable to locate object in xrefs! - Returning null object")
+		var nullObj PdfObjectNull
+		return &nullObj, false, nil
+	}
+
+	common.Log.Trace("Lookup obj number %d", objNumber)
+	switch xref.xtype {
+	case XREF_TABLE_ENTRY:
+		common.Log.Trace("xrefobj obj num %d", xref.objectNumber)
+		common.Log.Trace("xrefobj gen %d", xref.generation)
+		common.Log.Trace("xrefobj offset %d", xref.offset)
+
+		parser.rs.Seek(xref.offset, os.SEEK_SET)
+		parser.reader = bufio.NewReader(parser.rs)
+
+		obj, err := parser.ParseIndirectObject()
+		if err != nil {
+			common.Log.Debug("error Failed reading xref (%s)", err)
+			// Offset pointing to a non-object.  Try to repair the file.
+			if attemptRepairs {
+				common.Log.Debug("Attempting to repair xrefs (top down)")
+				xrefTable, err := parser.repairRebuildXrefsTopDown()
+				if err != nil {
+					common.Log.Debug("error Failed repair (%s)", err)
+					return nil, false, err
+				}
+				parser.xrefs = *xrefTable
+				return parser.lookupByNumber(objNumber, false)
+			}
+			return nil, false, err
+		}
+
+		if attemptRepairs {
+			// Check the object number..
+			// If it does not match, then try to rebuild, i.e. loop through
+			// all the items in the xref and look each one up and correct.
+			realObjNum, _, _ := getObjectNumber(obj)
+			if int(realObjNum) != objNumber {
+				common.Log.Debug("invalid xrefs: Rebuilding")
+				err := parser.rebuildXrefTable()
+				if err != nil {
+					return nil, false, err
+				}
+				// Empty the cache.
+				parser.ObjCache = ObjectCache{}
+				// Try looking up again and return.
+				return parser.lookupByNumberWrapper(objNumber, false)
+			}
+		}
+
+		common.Log.Trace("Returning obj")
+		parser.ObjCache[objNumber] = obj
+		return obj, false, nil
+	case XREF_OBJECT_STREAM:
+		common.Log.Trace("xref from object stream!")
+		common.Log.Trace(">Load via OS!")
+		common.Log.Trace("Object stream available in object %d/%d", xref.osObjNumber, xref.osObjIndex)
+
+		if xref.osObjNumber == objNumber {
+			common.Log.Debug("error Circular reference!?!")
+			return nil, true, errors.New(" Xref circular reference")
+		}
+		_, exists := parser.xrefs[xref.osObjNumber]
+		if exists {
+			optr, err := parser.lookupObjectViaOS(xref.osObjNumber, objNumber) //xref.osObjIndex)
+			if err != nil {
+				common.Log.Debug("error Returning ERR (%s)", err)
+				return nil, true, err
+			}
+			common.Log.Trace("<Loaded via OS")
+			parser.ObjCache[objNumber] = optr
+			if parser.crypter != nil {
+				// Mark as decrypted (inside object stream) for caching.
+				// and avoid decrypting decrypted object.
+				parser.crypter.DecryptedObjects[optr] = true
+			}
+			return optr, true, nil
+		} else {
+			common.Log.Debug("?? Belongs to a non-cross referenced object ...!")
+			return nil, true, errors.New("OS belongs to a non cross referenced object")
+		}
+	}
+	return nil, false, errors.New("unknown xref type")
+}
+
+// LookupByReference looks up a PdfObject by a reference.
+func (parser *PdfParser) LookupByReference(ref PdfObjectReference) (PdfObject, error) {
+	common.Log.Trace("Looking up reference %s", ref.String())
+	return parser.LookupByNumber(int(ref.ObjectNumber))
+}
+
+// Trace traces a PdfObject to direct object, looking up and resolving references as needed (unlike TraceToDirect).
+// TODO (v3): Unexport.
+func (parser *PdfParser) Trace(obj PdfObject) (PdfObject, error) {
+	ref, isRef := obj.(*PdfObjectReference)
+	if !isRef {
+		// Direct object already.
+		return obj, nil
+	}
+
+	bakOffset := parser.GetFileOffset()
+	defer func() { parser.SetFileOffset(bakOffset) }()
+
+	o, err := parser.LookupByReference(*ref)
+	if err != nil {
+		return nil, err
+	}
+
+	io, isInd := o.(*PdfIndirectObject)
+	if !isInd {
+		// Not indirect (Stream or null object).
+		return o, nil
+	}
+	o = io.PdfObject
+	_, isRef = o.(*PdfObjectReference)
+	if isRef {
+		return io, errors.New("multi depth trace pointer to pointer")
+	}
+
+	return o, nil
+}
+
+func printXrefTable(xrefTable XrefTable) {
+	common.Log.Debug("=X=X=X=")
+	common.Log.Debug("Xref table:")
+	i := 0
+	for _, xref := range xrefTable {
+		common.Log.Debug("i+1: %d (obj num: %d gen: %d) -> %d", i+1, xref.objectNumber, xref.generation, xref.offset)
+		i++
+	}
+}
--- a/internal/pdf/core/crypt.go
+++ b/internal/pdf/core/crypt.go
--- a/internal/pdf/core/crypt_filters.go
+++ b/internal/pdf/core/crypt_filters.go
@@ -0,0 +1,265 @@
+package core
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/md5"
+	"crypto/rand"
+	"crypto/rc4"
+	"fmt"
+	"io"
+
+	"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
+)
+
+var (
+	cryptMethods = make(map[string]cryptFilterMethod)
+)
+
+// registerCryptFilterMethod registers a CFM.
+func registerCryptFilterMethod(m cryptFilterMethod) {
+	cryptMethods[m.CFM()] = m
+}
+
+// getCryptFilterMethod check if a CFM with a specified name is supported an returns its implementation.
+func getCryptFilterMethod(name string) (cryptFilterMethod, error) {
+	f := cryptMethods[name]
+	if f == nil {
+		return nil, fmt.Errorf("unsupported crypt filter: %q", name)
+	}
+	return f, nil
+}
+
+func init() {
+	// register supported crypt filter methods
+	registerCryptFilterMethod(cryptFilterV2{})
+	registerCryptFilterMethod(cryptFilterAESV2{})
+	registerCryptFilterMethod(cryptFilterAESV3{})
+}
+
+// cryptFilterMethod is a common interface for crypt filter methods.
+type cryptFilterMethod interface {
+	// CFM returns a name of the filter that should be used in CFM field of Encrypt dictionary.
+	CFM() string
+	// MakeKey generates a object encryption key based on file encryption key and object numbers.
+	// Used only for legacy filters - AESV3 doesn't change the key for each object.
+	MakeKey(objNum, genNum uint32, fkey []byte) ([]byte, error)
+	// EncryptBytes encrypts a buffer using object encryption key, as returned by MakeKey.
+	// Implementation may reuse a buffer and encrypt data in-place.
+	EncryptBytes(p []byte, okey []byte) ([]byte, error)
+	// DecryptBytes decrypts a buffer using object encryption key, as returned by MakeKey.
+	// Implementation may reuse a buffer and decrypt data in-place.
+	DecryptBytes(p []byte, okey []byte) ([]byte, error)
+}
+
+// makeKeyV2 is a common object key generation shared by V2 and AESV2 crypt filters.
+func makeKeyV2(objNum, genNum uint32, ekey []byte, isAES bool) ([]byte, error) {
+	key := make([]byte, len(ekey)+5)
+	copy(key, ekey)
+
+	for i := 0; i < 3; i++ {
+		b := byte((objNum >> uint32(8*i)) & 0xff)
+		key[i+len(ekey)] = b
+	}
+	for i := 0; i < 2; i++ {
+		b := byte((genNum >> uint32(8*i)) & 0xff)
+		key[i+len(ekey)+3] = b
+	}
+	if isAES {
+		// If using the AES algorithm, extend the encryption key an
+		// additional 4 bytes by adding the value “sAlT”, which
+		// corresponds to the hexadecimal values 0x73, 0x41, 0x6C, 0x54.
+		key = append(key, 0x73)
+		key = append(key, 0x41)
+		key = append(key, 0x6C)
+		key = append(key, 0x54)
+	}
+
+	// Take the MD5.
+	h := md5.New()
+	h.Write(key)
+	hashb := h.Sum(nil)
+
+	if len(ekey)+5 < 16 {
+		return hashb[0 : len(ekey)+5], nil
+	}
+
+	return hashb, nil
+}
+
+// cryptFilterV2 is a RC4-based filter
+type cryptFilterV2 struct{}
+
+func (cryptFilterV2) CFM() string {
+	return CryptFilterV2
+}
+
+func (f cryptFilterV2) MakeKey(objNum, genNum uint32, ekey []byte) ([]byte, error) {
+	return makeKeyV2(objNum, genNum, ekey, false)
+}
+
+func (cryptFilterV2) EncryptBytes(buf []byte, okey []byte) ([]byte, error) {
+	// Standard RC4 algorithm.
+	ciph, err := rc4.NewCipher(okey)
+	if err != nil {
+		return nil, err
+	}
+	common.Log.Trace("RC4 Encrypt: % x", buf)
+	ciph.XORKeyStream(buf, buf)
+	common.Log.Trace("to: % x", buf)
+	return buf, nil
+}
+
+func (cryptFilterV2) DecryptBytes(buf []byte, okey []byte) ([]byte, error) {
+	// Standard RC4 algorithm.
+	ciph, err := rc4.NewCipher(okey)
+	if err != nil {
+		return nil, err
+	}
+	common.Log.Trace("RC4 Decrypt: % x", buf)
+	ciph.XORKeyStream(buf, buf)
+	common.Log.Trace("to: % x", buf)
+	return buf, nil
+}
+
+// cryptFilterAES implements a generic AES encryption and decryption algorithm used by AESV2 and AESV3 filter methods.
+type cryptFilterAES struct{}
+
+func (cryptFilterAES) EncryptBytes(buf []byte, okey []byte) ([]byte, error) {
+	// Strings and streams encrypted with AES shall use a padding
+	// scheme that is described in Internet RFC 2898, PKCS #5:
+	// Password-Based Cryptography Specification Version 2.0; see
+	// the Bibliography. For an original message length of M,
+	// the pad shall consist of 16 - (M mod 16) bytes whose value
+	// shall also be 16 - (M mod 16).
+	//
+	// A 9-byte message has a pad of 7 bytes, each with the value
+	// 0x07. The pad can be unambiguously removed to determine the
+	// original message length when decrypting. Note that the pad is
+	// present when M is evenly divisible by 16; it contains 16 bytes
+	// of 0x10.
+
+	ciph, err := aes.NewCipher(okey)
+	if err != nil {
+		return nil, err
+	}
+
+	common.Log.Trace("AES Encrypt (%d): % x", len(buf), buf)
+
+	// If using the AES algorithm, the Cipher Block Chaining (CBC)
+	// mode, which requires an initialization vector, is used. The
+	// block size parameter is set to 16 bytes, and the initialization
+	// vector is a 16-byte random number that is stored as the first
+	// 16 bytes of the encrypted stream or string.
+
+	const block = aes.BlockSize // 16
+
+	pad := block - len(buf)%block
+	for i := 0; i < pad; i++ {
+		buf = append(buf, byte(pad))
+	}
+	common.Log.Trace("Padded to %d bytes", len(buf))
+
+	// Generate random 16 bytes, place in beginning of buffer.
+	ciphertext := make([]byte, block+len(buf))
+	iv := ciphertext[:block]
+	if _, err := io.ReadFull(rand.Reader, iv); err != nil {
+		return nil, err
+	}
+
+	mode := cipher.NewCBCEncrypter(ciph, iv)
+	mode.CryptBlocks(ciphertext[block:], buf)
+
+	buf = ciphertext
+	common.Log.Trace("to (%d): % x", len(buf), buf)
+
+	return buf, nil
+}
+
+func (cryptFilterAES) DecryptBytes(buf []byte, okey []byte) ([]byte, error) {
+	// Strings and streams encrypted with AES shall use a padding
+	// scheme that is described in Internet RFC 2898, PKCS #5:
+	// Password-Based Cryptography Specification Version 2.0; see
+	// the Bibliography. For an original message length of M,
+	// the pad shall consist of 16 - (M mod 16) bytes whose value
+	// shall also be 16 - (M mod 16).
+	//
+	// A 9-byte message has a pad of 7 bytes, each with the value
+	// 0x07. The pad can be unambiguously removed to determine the
+	// original message length when decrypting. Note that the pad is
+	// present when M is evenly divisible by 16; it contains 16 bytes
+	// of 0x10.
+
+	ciph, err := aes.NewCipher(okey)
+	if err != nil {
+		return nil, err
+	}
+
+	// If using the AES algorithm, the Cipher Block Chaining (CBC)
+	// mode, which requires an initialization vector, is used. The
+	// block size parameter is set to 16 bytes, and the initialization
+	// vector is a 16-byte random number that is stored as the first
+	// 16 bytes of the encrypted stream or string.
+	if len(buf) < 16 {
+		common.Log.Debug("error AES invalid buf %s", buf)
+		return buf, fmt.Errorf("AES: Buf len < 16 (%d)", len(buf))
+	}
+
+	iv := buf[:16]
+	buf = buf[16:]
+
+	if len(buf)%16 != 0 {
+		common.Log.Debug(" iv (%d): % x", len(iv), iv)
+		common.Log.Debug("buf (%d): % x", len(buf), buf)
+		return buf, fmt.Errorf("AES buf length not multiple of 16 (%d)", len(buf))
+	}
+
+	mode := cipher.NewCBCDecrypter(ciph, iv)
+
+	common.Log.Trace("AES Decrypt (%d): % x", len(buf), buf)
+	common.Log.Trace("chop AES Decrypt (%d): % x", len(buf), buf)
+	mode.CryptBlocks(buf, buf)
+	common.Log.Trace("to (%d): % x", len(buf), buf)
+
+	if len(buf) == 0 {
+		common.Log.Trace("Empty buf, returning empty string")
+		return buf, nil
+	}
+
+	// The padded length is indicated by the last values.  Remove those.
+
+	padLen := int(buf[len(buf)-1])
+	if padLen >= len(buf) {
+		common.Log.Debug("Illegal pad length")
+		return buf, fmt.Errorf("invalid pad length")
+	}
+	buf = buf[:len(buf)-padLen]
+
+	return buf, nil
+}
+
+// cryptFilterAESV2 is an AES-based filter (128 bit key, PDF 1.6)
+type cryptFilterAESV2 struct {
+	cryptFilterAES
+}
+
+func (cryptFilterAESV2) CFM() string {
+	return CryptFilterAESV2
+}
+
+func (cryptFilterAESV2) MakeKey(objNum, genNum uint32, ekey []byte) ([]byte, error) {
+	return makeKeyV2(objNum, genNum, ekey, true)
+}
+
+// cryptFilterAESV3 is an AES-based filter (256 bit key, PDF 2.0)
+type cryptFilterAESV3 struct {
+	cryptFilterAES
+}
+
+func (cryptFilterAESV3) CFM() string {
+	return CryptFilterAESV3
+}
+
+func (cryptFilterAESV3) MakeKey(_, _ uint32, ekey []byte) ([]byte, error) {
+	return ekey, nil
+}
--- a/internal/pdf/core/ecb.go
+++ b/internal/pdf/core/ecb.go
@@ -0,0 +1,61 @@
+package core
+
+import "crypto/cipher"
+
+// ecb implements an Electronic Codebook encryption mode.
+// This mode is used to compute or validate document permissions for R=6.
+type ecb struct {
+	b         cipher.Block
+	blockSize int
+}
+
+func newECB(b cipher.Block) *ecb {
+	return &ecb{
+		b:         b,
+		blockSize: b.BlockSize(),
+	}
+}
+
+type ecbEncrypter ecb
+
+func newECBEncrypter(b cipher.Block) cipher.BlockMode {
+	return (*ecbEncrypter)(newECB(b))
+}
+
+func (x *ecbEncrypter) BlockSize() int { return x.blockSize }
+
+func (x *ecbEncrypter) CryptBlocks(dst, src []byte) {
+	if len(src)%x.blockSize != 0 {
+		panic("crypto/cipher: input not full blocks")
+	}
+	if len(dst) < len(src) {
+		panic("crypto/cipher: output smaller than input")
+	}
+	for len(src) > 0 {
+		x.b.Encrypt(dst, src[:x.blockSize])
+		src = src[x.blockSize:]
+		dst = dst[x.blockSize:]
+	}
+}
+
+type ecbDecrypter ecb
+
+func newECBDecrypter(b cipher.Block) cipher.BlockMode {
+	return (*ecbDecrypter)(newECB(b))
+}
+
+func (x *ecbDecrypter) BlockSize() int { return x.blockSize }
+
+func (x *ecbDecrypter) CryptBlocks(dst, src []byte) {
+	if len(src)%x.blockSize != 0 {
+		panic("crypto/cipher: input not full blocks")
+	}
+	if len(dst) < len(src) {
+		panic("crypto/cipher: output smaller than input")
+	}
+	for len(src) > 0 {
+		x.b.Decrypt(dst, src[:x.blockSize])
+		src = src[x.blockSize:]
+		dst = dst[x.blockSize:]
+	}
+}
--- a/internal/pdf/core/encoding.go
+++ b/internal/pdf/core/encoding.go
--- a/internal/pdf/core/io.go
+++ b/internal/pdf/core/io.go
@@ -0,0 +1,44 @@
+package core
+
+import (
+	"bufio"
+	"errors"
+	"os"
+
+	"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
+)
+
+// ReadAtLeast reads at least n bytes into slice p.
+// Returns the number of bytes read (should always be == n), and an error on failure.
+// TODO (v3): Unexport.
+func (parser *PdfParser) ReadAtLeast(p []byte, n int) (int, error) {
+	remaining := n
+	start := 0
+	numRounds := 0
+	for remaining > 0 {
+		nRead, err := parser.reader.Read(p[start:])
+		if err != nil {
+			common.Log.Debug("error Failed reading (%d;%d) %s", nRead, numRounds, err.Error())
+			return start, errors.New("failed reading")
+		}
+		numRounds++
+		start += nRead
+		remaining -= nRead
+	}
+	return start, nil
+}
+
+// Get the current file offset, accounting for buffered position.
+// TODO (v3): Unexport.
+func (parser *PdfParser) GetFileOffset() int64 {
+	offset, _ := parser.rs.Seek(0, os.SEEK_CUR)
+	offset -= int64(parser.reader.Buffered())
+	return offset
+}
+
+// Seek the file to an offset position.
+// TODO (v3): Unexport.
+func (parser *PdfParser) SetFileOffset(offset int64) {
+	parser.rs.Seek(offset, os.SEEK_SET)
+	parser.reader = bufio.NewReader(parser.rs)
+}
--- a/internal/pdf/core/parser.go
+++ b/internal/pdf/core/parser.go
--- a/internal/pdf/core/primitives.go
+++ b/internal/pdf/core/primitives.go
@@ -0,0 +1,570 @@
+package core
+
+import (
+	"bytes"
+	"fmt"
+
+	"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
+)
+
+// PdfObject is an interface which all primitive PDF objects must implement.
+type PdfObject interface {
+	// Output a string representation of the primitive (for debugging).
+	String() string
+
+	// Output the PDF primitive as written to file as expected by the standard.
+	DefaultWriteString() string
+}
+
+// PdfObjectBool represents the primitive PDF boolean object.
+type PdfObjectBool bool
+
+// PdfObjectInteger represents the primitive PDF integer numerical object.
+type PdfObjectInteger int64
+
+// PdfObjectFloat represents the primitive PDF floating point numerical object.
+type PdfObjectFloat float64
+
+// PdfObjectString represents the primitive PDF string object.
+// TODO (v3): Change to a struct and add a flag for hex/plaintext.
+type PdfObjectString string
+
+// PdfObjectName represents the primitive PDF name object.
+type PdfObjectName string
+
+// PdfObjectArray represents the primitive PDF array object.
+type PdfObjectArray []PdfObject
+
+// PdfObjectDictionary represents the primitive PDF dictionary/map object.
+type PdfObjectDictionary struct {
+	dict map[PdfObjectName]PdfObject
+	keys []PdfObjectName
+}
+
+// PdfObjectNull represents the primitive PDF null object.
+type PdfObjectNull struct{}
+
+// PdfObjectReference represents the primitive PDF reference object.
+type PdfObjectReference struct {
+	ObjectNumber     int64
+	GenerationNumber int64
+}
+
+// PdfIndirectObject represents the primitive PDF indirect object.
+type PdfIndirectObject struct {
+	PdfObjectReference
+	PdfObject
+}
+
+// PdfObjectStream represents the primitive PDF Object stream.
+type PdfObjectStream struct {
+	PdfObjectReference
+	*PdfObjectDictionary
+	Stream []byte
+}
+
+// MakeDict creates and returns an empty PdfObjectDictionary.
+func MakeDict() *PdfObjectDictionary {
+	d := &PdfObjectDictionary{}
+	d.dict = map[PdfObjectName]PdfObject{}
+	d.keys = []PdfObjectName{}
+	return d
+}
+
+// MakeName creates a PdfObjectName from a string.
+func MakeName(s string) *PdfObjectName {
+	name := PdfObjectName(s)
+	return &name
+}
+
+// MakeInteger creates a PdfObjectInteger from an int64.
+func MakeInteger(val int64) *PdfObjectInteger {
+	num := PdfObjectInteger(val)
+	return &num
+}
+
+// MakeArray creates an PdfObjectArray from a list of PdfObjects.
+func MakeArray(objects ...PdfObject) *PdfObjectArray {
+	array := PdfObjectArray{}
+	for _, obj := range objects {
+		array = append(array, obj)
+	}
+	return &array
+}
+
+// MakeArrayFromIntegers creates an PdfObjectArray from a slice of ints, where each array element is
+// an PdfObjectInteger.
+func MakeArrayFromIntegers(vals []int) *PdfObjectArray {
+	array := PdfObjectArray{}
+	for _, val := range vals {
+		array = append(array, MakeInteger(int64(val)))
+	}
+	return &array
+}
+
+// MakeArrayFromIntegers64 creates an PdfObjectArray from a slice of int64s, where each array element
+// is an PdfObjectInteger.
+func MakeArrayFromIntegers64(vals []int64) *PdfObjectArray {
+	array := PdfObjectArray{}
+	for _, val := range vals {
+		array = append(array, MakeInteger(val))
+	}
+	return &array
+}
+
+// MakeArrayFromFloats creates an PdfObjectArray from a slice of float64s, where each array element is an
+// PdfObjectFloat.
+func MakeArrayFromFloats(vals []float64) *PdfObjectArray {
+	array := PdfObjectArray{}
+	for _, val := range vals {
+		array = append(array, MakeFloat(val))
+	}
+	return &array
+}
+
+// MakeBool creates an PdfObjectBool from a bool.
+func MakeBool(val bool) *PdfObjectBool {
+	v := PdfObjectBool(val)
+	return &v
+}
+
+// MakeFloat creates an PdfObjectFloat from a float64.
+func MakeFloat(val float64) *PdfObjectFloat {
+	num := PdfObjectFloat(val)
+	return &num
+}
+
+// MakeString creates an PdfObjectString from a string.
+func MakeString(s string) *PdfObjectString {
+	str := PdfObjectString(s)
+	return &str
+}
+
+// MakeNull creates an PdfObjectNull.
+func MakeNull() *PdfObjectNull {
+	null := PdfObjectNull{}
+	return &null
+}
+
+// MakeIndirectObject creates an PdfIndirectObject with a specified direct object PdfObject.
+func MakeIndirectObject(obj PdfObject) *PdfIndirectObject {
+	ind := &PdfIndirectObject{}
+	ind.PdfObject = obj
+	return ind
+}
+
+// MakeStream creates an PdfObjectStream with specified contents and encoding. If encoding is nil, then raw encoding
+// will be used (i.e. no encoding applied).
+func MakeStream(contents []byte, encoder StreamEncoder) (*PdfObjectStream, error) {
+	stream := &PdfObjectStream{}
+
+	if encoder == nil {
+		encoder = NewRawEncoder()
+	}
+
+	stream.PdfObjectDictionary = encoder.MakeStreamDict()
+
+	encoded, err := encoder.EncodeBytes(contents)
+	if err != nil {
+		return nil, err
+	}
+	stream.PdfObjectDictionary.Set("Length", MakeInteger(int64(len(encoded))))
+
+	stream.Stream = encoded
+	return stream, nil
+}
+
+func (bool *PdfObjectBool) String() string {
+	if *bool {
+		return "true"
+	} else {
+		return "false"
+	}
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (bool *PdfObjectBool) DefaultWriteString() string {
+	if *bool {
+		return "true"
+	} else {
+		return "false"
+	}
+}
+
+func (int *PdfObjectInteger) String() string {
+	return fmt.Sprintf("%d", *int)
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (int *PdfObjectInteger) DefaultWriteString() string {
+	return fmt.Sprintf("%d", *int)
+}
+
+func (float *PdfObjectFloat) String() string {
+	return fmt.Sprintf("%f", *float)
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (float *PdfObjectFloat) DefaultWriteString() string {
+	return fmt.Sprintf("%f", *float)
+}
+
+func (str *PdfObjectString) String() string {
+	return string(*str)
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (str *PdfObjectString) DefaultWriteString() string {
+	var output bytes.Buffer
+
+	escapeSequences := map[byte]string{
+		'\n': "\\n",
+		'\r': "\\r",
+		'\t': "\\t",
+		'\b': "\\b",
+		'\f': "\\f",
+		'(':  "\\(",
+		')':  "\\)",
+		'\\': "\\\\",
+	}
+
+	output.WriteString("(")
+	for i := 0; i < len(*str); i++ {
+		char := (*str)[i]
+		if escStr, useEsc := escapeSequences[char]; useEsc {
+			output.WriteString(escStr)
+		} else {
+			output.WriteByte(char)
+		}
+	}
+	output.WriteString(")")
+
+	return output.String()
+}
+
+func (name *PdfObjectName) String() string {
+	return string(*name)
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (name *PdfObjectName) DefaultWriteString() string {
+	var output bytes.Buffer
+
+	if len(*name) > 127 {
+		common.Log.Debug("error: Name too long (%s)", *name)
+	}
+
+	output.WriteString("/")
+	for i := 0; i < len(*name); i++ {
+		char := (*name)[i]
+		if !IsPrintable(char) || char == '#' || IsDelimiter(char) {
+			output.WriteString(fmt.Sprintf("#%.2x", char))
+		} else {
+			output.WriteByte(char)
+		}
+	}
+
+	return output.String()
+}
+
+// ToFloat64Array returns a slice of all elements in the array as a float64 slice.  An error is returned if the array
+// contains non-numeric objects (each element can be either PdfObjectInteger or PdfObjectFloat).
+func (array *PdfObjectArray) ToFloat64Array() ([]float64, error) {
+	vals := []float64{}
+
+	for _, obj := range *array {
+		if number, is := obj.(*PdfObjectInteger); is {
+			vals = append(vals, float64(*number))
+		} else if number, is := obj.(*PdfObjectFloat); is {
+			vals = append(vals, float64(*number))
+		} else {
+			return nil, fmt.Errorf("type error")
+		}
+	}
+
+	return vals, nil
+}
+
+// ToIntegerArray returns a slice of all array elements as an int slice. An error is returned if the array contains
+// non-integer objects. Each element can only be PdfObjectInteger.
+func (array *PdfObjectArray) ToIntegerArray() ([]int, error) {
+	vals := []int{}
+
+	for _, obj := range *array {
+		if number, is := obj.(*PdfObjectInteger); is {
+			vals = append(vals, int(*number))
+		} else {
+			return nil, fmt.Errorf("type error")
+		}
+	}
+
+	return vals, nil
+}
+
+func (array *PdfObjectArray) String() string {
+	outStr := "["
+	for ind, o := range *array {
+		outStr += o.String()
+		if ind < (len(*array) - 1) {
+			outStr += ", "
+		}
+	}
+	outStr += "]"
+	return outStr
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (array *PdfObjectArray) DefaultWriteString() string {
+	outStr := "["
+	for ind, o := range *array {
+		outStr += o.DefaultWriteString()
+		if ind < (len(*array) - 1) {
+			outStr += " "
+		}
+	}
+	outStr += "]"
+	return outStr
+}
+
+// Append adds an PdfObject to the array.
+func (array *PdfObjectArray) Append(obj PdfObject) {
+	*array = append(*array, obj)
+}
+
+func getNumberAsFloat(obj PdfObject) (float64, error) {
+	if fObj, ok := obj.(*PdfObjectFloat); ok {
+		return float64(*fObj), nil
+	}
+
+	if iObj, ok := obj.(*PdfObjectInteger); ok {
+		return float64(*iObj), nil
+	}
+
+	return 0, fmt.Errorf("not a number")
+}
+
+// GetAsFloat64Slice returns the array as []float64 slice.
+// Returns an error if not entirely numeric (only PdfObjectIntegers, PdfObjectFloats).
+func (array *PdfObjectArray) GetAsFloat64Slice() ([]float64, error) {
+	slice := []float64{}
+
+	for _, obj := range *array {
+		obj := TraceToDirectObject(obj)
+		number, err := getNumberAsFloat(obj)
+		if err != nil {
+			return nil, fmt.Errorf("array element not a number")
+		}
+		slice = append(slice, number)
+	}
+
+	return slice, nil
+}
+
+// Merge merges in key/values from another dictionary. Overwriting if has same keys.
+func (d *PdfObjectDictionary) Merge(another *PdfObjectDictionary) {
+	if another != nil {
+		for _, key := range another.Keys() {
+			val := another.Get(key)
+			d.Set(key, val)
+		}
+	}
+}
+
+func (d *PdfObjectDictionary) String() string {
+	outStr := "Dict("
+	for _, k := range d.keys {
+		v := d.dict[k]
+		outStr += fmt.Sprintf("\"%s\": %s, ", k, v.String())
+	}
+	outStr += ")"
+	return outStr
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (d *PdfObjectDictionary) DefaultWriteString() string {
+	outStr := "<<"
+	for _, k := range d.keys {
+		v := d.dict[k]
+		common.Log.Trace("Writing k: %s %T %v %v", k, v, k, v)
+		outStr += k.DefaultWriteString()
+		outStr += " "
+		outStr += v.DefaultWriteString()
+	}
+	outStr += ">>"
+	return outStr
+}
+
+// Set sets the dictionary's key -> val mapping entry. Overwrites if key already set.
+func (d *PdfObjectDictionary) Set(key PdfObjectName, val PdfObject) {
+	found := false
+	for _, k := range d.keys {
+		if k == key {
+			found = true
+			break
+		}
+	}
+
+	if !found {
+		d.keys = append(d.keys, key)
+	}
+
+	d.dict[key] = val
+}
+
+// Get returns the PdfObject corresponding to the specified key.
+// Returns a nil value if the key is not set.
+//
+// The design is such that we only return 1 value.
+// The reason is that, it will be easy to do type casts such as
+// name, ok := dict.Get("mykey").(*PdfObjectName)
+// if !ok ....
+func (d *PdfObjectDictionary) Get(key PdfObjectName) PdfObject {
+	val, has := d.dict[key]
+	if !has {
+		return nil
+	}
+	return val
+}
+
+// Keys returns the list of keys in the dictionary.
+func (d *PdfObjectDictionary) Keys() []PdfObjectName {
+	return d.keys
+}
+
+// Remove removes an element specified by key.
+func (d *PdfObjectDictionary) Remove(key PdfObjectName) {
+	idx := -1
+	for i, k := range d.keys {
+		if k == key {
+			idx = i
+			break
+		}
+	}
+
+	if idx >= 0 {
+		// Found. Remove from key list and map.
+		d.keys = append(d.keys[:idx], d.keys[idx+1:]...)
+		delete(d.dict, key)
+	}
+}
+
+// SetIfNotNil sets the dictionary's key -> val mapping entry -IF- val is not nil.
+// Note that we take care to perform a type switch.  Otherwise if we would supply a nil value
+// of another type, e.g. (PdfObjectArray*)(nil), then it would not be a PdfObject(nil) and thus
+// would get set.
+func (d *PdfObjectDictionary) SetIfNotNil(key PdfObjectName, val PdfObject) {
+	if val != nil {
+		switch t := val.(type) {
+		case *PdfObjectName:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectDictionary:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectStream:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectString:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectNull:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectInteger:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectArray:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectBool:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectFloat:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfObjectReference:
+			if t != nil {
+				d.Set(key, val)
+			}
+		case *PdfIndirectObject:
+			if t != nil {
+				d.Set(key, val)
+			}
+		default:
+			common.Log.Error("error: Unknown type: %T - should never happen!", val)
+		}
+	}
+}
+
+func (ref *PdfObjectReference) String() string {
+	return fmt.Sprintf("Ref(%d %d)", ref.ObjectNumber, ref.GenerationNumber)
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (ref *PdfObjectReference) DefaultWriteString() string {
+	return fmt.Sprintf("%d %d R", ref.ObjectNumber, ref.GenerationNumber)
+}
+
+func (ind *PdfIndirectObject) String() string {
+	// Avoid printing out the object, can cause problems with circular
+	// references.
+	return fmt.Sprintf("IObject:%d", (*ind).ObjectNumber)
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (ind *PdfIndirectObject) DefaultWriteString() string {
+	outStr := fmt.Sprintf("%d 0 R", (*ind).ObjectNumber)
+	return outStr
+}
+
+func (stream *PdfObjectStream) String() string {
+	return fmt.Sprintf("Object stream %d: %s", stream.ObjectNumber, stream.PdfObjectDictionary)
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (stream *PdfObjectStream) DefaultWriteString() string {
+	outStr := fmt.Sprintf("%d 0 R", (*stream).ObjectNumber)
+	return outStr
+}
+
+func (null *PdfObjectNull) String() string {
+	return "null"
+}
+
+// DefaultWriteString outputs the object as it is to be written to file.
+func (null *PdfObjectNull) DefaultWriteString() string {
+	return "null"
+}
+
+// Handy functions to work with primitive objects.
+
+// TraceMaxDepth specifies the maximum recursion depth allowed.
+const TraceMaxDepth = 20
+
+// TraceToDirectObject traces a PdfObject to a direct object.  For example direct objects contained
+// in indirect objects (can be double referenced even).
+//
+// Note: This function does not trace/resolve references. That needs to be done beforehand.
+func TraceToDirectObject(obj PdfObject) PdfObject {
+	iobj, isIndirectObj := obj.(*PdfIndirectObject)
+	depth := 0
+	for isIndirectObj {
+		obj = iobj.PdfObject
+		iobj, isIndirectObj = obj.(*PdfIndirectObject)
+		depth++
+		if depth > TraceMaxDepth {
+			common.Log.Error("error: Trace depth level beyond %d - not going deeper!", TraceMaxDepth)
+			return nil
+		}
+	}
+	return obj
+}
--- a/internal/pdf/core/repairs.go
+++ b/internal/pdf/core/repairs.go
@@ -0,0 +1,281 @@
+// Routines related to repairing malformed pdf files.
+
+package core
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"regexp"
+
+	"bufio"
+	"io"
+	"strconv"
+
+	"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
+)
+
+var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`)
+
+// Locates a standard Xref table by looking for the "xref" entry.
+// Xref object stream not supported.
+func (parser *PdfParser) repairLocateXref() (int64, error) {
+	readBuf := int64(1000)
+	parser.rs.Seek(-readBuf, os.SEEK_CUR)
+
+	curOffset, err := parser.rs.Seek(0, os.SEEK_CUR)
+	if err != nil {
+		return 0, err
+	}
+	b2 := make([]byte, readBuf)
+	parser.rs.Read(b2)
+
+	results := repairReXrefTable.FindAllStringIndex(string(b2), -1)
+	if len(results) < 1 {
+		common.Log.Debug("error: Repair: xref not found!")
+		return 0, errors.New("repair: xref not found")
+	}
+
+	localOffset := int64(results[len(results)-1][0])
+	xrefOffset := curOffset + localOffset
+	return xrefOffset, nil
+}
+
+// Renumbers the xref table.
+// Useful when the cross reference is pointing to an object with the wrong number.
+// Update the table.
+func (parser *PdfParser) rebuildXrefTable() error {
+	newXrefs := XrefTable{}
+	for objNum, xref := range parser.xrefs {
+		obj, _, err := parser.lookupByNumberWrapper(objNum, false)
+		if err != nil {
+			common.Log.Debug("error: Unable to look up object (%s)", err)
+			common.Log.Debug("error: Xref table completely broken - attempting to repair ")
+			xrefTable, err := parser.repairRebuildXrefsTopDown()
+			if err != nil {
+				common.Log.Debug("error: Failed xref rebuild repair (%s)", err)
+				return err
+			}
+			parser.xrefs = *xrefTable
+			common.Log.Debug("Repaired xref table built")
+			return nil
+		}
+		actObjNum, actGenNum, err := getObjectNumber(obj)
+		if err != nil {
+			return err
+		}
+
+		xref.objectNumber = int(actObjNum)
+		xref.generation = int(actGenNum)
+		newXrefs[int(actObjNum)] = xref
+	}
+
+	parser.xrefs = newXrefs
+	common.Log.Debug("New xref table built")
+	printXrefTable(parser.xrefs)
+	return nil
+}
+
+// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil).
+func parseObjectNumberFromString(str string) (int, int, error) {
+	result := reIndirectObject.FindStringSubmatch(str)
+	if len(result) < 3 {
+		return 0, 0, errors.New("unable to detect indirect object signature")
+	}
+
+	on, _ := strconv.Atoi(result[1])
+	gn, _ := strconv.Atoi(result[2])
+
+	return on, gn, nil
+}
+
+// Parse the entire file from top down.
+// Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns.
+// N.B. This collects the XREF_TABLE_ENTRY data only.
+func (parser *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
+	if parser.repairsAttempted {
+		// Avoid multiple repairs (only try once).
+		return nil, fmt.Errorf("repair failed")
+	}
+	parser.repairsAttempted = true
+
+	// Go to beginning, reset reader.
+	parser.rs.Seek(0, os.SEEK_SET)
+	parser.reader = bufio.NewReader(parser.rs)
+
+	// Keep a running buffer of last bytes.
+	bufLen := 20
+	last := make([]byte, bufLen)
+
+	xrefTable := XrefTable{}
+	for {
+		b, err := parser.reader.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				break
+			} else {
+				return nil, err
+			}
+		}
+
+		// Format:
+		// object number - whitespace - generation number - obj
+		// e.g. "12 0 obj"
+		if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) {
+			i := bufLen - 4
+			// Go past whitespace
+			for IsWhiteSpace(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 || !IsDecimalDigit(last[i]) {
+				continue
+			}
+			// Go past generation number
+			for IsDecimalDigit(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 || !IsWhiteSpace(last[i]) {
+				continue
+			}
+			// Go past whitespace
+			for IsWhiteSpace(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 || !IsDecimalDigit(last[i]) {
+				continue
+			}
+			// Go past object number.
+			for IsDecimalDigit(last[i]) && i > 0 {
+				i--
+			}
+			if i == 0 {
+				continue // Probably too long to be a valid object...
+			}
+
+			objOffset := parser.GetFileOffset() - int64(bufLen-i)
+
+			objstr := append(last[i+1:], b)
+			objNum, genNum, err := parseObjectNumberFromString(string(objstr))
+			if err != nil {
+				common.Log.Debug("Unable to parse object number: %v", err)
+				return nil, err
+			}
+
+			// Create and insert the XREF entry if not existing, or the generation number is higher.
+			if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum {
+				// Make the entry for the cross ref table.
+				xrefEntry := XrefObject{}
+				xrefEntry.xtype = XREF_TABLE_ENTRY
+				xrefEntry.objectNumber = int(objNum)
+				xrefEntry.generation = int(genNum)
+				xrefEntry.offset = objOffset
+				xrefTable[objNum] = xrefEntry
+			}
+		}
+
+		last = append(last[1:bufLen], b)
+	}
+
+	return &xrefTable, nil
+}
+
+// Look for first sign of xref table from end of file.
+func (parser *PdfParser) repairSeekXrefMarker() error {
+	// Get the file size.
+	fSize, err := parser.rs.Seek(0, os.SEEK_END)
+	if err != nil {
+		return err
+	}
+
+	reXrefTableStart := regexp.MustCompile(`\sxref\s*`)
+
+	// Define the starting point (from the end of the file) to search from.
+	var offset int64 = 0
+
+	// Define an buffer length in terms of how many bytes to read from the end of the file.
+	var buflen int64 = 1000
+
+	for offset < fSize {
+		if fSize <= (buflen + offset) {
+			buflen = fSize - offset
+		}
+
+		// Move back enough (as we need to read forward).
+		_, err := parser.rs.Seek(-offset-buflen, os.SEEK_END)
+		if err != nil {
+			return err
+		}
+
+		// Read the data.
+		b1 := make([]byte, buflen)
+		parser.rs.Read(b1)
+
+		common.Log.Trace("Looking for xref : \"%s\"", string(b1))
+		ind := reXrefTableStart.FindAllStringIndex(string(b1), -1)
+		if ind != nil {
+			// Found it.
+			lastInd := ind[len(ind)-1]
+			common.Log.Trace("Ind: % d", ind)
+			parser.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
+			parser.reader = bufio.NewReader(parser.rs)
+			// Go past whitespace, finish at 'x'.
+			for {
+				bb, err := parser.reader.Peek(1)
+				if err != nil {
+					return err
+				}
+				common.Log.Trace("B: %d %c", bb[0], bb[0])
+				if !IsWhiteSpace(bb[0]) {
+					break
+				}
+				parser.reader.Discard(1)
+			}
+
+			return nil
+		} else {
+			common.Log.Debug("warning: EOF marker not found! - continue seeking")
+		}
+
+		offset += buflen
+	}
+
+	common.Log.Debug("error: Xref table marker was not found.")
+	return errors.New("xref not found ")
+}
+
+// Called when Pdf version not found normally.  Looks for the PDF version by scanning top-down.
+// %PDF-1.7
+func (parser *PdfParser) seekPdfVersionTopDown() (int, int, error) {
+	// Go to beginning, reset reader.
+	parser.rs.Seek(0, os.SEEK_SET)
+	parser.reader = bufio.NewReader(parser.rs)
+
+	// Keep a running buffer of last bytes.
+	bufLen := 20
+	last := make([]byte, bufLen)
+
+	for {
+		b, err := parser.reader.ReadByte()
+		if err != nil {
+			if err == io.EOF {
+				break
+			} else {
+				return 0, 0, err
+			}
+		}
+
+		// Format:
+		// object number - whitespace - generation number - obj
+		// e.g. "12 0 obj"
+		if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
+			last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
+			major := int(last[bufLen-2] - '0')
+			minor := int(b - '0')
+			return major, minor, nil
+		}
+
+		last = append(last[1:bufLen], b)
+	}
+
+	return 0, 0, errors.New("version not found")
+}
--- a/internal/pdf/core/stream.go
+++ b/internal/pdf/core/stream.go
@@ -0,0 +1,129 @@
+package core
+
+import (
+	"fmt"
+
+	"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
+)
+
+// NewEncoderFromStream creates a StreamEncoder based on the stream's dictionary.
+func NewEncoderFromStream(streamObj *PdfObjectStream) (StreamEncoder, error) {
+	filterObj := TraceToDirectObject(streamObj.PdfObjectDictionary.Get("Filter"))
+	if filterObj == nil {
+		// No filter, return raw data back.
+		return NewRawEncoder(), nil
+	}
+
+	if _, isNull := filterObj.(*PdfObjectNull); isNull {
+		// Filter is null -> raw data.
+		return NewRawEncoder(), nil
+	}
+
+	// The filter should be a name or an array with a list of filter names.
+	method, ok := filterObj.(*PdfObjectName)
+	if !ok {
+		array, ok := filterObj.(*PdfObjectArray)
+		if !ok {
+			return nil, fmt.Errorf("filter not a Name or Array object")
+		}
+		if len(*array) == 0 {
+			// Empty array -> indicates raw filter (no filter).
+			return NewRawEncoder(), nil
+		}
+
+		if len(*array) != 1 {
+			menc, err := newMultiEncoderFromStream(streamObj)
+			if err != nil {
+				common.Log.Error("Failed creating multi encoder: %v", err)
+				return nil, err
+			}
+
+			common.Log.Trace("Multi enc: %s\n", menc)
+			return menc, nil
+		}
+
+		// Single element.
+		filterObj = (*array)[0]
+		method, ok = filterObj.(*PdfObjectName)
+		if !ok {
+			return nil, fmt.Errorf("filter array member not a Name object")
+		}
+	}
+
+	switch *method {
+	case StreamEncodingFilterNameFlate:
+		return newFlateEncoderFromStream(streamObj, nil)
+	case StreamEncodingFilterNameLZW:
+		return newLZWEncoderFromStream(streamObj, nil)
+	case StreamEncodingFilterNameDCT:
+		return newDCTEncoderFromStream(streamObj, nil)
+	case StreamEncodingFilterNameRunLength:
+		return newRunLengthEncoderFromStream()
+	case StreamEncodingFilterNameASCIIHex:
+		return NewASCIIHexEncoder(), nil
+	case StreamEncodingFilterNameASCII85, "A85":
+		return NewASCII85Encoder(), nil
+	case StreamEncodingFilterNameCCITTFax:
+		return NewCCITTFaxEncoder(), nil
+	case StreamEncodingFilterNameJBIG2:
+		return NewJBIG2Encoder(), nil
+	case StreamEncodingFilterNameJPX:
+		return NewJPXEncoder(), nil
+	default:
+		common.Log.Debug("error: Unsupported encoding method!")
+		return nil, fmt.Errorf("unsupported encoding method (%s)", *method)
+	}
+}
+
+// DecodeStream decodes the stream data and returns the decoded data.
+// An error is returned upon failure.
+func DecodeStream(streamObj *PdfObjectStream) ([]byte, error) {
+	common.Log.Trace("Decode stream")
+
+	encoder, err := NewEncoderFromStream(streamObj)
+	if err != nil {
+		common.Log.Debug("Stream decoding failed: %v", err)
+		return nil, err
+	}
+	common.Log.Trace("Encoder: %#v\n", encoder)
+
+	decoded, err := encoder.DecodeStream(streamObj)
+	if err != nil {
+		common.Log.Debug("Stream decoding failed: %v", err)
+		return nil, err
+	}
+
+	return decoded, nil
+}
+
+// EncodeStream encodes the stream data using the encoded specified by the stream's dictionary.
+func EncodeStream(streamObj *PdfObjectStream) error {
+	common.Log.Trace("Encode stream")
+
+	encoder, err := NewEncoderFromStream(streamObj)
+	if err != nil {
+		common.Log.Debug("Stream decoding failed: %v", err)
+		return err
+	}
+
+	if lzwenc, is := encoder.(*LZWEncoder); is {
+		// If LZW:
+		// Make sure to use EarlyChange 0.. We do not have write support for 1 yet.
+		lzwenc.EarlyChange = 0
+		streamObj.PdfObjectDictionary.Set("EarlyChange", MakeInteger(0))
+	}
+
+	common.Log.Trace("Encoder: %+v\n", encoder)
+	encoded, err := encoder.EncodeBytes(streamObj.Stream)
+	if err != nil {
+		common.Log.Debug("Stream encoding failed: %v", err)
+		return err
+	}
+
+	streamObj.Stream = encoded
+
+	// Update length
+	streamObj.PdfObjectDictionary.Set("Length", MakeInteger(int64(len(encoded))))
+
+	return nil
+}
--- a/internal/pdf/core/symbols.go
+++ b/internal/pdf/core/symbols.go
@@ -0,0 +1,74 @@
+package core
+
+// IsWhiteSpace checks if byte represents a white space character.
+// TODO (v3): Unexport.
+func IsWhiteSpace(ch byte) bool {
+	// Table 1 white-space characters (7.2.2 Character Set)
+	// spaceCharacters := string([]byte{0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20})
+	if (ch == 0x00) || (ch == 0x09) || (ch == 0x0A) || (ch == 0x0C) || (ch == 0x0D) || (ch == 0x20) {
+		return true
+	}
+	return false
+}
+
+// IsFloatDigit checks if a character can be a part of a float number string.
+// TODO (v3): Unexport.
+func IsFloatDigit(c byte) bool {
+	return ('0' <= c && c <= '9') || c == '.'
+}
+
+// IsDecimalDigit checks if the character is a part of a decimal number string.
+// TODO (v3): Unexport.
+func IsDecimalDigit(c byte) bool {
+	if c >= '0' && c <= '9' {
+		return true
+	} else {
+		return false
+	}
+}
+
+// IsOctalDigit checks if a character can be part of an octal digit string.
+// TODO (v3): Unexport.
+func IsOctalDigit(c byte) bool {
+	if c >= '0' && c <= '7' {
+		return true
+	} else {
+		return false
+	}
+}
+
+// IsPrintable checks if a character is printable.
+// Regular characters that are outside the range EXCLAMATION MARK(21h)
+// (!) to TILDE (7Eh) (~) should be written using the hexadecimal notation.
+// TODO (v3): Unexport.
+func IsPrintable(char byte) bool {
+	if char < 0x21 || char > 0x7E {
+		return false
+	}
+	return true
+}
+
+// IsDelimiter checks if a character represents a delimiter.
+// TODO (v3): Unexport.
+func IsDelimiter(char byte) bool {
+	if char == '(' || char == ')' {
+		return true
+	}
+	if char == '<' || char == '>' {
+		return true
+	}
+	if char == '[' || char == ']' {
+		return true
+	}
+	if char == '{' || char == '}' {
+		return true
+	}
+	if char == '/' {
+		return true
+	}
+	if char == '%' {
+		return true
+	}
+
+	return false
+}
--- a/internal/pdf/core/utils.go
+++ b/internal/pdf/core/utils.go
@@ -0,0 +1,172 @@
+package core
+
+import (
+	"errors"
+	"fmt"
+	"sort"
+
+	"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
+)
+
+// Check slice range to make sure within bounds for accessing:
+//
+//	slice[a:b] where sliceLen=len(slice).
+func checkBounds(sliceLen, a, b int) error {
+	if a < 0 || a > sliceLen {
+		return errors.New("slice index a out of bounds")
+	}
+	if b < a {
+		return errors.New("invalid slice index b < a")
+	}
+	if b > sliceLen {
+		return errors.New("slice index b out of bounds")
+	}
+
+	return nil
+}
+
+// Inspect analyzes the document object structure.
+func (parser *PdfParser) Inspect() (map[string]int, error) {
+	return parser.inspect()
+}
+
+// GetObjectNums returns a sorted list of object numbers of the PDF objects in the file.
+func (parser *PdfParser) GetObjectNums() []int {
+	objNums := []int{}
+	for _, x := range parser.xrefs {
+		objNums = append(objNums, x.objectNumber)
+	}
+
+	// Sort the object numbers to give consistent ordering of PDF objects in output.
+	// Needed since parser.xrefs is a map.
+	sort.Ints(objNums)
+
+	return objNums
+}
+
+/*
+ * Inspect object types.
+ * Go through all objects in the cross ref table and detect the types.
+ * Mostly for debugging purposes and inspecting odd PDF files.
+ */
+func (parser *PdfParser) inspect() (map[string]int, error) {
+	common.Log.Trace("--------INSPECT ----------")
+	common.Log.Trace("Xref table:")
+
+	objTypes := map[string]int{}
+	objCount := 0
+	failedCount := 0
+
+	keys := []int{}
+	for k := range parser.xrefs {
+		keys = append(keys, k)
+	}
+	sort.Ints(keys)
+
+	i := 0
+	for _, k := range keys {
+		xref := parser.xrefs[k]
+		if xref.objectNumber == 0 {
+			continue
+		}
+		objCount++
+		common.Log.Trace("==========")
+		common.Log.Trace("Looking up object number: %d", xref.objectNumber)
+		o, err := parser.LookupByNumber(xref.objectNumber)
+		if err != nil {
+			common.Log.Trace("error: Fail to lookup obj %d (%s)", xref.objectNumber, err)
+			failedCount++
+			continue
+		}
+
+		common.Log.Trace("obj: %s", o)
+
+		iobj, isIndirect := o.(*PdfIndirectObject)
+		if isIndirect {
+			common.Log.Trace("IND OOBJ %d: %s", xref.objectNumber, iobj)
+			dict, isDict := iobj.PdfObject.(*PdfObjectDictionary)
+			if isDict {
+				// Check if has Type parameter.
+				if ot, has := dict.Get("Type").(*PdfObjectName); has {
+					otype := string(*ot)
+					common.Log.Trace("---> Obj type: %s", otype)
+					_, isDefined := objTypes[otype]
+					if isDefined {
+						objTypes[otype]++
+					} else {
+						objTypes[otype] = 1
+					}
+				} else if ot, has := dict.Get("Subtype").(*PdfObjectName); has {
+					// Check if subtype
+					otype := string(*ot)
+					common.Log.Trace("---> Obj subtype: %s", otype)
+					_, isDefined := objTypes[otype]
+					if isDefined {
+						objTypes[otype]++
+					} else {
+						objTypes[otype] = 1
+					}
+				}
+				if val, has := dict.Get("S").(*PdfObjectName); has && *val == "JavaScript" {
+					// Check if Javascript.
+					_, isDefined := objTypes["JavaScript"]
+					if isDefined {
+						objTypes["JavaScript"]++
+					} else {
+						objTypes["JavaScript"] = 1
+					}
+				}
+
+			}
+		} else if sobj, isStream := o.(*PdfObjectStream); isStream {
+			if otype, ok := sobj.PdfObjectDictionary.Get("Type").(*PdfObjectName); ok {
+				common.Log.Trace("--> Stream object type: %s", *otype)
+				k := string(*otype)
+				objTypes[k]++
+			}
+		} else { // Direct.
+			dict, isDict := o.(*PdfObjectDictionary)
+			if isDict {
+				ot, isName := dict.Get("Type").(*PdfObjectName)
+				if isName {
+					otype := string(*ot)
+					common.Log.Trace("--- obj type %s", otype)
+					objTypes[otype]++
+				}
+			}
+			common.Log.Trace("DIRECT OBJ %d: %s", xref.objectNumber, o)
+		}
+
+		i++
+	}
+	common.Log.Trace("--------EOF INSPECT ----------")
+	common.Log.Trace("=======")
+	common.Log.Trace("Object count: %d", objCount)
+	common.Log.Trace("Failed lookup: %d", failedCount)
+	for t, c := range objTypes {
+		common.Log.Trace("%s: %d", t, c)
+	}
+	common.Log.Trace("=======")
+
+	if len(parser.xrefs) < 1 {
+		common.Log.Debug("error: This document is invalid (xref table missing!)")
+		return nil, fmt.Errorf("invalid document (xref table missing)")
+	}
+
+	fontObjs, ok := objTypes["Font"]
+	if !ok || fontObjs < 2 {
+		common.Log.Trace("This document is probably scanned!")
+	} else {
+		common.Log.Trace("This document is valid for extraction!")
+	}
+
+	return objTypes, nil
+}
+
+func absInt(x int) int {
+	if x < 0 {
+		return -x
+	} else {
+		return x
+	}
+}