fix wrong git ignore

This commit is contained in:
Adrian Zürcher
2025-12-15 17:44:00 +01:00
parent ed9f31bb96
commit 8f313c00f0
126 changed files with 70589 additions and 1 deletions

View File

@@ -0,0 +1,13 @@
package core
import "errors"
var (
// ErrUnsupportedEncodingParameters error indicates that encoding/decoding was attempted with unsupported
// encoding parameters.
// For example when trying to encode with an unsupported Predictor (flate).
ErrUnsupportedEncodingParameters = errors.New("unsupported encoding parameters")
ErrNoCCITTFaxDecode = errors.New(" CCITTFaxDecode encoding is not yet implemented")
ErrNoJBIG2Decode = errors.New(" JBIG2Decode encoding is not yet implemented")
ErrNoJPXDecode = errors.New(" JPXDecode encoding is not yet implemented")
)

View File

@@ -0,0 +1,372 @@
package core
import (
"bufio"
"bytes"
"errors"
"os"
"strings"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
)
// TODO (v3): Create a new type xrefType which can be an integer and can be used for improved type checking.
// TODO (v3): Unexport these constants and rename with camelCase.
const (
// XREF_TABLE_ENTRY indicates a normal xref table entry.
XREF_TABLE_ENTRY = iota
// XREF_OBJECT_STREAM indicates an xref entry in an xref object stream.
XREF_OBJECT_STREAM = iota
)
// XrefObject defines a cross reference entry which is a map between object number (with generation number) and the
// location of the actual object, either as a file offset (xref table entry), or as a location within an xref
// stream object (xref object stream).
// TODO (v3): Unexport.
type XrefObject struct {
xtype int
objectNumber int
generation int
// For normal xrefs (defined by OFFSET)
offset int64
// For xrefs to object streams.
osObjNumber int
osObjIndex int
}
// XrefTable is a map between object number and corresponding XrefObject.
// TODO (v3): Unexport.
// TODO: Consider changing to a slice, so can maintain the object order without sorting when analyzing.
type XrefTable map[int]XrefObject
// ObjectStream represents an object stream's information which can contain multiple indirect objects.
// The information specifies the number of objects and has information about offset locations for
// each object.
// TODO (v3): Unexport.
type ObjectStream struct {
N int // TODO (v3): Unexport.
ds []byte
offsets map[int]int64
}
// ObjectStreams defines a map between object numbers (object streams only) and underlying ObjectStream information.
type ObjectStreams map[int]ObjectStream
// ObjectCache defines a map between object numbers and corresponding PdfObject. Serves as a cache for PdfObjects that
// have already been parsed.
// TODO (v3): Unexport.
type ObjectCache map[int]PdfObject
// Get an object from an object stream.
func (parser *PdfParser) lookupObjectViaOS(sobjNumber int, objNum int) (PdfObject, error) {
var bufReader *bytes.Reader
var objstm ObjectStream
var cached bool
objstm, cached = parser.objstms[sobjNumber]
if !cached {
soi, err := parser.LookupByNumber(sobjNumber)
if err != nil {
common.Log.Debug("Missing object stream with number %d", sobjNumber)
return nil, err
}
so, ok := soi.(*PdfObjectStream)
if !ok {
return nil, errors.New("invalid object stream")
}
if parser.crypter != nil && !parser.crypter.isDecrypted(so) {
return nil, errors.New("need to decrypt the stream")
}
sod := so.PdfObjectDictionary
common.Log.Trace("so d: %s\n", *sod)
name, ok := sod.Get("Type").(*PdfObjectName)
if !ok {
common.Log.Debug("error: Object stream should always have a Type")
return nil, errors.New("object stream missing Type")
}
if strings.ToLower(string(*name)) != "objstm" {
common.Log.Debug("error: Object stream type shall always be ObjStm !")
return nil, errors.New("object stream type != ObjStm")
}
N, ok := sod.Get("N").(*PdfObjectInteger)
if !ok {
return nil, errors.New("invalid N in stream dictionary")
}
firstOffset, ok := sod.Get("First").(*PdfObjectInteger)
if !ok {
return nil, errors.New("invalid First in stream dictionary")
}
common.Log.Trace("type: %s number of objects: %d", name, *N)
ds, err := DecodeStream(so)
if err != nil {
return nil, err
}
common.Log.Trace("Decoded: %s", ds)
// Temporarily change the reader object to this decoded buffer.
// Change back afterwards.
bakOffset := parser.GetFileOffset()
defer func() { parser.SetFileOffset(bakOffset) }()
bufReader = bytes.NewReader(ds)
parser.reader = bufio.NewReader(bufReader)
common.Log.Trace("Parsing offset map")
// Load the offset map (relative to the beginning of the stream...)
offsets := map[int]int64{}
// Object list and offsets.
for i := 0; i < int(*N); i++ {
parser.skipSpaces()
// Object number.
obj, err := parser.parseNumber()
if err != nil {
return nil, err
}
onum, ok := obj.(*PdfObjectInteger)
if !ok {
return nil, errors.New("invalid object stream offset table")
}
parser.skipSpaces()
// Offset.
obj, err = parser.parseNumber()
if err != nil {
return nil, err
}
offset, ok := obj.(*PdfObjectInteger)
if !ok {
return nil, errors.New("invalid object stream offset table")
}
common.Log.Trace("obj %d offset %d", *onum, *offset)
offsets[int(*onum)] = int64(*firstOffset + *offset)
}
objstm = ObjectStream{N: int(*N), ds: ds, offsets: offsets}
parser.objstms[sobjNumber] = objstm
} else {
// Temporarily change the reader object to this decoded buffer.
// Point back afterwards.
bakOffset := parser.GetFileOffset()
defer func() { parser.SetFileOffset(bakOffset) }()
bufReader = bytes.NewReader(objstm.ds)
// Temporarily change the reader object to this decoded buffer.
parser.reader = bufio.NewReader(bufReader)
}
offset := objstm.offsets[objNum]
common.Log.Trace("ACTUAL offset[%d] = %d", objNum, offset)
bufReader.Seek(offset, os.SEEK_SET)
parser.reader = bufio.NewReader(bufReader)
bb, _ := parser.reader.Peek(100)
common.Log.Trace("OBJ peek \"%s\"", string(bb))
val, err := parser.parseObject()
if err != nil {
common.Log.Debug("error Fail to read object (%s)", err)
return nil, err
}
if val == nil {
return nil, errors.New("object cannot be null")
}
// Make an indirect object around it.
io := PdfIndirectObject{}
io.ObjectNumber = int64(objNum)
io.PdfObject = val
return &io, nil
}
// LookupByNumber looks up a PdfObject by object number. Returns an error on failure.
// TODO (v3): Unexport.
func (parser *PdfParser) LookupByNumber(objNumber int) (PdfObject, error) {
// Outside interface for lookupByNumberWrapper. Default attempts repairs of bad xref tables.
obj, _, err := parser.lookupByNumberWrapper(objNumber, true)
return obj, err
}
// Wrapper for lookupByNumber, checks if object encrypted etc.
func (parser *PdfParser) lookupByNumberWrapper(objNumber int, attemptRepairs bool) (PdfObject, bool, error) {
obj, inObjStream, err := parser.lookupByNumber(objNumber, attemptRepairs)
if err != nil {
return nil, inObjStream, err
}
// If encrypted, decrypt it prior to returning.
// Do not attempt to decrypt objects within object streams.
if !inObjStream && parser.crypter != nil && !parser.crypter.isDecrypted(obj) {
err := parser.crypter.Decrypt(obj, 0, 0)
if err != nil {
return nil, inObjStream, err
}
}
return obj, inObjStream, nil
}
func getObjectNumber(obj PdfObject) (int64, int64, error) {
if io, isIndirect := obj.(*PdfIndirectObject); isIndirect {
return io.ObjectNumber, io.GenerationNumber, nil
}
if so, isStream := obj.(*PdfObjectStream); isStream {
return so.ObjectNumber, so.GenerationNumber, nil
}
return 0, 0, errors.New("not an indirect/stream object")
}
// LookupByNumber
// Repair signals whether to repair if broken.
func (parser *PdfParser) lookupByNumber(objNumber int, attemptRepairs bool) (PdfObject, bool, error) {
obj, ok := parser.ObjCache[objNumber]
if ok {
common.Log.Trace("Returning cached object %d", objNumber)
return obj, false, nil
}
xref, ok := parser.xrefs[objNumber]
if !ok {
// An indirect reference to an undefined object shall not be
// considered an error by a conforming reader; it shall be
// treated as a reference to the null object.
common.Log.Trace("Unable to locate object in xrefs! - Returning null object")
var nullObj PdfObjectNull
return &nullObj, false, nil
}
common.Log.Trace("Lookup obj number %d", objNumber)
switch xref.xtype {
case XREF_TABLE_ENTRY:
common.Log.Trace("xrefobj obj num %d", xref.objectNumber)
common.Log.Trace("xrefobj gen %d", xref.generation)
common.Log.Trace("xrefobj offset %d", xref.offset)
parser.rs.Seek(xref.offset, os.SEEK_SET)
parser.reader = bufio.NewReader(parser.rs)
obj, err := parser.ParseIndirectObject()
if err != nil {
common.Log.Debug("error Failed reading xref (%s)", err)
// Offset pointing to a non-object. Try to repair the file.
if attemptRepairs {
common.Log.Debug("Attempting to repair xrefs (top down)")
xrefTable, err := parser.repairRebuildXrefsTopDown()
if err != nil {
common.Log.Debug("error Failed repair (%s)", err)
return nil, false, err
}
parser.xrefs = *xrefTable
return parser.lookupByNumber(objNumber, false)
}
return nil, false, err
}
if attemptRepairs {
// Check the object number..
// If it does not match, then try to rebuild, i.e. loop through
// all the items in the xref and look each one up and correct.
realObjNum, _, _ := getObjectNumber(obj)
if int(realObjNum) != objNumber {
common.Log.Debug("invalid xrefs: Rebuilding")
err := parser.rebuildXrefTable()
if err != nil {
return nil, false, err
}
// Empty the cache.
parser.ObjCache = ObjectCache{}
// Try looking up again and return.
return parser.lookupByNumberWrapper(objNumber, false)
}
}
common.Log.Trace("Returning obj")
parser.ObjCache[objNumber] = obj
return obj, false, nil
case XREF_OBJECT_STREAM:
common.Log.Trace("xref from object stream!")
common.Log.Trace(">Load via OS!")
common.Log.Trace("Object stream available in object %d/%d", xref.osObjNumber, xref.osObjIndex)
if xref.osObjNumber == objNumber {
common.Log.Debug("error Circular reference!?!")
return nil, true, errors.New(" Xref circular reference")
}
_, exists := parser.xrefs[xref.osObjNumber]
if exists {
optr, err := parser.lookupObjectViaOS(xref.osObjNumber, objNumber) //xref.osObjIndex)
if err != nil {
common.Log.Debug("error Returning ERR (%s)", err)
return nil, true, err
}
common.Log.Trace("<Loaded via OS")
parser.ObjCache[objNumber] = optr
if parser.crypter != nil {
// Mark as decrypted (inside object stream) for caching.
// and avoid decrypting decrypted object.
parser.crypter.DecryptedObjects[optr] = true
}
return optr, true, nil
} else {
common.Log.Debug("?? Belongs to a non-cross referenced object ...!")
return nil, true, errors.New("OS belongs to a non cross referenced object")
}
}
return nil, false, errors.New("unknown xref type")
}
// LookupByReference looks up a PdfObject by a reference.
func (parser *PdfParser) LookupByReference(ref PdfObjectReference) (PdfObject, error) {
common.Log.Trace("Looking up reference %s", ref.String())
return parser.LookupByNumber(int(ref.ObjectNumber))
}
// Trace traces a PdfObject to direct object, looking up and resolving references as needed (unlike TraceToDirect).
// TODO (v3): Unexport.
func (parser *PdfParser) Trace(obj PdfObject) (PdfObject, error) {
ref, isRef := obj.(*PdfObjectReference)
if !isRef {
// Direct object already.
return obj, nil
}
bakOffset := parser.GetFileOffset()
defer func() { parser.SetFileOffset(bakOffset) }()
o, err := parser.LookupByReference(*ref)
if err != nil {
return nil, err
}
io, isInd := o.(*PdfIndirectObject)
if !isInd {
// Not indirect (Stream or null object).
return o, nil
}
o = io.PdfObject
_, isRef = o.(*PdfObjectReference)
if isRef {
return io, errors.New("multi depth trace pointer to pointer")
}
return o, nil
}
func printXrefTable(xrefTable XrefTable) {
common.Log.Debug("=X=X=X=")
common.Log.Debug("Xref table:")
i := 0
for _, xref := range xrefTable {
common.Log.Debug("i+1: %d (obj num: %d gen: %d) -> %d", i+1, xref.objectNumber, xref.generation, xref.offset)
i++
}
}

1732
internal/pdf/core/crypt.go Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,265 @@
package core
import (
"crypto/aes"
"crypto/cipher"
"crypto/md5"
"crypto/rand"
"crypto/rc4"
"fmt"
"io"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
)
var (
cryptMethods = make(map[string]cryptFilterMethod)
)
// registerCryptFilterMethod registers a CFM.
func registerCryptFilterMethod(m cryptFilterMethod) {
cryptMethods[m.CFM()] = m
}
// getCryptFilterMethod check if a CFM with a specified name is supported an returns its implementation.
func getCryptFilterMethod(name string) (cryptFilterMethod, error) {
f := cryptMethods[name]
if f == nil {
return nil, fmt.Errorf("unsupported crypt filter: %q", name)
}
return f, nil
}
func init() {
// register supported crypt filter methods
registerCryptFilterMethod(cryptFilterV2{})
registerCryptFilterMethod(cryptFilterAESV2{})
registerCryptFilterMethod(cryptFilterAESV3{})
}
// cryptFilterMethod is a common interface for crypt filter methods.
type cryptFilterMethod interface {
// CFM returns a name of the filter that should be used in CFM field of Encrypt dictionary.
CFM() string
// MakeKey generates a object encryption key based on file encryption key and object numbers.
// Used only for legacy filters - AESV3 doesn't change the key for each object.
MakeKey(objNum, genNum uint32, fkey []byte) ([]byte, error)
// EncryptBytes encrypts a buffer using object encryption key, as returned by MakeKey.
// Implementation may reuse a buffer and encrypt data in-place.
EncryptBytes(p []byte, okey []byte) ([]byte, error)
// DecryptBytes decrypts a buffer using object encryption key, as returned by MakeKey.
// Implementation may reuse a buffer and decrypt data in-place.
DecryptBytes(p []byte, okey []byte) ([]byte, error)
}
// makeKeyV2 is a common object key generation shared by V2 and AESV2 crypt filters.
func makeKeyV2(objNum, genNum uint32, ekey []byte, isAES bool) ([]byte, error) {
key := make([]byte, len(ekey)+5)
copy(key, ekey)
for i := 0; i < 3; i++ {
b := byte((objNum >> uint32(8*i)) & 0xff)
key[i+len(ekey)] = b
}
for i := 0; i < 2; i++ {
b := byte((genNum >> uint32(8*i)) & 0xff)
key[i+len(ekey)+3] = b
}
if isAES {
// If using the AES algorithm, extend the encryption key an
// additional 4 bytes by adding the value “sAlT”, which
// corresponds to the hexadecimal values 0x73, 0x41, 0x6C, 0x54.
key = append(key, 0x73)
key = append(key, 0x41)
key = append(key, 0x6C)
key = append(key, 0x54)
}
// Take the MD5.
h := md5.New()
h.Write(key)
hashb := h.Sum(nil)
if len(ekey)+5 < 16 {
return hashb[0 : len(ekey)+5], nil
}
return hashb, nil
}
// cryptFilterV2 is a RC4-based filter
type cryptFilterV2 struct{}
func (cryptFilterV2) CFM() string {
return CryptFilterV2
}
func (f cryptFilterV2) MakeKey(objNum, genNum uint32, ekey []byte) ([]byte, error) {
return makeKeyV2(objNum, genNum, ekey, false)
}
func (cryptFilterV2) EncryptBytes(buf []byte, okey []byte) ([]byte, error) {
// Standard RC4 algorithm.
ciph, err := rc4.NewCipher(okey)
if err != nil {
return nil, err
}
common.Log.Trace("RC4 Encrypt: % x", buf)
ciph.XORKeyStream(buf, buf)
common.Log.Trace("to: % x", buf)
return buf, nil
}
func (cryptFilterV2) DecryptBytes(buf []byte, okey []byte) ([]byte, error) {
// Standard RC4 algorithm.
ciph, err := rc4.NewCipher(okey)
if err != nil {
return nil, err
}
common.Log.Trace("RC4 Decrypt: % x", buf)
ciph.XORKeyStream(buf, buf)
common.Log.Trace("to: % x", buf)
return buf, nil
}
// cryptFilterAES implements a generic AES encryption and decryption algorithm used by AESV2 and AESV3 filter methods.
type cryptFilterAES struct{}
func (cryptFilterAES) EncryptBytes(buf []byte, okey []byte) ([]byte, error) {
// Strings and streams encrypted with AES shall use a padding
// scheme that is described in Internet RFC 2898, PKCS #5:
// Password-Based Cryptography Specification Version 2.0; see
// the Bibliography. For an original message length of M,
// the pad shall consist of 16 - (M mod 16) bytes whose value
// shall also be 16 - (M mod 16).
//
// A 9-byte message has a pad of 7 bytes, each with the value
// 0x07. The pad can be unambiguously removed to determine the
// original message length when decrypting. Note that the pad is
// present when M is evenly divisible by 16; it contains 16 bytes
// of 0x10.
ciph, err := aes.NewCipher(okey)
if err != nil {
return nil, err
}
common.Log.Trace("AES Encrypt (%d): % x", len(buf), buf)
// If using the AES algorithm, the Cipher Block Chaining (CBC)
// mode, which requires an initialization vector, is used. The
// block size parameter is set to 16 bytes, and the initialization
// vector is a 16-byte random number that is stored as the first
// 16 bytes of the encrypted stream or string.
const block = aes.BlockSize // 16
pad := block - len(buf)%block
for i := 0; i < pad; i++ {
buf = append(buf, byte(pad))
}
common.Log.Trace("Padded to %d bytes", len(buf))
// Generate random 16 bytes, place in beginning of buffer.
ciphertext := make([]byte, block+len(buf))
iv := ciphertext[:block]
if _, err := io.ReadFull(rand.Reader, iv); err != nil {
return nil, err
}
mode := cipher.NewCBCEncrypter(ciph, iv)
mode.CryptBlocks(ciphertext[block:], buf)
buf = ciphertext
common.Log.Trace("to (%d): % x", len(buf), buf)
return buf, nil
}
func (cryptFilterAES) DecryptBytes(buf []byte, okey []byte) ([]byte, error) {
// Strings and streams encrypted with AES shall use a padding
// scheme that is described in Internet RFC 2898, PKCS #5:
// Password-Based Cryptography Specification Version 2.0; see
// the Bibliography. For an original message length of M,
// the pad shall consist of 16 - (M mod 16) bytes whose value
// shall also be 16 - (M mod 16).
//
// A 9-byte message has a pad of 7 bytes, each with the value
// 0x07. The pad can be unambiguously removed to determine the
// original message length when decrypting. Note that the pad is
// present when M is evenly divisible by 16; it contains 16 bytes
// of 0x10.
ciph, err := aes.NewCipher(okey)
if err != nil {
return nil, err
}
// If using the AES algorithm, the Cipher Block Chaining (CBC)
// mode, which requires an initialization vector, is used. The
// block size parameter is set to 16 bytes, and the initialization
// vector is a 16-byte random number that is stored as the first
// 16 bytes of the encrypted stream or string.
if len(buf) < 16 {
common.Log.Debug("error AES invalid buf %s", buf)
return buf, fmt.Errorf("AES: Buf len < 16 (%d)", len(buf))
}
iv := buf[:16]
buf = buf[16:]
if len(buf)%16 != 0 {
common.Log.Debug(" iv (%d): % x", len(iv), iv)
common.Log.Debug("buf (%d): % x", len(buf), buf)
return buf, fmt.Errorf("AES buf length not multiple of 16 (%d)", len(buf))
}
mode := cipher.NewCBCDecrypter(ciph, iv)
common.Log.Trace("AES Decrypt (%d): % x", len(buf), buf)
common.Log.Trace("chop AES Decrypt (%d): % x", len(buf), buf)
mode.CryptBlocks(buf, buf)
common.Log.Trace("to (%d): % x", len(buf), buf)
if len(buf) == 0 {
common.Log.Trace("Empty buf, returning empty string")
return buf, nil
}
// The padded length is indicated by the last values. Remove those.
padLen := int(buf[len(buf)-1])
if padLen >= len(buf) {
common.Log.Debug("Illegal pad length")
return buf, fmt.Errorf("invalid pad length")
}
buf = buf[:len(buf)-padLen]
return buf, nil
}
// cryptFilterAESV2 is an AES-based filter (128 bit key, PDF 1.6)
type cryptFilterAESV2 struct {
cryptFilterAES
}
func (cryptFilterAESV2) CFM() string {
return CryptFilterAESV2
}
func (cryptFilterAESV2) MakeKey(objNum, genNum uint32, ekey []byte) ([]byte, error) {
return makeKeyV2(objNum, genNum, ekey, true)
}
// cryptFilterAESV3 is an AES-based filter (256 bit key, PDF 2.0)
type cryptFilterAESV3 struct {
cryptFilterAES
}
func (cryptFilterAESV3) CFM() string {
return CryptFilterAESV3
}
func (cryptFilterAESV3) MakeKey(_, _ uint32, ekey []byte) ([]byte, error) {
return ekey, nil
}

61
internal/pdf/core/ecb.go Normal file
View File

@@ -0,0 +1,61 @@
package core
import "crypto/cipher"
// ecb implements an Electronic Codebook encryption mode.
// This mode is used to compute or validate document permissions for R=6.
type ecb struct {
b cipher.Block
blockSize int
}
func newECB(b cipher.Block) *ecb {
return &ecb{
b: b,
blockSize: b.BlockSize(),
}
}
type ecbEncrypter ecb
func newECBEncrypter(b cipher.Block) cipher.BlockMode {
return (*ecbEncrypter)(newECB(b))
}
func (x *ecbEncrypter) BlockSize() int { return x.blockSize }
func (x *ecbEncrypter) CryptBlocks(dst, src []byte) {
if len(src)%x.blockSize != 0 {
panic("crypto/cipher: input not full blocks")
}
if len(dst) < len(src) {
panic("crypto/cipher: output smaller than input")
}
for len(src) > 0 {
x.b.Encrypt(dst, src[:x.blockSize])
src = src[x.blockSize:]
dst = dst[x.blockSize:]
}
}
type ecbDecrypter ecb
func newECBDecrypter(b cipher.Block) cipher.BlockMode {
return (*ecbDecrypter)(newECB(b))
}
func (x *ecbDecrypter) BlockSize() int { return x.blockSize }
func (x *ecbDecrypter) CryptBlocks(dst, src []byte) {
if len(src)%x.blockSize != 0 {
panic("crypto/cipher: input not full blocks")
}
if len(dst) < len(src) {
panic("crypto/cipher: output smaller than input")
}
for len(src) > 0 {
x.b.Decrypt(dst, src[:x.blockSize])
src = src[x.blockSize:]
dst = dst[x.blockSize:]
}
}

File diff suppressed because it is too large Load Diff

44
internal/pdf/core/io.go Normal file
View File

@@ -0,0 +1,44 @@
package core
import (
"bufio"
"errors"
"os"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
)
// ReadAtLeast reads at least n bytes into slice p.
// Returns the number of bytes read (should always be == n), and an error on failure.
// TODO (v3): Unexport.
func (parser *PdfParser) ReadAtLeast(p []byte, n int) (int, error) {
remaining := n
start := 0
numRounds := 0
for remaining > 0 {
nRead, err := parser.reader.Read(p[start:])
if err != nil {
common.Log.Debug("error Failed reading (%d;%d) %s", nRead, numRounds, err.Error())
return start, errors.New("failed reading")
}
numRounds++
start += nRead
remaining -= nRead
}
return start, nil
}
// Get the current file offset, accounting for buffered position.
// TODO (v3): Unexport.
func (parser *PdfParser) GetFileOffset() int64 {
offset, _ := parser.rs.Seek(0, os.SEEK_CUR)
offset -= int64(parser.reader.Buffered())
return offset
}
// Seek the file to an offset position.
// TODO (v3): Unexport.
func (parser *PdfParser) SetFileOffset(offset int64) {
parser.rs.Seek(offset, os.SEEK_SET)
parser.reader = bufio.NewReader(parser.rs)
}

1644
internal/pdf/core/parser.go Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,570 @@
package core
import (
"bytes"
"fmt"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
)
// PdfObject is an interface which all primitive PDF objects must implement.
type PdfObject interface {
// Output a string representation of the primitive (for debugging).
String() string
// Output the PDF primitive as written to file as expected by the standard.
DefaultWriteString() string
}
// PdfObjectBool represents the primitive PDF boolean object.
type PdfObjectBool bool
// PdfObjectInteger represents the primitive PDF integer numerical object.
type PdfObjectInteger int64
// PdfObjectFloat represents the primitive PDF floating point numerical object.
type PdfObjectFloat float64
// PdfObjectString represents the primitive PDF string object.
// TODO (v3): Change to a struct and add a flag for hex/plaintext.
type PdfObjectString string
// PdfObjectName represents the primitive PDF name object.
type PdfObjectName string
// PdfObjectArray represents the primitive PDF array object.
type PdfObjectArray []PdfObject
// PdfObjectDictionary represents the primitive PDF dictionary/map object.
type PdfObjectDictionary struct {
dict map[PdfObjectName]PdfObject
keys []PdfObjectName
}
// PdfObjectNull represents the primitive PDF null object.
type PdfObjectNull struct{}
// PdfObjectReference represents the primitive PDF reference object.
type PdfObjectReference struct {
ObjectNumber int64
GenerationNumber int64
}
// PdfIndirectObject represents the primitive PDF indirect object.
type PdfIndirectObject struct {
PdfObjectReference
PdfObject
}
// PdfObjectStream represents the primitive PDF Object stream.
type PdfObjectStream struct {
PdfObjectReference
*PdfObjectDictionary
Stream []byte
}
// MakeDict creates and returns an empty PdfObjectDictionary.
func MakeDict() *PdfObjectDictionary {
d := &PdfObjectDictionary{}
d.dict = map[PdfObjectName]PdfObject{}
d.keys = []PdfObjectName{}
return d
}
// MakeName creates a PdfObjectName from a string.
func MakeName(s string) *PdfObjectName {
name := PdfObjectName(s)
return &name
}
// MakeInteger creates a PdfObjectInteger from an int64.
func MakeInteger(val int64) *PdfObjectInteger {
num := PdfObjectInteger(val)
return &num
}
// MakeArray creates an PdfObjectArray from a list of PdfObjects.
func MakeArray(objects ...PdfObject) *PdfObjectArray {
array := PdfObjectArray{}
for _, obj := range objects {
array = append(array, obj)
}
return &array
}
// MakeArrayFromIntegers creates an PdfObjectArray from a slice of ints, where each array element is
// an PdfObjectInteger.
func MakeArrayFromIntegers(vals []int) *PdfObjectArray {
array := PdfObjectArray{}
for _, val := range vals {
array = append(array, MakeInteger(int64(val)))
}
return &array
}
// MakeArrayFromIntegers64 creates an PdfObjectArray from a slice of int64s, where each array element
// is an PdfObjectInteger.
func MakeArrayFromIntegers64(vals []int64) *PdfObjectArray {
array := PdfObjectArray{}
for _, val := range vals {
array = append(array, MakeInteger(val))
}
return &array
}
// MakeArrayFromFloats creates an PdfObjectArray from a slice of float64s, where each array element is an
// PdfObjectFloat.
func MakeArrayFromFloats(vals []float64) *PdfObjectArray {
array := PdfObjectArray{}
for _, val := range vals {
array = append(array, MakeFloat(val))
}
return &array
}
// MakeBool creates an PdfObjectBool from a bool.
func MakeBool(val bool) *PdfObjectBool {
v := PdfObjectBool(val)
return &v
}
// MakeFloat creates an PdfObjectFloat from a float64.
func MakeFloat(val float64) *PdfObjectFloat {
num := PdfObjectFloat(val)
return &num
}
// MakeString creates an PdfObjectString from a string.
func MakeString(s string) *PdfObjectString {
str := PdfObjectString(s)
return &str
}
// MakeNull creates an PdfObjectNull.
func MakeNull() *PdfObjectNull {
null := PdfObjectNull{}
return &null
}
// MakeIndirectObject creates an PdfIndirectObject with a specified direct object PdfObject.
func MakeIndirectObject(obj PdfObject) *PdfIndirectObject {
ind := &PdfIndirectObject{}
ind.PdfObject = obj
return ind
}
// MakeStream creates an PdfObjectStream with specified contents and encoding. If encoding is nil, then raw encoding
// will be used (i.e. no encoding applied).
func MakeStream(contents []byte, encoder StreamEncoder) (*PdfObjectStream, error) {
stream := &PdfObjectStream{}
if encoder == nil {
encoder = NewRawEncoder()
}
stream.PdfObjectDictionary = encoder.MakeStreamDict()
encoded, err := encoder.EncodeBytes(contents)
if err != nil {
return nil, err
}
stream.PdfObjectDictionary.Set("Length", MakeInteger(int64(len(encoded))))
stream.Stream = encoded
return stream, nil
}
func (bool *PdfObjectBool) String() string {
if *bool {
return "true"
} else {
return "false"
}
}
// DefaultWriteString outputs the object as it is to be written to file.
func (bool *PdfObjectBool) DefaultWriteString() string {
if *bool {
return "true"
} else {
return "false"
}
}
func (int *PdfObjectInteger) String() string {
return fmt.Sprintf("%d", *int)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (int *PdfObjectInteger) DefaultWriteString() string {
return fmt.Sprintf("%d", *int)
}
func (float *PdfObjectFloat) String() string {
return fmt.Sprintf("%f", *float)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (float *PdfObjectFloat) DefaultWriteString() string {
return fmt.Sprintf("%f", *float)
}
func (str *PdfObjectString) String() string {
return string(*str)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (str *PdfObjectString) DefaultWriteString() string {
var output bytes.Buffer
escapeSequences := map[byte]string{
'\n': "\\n",
'\r': "\\r",
'\t': "\\t",
'\b': "\\b",
'\f': "\\f",
'(': "\\(",
')': "\\)",
'\\': "\\\\",
}
output.WriteString("(")
for i := 0; i < len(*str); i++ {
char := (*str)[i]
if escStr, useEsc := escapeSequences[char]; useEsc {
output.WriteString(escStr)
} else {
output.WriteByte(char)
}
}
output.WriteString(")")
return output.String()
}
func (name *PdfObjectName) String() string {
return string(*name)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (name *PdfObjectName) DefaultWriteString() string {
var output bytes.Buffer
if len(*name) > 127 {
common.Log.Debug("error: Name too long (%s)", *name)
}
output.WriteString("/")
for i := 0; i < len(*name); i++ {
char := (*name)[i]
if !IsPrintable(char) || char == '#' || IsDelimiter(char) {
output.WriteString(fmt.Sprintf("#%.2x", char))
} else {
output.WriteByte(char)
}
}
return output.String()
}
// ToFloat64Array returns a slice of all elements in the array as a float64 slice. An error is returned if the array
// contains non-numeric objects (each element can be either PdfObjectInteger or PdfObjectFloat).
func (array *PdfObjectArray) ToFloat64Array() ([]float64, error) {
vals := []float64{}
for _, obj := range *array {
if number, is := obj.(*PdfObjectInteger); is {
vals = append(vals, float64(*number))
} else if number, is := obj.(*PdfObjectFloat); is {
vals = append(vals, float64(*number))
} else {
return nil, fmt.Errorf("type error")
}
}
return vals, nil
}
// ToIntegerArray returns a slice of all array elements as an int slice. An error is returned if the array contains
// non-integer objects. Each element can only be PdfObjectInteger.
func (array *PdfObjectArray) ToIntegerArray() ([]int, error) {
vals := []int{}
for _, obj := range *array {
if number, is := obj.(*PdfObjectInteger); is {
vals = append(vals, int(*number))
} else {
return nil, fmt.Errorf("type error")
}
}
return vals, nil
}
func (array *PdfObjectArray) String() string {
outStr := "["
for ind, o := range *array {
outStr += o.String()
if ind < (len(*array) - 1) {
outStr += ", "
}
}
outStr += "]"
return outStr
}
// DefaultWriteString outputs the object as it is to be written to file.
func (array *PdfObjectArray) DefaultWriteString() string {
outStr := "["
for ind, o := range *array {
outStr += o.DefaultWriteString()
if ind < (len(*array) - 1) {
outStr += " "
}
}
outStr += "]"
return outStr
}
// Append adds an PdfObject to the array.
func (array *PdfObjectArray) Append(obj PdfObject) {
*array = append(*array, obj)
}
func getNumberAsFloat(obj PdfObject) (float64, error) {
if fObj, ok := obj.(*PdfObjectFloat); ok {
return float64(*fObj), nil
}
if iObj, ok := obj.(*PdfObjectInteger); ok {
return float64(*iObj), nil
}
return 0, fmt.Errorf("not a number")
}
// GetAsFloat64Slice returns the array as []float64 slice.
// Returns an error if not entirely numeric (only PdfObjectIntegers, PdfObjectFloats).
func (array *PdfObjectArray) GetAsFloat64Slice() ([]float64, error) {
slice := []float64{}
for _, obj := range *array {
obj := TraceToDirectObject(obj)
number, err := getNumberAsFloat(obj)
if err != nil {
return nil, fmt.Errorf("array element not a number")
}
slice = append(slice, number)
}
return slice, nil
}
// Merge merges in key/values from another dictionary. Overwriting if has same keys.
func (d *PdfObjectDictionary) Merge(another *PdfObjectDictionary) {
if another != nil {
for _, key := range another.Keys() {
val := another.Get(key)
d.Set(key, val)
}
}
}
func (d *PdfObjectDictionary) String() string {
outStr := "Dict("
for _, k := range d.keys {
v := d.dict[k]
outStr += fmt.Sprintf("\"%s\": %s, ", k, v.String())
}
outStr += ")"
return outStr
}
// DefaultWriteString outputs the object as it is to be written to file.
func (d *PdfObjectDictionary) DefaultWriteString() string {
outStr := "<<"
for _, k := range d.keys {
v := d.dict[k]
common.Log.Trace("Writing k: %s %T %v %v", k, v, k, v)
outStr += k.DefaultWriteString()
outStr += " "
outStr += v.DefaultWriteString()
}
outStr += ">>"
return outStr
}
// Set sets the dictionary's key -> val mapping entry. Overwrites if key already set.
func (d *PdfObjectDictionary) Set(key PdfObjectName, val PdfObject) {
found := false
for _, k := range d.keys {
if k == key {
found = true
break
}
}
if !found {
d.keys = append(d.keys, key)
}
d.dict[key] = val
}
// Get returns the PdfObject corresponding to the specified key.
// Returns a nil value if the key is not set.
//
// The design is such that we only return 1 value.
// The reason is that, it will be easy to do type casts such as
// name, ok := dict.Get("mykey").(*PdfObjectName)
// if !ok ....
func (d *PdfObjectDictionary) Get(key PdfObjectName) PdfObject {
val, has := d.dict[key]
if !has {
return nil
}
return val
}
// Keys returns the list of keys in the dictionary.
func (d *PdfObjectDictionary) Keys() []PdfObjectName {
return d.keys
}
// Remove removes an element specified by key.
func (d *PdfObjectDictionary) Remove(key PdfObjectName) {
idx := -1
for i, k := range d.keys {
if k == key {
idx = i
break
}
}
if idx >= 0 {
// Found. Remove from key list and map.
d.keys = append(d.keys[:idx], d.keys[idx+1:]...)
delete(d.dict, key)
}
}
// SetIfNotNil sets the dictionary's key -> val mapping entry -IF- val is not nil.
// Note that we take care to perform a type switch. Otherwise if we would supply a nil value
// of another type, e.g. (PdfObjectArray*)(nil), then it would not be a PdfObject(nil) and thus
// would get set.
func (d *PdfObjectDictionary) SetIfNotNil(key PdfObjectName, val PdfObject) {
if val != nil {
switch t := val.(type) {
case *PdfObjectName:
if t != nil {
d.Set(key, val)
}
case *PdfObjectDictionary:
if t != nil {
d.Set(key, val)
}
case *PdfObjectStream:
if t != nil {
d.Set(key, val)
}
case *PdfObjectString:
if t != nil {
d.Set(key, val)
}
case *PdfObjectNull:
if t != nil {
d.Set(key, val)
}
case *PdfObjectInteger:
if t != nil {
d.Set(key, val)
}
case *PdfObjectArray:
if t != nil {
d.Set(key, val)
}
case *PdfObjectBool:
if t != nil {
d.Set(key, val)
}
case *PdfObjectFloat:
if t != nil {
d.Set(key, val)
}
case *PdfObjectReference:
if t != nil {
d.Set(key, val)
}
case *PdfIndirectObject:
if t != nil {
d.Set(key, val)
}
default:
common.Log.Error("error: Unknown type: %T - should never happen!", val)
}
}
}
func (ref *PdfObjectReference) String() string {
return fmt.Sprintf("Ref(%d %d)", ref.ObjectNumber, ref.GenerationNumber)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (ref *PdfObjectReference) DefaultWriteString() string {
return fmt.Sprintf("%d %d R", ref.ObjectNumber, ref.GenerationNumber)
}
func (ind *PdfIndirectObject) String() string {
// Avoid printing out the object, can cause problems with circular
// references.
return fmt.Sprintf("IObject:%d", (*ind).ObjectNumber)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (ind *PdfIndirectObject) DefaultWriteString() string {
outStr := fmt.Sprintf("%d 0 R", (*ind).ObjectNumber)
return outStr
}
func (stream *PdfObjectStream) String() string {
return fmt.Sprintf("Object stream %d: %s", stream.ObjectNumber, stream.PdfObjectDictionary)
}
// DefaultWriteString outputs the object as it is to be written to file.
func (stream *PdfObjectStream) DefaultWriteString() string {
outStr := fmt.Sprintf("%d 0 R", (*stream).ObjectNumber)
return outStr
}
func (null *PdfObjectNull) String() string {
return "null"
}
// DefaultWriteString outputs the object as it is to be written to file.
func (null *PdfObjectNull) DefaultWriteString() string {
return "null"
}
// Handy functions to work with primitive objects.
// TraceMaxDepth specifies the maximum recursion depth allowed.
const TraceMaxDepth = 20
// TraceToDirectObject traces a PdfObject to a direct object. For example direct objects contained
// in indirect objects (can be double referenced even).
//
// Note: This function does not trace/resolve references. That needs to be done beforehand.
func TraceToDirectObject(obj PdfObject) PdfObject {
iobj, isIndirectObj := obj.(*PdfIndirectObject)
depth := 0
for isIndirectObj {
obj = iobj.PdfObject
iobj, isIndirectObj = obj.(*PdfIndirectObject)
depth++
if depth > TraceMaxDepth {
common.Log.Error("error: Trace depth level beyond %d - not going deeper!", TraceMaxDepth)
return nil
}
}
return obj
}

View File

@@ -0,0 +1,281 @@
// Routines related to repairing malformed pdf files.
package core
import (
"errors"
"fmt"
"os"
"regexp"
"bufio"
"io"
"strconv"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
)
var repairReXrefTable = regexp.MustCompile(`[\r\n]\s*(xref)\s*[\r\n]`)
// Locates a standard Xref table by looking for the "xref" entry.
// Xref object stream not supported.
func (parser *PdfParser) repairLocateXref() (int64, error) {
readBuf := int64(1000)
parser.rs.Seek(-readBuf, os.SEEK_CUR)
curOffset, err := parser.rs.Seek(0, os.SEEK_CUR)
if err != nil {
return 0, err
}
b2 := make([]byte, readBuf)
parser.rs.Read(b2)
results := repairReXrefTable.FindAllStringIndex(string(b2), -1)
if len(results) < 1 {
common.Log.Debug("error: Repair: xref not found!")
return 0, errors.New("repair: xref not found")
}
localOffset := int64(results[len(results)-1][0])
xrefOffset := curOffset + localOffset
return xrefOffset, nil
}
// Renumbers the xref table.
// Useful when the cross reference is pointing to an object with the wrong number.
// Update the table.
func (parser *PdfParser) rebuildXrefTable() error {
newXrefs := XrefTable{}
for objNum, xref := range parser.xrefs {
obj, _, err := parser.lookupByNumberWrapper(objNum, false)
if err != nil {
common.Log.Debug("error: Unable to look up object (%s)", err)
common.Log.Debug("error: Xref table completely broken - attempting to repair ")
xrefTable, err := parser.repairRebuildXrefsTopDown()
if err != nil {
common.Log.Debug("error: Failed xref rebuild repair (%s)", err)
return err
}
parser.xrefs = *xrefTable
common.Log.Debug("Repaired xref table built")
return nil
}
actObjNum, actGenNum, err := getObjectNumber(obj)
if err != nil {
return err
}
xref.objectNumber = int(actObjNum)
xref.generation = int(actGenNum)
newXrefs[int(actObjNum)] = xref
}
parser.xrefs = newXrefs
common.Log.Debug("New xref table built")
printXrefTable(parser.xrefs)
return nil
}
// Parses and returns the object and generation number from a string such as "12 0 obj" -> (12,0,nil).
func parseObjectNumberFromString(str string) (int, int, error) {
result := reIndirectObject.FindStringSubmatch(str)
if len(result) < 3 {
return 0, 0, errors.New("unable to detect indirect object signature")
}
on, _ := strconv.Atoi(result[1])
gn, _ := strconv.Atoi(result[2])
return on, gn, nil
}
// Parse the entire file from top down.
// Goes through the file byte-by-byte looking for "<num> <generation> obj" patterns.
// N.B. This collects the XREF_TABLE_ENTRY data only.
func (parser *PdfParser) repairRebuildXrefsTopDown() (*XrefTable, error) {
if parser.repairsAttempted {
// Avoid multiple repairs (only try once).
return nil, fmt.Errorf("repair failed")
}
parser.repairsAttempted = true
// Go to beginning, reset reader.
parser.rs.Seek(0, os.SEEK_SET)
parser.reader = bufio.NewReader(parser.rs)
// Keep a running buffer of last bytes.
bufLen := 20
last := make([]byte, bufLen)
xrefTable := XrefTable{}
for {
b, err := parser.reader.ReadByte()
if err != nil {
if err == io.EOF {
break
} else {
return nil, err
}
}
// Format:
// object number - whitespace - generation number - obj
// e.g. "12 0 obj"
if b == 'j' && last[bufLen-1] == 'b' && last[bufLen-2] == 'o' && IsWhiteSpace(last[bufLen-3]) {
i := bufLen - 4
// Go past whitespace
for IsWhiteSpace(last[i]) && i > 0 {
i--
}
if i == 0 || !IsDecimalDigit(last[i]) {
continue
}
// Go past generation number
for IsDecimalDigit(last[i]) && i > 0 {
i--
}
if i == 0 || !IsWhiteSpace(last[i]) {
continue
}
// Go past whitespace
for IsWhiteSpace(last[i]) && i > 0 {
i--
}
if i == 0 || !IsDecimalDigit(last[i]) {
continue
}
// Go past object number.
for IsDecimalDigit(last[i]) && i > 0 {
i--
}
if i == 0 {
continue // Probably too long to be a valid object...
}
objOffset := parser.GetFileOffset() - int64(bufLen-i)
objstr := append(last[i+1:], b)
objNum, genNum, err := parseObjectNumberFromString(string(objstr))
if err != nil {
common.Log.Debug("Unable to parse object number: %v", err)
return nil, err
}
// Create and insert the XREF entry if not existing, or the generation number is higher.
if curXref, has := xrefTable[objNum]; !has || curXref.generation < genNum {
// Make the entry for the cross ref table.
xrefEntry := XrefObject{}
xrefEntry.xtype = XREF_TABLE_ENTRY
xrefEntry.objectNumber = int(objNum)
xrefEntry.generation = int(genNum)
xrefEntry.offset = objOffset
xrefTable[objNum] = xrefEntry
}
}
last = append(last[1:bufLen], b)
}
return &xrefTable, nil
}
// Look for first sign of xref table from end of file.
func (parser *PdfParser) repairSeekXrefMarker() error {
// Get the file size.
fSize, err := parser.rs.Seek(0, os.SEEK_END)
if err != nil {
return err
}
reXrefTableStart := regexp.MustCompile(`\sxref\s*`)
// Define the starting point (from the end of the file) to search from.
var offset int64 = 0
// Define an buffer length in terms of how many bytes to read from the end of the file.
var buflen int64 = 1000
for offset < fSize {
if fSize <= (buflen + offset) {
buflen = fSize - offset
}
// Move back enough (as we need to read forward).
_, err := parser.rs.Seek(-offset-buflen, os.SEEK_END)
if err != nil {
return err
}
// Read the data.
b1 := make([]byte, buflen)
parser.rs.Read(b1)
common.Log.Trace("Looking for xref : \"%s\"", string(b1))
ind := reXrefTableStart.FindAllStringIndex(string(b1), -1)
if ind != nil {
// Found it.
lastInd := ind[len(ind)-1]
common.Log.Trace("Ind: % d", ind)
parser.rs.Seek(-offset-buflen+int64(lastInd[0]), os.SEEK_END)
parser.reader = bufio.NewReader(parser.rs)
// Go past whitespace, finish at 'x'.
for {
bb, err := parser.reader.Peek(1)
if err != nil {
return err
}
common.Log.Trace("B: %d %c", bb[0], bb[0])
if !IsWhiteSpace(bb[0]) {
break
}
parser.reader.Discard(1)
}
return nil
} else {
common.Log.Debug("warning: EOF marker not found! - continue seeking")
}
offset += buflen
}
common.Log.Debug("error: Xref table marker was not found.")
return errors.New("xref not found ")
}
// Called when Pdf version not found normally. Looks for the PDF version by scanning top-down.
// %PDF-1.7
func (parser *PdfParser) seekPdfVersionTopDown() (int, int, error) {
// Go to beginning, reset reader.
parser.rs.Seek(0, os.SEEK_SET)
parser.reader = bufio.NewReader(parser.rs)
// Keep a running buffer of last bytes.
bufLen := 20
last := make([]byte, bufLen)
for {
b, err := parser.reader.ReadByte()
if err != nil {
if err == io.EOF {
break
} else {
return 0, 0, err
}
}
// Format:
// object number - whitespace - generation number - obj
// e.g. "12 0 obj"
if IsDecimalDigit(b) && last[bufLen-1] == '.' && IsDecimalDigit(last[bufLen-2]) && last[bufLen-3] == '-' &&
last[bufLen-4] == 'F' && last[bufLen-5] == 'D' && last[bufLen-6] == 'P' {
major := int(last[bufLen-2] - '0')
minor := int(b - '0')
return major, minor, nil
}
last = append(last[1:bufLen], b)
}
return 0, 0, errors.New("version not found")
}

129
internal/pdf/core/stream.go Normal file
View File

@@ -0,0 +1,129 @@
package core
import (
"fmt"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
)
// NewEncoderFromStream creates a StreamEncoder based on the stream's dictionary.
func NewEncoderFromStream(streamObj *PdfObjectStream) (StreamEncoder, error) {
filterObj := TraceToDirectObject(streamObj.PdfObjectDictionary.Get("Filter"))
if filterObj == nil {
// No filter, return raw data back.
return NewRawEncoder(), nil
}
if _, isNull := filterObj.(*PdfObjectNull); isNull {
// Filter is null -> raw data.
return NewRawEncoder(), nil
}
// The filter should be a name or an array with a list of filter names.
method, ok := filterObj.(*PdfObjectName)
if !ok {
array, ok := filterObj.(*PdfObjectArray)
if !ok {
return nil, fmt.Errorf("filter not a Name or Array object")
}
if len(*array) == 0 {
// Empty array -> indicates raw filter (no filter).
return NewRawEncoder(), nil
}
if len(*array) != 1 {
menc, err := newMultiEncoderFromStream(streamObj)
if err != nil {
common.Log.Error("Failed creating multi encoder: %v", err)
return nil, err
}
common.Log.Trace("Multi enc: %s\n", menc)
return menc, nil
}
// Single element.
filterObj = (*array)[0]
method, ok = filterObj.(*PdfObjectName)
if !ok {
return nil, fmt.Errorf("filter array member not a Name object")
}
}
switch *method {
case StreamEncodingFilterNameFlate:
return newFlateEncoderFromStream(streamObj, nil)
case StreamEncodingFilterNameLZW:
return newLZWEncoderFromStream(streamObj, nil)
case StreamEncodingFilterNameDCT:
return newDCTEncoderFromStream(streamObj, nil)
case StreamEncodingFilterNameRunLength:
return newRunLengthEncoderFromStream()
case StreamEncodingFilterNameASCIIHex:
return NewASCIIHexEncoder(), nil
case StreamEncodingFilterNameASCII85, "A85":
return NewASCII85Encoder(), nil
case StreamEncodingFilterNameCCITTFax:
return NewCCITTFaxEncoder(), nil
case StreamEncodingFilterNameJBIG2:
return NewJBIG2Encoder(), nil
case StreamEncodingFilterNameJPX:
return NewJPXEncoder(), nil
default:
common.Log.Debug("error: Unsupported encoding method!")
return nil, fmt.Errorf("unsupported encoding method (%s)", *method)
}
}
// DecodeStream decodes the stream data and returns the decoded data.
// An error is returned upon failure.
func DecodeStream(streamObj *PdfObjectStream) ([]byte, error) {
common.Log.Trace("Decode stream")
encoder, err := NewEncoderFromStream(streamObj)
if err != nil {
common.Log.Debug("Stream decoding failed: %v", err)
return nil, err
}
common.Log.Trace("Encoder: %#v\n", encoder)
decoded, err := encoder.DecodeStream(streamObj)
if err != nil {
common.Log.Debug("Stream decoding failed: %v", err)
return nil, err
}
return decoded, nil
}
// EncodeStream encodes the stream data using the encoded specified by the stream's dictionary.
func EncodeStream(streamObj *PdfObjectStream) error {
common.Log.Trace("Encode stream")
encoder, err := NewEncoderFromStream(streamObj)
if err != nil {
common.Log.Debug("Stream decoding failed: %v", err)
return err
}
if lzwenc, is := encoder.(*LZWEncoder); is {
// If LZW:
// Make sure to use EarlyChange 0.. We do not have write support for 1 yet.
lzwenc.EarlyChange = 0
streamObj.PdfObjectDictionary.Set("EarlyChange", MakeInteger(0))
}
common.Log.Trace("Encoder: %+v\n", encoder)
encoded, err := encoder.EncodeBytes(streamObj.Stream)
if err != nil {
common.Log.Debug("Stream encoding failed: %v", err)
return err
}
streamObj.Stream = encoded
// Update length
streamObj.PdfObjectDictionary.Set("Length", MakeInteger(int64(len(encoded))))
return nil
}

View File

@@ -0,0 +1,74 @@
package core
// IsWhiteSpace checks if byte represents a white space character.
// TODO (v3): Unexport.
func IsWhiteSpace(ch byte) bool {
// Table 1 white-space characters (7.2.2 Character Set)
// spaceCharacters := string([]byte{0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20})
if (ch == 0x00) || (ch == 0x09) || (ch == 0x0A) || (ch == 0x0C) || (ch == 0x0D) || (ch == 0x20) {
return true
}
return false
}
// IsFloatDigit checks if a character can be a part of a float number string.
// TODO (v3): Unexport.
func IsFloatDigit(c byte) bool {
return ('0' <= c && c <= '9') || c == '.'
}
// IsDecimalDigit checks if the character is a part of a decimal number string.
// TODO (v3): Unexport.
func IsDecimalDigit(c byte) bool {
if c >= '0' && c <= '9' {
return true
} else {
return false
}
}
// IsOctalDigit checks if a character can be part of an octal digit string.
// TODO (v3): Unexport.
func IsOctalDigit(c byte) bool {
if c >= '0' && c <= '7' {
return true
} else {
return false
}
}
// IsPrintable checks if a character is printable.
// Regular characters that are outside the range EXCLAMATION MARK(21h)
// (!) to TILDE (7Eh) (~) should be written using the hexadecimal notation.
// TODO (v3): Unexport.
func IsPrintable(char byte) bool {
if char < 0x21 || char > 0x7E {
return false
}
return true
}
// IsDelimiter checks if a character represents a delimiter.
// TODO (v3): Unexport.
func IsDelimiter(char byte) bool {
if char == '(' || char == ')' {
return true
}
if char == '<' || char == '>' {
return true
}
if char == '[' || char == ']' {
return true
}
if char == '{' || char == '}' {
return true
}
if char == '/' {
return true
}
if char == '%' {
return true
}
return false
}

172
internal/pdf/core/utils.go Normal file
View File

@@ -0,0 +1,172 @@
package core
import (
"errors"
"fmt"
"sort"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
)
// Check slice range to make sure within bounds for accessing:
//
// slice[a:b] where sliceLen=len(slice).
func checkBounds(sliceLen, a, b int) error {
if a < 0 || a > sliceLen {
return errors.New("slice index a out of bounds")
}
if b < a {
return errors.New("invalid slice index b < a")
}
if b > sliceLen {
return errors.New("slice index b out of bounds")
}
return nil
}
// Inspect analyzes the document object structure.
func (parser *PdfParser) Inspect() (map[string]int, error) {
return parser.inspect()
}
// GetObjectNums returns a sorted list of object numbers of the PDF objects in the file.
func (parser *PdfParser) GetObjectNums() []int {
objNums := []int{}
for _, x := range parser.xrefs {
objNums = append(objNums, x.objectNumber)
}
// Sort the object numbers to give consistent ordering of PDF objects in output.
// Needed since parser.xrefs is a map.
sort.Ints(objNums)
return objNums
}
/*
* Inspect object types.
* Go through all objects in the cross ref table and detect the types.
* Mostly for debugging purposes and inspecting odd PDF files.
*/
func (parser *PdfParser) inspect() (map[string]int, error) {
common.Log.Trace("--------INSPECT ----------")
common.Log.Trace("Xref table:")
objTypes := map[string]int{}
objCount := 0
failedCount := 0
keys := []int{}
for k := range parser.xrefs {
keys = append(keys, k)
}
sort.Ints(keys)
i := 0
for _, k := range keys {
xref := parser.xrefs[k]
if xref.objectNumber == 0 {
continue
}
objCount++
common.Log.Trace("==========")
common.Log.Trace("Looking up object number: %d", xref.objectNumber)
o, err := parser.LookupByNumber(xref.objectNumber)
if err != nil {
common.Log.Trace("error: Fail to lookup obj %d (%s)", xref.objectNumber, err)
failedCount++
continue
}
common.Log.Trace("obj: %s", o)
iobj, isIndirect := o.(*PdfIndirectObject)
if isIndirect {
common.Log.Trace("IND OOBJ %d: %s", xref.objectNumber, iobj)
dict, isDict := iobj.PdfObject.(*PdfObjectDictionary)
if isDict {
// Check if has Type parameter.
if ot, has := dict.Get("Type").(*PdfObjectName); has {
otype := string(*ot)
common.Log.Trace("---> Obj type: %s", otype)
_, isDefined := objTypes[otype]
if isDefined {
objTypes[otype]++
} else {
objTypes[otype] = 1
}
} else if ot, has := dict.Get("Subtype").(*PdfObjectName); has {
// Check if subtype
otype := string(*ot)
common.Log.Trace("---> Obj subtype: %s", otype)
_, isDefined := objTypes[otype]
if isDefined {
objTypes[otype]++
} else {
objTypes[otype] = 1
}
}
if val, has := dict.Get("S").(*PdfObjectName); has && *val == "JavaScript" {
// Check if Javascript.
_, isDefined := objTypes["JavaScript"]
if isDefined {
objTypes["JavaScript"]++
} else {
objTypes["JavaScript"] = 1
}
}
}
} else if sobj, isStream := o.(*PdfObjectStream); isStream {
if otype, ok := sobj.PdfObjectDictionary.Get("Type").(*PdfObjectName); ok {
common.Log.Trace("--> Stream object type: %s", *otype)
k := string(*otype)
objTypes[k]++
}
} else { // Direct.
dict, isDict := o.(*PdfObjectDictionary)
if isDict {
ot, isName := dict.Get("Type").(*PdfObjectName)
if isName {
otype := string(*ot)
common.Log.Trace("--- obj type %s", otype)
objTypes[otype]++
}
}
common.Log.Trace("DIRECT OBJ %d: %s", xref.objectNumber, o)
}
i++
}
common.Log.Trace("--------EOF INSPECT ----------")
common.Log.Trace("=======")
common.Log.Trace("Object count: %d", objCount)
common.Log.Trace("Failed lookup: %d", failedCount)
for t, c := range objTypes {
common.Log.Trace("%s: %d", t, c)
}
common.Log.Trace("=======")
if len(parser.xrefs) < 1 {
common.Log.Debug("error: This document is invalid (xref table missing!)")
return nil, fmt.Errorf("invalid document (xref table missing)")
}
fontObjs, ok := objTypes["Font"]
if !ok || fontObjs < 2 {
common.Log.Trace("This document is probably scanned!")
} else {
common.Log.Trace("This document is valid for extraction!")
}
return objTypes, nil
}
func absInt(x int) int {
if x < 0 {
return -x
} else {
return x
}
}