764 lines
22 KiB
Go
764 lines
22 KiB
Go
package model
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
|
|
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
|
|
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
|
|
)
|
|
|
|
// PdfReader represents a PDF file reader. It is a frontend to the lower level parsing mechanism and provides
|
|
// a higher level access to work with PDF structure and information, such as the page structure etc.
|
|
type PdfReader struct {
|
|
parser *core.PdfParser
|
|
root core.PdfObject
|
|
pages *core.PdfObjectDictionary
|
|
pageList []*core.PdfIndirectObject
|
|
PageList []*PdfPage
|
|
pageCount int
|
|
catalog *core.PdfObjectDictionary
|
|
outlineTree *PdfOutlineTreeNode
|
|
AcroForm *PdfAcroForm
|
|
|
|
modelManager *ModelManager
|
|
|
|
// For tracking traversal (cache).
|
|
traversed map[core.PdfObject]bool
|
|
}
|
|
|
|
// NewPdfReader returns a new PdfReader for an input io.ReadSeeker interface. Can be used to read PDF from
|
|
// memory or file. Immediately loads and traverses the PDF structure including pages and page contents (if
|
|
// not encrypted).
|
|
func NewPdfReader(rs io.ReadSeeker) (*PdfReader, error) {
|
|
pdfReader := &PdfReader{}
|
|
pdfReader.traversed = map[core.PdfObject]bool{}
|
|
|
|
pdfReader.modelManager = NewModelManager()
|
|
|
|
// Create the parser, loads the cross reference table and trailer.
|
|
parser, err := core.NewParser(rs)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
pdfReader.parser = parser
|
|
|
|
isEncrypted, err := pdfReader.IsEncrypted()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Load pdf doc structure if not encrypted.
|
|
if !isEncrypted {
|
|
err = pdfReader.loadStructure()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return pdfReader, nil
|
|
}
|
|
|
|
// IsEncrypted returns true if the PDF file is encrypted.
|
|
func (pp *PdfReader) IsEncrypted() (bool, error) {
|
|
return pp.parser.IsEncrypted()
|
|
}
|
|
|
|
// GetEncryptionMethod returns a string containing some information about the encryption method used.
|
|
// XXX/TODO: May be better to return a standardized struct with information.
|
|
func (pp *PdfReader) GetEncryptionMethod() string {
|
|
crypter := pp.parser.GetCrypter()
|
|
str := crypter.Filter + " - "
|
|
|
|
if crypter.V == 0 {
|
|
str += "Undocumented algorithm"
|
|
} else if crypter.V == 1 {
|
|
// RC4 or AES (bits: 40)
|
|
str += "RC4: 40 bits"
|
|
} else if crypter.V == 2 {
|
|
str += fmt.Sprintf("RC4: %d bits", crypter.Length)
|
|
} else if crypter.V == 3 {
|
|
str += "Unpublished algorithm"
|
|
} else if crypter.V >= 4 {
|
|
// Look at CF, StmF, StrF
|
|
str += fmt.Sprintf("Stream filter: %s - String filter: %s", crypter.StreamFilter, crypter.StringFilter)
|
|
str += "; Crypt filters:"
|
|
for name, cf := range crypter.CryptFilters {
|
|
str += fmt.Sprintf(" - %s: %s (%d)", name, cf.Cfm, cf.Length)
|
|
}
|
|
}
|
|
perms := crypter.GetAccessPermissions()
|
|
str += fmt.Sprintf(" - %#v", perms)
|
|
|
|
return str
|
|
}
|
|
|
|
// Decrypt decrypts the PDF file with a specified password. Also tries to
|
|
// decrypt with an empty password. Returns true if successful,
|
|
// false otherwise.
|
|
func (pp *PdfReader) Decrypt(password []byte) (bool, error) {
|
|
success, err := pp.parser.Decrypt(password)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if !success {
|
|
return false, nil
|
|
}
|
|
|
|
err = pp.loadStructure()
|
|
if err != nil {
|
|
common.Log.Debug("error: Fail to load structure (%s)", err)
|
|
return false, err
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// CheckAccessRights checks access rights and permissions for a specified password. If either user/owner
|
|
// password is specified, full rights are granted, otherwise the access rights are specified by the
|
|
// Permissions flag.
|
|
//
|
|
// The bool flag indicates that the user can access and view the file.
|
|
// The AccessPermissions shows what access the user has for editing etc.
|
|
// An error is returned if there was a problem performing the authentication.
|
|
func (pp *PdfReader) CheckAccessRights(password []byte) (bool, core.AccessPermissions, error) {
|
|
return pp.parser.CheckAccessRights(password)
|
|
}
|
|
|
|
// Loads the structure of the pdf file: pages, outlines, etc.
|
|
func (pp *PdfReader) loadStructure() error {
|
|
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
|
return fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
|
|
trailerDict := pp.parser.GetTrailer()
|
|
if trailerDict == nil {
|
|
return fmt.Errorf("missing trailer")
|
|
}
|
|
|
|
// Catalog.
|
|
root, ok := trailerDict.Get("Root").(*core.PdfObjectReference)
|
|
if !ok {
|
|
return fmt.Errorf("invalid Root (trailer: %s)", *trailerDict)
|
|
}
|
|
oc, err := pp.parser.LookupByReference(*root)
|
|
if err != nil {
|
|
common.Log.Debug("error: Failed to read root element catalog: %s", err)
|
|
return err
|
|
}
|
|
pcatalog, ok := oc.(*core.PdfIndirectObject)
|
|
if !ok {
|
|
common.Log.Debug("error: Missing catalog: (root %q) (trailer %s)", oc, *trailerDict)
|
|
return errors.New("missing catalog")
|
|
}
|
|
catalog, ok := (*pcatalog).PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
common.Log.Debug("error: Invalid catalog (%s)", pcatalog.PdfObject)
|
|
return errors.New("invalid catalog")
|
|
}
|
|
common.Log.Trace("Catalog: %s", catalog)
|
|
|
|
// Pages.
|
|
pagesRef, ok := catalog.Get("Pages").(*core.PdfObjectReference)
|
|
if !ok {
|
|
return errors.New("pages in catalog should be a reference")
|
|
}
|
|
op, err := pp.parser.LookupByReference(*pagesRef)
|
|
if err != nil {
|
|
common.Log.Debug("error: Failed to read pages")
|
|
return err
|
|
}
|
|
ppages, ok := op.(*core.PdfIndirectObject)
|
|
if !ok {
|
|
common.Log.Debug("error: Pages object invalid")
|
|
common.Log.Debug("op: %p", ppages)
|
|
return errors.New("pages object invalid")
|
|
}
|
|
pages, ok := ppages.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
common.Log.Debug("error: Pages object invalid (%s)", ppages)
|
|
return errors.New("pages object invalid")
|
|
}
|
|
pageCount, ok := pages.Get("Count").(*core.PdfObjectInteger)
|
|
if !ok {
|
|
common.Log.Debug("error: Pages count object invalid")
|
|
return errors.New("pages count invalid")
|
|
}
|
|
|
|
pp.root = root
|
|
pp.catalog = catalog
|
|
pp.pages = pages
|
|
pp.pageCount = int(*pageCount)
|
|
pp.pageList = []*core.PdfIndirectObject{}
|
|
|
|
traversedPageNodes := map[core.PdfObject]bool{}
|
|
err = pp.buildPageList(ppages, nil, traversedPageNodes)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
common.Log.Trace("---")
|
|
common.Log.Trace("TOC")
|
|
common.Log.Trace("Pages")
|
|
common.Log.Trace("%d: %s", len(pp.pageList), pp.pageList)
|
|
|
|
// Outlines.
|
|
pp.outlineTree, err = pp.loadOutlines()
|
|
if err != nil {
|
|
common.Log.Debug("error: Failed to build outline tree (%s)", err)
|
|
return err
|
|
}
|
|
|
|
// Load interactive forms and fields.
|
|
pp.AcroForm, err = pp.loadForms()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Trace to object. Keeps a list of already visited references to avoid circular references.
|
|
//
|
|
// Example circular reference.
|
|
// 1 0 obj << /Next 2 0 R >>
|
|
// 2 0 obj << /Next 1 0 R >>
|
|
func (pp *PdfReader) traceToObjectWrapper(obj core.PdfObject, refList map[*core.PdfObjectReference]bool) (core.PdfObject, error) {
|
|
// Keep a list of references to avoid circular references.
|
|
|
|
ref, isRef := obj.(*core.PdfObjectReference)
|
|
if isRef {
|
|
// Make sure not already visited (circular ref).
|
|
if _, alreadyTraversed := refList[ref]; alreadyTraversed {
|
|
return nil, errors.New("circular reference")
|
|
}
|
|
refList[ref] = true
|
|
obj, err := pp.parser.LookupByReference(*ref)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return pp.traceToObjectWrapper(obj, refList)
|
|
}
|
|
|
|
// Not a reference, an object. Can be indirect or any direct pdf object (other than reference).
|
|
return obj, nil
|
|
}
|
|
|
|
func (pp *PdfReader) traceToObject(obj core.PdfObject) (core.PdfObject, error) {
|
|
refList := map[*core.PdfObjectReference]bool{}
|
|
return pp.traceToObjectWrapper(obj, refList)
|
|
}
|
|
|
|
func (pp *PdfReader) loadOutlines() (*PdfOutlineTreeNode, error) {
|
|
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
|
return nil, fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
|
|
// Has outlines? Otherwise return an empty outlines structure.
|
|
catalog := pp.catalog
|
|
outlinesObj := catalog.Get("Outlines")
|
|
if outlinesObj == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
common.Log.Trace("-Has outlines")
|
|
// Trace references to the object.
|
|
outlineRootObj, err := pp.traceToObject(outlinesObj)
|
|
if err != nil {
|
|
common.Log.Debug("error: Failed to read outlines")
|
|
return nil, err
|
|
}
|
|
common.Log.Trace("Outline root: %v", outlineRootObj)
|
|
|
|
if _, isNull := outlineRootObj.(*core.PdfObjectNull); isNull {
|
|
common.Log.Trace("Outline root is null - no outlines")
|
|
return nil, nil
|
|
}
|
|
|
|
outlineRoot, ok := outlineRootObj.(*core.PdfIndirectObject)
|
|
if !ok {
|
|
return nil, errors.New("outline root should be an indirect object")
|
|
}
|
|
|
|
dict, ok := outlineRoot.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
return nil, errors.New("outline indirect object should contain a dictionary")
|
|
}
|
|
|
|
common.Log.Trace("Outline root dict: %v", dict)
|
|
|
|
outlineTree, _, err := pp.buildOutlineTree(outlineRoot, nil, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
common.Log.Trace("Resulting outline tree: %v", outlineTree)
|
|
|
|
return outlineTree, nil
|
|
}
|
|
|
|
// Recursive build outline tree.
|
|
// prev PdfObject,
|
|
// Input: The indirect object containing an Outlines or Outline item dictionary.
|
|
// Parent, Prev are the parent or previous node in the hierarchy.
|
|
// The function returns the corresponding tree node and the last node which is used
|
|
// for setting the Last pointer of the tree node structures.
|
|
func (pp *PdfReader) buildOutlineTree(obj core.PdfObject, parent *PdfOutlineTreeNode, prev *PdfOutlineTreeNode) (*PdfOutlineTreeNode, *PdfOutlineTreeNode, error) {
|
|
container, isInd := obj.(*core.PdfIndirectObject)
|
|
if !isInd {
|
|
return nil, nil, fmt.Errorf("outline container not an indirect object %T", obj)
|
|
}
|
|
dict, ok := container.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
return nil, nil, errors.New("not a dictionary object")
|
|
}
|
|
common.Log.Trace("build outline tree: dict: %v (%v) p: %p", dict, container, container)
|
|
|
|
if obj := dict.Get("Title"); obj != nil {
|
|
// Outline item has a title. (required)
|
|
outlineItem, err := pp.newPdfOutlineItemFromIndirectObject(container)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
outlineItem.Parent = parent
|
|
outlineItem.Prev = prev
|
|
|
|
if firstObj := dict.Get("First"); firstObj != nil {
|
|
firstObj, err = pp.traceToObject(firstObj)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if _, isNull := firstObj.(*core.PdfObjectNull); !isNull {
|
|
first, last, err := pp.buildOutlineTree(firstObj, &outlineItem.PdfOutlineTreeNode, nil)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
outlineItem.First = first
|
|
outlineItem.Last = last
|
|
}
|
|
}
|
|
|
|
// Resolve the reference to next
|
|
if nextObj := dict.Get("Next"); nextObj != nil {
|
|
nextObj, err = pp.traceToObject(nextObj)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if _, isNull := nextObj.(*core.PdfObjectNull); !isNull {
|
|
next, last, err := pp.buildOutlineTree(nextObj, parent, &outlineItem.PdfOutlineTreeNode)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
outlineItem.Next = next
|
|
return &outlineItem.PdfOutlineTreeNode, last, nil
|
|
}
|
|
}
|
|
|
|
return &outlineItem.PdfOutlineTreeNode, &outlineItem.PdfOutlineTreeNode, nil
|
|
} else {
|
|
// Outline dictionary (structure element).
|
|
|
|
outline, err := newPdfOutlineFromIndirectObject(container)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
outline.Parent = parent
|
|
//outline.Prev = parent
|
|
|
|
if firstObj := dict.Get("First"); firstObj != nil {
|
|
// Has children...
|
|
firstObj, err = pp.traceToObject(firstObj)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if _, isNull := firstObj.(*core.PdfObjectNull); !isNull {
|
|
first, last, err := pp.buildOutlineTree(firstObj, &outline.PdfOutlineTreeNode, nil)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
outline.First = first
|
|
outline.Last = last
|
|
}
|
|
}
|
|
return &outline.PdfOutlineTreeNode, &outline.PdfOutlineTreeNode, nil
|
|
}
|
|
}
|
|
|
|
// GetOutlineTree returns the outline tree.
|
|
func (pp *PdfReader) GetOutlineTree() *PdfOutlineTreeNode {
|
|
return pp.outlineTree
|
|
}
|
|
|
|
// GetOutlinesFlattened returns a flattened list of tree nodes and titles.
|
|
func (pp *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, error) {
|
|
outlineNodeList := []*PdfOutlineTreeNode{}
|
|
flattenedTitleList := []string{}
|
|
|
|
// Recursive flattening function.
|
|
var flattenFunc func(*PdfOutlineTreeNode, *[]*PdfOutlineTreeNode, *[]string, int)
|
|
flattenFunc = func(node *PdfOutlineTreeNode, outlineList *[]*PdfOutlineTreeNode, titleList *[]string, depth int) {
|
|
if node == nil {
|
|
return
|
|
}
|
|
if node.context == nil {
|
|
common.Log.Debug("error: Missing node.context") // Should not happen ever.
|
|
return
|
|
}
|
|
|
|
if item, isItem := node.context.(*PdfOutlineItem); isItem {
|
|
*outlineList = append(*outlineList, &item.PdfOutlineTreeNode)
|
|
title := strings.Repeat(" ", depth*2) + string(*item.Title)
|
|
*titleList = append(*titleList, title)
|
|
if item.Next != nil {
|
|
flattenFunc(item.Next, outlineList, titleList, depth)
|
|
}
|
|
}
|
|
|
|
if node.First != nil {
|
|
title := strings.Repeat(" ", depth*2) + "+"
|
|
*titleList = append(*titleList, title)
|
|
flattenFunc(node.First, outlineList, titleList, depth+1)
|
|
}
|
|
}
|
|
flattenFunc(pp.outlineTree, &outlineNodeList, &flattenedTitleList, 0)
|
|
return outlineNodeList, flattenedTitleList, nil
|
|
}
|
|
|
|
// loadForms loads the AcroForm.
|
|
func (pp *PdfReader) loadForms() (*PdfAcroForm, error) {
|
|
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
|
return nil, fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
|
|
// Has forms?
|
|
catalog := pp.catalog
|
|
obj := catalog.Get("AcroForm")
|
|
if obj == nil {
|
|
// Nothing to load.
|
|
return nil, nil
|
|
}
|
|
var err error
|
|
obj, err = pp.traceToObject(obj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
obj = core.TraceToDirectObject(obj)
|
|
if _, isNull := obj.(*core.PdfObjectNull); isNull {
|
|
common.Log.Trace("Acroform is a null object (empty)\n")
|
|
return nil, nil
|
|
}
|
|
|
|
formsDict, ok := obj.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
common.Log.Debug("invalid AcroForm entry %T", obj)
|
|
common.Log.Debug("Does not have forms")
|
|
return nil, fmt.Errorf("invalid acroform entry %T", obj)
|
|
}
|
|
common.Log.Trace("Has Acro forms")
|
|
// Load it.
|
|
|
|
// Ensure we have access to everything.
|
|
common.Log.Trace("Traverse the Acroforms structure")
|
|
err = pp.traverseObjectData(formsDict)
|
|
if err != nil {
|
|
common.Log.Debug("error: Unable to traverse AcroForms (%s)", err)
|
|
return nil, err
|
|
}
|
|
|
|
// Create the acro forms object.
|
|
acroForm, err := pp.newPdfAcroFormFromDict(formsDict)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return acroForm, nil
|
|
}
|
|
|
|
// Build the table of contents.
|
|
// tree, ex: Pages -> Pages -> Pages -> Page
|
|
// Traverse through the whole thing recursively.
|
|
func (pp *PdfReader) buildPageList(node *core.PdfIndirectObject, parent *core.PdfIndirectObject, traversedPageNodes map[core.PdfObject]bool) error {
|
|
if node == nil {
|
|
return nil
|
|
}
|
|
|
|
if _, alreadyTraversed := traversedPageNodes[node]; alreadyTraversed {
|
|
common.Log.Debug("Cyclic recursion, skipping")
|
|
return nil
|
|
}
|
|
traversedPageNodes[node] = true
|
|
|
|
nodeDict, ok := node.PdfObject.(*core.PdfObjectDictionary)
|
|
if !ok {
|
|
return errors.New("node not a dictionary")
|
|
}
|
|
|
|
objType, ok := (*nodeDict).Get("Type").(*core.PdfObjectName)
|
|
if !ok {
|
|
return errors.New("node missing Type (Required)")
|
|
}
|
|
common.Log.Trace("buildPageList node type: %s", *objType)
|
|
if *objType == "Page" {
|
|
p, err := pp.newPdfPageFromDict(nodeDict)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
p.setContainer(node)
|
|
|
|
if parent != nil {
|
|
// Set the parent (in case missing or incorrect).
|
|
nodeDict.Set("Parent", parent)
|
|
}
|
|
pp.pageList = append(pp.pageList, node)
|
|
pp.PageList = append(pp.PageList, p)
|
|
|
|
return nil
|
|
}
|
|
if *objType != "Pages" {
|
|
common.Log.Debug("error: Table of content containing non Page/Pages object! (%s)", objType)
|
|
return errors.New("table of content containing non Page/Pages object")
|
|
}
|
|
|
|
// A Pages object. Update the parent.
|
|
if parent != nil {
|
|
nodeDict.Set("Parent", parent)
|
|
}
|
|
|
|
// Resolve the object recursively.
|
|
err := pp.traverseObjectData(node)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
kidsObj, err := pp.parser.Trace(nodeDict.Get("Kids"))
|
|
if err != nil {
|
|
common.Log.Debug("error: Failed loading Kids object")
|
|
return err
|
|
}
|
|
|
|
var kids *core.PdfObjectArray
|
|
kids, ok = kidsObj.(*core.PdfObjectArray)
|
|
if !ok {
|
|
kidsIndirect, isIndirect := kidsObj.(*core.PdfIndirectObject)
|
|
if !isIndirect {
|
|
return errors.New("invalid Kids object")
|
|
}
|
|
kids, ok = kidsIndirect.PdfObject.(*core.PdfObjectArray)
|
|
if !ok {
|
|
return errors.New("invalid Kids indirect object")
|
|
}
|
|
}
|
|
common.Log.Trace("Kids: %s", kids)
|
|
for idx, child := range *kids {
|
|
child, ok := child.(*core.PdfIndirectObject)
|
|
if !ok {
|
|
common.Log.Debug("error: Page not indirect object - (%s)", child)
|
|
return errors.New("page not indirect object")
|
|
}
|
|
(*kids)[idx] = child
|
|
err = pp.buildPageList(child, node, traversedPageNodes)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetNumPages returns the number of pages in the document.
|
|
func (pp *PdfReader) GetNumPages() (int, error) {
|
|
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
|
return 0, fmt.Errorf("file need to be decrypted first")
|
|
}
|
|
return len(pp.pageList), nil
|
|
}
|
|
|
|
// Resolves a reference, returning the object and indicates whether or not
|
|
// it was cached.
|
|
func (pp *PdfReader) resolveReference(ref *core.PdfObjectReference) (core.PdfObject, bool, error) {
|
|
cachedObj, isCached := pp.parser.ObjCache[int(ref.ObjectNumber)]
|
|
if !isCached {
|
|
common.Log.Trace("Reader Lookup ref: %s", ref)
|
|
obj, err := pp.parser.LookupByReference(*ref)
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
pp.parser.ObjCache[int(ref.ObjectNumber)] = obj
|
|
return obj, false, nil
|
|
}
|
|
return cachedObj, true, nil
|
|
}
|
|
|
|
/*
|
|
* Recursively traverse through the page object data and look up
|
|
* references to indirect objects.
|
|
*
|
|
* GH: Are we fully protected against circular references? (Add tests).
|
|
*/
|
|
func (pp *PdfReader) traverseObjectData(o core.PdfObject) error {
|
|
common.Log.Trace("Traverse object data")
|
|
if _, isTraversed := pp.traversed[o]; isTraversed {
|
|
common.Log.Trace("-Already traversed...")
|
|
return nil
|
|
}
|
|
pp.traversed[o] = true
|
|
|
|
if io, isIndirectObj := o.(*core.PdfIndirectObject); isIndirectObj {
|
|
common.Log.Trace("io: %s", io)
|
|
common.Log.Trace("- %s", io.PdfObject)
|
|
err := pp.traverseObjectData(io.PdfObject)
|
|
return err
|
|
}
|
|
|
|
if so, isStreamObj := o.(*core.PdfObjectStream); isStreamObj {
|
|
err := pp.traverseObjectData(so.PdfObjectDictionary)
|
|
return err
|
|
}
|
|
|
|
if dict, isDict := o.(*core.PdfObjectDictionary); isDict {
|
|
common.Log.Trace("- dict: %s", dict)
|
|
for _, name := range dict.Keys() {
|
|
v := dict.Get(name)
|
|
if ref, isRef := v.(*core.PdfObjectReference); isRef {
|
|
resolvedObj, _, err := pp.resolveReference(ref)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
dict.Set(name, resolvedObj)
|
|
err = pp.traverseObjectData(resolvedObj)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
err := pp.traverseObjectData(v)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if arr, isArray := o.(*core.PdfObjectArray); isArray {
|
|
common.Log.Trace("- array: %s", arr)
|
|
for idx, v := range *arr {
|
|
if ref, isRef := v.(*core.PdfObjectReference); isRef {
|
|
resolvedObj, _, err := pp.resolveReference(ref)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
(*arr)[idx] = resolvedObj
|
|
|
|
err = pp.traverseObjectData(resolvedObj)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
err := pp.traverseObjectData(v)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if _, isRef := o.(*core.PdfObjectReference); isRef {
|
|
common.Log.Debug("error: Reader tracing a reference!")
|
|
return errors.New("reader tracing a reference")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetPageAsIndirectObject returns an indirect object containing the page dictionary for a specified page number.
|
|
func (pp *PdfReader) GetPageAsIndirectObject(pageNumber int) (core.PdfObject, error) {
|
|
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
|
return nil, fmt.Errorf("file needs to be decrypted first")
|
|
}
|
|
if len(pp.pageList) < pageNumber {
|
|
return nil, errors.New("invalid page number (page count too short)")
|
|
}
|
|
page := pp.pageList[pageNumber-1]
|
|
|
|
// Look up all references related to page and load everything.
|
|
err := pp.traverseObjectData(page)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
common.Log.Trace("Page: %T %s", page, page)
|
|
common.Log.Trace("- %T %s", page.PdfObject, page.PdfObject)
|
|
|
|
return page, nil
|
|
}
|
|
|
|
// GetPage returns the PdfPage model for the specified page number.
|
|
func (pp *PdfReader) GetPage(pageNumber int) (*PdfPage, error) {
|
|
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
|
return nil, fmt.Errorf("file needs to be decrypted first")
|
|
}
|
|
if len(pp.pageList) < pageNumber {
|
|
return nil, errors.New("invalid page number (page count too short)")
|
|
}
|
|
idx := pageNumber - 1
|
|
if idx < 0 {
|
|
return nil, fmt.Errorf("page numbering must start at 1")
|
|
}
|
|
page := pp.PageList[idx]
|
|
|
|
return page, nil
|
|
}
|
|
|
|
// GetOCProperties returns the optional content properties PdfObject.
|
|
func (pp *PdfReader) GetOCProperties() (core.PdfObject, error) {
|
|
dict := pp.catalog
|
|
obj := dict.Get("OCProperties")
|
|
var err error
|
|
obj, err = pp.traceToObject(obj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Resolve all references...
|
|
// Should be pretty safe. Should not be referencing to pages or
|
|
// any large structures. Local structures and references
|
|
// to OC Groups.
|
|
err = pp.traverseObjectData(obj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return obj, nil
|
|
}
|
|
|
|
// Inspect inspects the object types, subtypes and content in the PDF file returning a map of
|
|
// object type to number of instances of each.
|
|
func (pp *PdfReader) Inspect() (map[string]int, error) {
|
|
return pp.parser.Inspect()
|
|
}
|
|
|
|
// GetObjectNums returns the object numbers of the PDF objects in the file
|
|
// Numbered objects are either indirect objects or stream objects.
|
|
// e.g. objNums := pdfReader.GetObjectNums()
|
|
// The underlying objects can then be accessed with
|
|
// pdfReader.GetIndirectObjectByNumber(objNums[0]) for the first available object.
|
|
func (r *PdfReader) GetObjectNums() []int {
|
|
return r.parser.GetObjectNums()
|
|
}
|
|
|
|
// GetIndirectObjectByNumber retrieves and returns a specific PdfObject by object number.
|
|
func (pp *PdfReader) GetIndirectObjectByNumber(number int) (core.PdfObject, error) {
|
|
obj, err := pp.parser.LookupByNumber(number)
|
|
return obj, err
|
|
}
|
|
|
|
// GetTrailer returns the PDF's trailer dictionary.
|
|
func (pp *PdfReader) GetTrailer() (*core.PdfObjectDictionary, error) {
|
|
trailerDict := pp.parser.GetTrailer()
|
|
if trailerDict == nil {
|
|
return nil, errors.New("trailer missing")
|
|
}
|
|
|
|
return trailerDict, nil
|
|
}
|