fix wrong git ignore
This commit is contained in:
763
internal/pdf/model/reader.go
Normal file
763
internal/pdf/model/reader.go
Normal file
@@ -0,0 +1,763 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
|
||||
)
|
||||
|
||||
// PdfReader represents a PDF file reader. It is a frontend to the lower level parsing mechanism and provides
|
||||
// a higher level access to work with PDF structure and information, such as the page structure etc.
|
||||
type PdfReader struct {
|
||||
parser *core.PdfParser
|
||||
root core.PdfObject
|
||||
pages *core.PdfObjectDictionary
|
||||
pageList []*core.PdfIndirectObject
|
||||
PageList []*PdfPage
|
||||
pageCount int
|
||||
catalog *core.PdfObjectDictionary
|
||||
outlineTree *PdfOutlineTreeNode
|
||||
AcroForm *PdfAcroForm
|
||||
|
||||
modelManager *ModelManager
|
||||
|
||||
// For tracking traversal (cache).
|
||||
traversed map[core.PdfObject]bool
|
||||
}
|
||||
|
||||
// NewPdfReader returns a new PdfReader for an input io.ReadSeeker interface. Can be used to read PDF from
|
||||
// memory or file. Immediately loads and traverses the PDF structure including pages and page contents (if
|
||||
// not encrypted).
|
||||
func NewPdfReader(rs io.ReadSeeker) (*PdfReader, error) {
|
||||
pdfReader := &PdfReader{}
|
||||
pdfReader.traversed = map[core.PdfObject]bool{}
|
||||
|
||||
pdfReader.modelManager = NewModelManager()
|
||||
|
||||
// Create the parser, loads the cross reference table and trailer.
|
||||
parser, err := core.NewParser(rs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pdfReader.parser = parser
|
||||
|
||||
isEncrypted, err := pdfReader.IsEncrypted()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Load pdf doc structure if not encrypted.
|
||||
if !isEncrypted {
|
||||
err = pdfReader.loadStructure()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return pdfReader, nil
|
||||
}
|
||||
|
||||
// IsEncrypted returns true if the PDF file is encrypted.
|
||||
func (pp *PdfReader) IsEncrypted() (bool, error) {
|
||||
return pp.parser.IsEncrypted()
|
||||
}
|
||||
|
||||
// GetEncryptionMethod returns a string containing some information about the encryption method used.
|
||||
// XXX/TODO: May be better to return a standardized struct with information.
|
||||
func (pp *PdfReader) GetEncryptionMethod() string {
|
||||
crypter := pp.parser.GetCrypter()
|
||||
str := crypter.Filter + " - "
|
||||
|
||||
if crypter.V == 0 {
|
||||
str += "Undocumented algorithm"
|
||||
} else if crypter.V == 1 {
|
||||
// RC4 or AES (bits: 40)
|
||||
str += "RC4: 40 bits"
|
||||
} else if crypter.V == 2 {
|
||||
str += fmt.Sprintf("RC4: %d bits", crypter.Length)
|
||||
} else if crypter.V == 3 {
|
||||
str += "Unpublished algorithm"
|
||||
} else if crypter.V >= 4 {
|
||||
// Look at CF, StmF, StrF
|
||||
str += fmt.Sprintf("Stream filter: %s - String filter: %s", crypter.StreamFilter, crypter.StringFilter)
|
||||
str += "; Crypt filters:"
|
||||
for name, cf := range crypter.CryptFilters {
|
||||
str += fmt.Sprintf(" - %s: %s (%d)", name, cf.Cfm, cf.Length)
|
||||
}
|
||||
}
|
||||
perms := crypter.GetAccessPermissions()
|
||||
str += fmt.Sprintf(" - %#v", perms)
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
// Decrypt decrypts the PDF file with a specified password. Also tries to
|
||||
// decrypt with an empty password. Returns true if successful,
|
||||
// false otherwise.
|
||||
func (pp *PdfReader) Decrypt(password []byte) (bool, error) {
|
||||
success, err := pp.parser.Decrypt(password)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if !success {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
err = pp.loadStructure()
|
||||
if err != nil {
|
||||
common.Log.Debug("error: Fail to load structure (%s)", err)
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// CheckAccessRights checks access rights and permissions for a specified password. If either user/owner
|
||||
// password is specified, full rights are granted, otherwise the access rights are specified by the
|
||||
// Permissions flag.
|
||||
//
|
||||
// The bool flag indicates that the user can access and view the file.
|
||||
// The AccessPermissions shows what access the user has for editing etc.
|
||||
// An error is returned if there was a problem performing the authentication.
|
||||
func (pp *PdfReader) CheckAccessRights(password []byte) (bool, core.AccessPermissions, error) {
|
||||
return pp.parser.CheckAccessRights(password)
|
||||
}
|
||||
|
||||
// Loads the structure of the pdf file: pages, outlines, etc.
|
||||
func (pp *PdfReader) loadStructure() error {
|
||||
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
||||
return fmt.Errorf("file need to be decrypted first")
|
||||
}
|
||||
|
||||
trailerDict := pp.parser.GetTrailer()
|
||||
if trailerDict == nil {
|
||||
return fmt.Errorf("missing trailer")
|
||||
}
|
||||
|
||||
// Catalog.
|
||||
root, ok := trailerDict.Get("Root").(*core.PdfObjectReference)
|
||||
if !ok {
|
||||
return fmt.Errorf("invalid Root (trailer: %s)", *trailerDict)
|
||||
}
|
||||
oc, err := pp.parser.LookupByReference(*root)
|
||||
if err != nil {
|
||||
common.Log.Debug("error: Failed to read root element catalog: %s", err)
|
||||
return err
|
||||
}
|
||||
pcatalog, ok := oc.(*core.PdfIndirectObject)
|
||||
if !ok {
|
||||
common.Log.Debug("error: Missing catalog: (root %q) (trailer %s)", oc, *trailerDict)
|
||||
return errors.New("missing catalog")
|
||||
}
|
||||
catalog, ok := (*pcatalog).PdfObject.(*core.PdfObjectDictionary)
|
||||
if !ok {
|
||||
common.Log.Debug("error: Invalid catalog (%s)", pcatalog.PdfObject)
|
||||
return errors.New("invalid catalog")
|
||||
}
|
||||
common.Log.Trace("Catalog: %s", catalog)
|
||||
|
||||
// Pages.
|
||||
pagesRef, ok := catalog.Get("Pages").(*core.PdfObjectReference)
|
||||
if !ok {
|
||||
return errors.New("pages in catalog should be a reference")
|
||||
}
|
||||
op, err := pp.parser.LookupByReference(*pagesRef)
|
||||
if err != nil {
|
||||
common.Log.Debug("error: Failed to read pages")
|
||||
return err
|
||||
}
|
||||
ppages, ok := op.(*core.PdfIndirectObject)
|
||||
if !ok {
|
||||
common.Log.Debug("error: Pages object invalid")
|
||||
common.Log.Debug("op: %p", ppages)
|
||||
return errors.New("pages object invalid")
|
||||
}
|
||||
pages, ok := ppages.PdfObject.(*core.PdfObjectDictionary)
|
||||
if !ok {
|
||||
common.Log.Debug("error: Pages object invalid (%s)", ppages)
|
||||
return errors.New("pages object invalid")
|
||||
}
|
||||
pageCount, ok := pages.Get("Count").(*core.PdfObjectInteger)
|
||||
if !ok {
|
||||
common.Log.Debug("error: Pages count object invalid")
|
||||
return errors.New("pages count invalid")
|
||||
}
|
||||
|
||||
pp.root = root
|
||||
pp.catalog = catalog
|
||||
pp.pages = pages
|
||||
pp.pageCount = int(*pageCount)
|
||||
pp.pageList = []*core.PdfIndirectObject{}
|
||||
|
||||
traversedPageNodes := map[core.PdfObject]bool{}
|
||||
err = pp.buildPageList(ppages, nil, traversedPageNodes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
common.Log.Trace("---")
|
||||
common.Log.Trace("TOC")
|
||||
common.Log.Trace("Pages")
|
||||
common.Log.Trace("%d: %s", len(pp.pageList), pp.pageList)
|
||||
|
||||
// Outlines.
|
||||
pp.outlineTree, err = pp.loadOutlines()
|
||||
if err != nil {
|
||||
common.Log.Debug("error: Failed to build outline tree (%s)", err)
|
||||
return err
|
||||
}
|
||||
|
||||
// Load interactive forms and fields.
|
||||
pp.AcroForm, err = pp.loadForms()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Trace to object. Keeps a list of already visited references to avoid circular references.
|
||||
//
|
||||
// Example circular reference.
|
||||
// 1 0 obj << /Next 2 0 R >>
|
||||
// 2 0 obj << /Next 1 0 R >>
|
||||
func (pp *PdfReader) traceToObjectWrapper(obj core.PdfObject, refList map[*core.PdfObjectReference]bool) (core.PdfObject, error) {
|
||||
// Keep a list of references to avoid circular references.
|
||||
|
||||
ref, isRef := obj.(*core.PdfObjectReference)
|
||||
if isRef {
|
||||
// Make sure not already visited (circular ref).
|
||||
if _, alreadyTraversed := refList[ref]; alreadyTraversed {
|
||||
return nil, errors.New("circular reference")
|
||||
}
|
||||
refList[ref] = true
|
||||
obj, err := pp.parser.LookupByReference(*ref)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return pp.traceToObjectWrapper(obj, refList)
|
||||
}
|
||||
|
||||
// Not a reference, an object. Can be indirect or any direct pdf object (other than reference).
|
||||
return obj, nil
|
||||
}
|
||||
|
||||
func (pp *PdfReader) traceToObject(obj core.PdfObject) (core.PdfObject, error) {
|
||||
refList := map[*core.PdfObjectReference]bool{}
|
||||
return pp.traceToObjectWrapper(obj, refList)
|
||||
}
|
||||
|
||||
func (pp *PdfReader) loadOutlines() (*PdfOutlineTreeNode, error) {
|
||||
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
||||
return nil, fmt.Errorf("file need to be decrypted first")
|
||||
}
|
||||
|
||||
// Has outlines? Otherwise return an empty outlines structure.
|
||||
catalog := pp.catalog
|
||||
outlinesObj := catalog.Get("Outlines")
|
||||
if outlinesObj == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
common.Log.Trace("-Has outlines")
|
||||
// Trace references to the object.
|
||||
outlineRootObj, err := pp.traceToObject(outlinesObj)
|
||||
if err != nil {
|
||||
common.Log.Debug("error: Failed to read outlines")
|
||||
return nil, err
|
||||
}
|
||||
common.Log.Trace("Outline root: %v", outlineRootObj)
|
||||
|
||||
if _, isNull := outlineRootObj.(*core.PdfObjectNull); isNull {
|
||||
common.Log.Trace("Outline root is null - no outlines")
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
outlineRoot, ok := outlineRootObj.(*core.PdfIndirectObject)
|
||||
if !ok {
|
||||
return nil, errors.New("outline root should be an indirect object")
|
||||
}
|
||||
|
||||
dict, ok := outlineRoot.PdfObject.(*core.PdfObjectDictionary)
|
||||
if !ok {
|
||||
return nil, errors.New("outline indirect object should contain a dictionary")
|
||||
}
|
||||
|
||||
common.Log.Trace("Outline root dict: %v", dict)
|
||||
|
||||
outlineTree, _, err := pp.buildOutlineTree(outlineRoot, nil, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
common.Log.Trace("Resulting outline tree: %v", outlineTree)
|
||||
|
||||
return outlineTree, nil
|
||||
}
|
||||
|
||||
// Recursive build outline tree.
|
||||
// prev PdfObject,
|
||||
// Input: The indirect object containing an Outlines or Outline item dictionary.
|
||||
// Parent, Prev are the parent or previous node in the hierarchy.
|
||||
// The function returns the corresponding tree node and the last node which is used
|
||||
// for setting the Last pointer of the tree node structures.
|
||||
func (pp *PdfReader) buildOutlineTree(obj core.PdfObject, parent *PdfOutlineTreeNode, prev *PdfOutlineTreeNode) (*PdfOutlineTreeNode, *PdfOutlineTreeNode, error) {
|
||||
container, isInd := obj.(*core.PdfIndirectObject)
|
||||
if !isInd {
|
||||
return nil, nil, fmt.Errorf("outline container not an indirect object %T", obj)
|
||||
}
|
||||
dict, ok := container.PdfObject.(*core.PdfObjectDictionary)
|
||||
if !ok {
|
||||
return nil, nil, errors.New("not a dictionary object")
|
||||
}
|
||||
common.Log.Trace("build outline tree: dict: %v (%v) p: %p", dict, container, container)
|
||||
|
||||
if obj := dict.Get("Title"); obj != nil {
|
||||
// Outline item has a title. (required)
|
||||
outlineItem, err := pp.newPdfOutlineItemFromIndirectObject(container)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
outlineItem.Parent = parent
|
||||
outlineItem.Prev = prev
|
||||
|
||||
if firstObj := dict.Get("First"); firstObj != nil {
|
||||
firstObj, err = pp.traceToObject(firstObj)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if _, isNull := firstObj.(*core.PdfObjectNull); !isNull {
|
||||
first, last, err := pp.buildOutlineTree(firstObj, &outlineItem.PdfOutlineTreeNode, nil)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
outlineItem.First = first
|
||||
outlineItem.Last = last
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve the reference to next
|
||||
if nextObj := dict.Get("Next"); nextObj != nil {
|
||||
nextObj, err = pp.traceToObject(nextObj)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if _, isNull := nextObj.(*core.PdfObjectNull); !isNull {
|
||||
next, last, err := pp.buildOutlineTree(nextObj, parent, &outlineItem.PdfOutlineTreeNode)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
outlineItem.Next = next
|
||||
return &outlineItem.PdfOutlineTreeNode, last, nil
|
||||
}
|
||||
}
|
||||
|
||||
return &outlineItem.PdfOutlineTreeNode, &outlineItem.PdfOutlineTreeNode, nil
|
||||
} else {
|
||||
// Outline dictionary (structure element).
|
||||
|
||||
outline, err := newPdfOutlineFromIndirectObject(container)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
outline.Parent = parent
|
||||
//outline.Prev = parent
|
||||
|
||||
if firstObj := dict.Get("First"); firstObj != nil {
|
||||
// Has children...
|
||||
firstObj, err = pp.traceToObject(firstObj)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if _, isNull := firstObj.(*core.PdfObjectNull); !isNull {
|
||||
first, last, err := pp.buildOutlineTree(firstObj, &outline.PdfOutlineTreeNode, nil)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
outline.First = first
|
||||
outline.Last = last
|
||||
}
|
||||
}
|
||||
return &outline.PdfOutlineTreeNode, &outline.PdfOutlineTreeNode, nil
|
||||
}
|
||||
}
|
||||
|
||||
// GetOutlineTree returns the outline tree.
|
||||
func (pp *PdfReader) GetOutlineTree() *PdfOutlineTreeNode {
|
||||
return pp.outlineTree
|
||||
}
|
||||
|
||||
// GetOutlinesFlattened returns a flattened list of tree nodes and titles.
|
||||
func (pp *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, error) {
|
||||
outlineNodeList := []*PdfOutlineTreeNode{}
|
||||
flattenedTitleList := []string{}
|
||||
|
||||
// Recursive flattening function.
|
||||
var flattenFunc func(*PdfOutlineTreeNode, *[]*PdfOutlineTreeNode, *[]string, int)
|
||||
flattenFunc = func(node *PdfOutlineTreeNode, outlineList *[]*PdfOutlineTreeNode, titleList *[]string, depth int) {
|
||||
if node == nil {
|
||||
return
|
||||
}
|
||||
if node.context == nil {
|
||||
common.Log.Debug("error: Missing node.context") // Should not happen ever.
|
||||
return
|
||||
}
|
||||
|
||||
if item, isItem := node.context.(*PdfOutlineItem); isItem {
|
||||
*outlineList = append(*outlineList, &item.PdfOutlineTreeNode)
|
||||
title := strings.Repeat(" ", depth*2) + string(*item.Title)
|
||||
*titleList = append(*titleList, title)
|
||||
if item.Next != nil {
|
||||
flattenFunc(item.Next, outlineList, titleList, depth)
|
||||
}
|
||||
}
|
||||
|
||||
if node.First != nil {
|
||||
title := strings.Repeat(" ", depth*2) + "+"
|
||||
*titleList = append(*titleList, title)
|
||||
flattenFunc(node.First, outlineList, titleList, depth+1)
|
||||
}
|
||||
}
|
||||
flattenFunc(pp.outlineTree, &outlineNodeList, &flattenedTitleList, 0)
|
||||
return outlineNodeList, flattenedTitleList, nil
|
||||
}
|
||||
|
||||
// loadForms loads the AcroForm.
|
||||
func (pp *PdfReader) loadForms() (*PdfAcroForm, error) {
|
||||
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
||||
return nil, fmt.Errorf("file need to be decrypted first")
|
||||
}
|
||||
|
||||
// Has forms?
|
||||
catalog := pp.catalog
|
||||
obj := catalog.Get("AcroForm")
|
||||
if obj == nil {
|
||||
// Nothing to load.
|
||||
return nil, nil
|
||||
}
|
||||
var err error
|
||||
obj, err = pp.traceToObject(obj)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
obj = core.TraceToDirectObject(obj)
|
||||
if _, isNull := obj.(*core.PdfObjectNull); isNull {
|
||||
common.Log.Trace("Acroform is a null object (empty)\n")
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
formsDict, ok := obj.(*core.PdfObjectDictionary)
|
||||
if !ok {
|
||||
common.Log.Debug("invalid AcroForm entry %T", obj)
|
||||
common.Log.Debug("Does not have forms")
|
||||
return nil, fmt.Errorf("invalid acroform entry %T", obj)
|
||||
}
|
||||
common.Log.Trace("Has Acro forms")
|
||||
// Load it.
|
||||
|
||||
// Ensure we have access to everything.
|
||||
common.Log.Trace("Traverse the Acroforms structure")
|
||||
err = pp.traverseObjectData(formsDict)
|
||||
if err != nil {
|
||||
common.Log.Debug("error: Unable to traverse AcroForms (%s)", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Create the acro forms object.
|
||||
acroForm, err := pp.newPdfAcroFormFromDict(formsDict)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return acroForm, nil
|
||||
}
|
||||
|
||||
// Build the table of contents.
|
||||
// tree, ex: Pages -> Pages -> Pages -> Page
|
||||
// Traverse through the whole thing recursively.
|
||||
func (pp *PdfReader) buildPageList(node *core.PdfIndirectObject, parent *core.PdfIndirectObject, traversedPageNodes map[core.PdfObject]bool) error {
|
||||
if node == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if _, alreadyTraversed := traversedPageNodes[node]; alreadyTraversed {
|
||||
common.Log.Debug("Cyclic recursion, skipping")
|
||||
return nil
|
||||
}
|
||||
traversedPageNodes[node] = true
|
||||
|
||||
nodeDict, ok := node.PdfObject.(*core.PdfObjectDictionary)
|
||||
if !ok {
|
||||
return errors.New("node not a dictionary")
|
||||
}
|
||||
|
||||
objType, ok := (*nodeDict).Get("Type").(*core.PdfObjectName)
|
||||
if !ok {
|
||||
return errors.New("node missing Type (Required)")
|
||||
}
|
||||
common.Log.Trace("buildPageList node type: %s", *objType)
|
||||
if *objType == "Page" {
|
||||
p, err := pp.newPdfPageFromDict(nodeDict)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
p.setContainer(node)
|
||||
|
||||
if parent != nil {
|
||||
// Set the parent (in case missing or incorrect).
|
||||
nodeDict.Set("Parent", parent)
|
||||
}
|
||||
pp.pageList = append(pp.pageList, node)
|
||||
pp.PageList = append(pp.PageList, p)
|
||||
|
||||
return nil
|
||||
}
|
||||
if *objType != "Pages" {
|
||||
common.Log.Debug("error: Table of content containing non Page/Pages object! (%s)", objType)
|
||||
return errors.New("table of content containing non Page/Pages object")
|
||||
}
|
||||
|
||||
// A Pages object. Update the parent.
|
||||
if parent != nil {
|
||||
nodeDict.Set("Parent", parent)
|
||||
}
|
||||
|
||||
// Resolve the object recursively.
|
||||
err := pp.traverseObjectData(node)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
kidsObj, err := pp.parser.Trace(nodeDict.Get("Kids"))
|
||||
if err != nil {
|
||||
common.Log.Debug("error: Failed loading Kids object")
|
||||
return err
|
||||
}
|
||||
|
||||
var kids *core.PdfObjectArray
|
||||
kids, ok = kidsObj.(*core.PdfObjectArray)
|
||||
if !ok {
|
||||
kidsIndirect, isIndirect := kidsObj.(*core.PdfIndirectObject)
|
||||
if !isIndirect {
|
||||
return errors.New("invalid Kids object")
|
||||
}
|
||||
kids, ok = kidsIndirect.PdfObject.(*core.PdfObjectArray)
|
||||
if !ok {
|
||||
return errors.New("invalid Kids indirect object")
|
||||
}
|
||||
}
|
||||
common.Log.Trace("Kids: %s", kids)
|
||||
for idx, child := range *kids {
|
||||
child, ok := child.(*core.PdfIndirectObject)
|
||||
if !ok {
|
||||
common.Log.Debug("error: Page not indirect object - (%s)", child)
|
||||
return errors.New("page not indirect object")
|
||||
}
|
||||
(*kids)[idx] = child
|
||||
err = pp.buildPageList(child, node, traversedPageNodes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetNumPages returns the number of pages in the document.
|
||||
func (pp *PdfReader) GetNumPages() (int, error) {
|
||||
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
||||
return 0, fmt.Errorf("file need to be decrypted first")
|
||||
}
|
||||
return len(pp.pageList), nil
|
||||
}
|
||||
|
||||
// Resolves a reference, returning the object and indicates whether or not
|
||||
// it was cached.
|
||||
func (pp *PdfReader) resolveReference(ref *core.PdfObjectReference) (core.PdfObject, bool, error) {
|
||||
cachedObj, isCached := pp.parser.ObjCache[int(ref.ObjectNumber)]
|
||||
if !isCached {
|
||||
common.Log.Trace("Reader Lookup ref: %s", ref)
|
||||
obj, err := pp.parser.LookupByReference(*ref)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
pp.parser.ObjCache[int(ref.ObjectNumber)] = obj
|
||||
return obj, false, nil
|
||||
}
|
||||
return cachedObj, true, nil
|
||||
}
|
||||
|
||||
/*
|
||||
* Recursively traverse through the page object data and look up
|
||||
* references to indirect objects.
|
||||
*
|
||||
* GH: Are we fully protected against circular references? (Add tests).
|
||||
*/
|
||||
func (pp *PdfReader) traverseObjectData(o core.PdfObject) error {
|
||||
common.Log.Trace("Traverse object data")
|
||||
if _, isTraversed := pp.traversed[o]; isTraversed {
|
||||
common.Log.Trace("-Already traversed...")
|
||||
return nil
|
||||
}
|
||||
pp.traversed[o] = true
|
||||
|
||||
if io, isIndirectObj := o.(*core.PdfIndirectObject); isIndirectObj {
|
||||
common.Log.Trace("io: %s", io)
|
||||
common.Log.Trace("- %s", io.PdfObject)
|
||||
err := pp.traverseObjectData(io.PdfObject)
|
||||
return err
|
||||
}
|
||||
|
||||
if so, isStreamObj := o.(*core.PdfObjectStream); isStreamObj {
|
||||
err := pp.traverseObjectData(so.PdfObjectDictionary)
|
||||
return err
|
||||
}
|
||||
|
||||
if dict, isDict := o.(*core.PdfObjectDictionary); isDict {
|
||||
common.Log.Trace("- dict: %s", dict)
|
||||
for _, name := range dict.Keys() {
|
||||
v := dict.Get(name)
|
||||
if ref, isRef := v.(*core.PdfObjectReference); isRef {
|
||||
resolvedObj, _, err := pp.resolveReference(ref)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dict.Set(name, resolvedObj)
|
||||
err = pp.traverseObjectData(resolvedObj)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
err := pp.traverseObjectData(v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if arr, isArray := o.(*core.PdfObjectArray); isArray {
|
||||
common.Log.Trace("- array: %s", arr)
|
||||
for idx, v := range *arr {
|
||||
if ref, isRef := v.(*core.PdfObjectReference); isRef {
|
||||
resolvedObj, _, err := pp.resolveReference(ref)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
(*arr)[idx] = resolvedObj
|
||||
|
||||
err = pp.traverseObjectData(resolvedObj)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
err := pp.traverseObjectData(v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if _, isRef := o.(*core.PdfObjectReference); isRef {
|
||||
common.Log.Debug("error: Reader tracing a reference!")
|
||||
return errors.New("reader tracing a reference")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetPageAsIndirectObject returns an indirect object containing the page dictionary for a specified page number.
|
||||
func (pp *PdfReader) GetPageAsIndirectObject(pageNumber int) (core.PdfObject, error) {
|
||||
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
||||
return nil, fmt.Errorf("file needs to be decrypted first")
|
||||
}
|
||||
if len(pp.pageList) < pageNumber {
|
||||
return nil, errors.New("invalid page number (page count too short)")
|
||||
}
|
||||
page := pp.pageList[pageNumber-1]
|
||||
|
||||
// Look up all references related to page and load everything.
|
||||
err := pp.traverseObjectData(page)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
common.Log.Trace("Page: %T %s", page, page)
|
||||
common.Log.Trace("- %T %s", page.PdfObject, page.PdfObject)
|
||||
|
||||
return page, nil
|
||||
}
|
||||
|
||||
// GetPage returns the PdfPage model for the specified page number.
|
||||
func (pp *PdfReader) GetPage(pageNumber int) (*PdfPage, error) {
|
||||
if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() {
|
||||
return nil, fmt.Errorf("file needs to be decrypted first")
|
||||
}
|
||||
if len(pp.pageList) < pageNumber {
|
||||
return nil, errors.New("invalid page number (page count too short)")
|
||||
}
|
||||
idx := pageNumber - 1
|
||||
if idx < 0 {
|
||||
return nil, fmt.Errorf("page numbering must start at 1")
|
||||
}
|
||||
page := pp.PageList[idx]
|
||||
|
||||
return page, nil
|
||||
}
|
||||
|
||||
// GetOCProperties returns the optional content properties PdfObject.
|
||||
func (pp *PdfReader) GetOCProperties() (core.PdfObject, error) {
|
||||
dict := pp.catalog
|
||||
obj := dict.Get("OCProperties")
|
||||
var err error
|
||||
obj, err = pp.traceToObject(obj)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Resolve all references...
|
||||
// Should be pretty safe. Should not be referencing to pages or
|
||||
// any large structures. Local structures and references
|
||||
// to OC Groups.
|
||||
err = pp.traverseObjectData(obj)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return obj, nil
|
||||
}
|
||||
|
||||
// Inspect inspects the object types, subtypes and content in the PDF file returning a map of
|
||||
// object type to number of instances of each.
|
||||
func (pp *PdfReader) Inspect() (map[string]int, error) {
|
||||
return pp.parser.Inspect()
|
||||
}
|
||||
|
||||
// GetObjectNums returns the object numbers of the PDF objects in the file
|
||||
// Numbered objects are either indirect objects or stream objects.
|
||||
// e.g. objNums := pdfReader.GetObjectNums()
|
||||
// The underlying objects can then be accessed with
|
||||
// pdfReader.GetIndirectObjectByNumber(objNums[0]) for the first available object.
|
||||
func (r *PdfReader) GetObjectNums() []int {
|
||||
return r.parser.GetObjectNums()
|
||||
}
|
||||
|
||||
// GetIndirectObjectByNumber retrieves and returns a specific PdfObject by object number.
|
||||
func (pp *PdfReader) GetIndirectObjectByNumber(number int) (core.PdfObject, error) {
|
||||
obj, err := pp.parser.LookupByNumber(number)
|
||||
return obj, err
|
||||
}
|
||||
|
||||
// GetTrailer returns the PDF's trailer dictionary.
|
||||
func (pp *PdfReader) GetTrailer() (*core.PdfObjectDictionary, error) {
|
||||
trailerDict := pp.parser.GetTrailer()
|
||||
if trailerDict == nil {
|
||||
return nil, errors.New("trailer missing")
|
||||
}
|
||||
|
||||
return trailerDict, nil
|
||||
}
|
||||
Reference in New Issue
Block a user