package model import ( "errors" "fmt" "io" "strings" "gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common" "gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core" ) // PdfReader represents a PDF file reader. It is a frontend to the lower level parsing mechanism and provides // a higher level access to work with PDF structure and information, such as the page structure etc. type PdfReader struct { parser *core.PdfParser root core.PdfObject pages *core.PdfObjectDictionary pageList []*core.PdfIndirectObject PageList []*PdfPage pageCount int catalog *core.PdfObjectDictionary outlineTree *PdfOutlineTreeNode AcroForm *PdfAcroForm modelManager *ModelManager // For tracking traversal (cache). traversed map[core.PdfObject]bool } // NewPdfReader returns a new PdfReader for an input io.ReadSeeker interface. Can be used to read PDF from // memory or file. Immediately loads and traverses the PDF structure including pages and page contents (if // not encrypted). func NewPdfReader(rs io.ReadSeeker) (*PdfReader, error) { pdfReader := &PdfReader{} pdfReader.traversed = map[core.PdfObject]bool{} pdfReader.modelManager = NewModelManager() // Create the parser, loads the cross reference table and trailer. parser, err := core.NewParser(rs) if err != nil { return nil, err } pdfReader.parser = parser isEncrypted, err := pdfReader.IsEncrypted() if err != nil { return nil, err } // Load pdf doc structure if not encrypted. if !isEncrypted { err = pdfReader.loadStructure() if err != nil { return nil, err } } return pdfReader, nil } // IsEncrypted returns true if the PDF file is encrypted. func (pp *PdfReader) IsEncrypted() (bool, error) { return pp.parser.IsEncrypted() } // GetEncryptionMethod returns a string containing some information about the encryption method used. // XXX/TODO: May be better to return a standardized struct with information. func (pp *PdfReader) GetEncryptionMethod() string { crypter := pp.parser.GetCrypter() str := crypter.Filter + " - " if crypter.V == 0 { str += "Undocumented algorithm" } else if crypter.V == 1 { // RC4 or AES (bits: 40) str += "RC4: 40 bits" } else if crypter.V == 2 { str += fmt.Sprintf("RC4: %d bits", crypter.Length) } else if crypter.V == 3 { str += "Unpublished algorithm" } else if crypter.V >= 4 { // Look at CF, StmF, StrF str += fmt.Sprintf("Stream filter: %s - String filter: %s", crypter.StreamFilter, crypter.StringFilter) str += "; Crypt filters:" for name, cf := range crypter.CryptFilters { str += fmt.Sprintf(" - %s: %s (%d)", name, cf.Cfm, cf.Length) } } perms := crypter.GetAccessPermissions() str += fmt.Sprintf(" - %#v", perms) return str } // Decrypt decrypts the PDF file with a specified password. Also tries to // decrypt with an empty password. Returns true if successful, // false otherwise. func (pp *PdfReader) Decrypt(password []byte) (bool, error) { success, err := pp.parser.Decrypt(password) if err != nil { return false, err } if !success { return false, nil } err = pp.loadStructure() if err != nil { common.Log.Debug("error: Fail to load structure (%s)", err) return false, err } return true, nil } // CheckAccessRights checks access rights and permissions for a specified password. If either user/owner // password is specified, full rights are granted, otherwise the access rights are specified by the // Permissions flag. // // The bool flag indicates that the user can access and view the file. // The AccessPermissions shows what access the user has for editing etc. // An error is returned if there was a problem performing the authentication. func (pp *PdfReader) CheckAccessRights(password []byte) (bool, core.AccessPermissions, error) { return pp.parser.CheckAccessRights(password) } // Loads the structure of the pdf file: pages, outlines, etc. func (pp *PdfReader) loadStructure() error { if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() { return fmt.Errorf("file need to be decrypted first") } trailerDict := pp.parser.GetTrailer() if trailerDict == nil { return fmt.Errorf("missing trailer") } // Catalog. root, ok := trailerDict.Get("Root").(*core.PdfObjectReference) if !ok { return fmt.Errorf("invalid Root (trailer: %s)", *trailerDict) } oc, err := pp.parser.LookupByReference(*root) if err != nil { common.Log.Debug("error: Failed to read root element catalog: %s", err) return err } pcatalog, ok := oc.(*core.PdfIndirectObject) if !ok { common.Log.Debug("error: Missing catalog: (root %q) (trailer %s)", oc, *trailerDict) return errors.New("missing catalog") } catalog, ok := (*pcatalog).PdfObject.(*core.PdfObjectDictionary) if !ok { common.Log.Debug("error: Invalid catalog (%s)", pcatalog.PdfObject) return errors.New("invalid catalog") } common.Log.Trace("Catalog: %s", catalog) // Pages. pagesRef, ok := catalog.Get("Pages").(*core.PdfObjectReference) if !ok { return errors.New("pages in catalog should be a reference") } op, err := pp.parser.LookupByReference(*pagesRef) if err != nil { common.Log.Debug("error: Failed to read pages") return err } ppages, ok := op.(*core.PdfIndirectObject) if !ok { common.Log.Debug("error: Pages object invalid") common.Log.Debug("op: %p", ppages) return errors.New("pages object invalid") } pages, ok := ppages.PdfObject.(*core.PdfObjectDictionary) if !ok { common.Log.Debug("error: Pages object invalid (%s)", ppages) return errors.New("pages object invalid") } pageCount, ok := pages.Get("Count").(*core.PdfObjectInteger) if !ok { common.Log.Debug("error: Pages count object invalid") return errors.New("pages count invalid") } pp.root = root pp.catalog = catalog pp.pages = pages pp.pageCount = int(*pageCount) pp.pageList = []*core.PdfIndirectObject{} traversedPageNodes := map[core.PdfObject]bool{} err = pp.buildPageList(ppages, nil, traversedPageNodes) if err != nil { return err } common.Log.Trace("---") common.Log.Trace("TOC") common.Log.Trace("Pages") common.Log.Trace("%d: %s", len(pp.pageList), pp.pageList) // Outlines. pp.outlineTree, err = pp.loadOutlines() if err != nil { common.Log.Debug("error: Failed to build outline tree (%s)", err) return err } // Load interactive forms and fields. pp.AcroForm, err = pp.loadForms() if err != nil { return err } return nil } // Trace to object. Keeps a list of already visited references to avoid circular references. // // Example circular reference. // 1 0 obj << /Next 2 0 R >> // 2 0 obj << /Next 1 0 R >> func (pp *PdfReader) traceToObjectWrapper(obj core.PdfObject, refList map[*core.PdfObjectReference]bool) (core.PdfObject, error) { // Keep a list of references to avoid circular references. ref, isRef := obj.(*core.PdfObjectReference) if isRef { // Make sure not already visited (circular ref). if _, alreadyTraversed := refList[ref]; alreadyTraversed { return nil, errors.New("circular reference") } refList[ref] = true obj, err := pp.parser.LookupByReference(*ref) if err != nil { return nil, err } return pp.traceToObjectWrapper(obj, refList) } // Not a reference, an object. Can be indirect or any direct pdf object (other than reference). return obj, nil } func (pp *PdfReader) traceToObject(obj core.PdfObject) (core.PdfObject, error) { refList := map[*core.PdfObjectReference]bool{} return pp.traceToObjectWrapper(obj, refList) } func (pp *PdfReader) loadOutlines() (*PdfOutlineTreeNode, error) { if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() { return nil, fmt.Errorf("file need to be decrypted first") } // Has outlines? Otherwise return an empty outlines structure. catalog := pp.catalog outlinesObj := catalog.Get("Outlines") if outlinesObj == nil { return nil, nil } common.Log.Trace("-Has outlines") // Trace references to the object. outlineRootObj, err := pp.traceToObject(outlinesObj) if err != nil { common.Log.Debug("error: Failed to read outlines") return nil, err } common.Log.Trace("Outline root: %v", outlineRootObj) if _, isNull := outlineRootObj.(*core.PdfObjectNull); isNull { common.Log.Trace("Outline root is null - no outlines") return nil, nil } outlineRoot, ok := outlineRootObj.(*core.PdfIndirectObject) if !ok { return nil, errors.New("outline root should be an indirect object") } dict, ok := outlineRoot.PdfObject.(*core.PdfObjectDictionary) if !ok { return nil, errors.New("outline indirect object should contain a dictionary") } common.Log.Trace("Outline root dict: %v", dict) outlineTree, _, err := pp.buildOutlineTree(outlineRoot, nil, nil) if err != nil { return nil, err } common.Log.Trace("Resulting outline tree: %v", outlineTree) return outlineTree, nil } // Recursive build outline tree. // prev PdfObject, // Input: The indirect object containing an Outlines or Outline item dictionary. // Parent, Prev are the parent or previous node in the hierarchy. // The function returns the corresponding tree node and the last node which is used // for setting the Last pointer of the tree node structures. func (pp *PdfReader) buildOutlineTree(obj core.PdfObject, parent *PdfOutlineTreeNode, prev *PdfOutlineTreeNode) (*PdfOutlineTreeNode, *PdfOutlineTreeNode, error) { container, isInd := obj.(*core.PdfIndirectObject) if !isInd { return nil, nil, fmt.Errorf("outline container not an indirect object %T", obj) } dict, ok := container.PdfObject.(*core.PdfObjectDictionary) if !ok { return nil, nil, errors.New("not a dictionary object") } common.Log.Trace("build outline tree: dict: %v (%v) p: %p", dict, container, container) if obj := dict.Get("Title"); obj != nil { // Outline item has a title. (required) outlineItem, err := pp.newPdfOutlineItemFromIndirectObject(container) if err != nil { return nil, nil, err } outlineItem.Parent = parent outlineItem.Prev = prev if firstObj := dict.Get("First"); firstObj != nil { firstObj, err = pp.traceToObject(firstObj) if err != nil { return nil, nil, err } if _, isNull := firstObj.(*core.PdfObjectNull); !isNull { first, last, err := pp.buildOutlineTree(firstObj, &outlineItem.PdfOutlineTreeNode, nil) if err != nil { return nil, nil, err } outlineItem.First = first outlineItem.Last = last } } // Resolve the reference to next if nextObj := dict.Get("Next"); nextObj != nil { nextObj, err = pp.traceToObject(nextObj) if err != nil { return nil, nil, err } if _, isNull := nextObj.(*core.PdfObjectNull); !isNull { next, last, err := pp.buildOutlineTree(nextObj, parent, &outlineItem.PdfOutlineTreeNode) if err != nil { return nil, nil, err } outlineItem.Next = next return &outlineItem.PdfOutlineTreeNode, last, nil } } return &outlineItem.PdfOutlineTreeNode, &outlineItem.PdfOutlineTreeNode, nil } else { // Outline dictionary (structure element). outline, err := newPdfOutlineFromIndirectObject(container) if err != nil { return nil, nil, err } outline.Parent = parent //outline.Prev = parent if firstObj := dict.Get("First"); firstObj != nil { // Has children... firstObj, err = pp.traceToObject(firstObj) if err != nil { return nil, nil, err } if _, isNull := firstObj.(*core.PdfObjectNull); !isNull { first, last, err := pp.buildOutlineTree(firstObj, &outline.PdfOutlineTreeNode, nil) if err != nil { return nil, nil, err } outline.First = first outline.Last = last } } return &outline.PdfOutlineTreeNode, &outline.PdfOutlineTreeNode, nil } } // GetOutlineTree returns the outline tree. func (pp *PdfReader) GetOutlineTree() *PdfOutlineTreeNode { return pp.outlineTree } // GetOutlinesFlattened returns a flattened list of tree nodes and titles. func (pp *PdfReader) GetOutlinesFlattened() ([]*PdfOutlineTreeNode, []string, error) { outlineNodeList := []*PdfOutlineTreeNode{} flattenedTitleList := []string{} // Recursive flattening function. var flattenFunc func(*PdfOutlineTreeNode, *[]*PdfOutlineTreeNode, *[]string, int) flattenFunc = func(node *PdfOutlineTreeNode, outlineList *[]*PdfOutlineTreeNode, titleList *[]string, depth int) { if node == nil { return } if node.context == nil { common.Log.Debug("error: Missing node.context") // Should not happen ever. return } if item, isItem := node.context.(*PdfOutlineItem); isItem { *outlineList = append(*outlineList, &item.PdfOutlineTreeNode) title := strings.Repeat(" ", depth*2) + string(*item.Title) *titleList = append(*titleList, title) if item.Next != nil { flattenFunc(item.Next, outlineList, titleList, depth) } } if node.First != nil { title := strings.Repeat(" ", depth*2) + "+" *titleList = append(*titleList, title) flattenFunc(node.First, outlineList, titleList, depth+1) } } flattenFunc(pp.outlineTree, &outlineNodeList, &flattenedTitleList, 0) return outlineNodeList, flattenedTitleList, nil } // loadForms loads the AcroForm. func (pp *PdfReader) loadForms() (*PdfAcroForm, error) { if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() { return nil, fmt.Errorf("file need to be decrypted first") } // Has forms? catalog := pp.catalog obj := catalog.Get("AcroForm") if obj == nil { // Nothing to load. return nil, nil } var err error obj, err = pp.traceToObject(obj) if err != nil { return nil, err } obj = core.TraceToDirectObject(obj) if _, isNull := obj.(*core.PdfObjectNull); isNull { common.Log.Trace("Acroform is a null object (empty)\n") return nil, nil } formsDict, ok := obj.(*core.PdfObjectDictionary) if !ok { common.Log.Debug("invalid AcroForm entry %T", obj) common.Log.Debug("Does not have forms") return nil, fmt.Errorf("invalid acroform entry %T", obj) } common.Log.Trace("Has Acro forms") // Load it. // Ensure we have access to everything. common.Log.Trace("Traverse the Acroforms structure") err = pp.traverseObjectData(formsDict) if err != nil { common.Log.Debug("error: Unable to traverse AcroForms (%s)", err) return nil, err } // Create the acro forms object. acroForm, err := pp.newPdfAcroFormFromDict(formsDict) if err != nil { return nil, err } return acroForm, nil } // Build the table of contents. // tree, ex: Pages -> Pages -> Pages -> Page // Traverse through the whole thing recursively. func (pp *PdfReader) buildPageList(node *core.PdfIndirectObject, parent *core.PdfIndirectObject, traversedPageNodes map[core.PdfObject]bool) error { if node == nil { return nil } if _, alreadyTraversed := traversedPageNodes[node]; alreadyTraversed { common.Log.Debug("Cyclic recursion, skipping") return nil } traversedPageNodes[node] = true nodeDict, ok := node.PdfObject.(*core.PdfObjectDictionary) if !ok { return errors.New("node not a dictionary") } objType, ok := (*nodeDict).Get("Type").(*core.PdfObjectName) if !ok { return errors.New("node missing Type (Required)") } common.Log.Trace("buildPageList node type: %s", *objType) if *objType == "Page" { p, err := pp.newPdfPageFromDict(nodeDict) if err != nil { return err } p.setContainer(node) if parent != nil { // Set the parent (in case missing or incorrect). nodeDict.Set("Parent", parent) } pp.pageList = append(pp.pageList, node) pp.PageList = append(pp.PageList, p) return nil } if *objType != "Pages" { common.Log.Debug("error: Table of content containing non Page/Pages object! (%s)", objType) return errors.New("table of content containing non Page/Pages object") } // A Pages object. Update the parent. if parent != nil { nodeDict.Set("Parent", parent) } // Resolve the object recursively. err := pp.traverseObjectData(node) if err != nil { return err } kidsObj, err := pp.parser.Trace(nodeDict.Get("Kids")) if err != nil { common.Log.Debug("error: Failed loading Kids object") return err } var kids *core.PdfObjectArray kids, ok = kidsObj.(*core.PdfObjectArray) if !ok { kidsIndirect, isIndirect := kidsObj.(*core.PdfIndirectObject) if !isIndirect { return errors.New("invalid Kids object") } kids, ok = kidsIndirect.PdfObject.(*core.PdfObjectArray) if !ok { return errors.New("invalid Kids indirect object") } } common.Log.Trace("Kids: %s", kids) for idx, child := range *kids { child, ok := child.(*core.PdfIndirectObject) if !ok { common.Log.Debug("error: Page not indirect object - (%s)", child) return errors.New("page not indirect object") } (*kids)[idx] = child err = pp.buildPageList(child, node, traversedPageNodes) if err != nil { return err } } return nil } // GetNumPages returns the number of pages in the document. func (pp *PdfReader) GetNumPages() (int, error) { if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() { return 0, fmt.Errorf("file need to be decrypted first") } return len(pp.pageList), nil } // Resolves a reference, returning the object and indicates whether or not // it was cached. func (pp *PdfReader) resolveReference(ref *core.PdfObjectReference) (core.PdfObject, bool, error) { cachedObj, isCached := pp.parser.ObjCache[int(ref.ObjectNumber)] if !isCached { common.Log.Trace("Reader Lookup ref: %s", ref) obj, err := pp.parser.LookupByReference(*ref) if err != nil { return nil, false, err } pp.parser.ObjCache[int(ref.ObjectNumber)] = obj return obj, false, nil } return cachedObj, true, nil } /* * Recursively traverse through the page object data and look up * references to indirect objects. * * GH: Are we fully protected against circular references? (Add tests). */ func (pp *PdfReader) traverseObjectData(o core.PdfObject) error { common.Log.Trace("Traverse object data") if _, isTraversed := pp.traversed[o]; isTraversed { common.Log.Trace("-Already traversed...") return nil } pp.traversed[o] = true if io, isIndirectObj := o.(*core.PdfIndirectObject); isIndirectObj { common.Log.Trace("io: %s", io) common.Log.Trace("- %s", io.PdfObject) err := pp.traverseObjectData(io.PdfObject) return err } if so, isStreamObj := o.(*core.PdfObjectStream); isStreamObj { err := pp.traverseObjectData(so.PdfObjectDictionary) return err } if dict, isDict := o.(*core.PdfObjectDictionary); isDict { common.Log.Trace("- dict: %s", dict) for _, name := range dict.Keys() { v := dict.Get(name) if ref, isRef := v.(*core.PdfObjectReference); isRef { resolvedObj, _, err := pp.resolveReference(ref) if err != nil { return err } dict.Set(name, resolvedObj) err = pp.traverseObjectData(resolvedObj) if err != nil { return err } } else { err := pp.traverseObjectData(v) if err != nil { return err } } } return nil } if arr, isArray := o.(*core.PdfObjectArray); isArray { common.Log.Trace("- array: %s", arr) for idx, v := range *arr { if ref, isRef := v.(*core.PdfObjectReference); isRef { resolvedObj, _, err := pp.resolveReference(ref) if err != nil { return err } (*arr)[idx] = resolvedObj err = pp.traverseObjectData(resolvedObj) if err != nil { return err } } else { err := pp.traverseObjectData(v) if err != nil { return err } } } return nil } if _, isRef := o.(*core.PdfObjectReference); isRef { common.Log.Debug("error: Reader tracing a reference!") return errors.New("reader tracing a reference") } return nil } // GetPageAsIndirectObject returns an indirect object containing the page dictionary for a specified page number. func (pp *PdfReader) GetPageAsIndirectObject(pageNumber int) (core.PdfObject, error) { if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() { return nil, fmt.Errorf("file needs to be decrypted first") } if len(pp.pageList) < pageNumber { return nil, errors.New("invalid page number (page count too short)") } page := pp.pageList[pageNumber-1] // Look up all references related to page and load everything. err := pp.traverseObjectData(page) if err != nil { return nil, err } common.Log.Trace("Page: %T %s", page, page) common.Log.Trace("- %T %s", page.PdfObject, page.PdfObject) return page, nil } // GetPage returns the PdfPage model for the specified page number. func (pp *PdfReader) GetPage(pageNumber int) (*PdfPage, error) { if pp.parser.GetCrypter() != nil && !pp.parser.IsAuthenticated() { return nil, fmt.Errorf("file needs to be decrypted first") } if len(pp.pageList) < pageNumber { return nil, errors.New("invalid page number (page count too short)") } idx := pageNumber - 1 if idx < 0 { return nil, fmt.Errorf("page numbering must start at 1") } page := pp.PageList[idx] return page, nil } // GetOCProperties returns the optional content properties PdfObject. func (pp *PdfReader) GetOCProperties() (core.PdfObject, error) { dict := pp.catalog obj := dict.Get("OCProperties") var err error obj, err = pp.traceToObject(obj) if err != nil { return nil, err } // Resolve all references... // Should be pretty safe. Should not be referencing to pages or // any large structures. Local structures and references // to OC Groups. err = pp.traverseObjectData(obj) if err != nil { return nil, err } return obj, nil } // Inspect inspects the object types, subtypes and content in the PDF file returning a map of // object type to number of instances of each. func (pp *PdfReader) Inspect() (map[string]int, error) { return pp.parser.Inspect() } // GetObjectNums returns the object numbers of the PDF objects in the file // Numbered objects are either indirect objects or stream objects. // e.g. objNums := pdfReader.GetObjectNums() // The underlying objects can then be accessed with // pdfReader.GetIndirectObjectByNumber(objNums[0]) for the first available object. func (r *PdfReader) GetObjectNums() []int { return r.parser.GetObjectNums() } // GetIndirectObjectByNumber retrieves and returns a specific PdfObject by object number. func (pp *PdfReader) GetIndirectObjectByNumber(number int) (core.PdfObject, error) { obj, err := pp.parser.LookupByNumber(number) return obj, err } // GetTrailer returns the PDF's trailer dictionary. func (pp *PdfReader) GetTrailer() (*core.PdfObjectDictionary, error) { trailerDict := pp.parser.GetTrailer() if trailerDict == nil { return nil, errors.New("trailer missing") } return trailerDict, nil }