Files
pdfmerge/internal/pdf/model/writer.go
2025-12-15 17:44:00 +01:00

640 lines
17 KiB
Go

package model
import (
"bufio"
"crypto/md5"
"crypto/rand"
"errors"
"fmt"
"io"
"math"
"os"
"time"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
)
var pdfCreator = ""
func getPdfCreator() string {
if len(pdfCreator) > 0 {
return pdfCreator
}
// Return default.
return ""
}
func SetPdfCreator(creator string) {
pdfCreator = creator
}
type PdfWriter struct {
root *core.PdfIndirectObject
pages *core.PdfIndirectObject
objects []core.PdfObject
objectsMap map[core.PdfObject]bool // Quick lookup table.
writer *bufio.Writer
outlineTree *PdfOutlineTreeNode
catalog *core.PdfObjectDictionary
infoObj *core.PdfIndirectObject
// Encryption
crypter *core.PdfCrypt
encryptDict *core.PdfObjectDictionary
encryptObj *core.PdfIndirectObject
ids *core.PdfObjectArray
// PDF version
majorVersion int
minorVersion int
// Objects to be followed up on prior to writing.
// These are objects that are added and reference objects that are not included
// for writing.
// The map stores the object and the dictionary it is contained in.
// Only way so we can access the dictionary entry later.
pendingObjects map[core.PdfObject]*core.PdfObjectDictionary
// Forms.
acroForm *PdfAcroForm
}
func NewPdfWriter() PdfWriter {
w := PdfWriter{}
w.objectsMap = map[core.PdfObject]bool{}
w.objects = []core.PdfObject{}
w.pendingObjects = map[core.PdfObject]*core.PdfObjectDictionary{}
// PDF Version. Can be changed if using more advanced features in PDF.
// By default it is set to 1.3.
w.majorVersion = 1
w.minorVersion = 3
// Creation info.
infoDict := core.MakeDict()
infoDict.Set("Producer", core.MakeString(""))
infoDict.Set("Creator", core.MakeString(getPdfCreator()))
infoObj := core.PdfIndirectObject{}
infoObj.PdfObject = infoDict
w.infoObj = &infoObj
w.addObject(&infoObj)
// Root catalog.
catalog := core.PdfIndirectObject{}
catalogDict := core.MakeDict()
catalogDict.Set("Type", core.MakeName("Catalog"))
catalog.PdfObject = catalogDict
w.root = &catalog
w.addObject(&catalog)
// Pages.
pages := core.PdfIndirectObject{}
pagedict := core.MakeDict()
pagedict.Set("Type", core.MakeName("Pages"))
kids := core.PdfObjectArray{}
pagedict.Set("Kids", &kids)
pagedict.Set("Count", core.MakeInteger(0))
pages.PdfObject = pagedict
w.pages = &pages
w.addObject(&pages)
catalogDict.Set("Pages", &pages)
w.catalog = catalogDict
common.Log.Trace("Catalog %s", catalog)
return w
}
// Set the PDF version of the output file.
func (pw *PdfWriter) SetVersion(majorVersion, minorVersion int) {
pw.majorVersion = majorVersion
pw.minorVersion = minorVersion
}
// Set the optional content properties.
func (pw *PdfWriter) SetOCProperties(ocProperties core.PdfObject) error {
dict := pw.catalog
if ocProperties != nil {
common.Log.Trace("Setting OC Properties...")
dict.Set("OCProperties", ocProperties)
// Any risk of infinite loops?
pw.addObjects(ocProperties)
}
return nil
}
func (pw *PdfWriter) hasObject(obj core.PdfObject) bool {
// Check if already added.
for _, o := range pw.objects {
// GH: May perform better to use a hash map to check if added?
if o == obj {
return true
}
}
return false
}
// Adds the object to list of objects and returns true if the obj was
// not already added.
// Returns false if the object was previously added.
func (pw *PdfWriter) addObject(obj core.PdfObject) bool {
hasObj := pw.hasObject(obj)
if !hasObj {
pw.objects = append(pw.objects, obj)
return true
}
return false
}
func (pw *PdfWriter) addObjects(obj core.PdfObject) error {
common.Log.Trace("Adding objects!")
if io, isIndirectObj := obj.(*core.PdfIndirectObject); isIndirectObj {
common.Log.Trace("Indirect")
common.Log.Trace("- %s (%p)", obj, io)
common.Log.Trace("- %s", io.PdfObject)
if pw.addObject(io) {
err := pw.addObjects(io.PdfObject)
if err != nil {
return err
}
}
return nil
}
if so, isStreamObj := obj.(*core.PdfObjectStream); isStreamObj {
common.Log.Trace("Stream")
common.Log.Trace("- %s %p", obj, obj)
if pw.addObject(so) {
err := pw.addObjects(so.PdfObjectDictionary)
if err != nil {
return err
}
}
return nil
}
if dict, isDict := obj.(*core.PdfObjectDictionary); isDict {
common.Log.Trace("Dict")
common.Log.Trace("- %s", obj)
for _, k := range dict.Keys() {
v := dict.Get(k)
common.Log.Trace("Key %s", k)
if k != "Parent" {
err := pw.addObjects(v)
if err != nil {
return err
}
} else {
if _, parentIsNull := dict.Get("Parent").(*core.PdfObjectNull); parentIsNull {
// Parent is null. We can ignore it.
continue
}
if hasObj := pw.hasObject(v); !hasObj {
common.Log.Debug("Parent obj is missing!! %T %p %v", v, v, v)
pw.pendingObjects[v] = dict
// Although it is missing at this point, it could be added later...
}
// How to handle the parent? Make sure it is present?
if parentObj, parentIsRef := dict.Get("Parent").(*core.PdfObjectReference); parentIsRef {
// Parent is a reference. Means we can drop it?
// Could refer to somewhere outside of the scope of the output doc.
// Should be done by the reader already.
// -> ERROR.
common.Log.Debug("error: Parent is a reference object - Cannot be in writer (needs to be resolved)")
return fmt.Errorf("parent is a reference object - Cannot be in writer (needs to be resolved) - %s", parentObj)
}
}
}
return nil
}
if arr, isArray := obj.(*core.PdfObjectArray); isArray {
common.Log.Trace("Array")
common.Log.Trace("- %s", obj)
if arr == nil {
return errors.New("array is nil")
}
for _, v := range *arr {
err := pw.addObjects(v)
if err != nil {
return err
}
}
return nil
}
if _, isReference := obj.(*core.PdfObjectReference); isReference {
// Should never be a reference, should already be resolved.
common.Log.Debug("error: Cannot be a reference!")
return errors.New("reference not allowed")
}
return nil
}
// Add a page to the PDF file. The new page should be an indirect
// object.
func (pw *PdfWriter) AddPage(page *PdfPage) error {
obj := page.ToPdfObject()
common.Log.Trace("==========")
common.Log.Trace("Appending to page list %T", obj)
pageObj, ok := obj.(*core.PdfIndirectObject)
if !ok {
return errors.New("page should be an indirect object")
}
common.Log.Trace("%s", pageObj)
common.Log.Trace("%s", pageObj.PdfObject)
pDict, ok := pageObj.PdfObject.(*core.PdfObjectDictionary)
if !ok {
return errors.New("page object should be a dictionary")
}
otype, ok := pDict.Get("Type").(*core.PdfObjectName)
if !ok {
return fmt.Errorf("page should have a Type key with a value of type name (%T)", pDict.Get("Type"))
}
if *otype != "Page" {
return errors.New("type != Page (Required)")
}
// Copy inherited fields if missing.
inheritedFields := []core.PdfObjectName{"Resources", "MediaBox", "CropBox", "Rotate"}
parent, hasParent := pDict.Get("Parent").(*core.PdfIndirectObject)
common.Log.Trace("Page Parent: %T (%v)", pDict.Get("Parent"), hasParent)
for hasParent {
common.Log.Trace("Page Parent: %T", parent)
parentDict, ok := parent.PdfObject.(*core.PdfObjectDictionary)
if !ok {
return errors.New("invalid Parent object")
}
for _, field := range inheritedFields {
common.Log.Trace("Field %s", field)
if pDict.Get(field) != nil {
common.Log.Trace("- page has already")
continue
}
if obj := parentDict.Get(field); obj != nil {
// Parent has the field. Inherit, pass to the new page.
common.Log.Trace("Inheriting field %s", field)
pDict.Set(field, obj)
}
}
parent, hasParent = parentDict.Get("Parent").(*core.PdfIndirectObject)
common.Log.Trace("Next parent: %T", parentDict.Get("Parent"))
}
common.Log.Trace("Traversal done")
// Update the dictionary.
// Reuses the input object, updating the fields.
pDict.Set("Parent", pw.pages)
pageObj.PdfObject = pDict
// Add to Pages.
pagesDict, ok := pw.pages.PdfObject.(*core.PdfObjectDictionary)
if !ok {
return errors.New("invalid Pages obj (not a dict)")
}
kids, ok := pagesDict.Get("Kids").(*core.PdfObjectArray)
if !ok {
return errors.New("invalid Pages Kids obj (not an array)")
}
*kids = append(*kids, pageObj)
pageCount, ok := pagesDict.Get("Count").(*core.PdfObjectInteger)
if !ok {
return errors.New("invalid Pages Count object (not an integer)")
}
// Update the count.
*pageCount = *pageCount + 1
pw.addObject(pageObj)
// Traverse the page and record all object references.
err := pw.addObjects(pDict)
if err != nil {
return err
}
return nil
}
// Add outlines to a PDF file.
func (pw *PdfWriter) AddOutlineTree(outlineTree *PdfOutlineTreeNode) {
pw.outlineTree = outlineTree
}
// Add Acroforms to a PDF file. Sets the specified form for writing.
func (pw *PdfWriter) SetForms(form *PdfAcroForm) error {
pw.acroForm = form
return nil
}
// Write out an indirect / stream object.
func (pw *PdfWriter) writeObject(num int, obj core.PdfObject) {
common.Log.Trace("Write obj #%d\n", num)
if pobj, isIndirect := obj.(*core.PdfIndirectObject); isIndirect {
outStr := fmt.Sprintf("%d 0 obj\n", num)
outStr += pobj.PdfObject.DefaultWriteString()
outStr += "\nendobj\n"
pw.writer.WriteString(outStr)
return
}
// XXX/TODO: Add a default encoder if Filter not specified?
// Still need to make sure is encrypted.
if pobj, isStream := obj.(*core.PdfObjectStream); isStream {
outStr := fmt.Sprintf("%d 0 obj\n", num)
outStr += pobj.PdfObjectDictionary.DefaultWriteString()
outStr += "\nstream\n"
pw.writer.WriteString(outStr)
pw.writer.Write(pobj.Stream)
pw.writer.WriteString("\nendstream\nendobj\n")
return
}
pw.writer.WriteString(obj.DefaultWriteString())
}
// Update all the object numbers prior to writing.
func (pw *PdfWriter) updateObjectNumbers() {
// Update numbers
for idx, obj := range pw.objects {
if io, isIndirect := obj.(*core.PdfIndirectObject); isIndirect {
io.ObjectNumber = int64(idx + 1)
io.GenerationNumber = 0
}
if so, isStream := obj.(*core.PdfObjectStream); isStream {
so.ObjectNumber = int64(idx + 1)
so.GenerationNumber = 0
}
}
}
type EncryptOptions struct {
Permissions core.AccessPermissions
Algorithm EncryptionAlgorithm
}
// EncryptionAlgorithm is used in EncryptOptions to change the default algorithm used to encrypt the document.
type EncryptionAlgorithm int
const (
// RC4_128bit uses RC4 encryption (128 bit)
RC4_128bit = EncryptionAlgorithm(iota)
// AES_128bit uses AES encryption (128 bit, PDF 1.6)
AES_128bit
// AES_256bit uses AES encryption (256 bit, PDF 2.0)
AES_256bit
)
// Encrypt the output file with a specified user/owner password.
func (pw *PdfWriter) Encrypt(userPass, ownerPass []byte, options *EncryptOptions) error {
crypter := core.PdfCrypt{}
pw.crypter = &crypter
crypter.EncryptedObjects = map[core.PdfObject]bool{}
crypter.CryptFilters = core.CryptFilters{}
algo := RC4_128bit
if options != nil {
algo = options.Algorithm
}
var cf core.CryptFilter
switch algo {
case RC4_128bit:
crypter.V = 2
crypter.R = 3
cf = core.NewCryptFilterV2(16)
case AES_128bit:
pw.SetVersion(1, 5)
crypter.V = 4
crypter.R = 4
cf = core.NewCryptFilterAESV2()
case AES_256bit:
pw.SetVersion(2, 0)
crypter.V = 5
crypter.R = 6 // TODO(dennwc): a way to set R=5?
cf = core.NewCryptFilterAESV3()
default:
return fmt.Errorf("unsupported algorithm: %v", options.Algorithm)
}
crypter.Length = cf.Length * 8
const (
defaultFilter = core.StandardCryptFilter
)
crypter.CryptFilters[defaultFilter] = cf
if crypter.V >= 4 {
crypter.StreamFilter = defaultFilter
crypter.StringFilter = defaultFilter
}
// Set
crypter.P = math.MaxUint32
crypter.EncryptMetadata = true
if options != nil {
crypter.P = int(options.Permissions.GetP())
}
// Generate the encryption dictionary.
ed := core.MakeDict()
ed.Set("Filter", core.MakeName("Standard"))
ed.Set("P", core.MakeInteger(int64(crypter.P)))
ed.Set("V", core.MakeInteger(int64(crypter.V)))
ed.Set("R", core.MakeInteger(int64(crypter.R)))
ed.Set("Length", core.MakeInteger(int64(crypter.Length)))
pw.encryptDict = ed
// Prepare the ID object for the trailer.
hashcode := md5.Sum([]byte(time.Now().Format(time.RFC850)))
id0 := core.PdfObjectString(hashcode[:])
b := make([]byte, 100)
rand.Read(b)
hashcode = md5.Sum(b)
id1 := core.PdfObjectString(hashcode[:])
common.Log.Trace("Random b: % x", b)
pw.ids = &core.PdfObjectArray{&id0, &id1}
common.Log.Trace("Gen Id 0: % x", id0)
// Generate encryption parameters
if crypter.R < 5 {
crypter.Id0 = string(id0)
// Make the O and U objects.
O, err := crypter.Alg3(userPass, ownerPass)
if err != nil {
common.Log.Debug("error: Error generating O for encryption (%s)", err)
return err
}
crypter.O = []byte(O)
common.Log.Trace("gen O: % x", O)
U, key, err := crypter.Alg5(userPass)
if err != nil {
common.Log.Debug("error: Error generating O for encryption (%s)", err)
return err
}
common.Log.Trace("gen U: % x", U)
crypter.U = []byte(U)
crypter.EncryptionKey = key
ed.Set("O", &O)
ed.Set("U", &U)
} else { // R >= 5
err := crypter.GenerateParams(userPass, ownerPass)
if err != nil {
return err
}
ed.Set("O", core.MakeString(string(crypter.O)))
ed.Set("U", core.MakeString(string(crypter.U)))
ed.Set("OE", core.MakeString(string(crypter.OE)))
ed.Set("UE", core.MakeString(string(crypter.UE)))
ed.Set("EncryptMetadata", core.MakeBool(crypter.EncryptMetadata))
if crypter.R > 5 {
ed.Set("Perms", core.MakeString(string(crypter.Perms)))
}
}
if crypter.V >= 4 {
if err := crypter.SaveCryptFilters(ed); err != nil {
return err
}
}
// Make an object to contain the encryption dictionary.
io := core.MakeIndirectObject(ed)
pw.encryptObj = io
pw.addObject(io)
return nil
}
// Write the pdf out.
func (pw *PdfWriter) Write(ws io.WriteSeeker) error {
// Outlines.
if pw.outlineTree != nil {
common.Log.Trace("OutlineTree: %+v", pw.outlineTree)
outlines := pw.outlineTree.ToPdfObject()
common.Log.Trace("Outlines: %+v (%T, p:%p)", outlines, outlines, outlines)
pw.catalog.Set("Outlines", outlines)
err := pw.addObjects(outlines)
if err != nil {
return err
}
}
// Form fields.
if pw.acroForm != nil {
common.Log.Trace("Writing acro forms")
indObj := pw.acroForm.ToPdfObject()
common.Log.Trace("AcroForm: %+v", indObj)
pw.catalog.Set("AcroForm", indObj)
err := pw.addObjects(indObj)
if err != nil {
return err
}
}
// Check pending objects prior to write.
for pendingObj, pendingObjDict := range pw.pendingObjects {
if !pw.hasObject(pendingObj) {
common.Log.Debug("error Pending object %+v %T (%p) never added for writing", pendingObj, pendingObj, pendingObj)
for _, key := range pendingObjDict.Keys() {
val := pendingObjDict.Get(key)
if val == pendingObj {
common.Log.Debug("Pending object found! and replaced with null")
pendingObjDict.Set(key, core.MakeNull())
break
}
}
}
}
// Set version in the catalog.
pw.catalog.Set("Version", core.MakeName(fmt.Sprintf("%d.%d", pw.majorVersion, pw.minorVersion)))
w := bufio.NewWriter(ws)
pw.writer = w
w.WriteString(fmt.Sprintf("%%PDF-%d.%d\n", pw.majorVersion, pw.minorVersion))
w.WriteString("%âãÏÓ\n")
w.Flush()
pw.updateObjectNumbers()
offsets := []int64{}
// Write objects
common.Log.Trace("Writing %d obj", len(pw.objects))
for idx, obj := range pw.objects {
common.Log.Trace("Writing %d", idx)
pw.writer.Flush()
offset, _ := ws.Seek(0, os.SEEK_CUR)
offsets = append(offsets, offset)
// Encrypt prior to writing.
// Encrypt dictionary should not be encrypted.
if pw.crypter != nil && obj != pw.encryptObj {
err := pw.crypter.Encrypt(obj, int64(idx+1), 0)
if err != nil {
common.Log.Debug("error: Failed encrypting (%s)", err)
return err
}
}
pw.writeObject(idx+1, obj)
}
w.Flush()
xrefOffset, _ := ws.Seek(0, os.SEEK_CUR)
// Write xref table.
pw.writer.WriteString("xref\r\n")
outStr := fmt.Sprintf("%d %d\r\n", 0, len(pw.objects)+1)
pw.writer.WriteString(outStr)
outStr = fmt.Sprintf("%.10d %.5d f\r\n", 0, 65535)
pw.writer.WriteString(outStr)
for _, offset := range offsets {
outStr = fmt.Sprintf("%.10d %.5d n\r\n", offset, 0)
pw.writer.WriteString(outStr)
}
// Generate & write trailer
trailer := core.MakeDict()
trailer.Set("Info", pw.infoObj)
trailer.Set("Root", pw.root)
trailer.Set("Size", core.MakeInteger(int64(len(pw.objects)+1)))
// If encrypted!
if pw.crypter != nil {
trailer.Set("Encrypt", pw.encryptObj)
trailer.Set("ID", pw.ids)
common.Log.Trace("Ids: %s", pw.ids)
}
pw.writer.WriteString("trailer\n")
pw.writer.WriteString(trailer.DefaultWriteString())
pw.writer.WriteString("\n")
// Make offset reference.
outStr = fmt.Sprintf("startxref\n%d\n", xrefOffset)
pw.writer.WriteString(outStr)
pw.writer.WriteString("%%EOF\n")
w.Flush()
return nil
}