Files
pdfmerge/internal/pdf/contentstream/contentstream.go
Adrian Zürcher ef0778c8b3 some improvments
2026-01-01 11:00:23 +01:00

199 lines
4.3 KiB
Go

package contentstream
import (
"bytes"
"fmt"
"strings"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
)
type ContentStreamOperation struct {
Params []core.PdfObject
Operand string
}
type ContentStreamOperations []*ContentStreamOperation
// Check if the content stream operations are fully wrapped (within q ... Q)
func (s *ContentStreamOperations) isWrapped() bool {
if len(*s) < 2 {
return false
}
depth := 0
for _, op := range *s {
switch op.Operand {
case "q":
depth++
case "Q":
depth--
default:
if depth < 1 {
return false
}
}
}
// Should end at depth == 0
return depth == 0
}
// Wrap entire contents within q ... Q. If unbalanced, then adds extra Qs at the end.
// Only does if needed. Ensures that when adding new content, one start with all states
// in the default condition.
func (s *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations {
if len(*s) == 0 {
// No need to wrap if empty.
return s
}
if s.isWrapped() {
return s
}
*s = append([]*ContentStreamOperation{{Operand: "q"}}, *s...)
depth := 0
for _, op := range *s {
switch op.Operand {
case "q":
depth++
case "Q":
depth--
}
}
for depth > 0 {
*s = append(*s, &ContentStreamOperation{Operand: "Q"})
depth--
}
return s
}
// Convert a set of content stream operations to a content stream byte presentation, i.e. the kind that can be
// stored as a PDF stream or string format.
func (s *ContentStreamOperations) Bytes() []byte {
var buf bytes.Buffer
for _, op := range *s {
if op == nil {
continue
}
if op.Operand == "BI" {
// Inline image requires special handling.
buf.WriteString(op.Operand + "\n")
buf.WriteString(op.Params[0].DefaultWriteString())
} else {
// Default handler.
for _, param := range op.Params {
buf.WriteString(param.DefaultWriteString())
buf.WriteString(" ")
}
buf.WriteString(op.Operand + "\n")
}
}
return buf.Bytes()
}
// ExtractText parses and extracts all text data in content streams and returns as a string.
// Does not take into account Encoding table, the output is simply the character codes.
//
// Deprecated: More advanced text extraction is offered in package extractor with character encoding support.
func (s *ContentStreamParser) ExtractText() (string, error) {
operations, err := s.Parse()
if err != nil {
return "", err
}
inText := false
xPos, yPos := float64(-1), float64(-1)
var txt strings.Builder
for _, op := range *operations {
switch op.Operand {
case "BT":
inText = true
case "ET":
inText = false
}
if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
// Move to next line...
txt.WriteString("\n")
}
if op.Operand == "Tm" {
if len(op.Params) != 6 {
continue
}
xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
if !ok {
xint, ok := op.Params[4].(*core.PdfObjectInteger)
if !ok {
continue
}
xfloat = core.MakeFloat(float64(*xint))
}
yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
if !ok {
yint, ok := op.Params[5].(*core.PdfObjectInteger)
if !ok {
continue
}
yfloat = core.MakeFloat(float64(*yint))
}
if yPos == -1 {
yPos = float64(*yfloat)
} else if yPos > float64(*yfloat) {
txt.WriteString("\n")
xPos = float64(*xfloat)
yPos = float64(*yfloat)
continue
}
if xPos == -1 {
xPos = float64(*xfloat)
} else if xPos < float64(*xfloat) {
txt.WriteString("\t")
xPos = float64(*xfloat)
}
}
if inText && op.Operand == "TJ" {
if len(op.Params) < 1 {
continue
}
paramList, ok := op.Params[0].(*core.PdfObjectArray)
if !ok {
return "", fmt.Errorf("invalid parameter type, no array (%T)", op.Params[0])
}
for _, obj := range *paramList {
switch v := obj.(type) {
case *core.PdfObjectString:
txt.WriteString(string(*v))
case *core.PdfObjectFloat:
if *v < -100 {
txt.WriteString(" ")
}
case *core.PdfObjectInteger:
if *v < -100 {
txt.WriteString(" ")
}
}
}
} else if inText && op.Operand == "Tj" {
if len(op.Params) < 1 {
continue
}
param, ok := op.Params[0].(*core.PdfObjectString)
if !ok {
return "", fmt.Errorf("invalid parameter type, not string (%T)", op.Params[0])
}
txt.WriteString(string(*param))
}
}
return txt.String(), nil
}