198 lines
4.2 KiB
Go
198 lines
4.2 KiB
Go
package contentstream
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
|
|
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
|
|
)
|
|
|
|
type ContentStreamOperation struct {
|
|
Params []core.PdfObject
|
|
Operand string
|
|
}
|
|
|
|
type ContentStreamOperations []*ContentStreamOperation
|
|
|
|
// Check if the content stream operations are fully wrapped (within q ... Q)
|
|
func (s *ContentStreamOperations) isWrapped() bool {
|
|
if len(*s) < 2 {
|
|
return false
|
|
}
|
|
|
|
depth := 0
|
|
for _, op := range *s {
|
|
switch op.Operand {
|
|
case "q":
|
|
depth++
|
|
case "Q":
|
|
depth--
|
|
default:
|
|
if depth < 1 {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
// Should end at depth == 0
|
|
return depth == 0
|
|
}
|
|
|
|
// Wrap entire contents within q ... Q. If unbalanced, then adds extra Qs at the end.
|
|
// Only does if needed. Ensures that when adding new content, one start with all states
|
|
// in the default condition.
|
|
func (s *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations {
|
|
if len(*s) == 0 {
|
|
// No need to wrap if empty.
|
|
return s
|
|
}
|
|
if s.isWrapped() {
|
|
return s
|
|
}
|
|
|
|
*s = append([]*ContentStreamOperation{{Operand: "q"}}, *s...)
|
|
|
|
depth := 0
|
|
for _, op := range *s {
|
|
switch op.Operand {
|
|
case "q":
|
|
depth++
|
|
case "Q":
|
|
depth--
|
|
}
|
|
}
|
|
|
|
for depth > 0 {
|
|
*s = append(*s, &ContentStreamOperation{Operand: "Q"})
|
|
depth--
|
|
}
|
|
|
|
return s
|
|
}
|
|
|
|
// Convert a set of content stream operations to a content stream byte presentation, i.e. the kind that can be
|
|
// stored as a PDF stream or string format.
|
|
func (s *ContentStreamOperations) Bytes() []byte {
|
|
var buf bytes.Buffer
|
|
|
|
for _, op := range *s {
|
|
if op == nil {
|
|
continue
|
|
}
|
|
|
|
if op.Operand == "BI" {
|
|
// Inline image requires special handling.
|
|
buf.WriteString(op.Operand + "\n")
|
|
buf.WriteString(op.Params[0].DefaultWriteString())
|
|
|
|
} else {
|
|
// Default handler.
|
|
for _, param := range op.Params {
|
|
buf.WriteString(param.DefaultWriteString())
|
|
buf.WriteString(" ")
|
|
|
|
}
|
|
|
|
buf.WriteString(op.Operand + "\n")
|
|
}
|
|
}
|
|
|
|
return buf.Bytes()
|
|
}
|
|
|
|
// ExtractText parses and extracts all text data in content streams and returns as a string.
|
|
// Does not take into account Encoding table, the output is simply the character codes.
|
|
//
|
|
// Deprecated: More advanced text extraction is offered in package extractor with character encoding support.
|
|
func (s *ContentStreamParser) ExtractText() (string, error) {
|
|
operations, err := s.Parse()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
inText := false
|
|
xPos, yPos := float64(-1), float64(-1)
|
|
txt := ""
|
|
for _, op := range *operations {
|
|
switch op.Operand {
|
|
case "BT":
|
|
inText = true
|
|
case "ET":
|
|
inText = false
|
|
}
|
|
|
|
if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
|
|
// Move to next line...
|
|
txt += "\n"
|
|
}
|
|
if op.Operand == "Tm" {
|
|
if len(op.Params) != 6 {
|
|
continue
|
|
}
|
|
xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
|
|
if !ok {
|
|
xint, ok := op.Params[4].(*core.PdfObjectInteger)
|
|
if !ok {
|
|
continue
|
|
}
|
|
xfloat = core.MakeFloat(float64(*xint))
|
|
}
|
|
yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
|
|
if !ok {
|
|
yint, ok := op.Params[5].(*core.PdfObjectInteger)
|
|
if !ok {
|
|
continue
|
|
}
|
|
yfloat = core.MakeFloat(float64(*yint))
|
|
}
|
|
if yPos == -1 {
|
|
yPos = float64(*yfloat)
|
|
} else if yPos > float64(*yfloat) {
|
|
txt += "\n"
|
|
xPos = float64(*xfloat)
|
|
yPos = float64(*yfloat)
|
|
continue
|
|
}
|
|
if xPos == -1 {
|
|
xPos = float64(*xfloat)
|
|
} else if xPos < float64(*xfloat) {
|
|
txt += "\t"
|
|
xPos = float64(*xfloat)
|
|
}
|
|
}
|
|
if inText && op.Operand == "TJ" {
|
|
if len(op.Params) < 1 {
|
|
continue
|
|
}
|
|
paramList, ok := op.Params[0].(*core.PdfObjectArray)
|
|
if !ok {
|
|
return "", fmt.Errorf("invalid parameter type, no array (%T)", op.Params[0])
|
|
}
|
|
for _, obj := range *paramList {
|
|
switch v := obj.(type) {
|
|
case *core.PdfObjectString:
|
|
txt += string(*v)
|
|
case *core.PdfObjectFloat:
|
|
if *v < -100 {
|
|
txt += " "
|
|
}
|
|
case *core.PdfObjectInteger:
|
|
if *v < -100 {
|
|
txt += " "
|
|
}
|
|
}
|
|
}
|
|
} else if inText && op.Operand == "Tj" {
|
|
if len(op.Params) < 1 {
|
|
continue
|
|
}
|
|
param, ok := op.Params[0].(*core.PdfObjectString)
|
|
if !ok {
|
|
return "", fmt.Errorf("invalid parameter type, not string (%T)", op.Params[0])
|
|
}
|
|
txt += string(*param)
|
|
}
|
|
}
|
|
|
|
return txt, nil
|
|
}
|