fix wrong git ignore
This commit is contained in:
197
internal/pdf/contentstream/contentstream.go
Normal file
197
internal/pdf/contentstream/contentstream.go
Normal file
@@ -0,0 +1,197 @@
|
||||
package contentstream
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
|
||||
)
|
||||
|
||||
type ContentStreamOperation struct {
|
||||
Params []core.PdfObject
|
||||
Operand string
|
||||
}
|
||||
|
||||
type ContentStreamOperations []*ContentStreamOperation
|
||||
|
||||
// Check if the content stream operations are fully wrapped (within q ... Q)
|
||||
func (s *ContentStreamOperations) isWrapped() bool {
|
||||
if len(*s) < 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
depth := 0
|
||||
for _, op := range *s {
|
||||
switch op.Operand {
|
||||
case "q":
|
||||
depth++
|
||||
case "Q":
|
||||
depth--
|
||||
default:
|
||||
if depth < 1 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Should end at depth == 0
|
||||
return depth == 0
|
||||
}
|
||||
|
||||
// Wrap entire contents within q ... Q. If unbalanced, then adds extra Qs at the end.
|
||||
// Only does if needed. Ensures that when adding new content, one start with all states
|
||||
// in the default condition.
|
||||
func (s *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations {
|
||||
if len(*s) == 0 {
|
||||
// No need to wrap if empty.
|
||||
return s
|
||||
}
|
||||
if s.isWrapped() {
|
||||
return s
|
||||
}
|
||||
|
||||
*s = append([]*ContentStreamOperation{{Operand: "q"}}, *s...)
|
||||
|
||||
depth := 0
|
||||
for _, op := range *s {
|
||||
switch op.Operand {
|
||||
case "q":
|
||||
depth++
|
||||
case "Q":
|
||||
depth--
|
||||
}
|
||||
}
|
||||
|
||||
for depth > 0 {
|
||||
*s = append(*s, &ContentStreamOperation{Operand: "Q"})
|
||||
depth--
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// Convert a set of content stream operations to a content stream byte presentation, i.e. the kind that can be
|
||||
// stored as a PDF stream or string format.
|
||||
func (s *ContentStreamOperations) Bytes() []byte {
|
||||
var buf bytes.Buffer
|
||||
|
||||
for _, op := range *s {
|
||||
if op == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if op.Operand == "BI" {
|
||||
// Inline image requires special handling.
|
||||
buf.WriteString(op.Operand + "\n")
|
||||
buf.WriteString(op.Params[0].DefaultWriteString())
|
||||
|
||||
} else {
|
||||
// Default handler.
|
||||
for _, param := range op.Params {
|
||||
buf.WriteString(param.DefaultWriteString())
|
||||
buf.WriteString(" ")
|
||||
|
||||
}
|
||||
|
||||
buf.WriteString(op.Operand + "\n")
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// ExtractText parses and extracts all text data in content streams and returns as a string.
|
||||
// Does not take into account Encoding table, the output is simply the character codes.
|
||||
//
|
||||
// Deprecated: More advanced text extraction is offered in package extractor with character encoding support.
|
||||
func (s *ContentStreamParser) ExtractText() (string, error) {
|
||||
operations, err := s.Parse()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
inText := false
|
||||
xPos, yPos := float64(-1), float64(-1)
|
||||
txt := ""
|
||||
for _, op := range *operations {
|
||||
switch op.Operand {
|
||||
case "BT":
|
||||
inText = true
|
||||
case "ET":
|
||||
inText = false
|
||||
}
|
||||
|
||||
if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" {
|
||||
// Move to next line...
|
||||
txt += "\n"
|
||||
}
|
||||
if op.Operand == "Tm" {
|
||||
if len(op.Params) != 6 {
|
||||
continue
|
||||
}
|
||||
xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
|
||||
if !ok {
|
||||
xint, ok := op.Params[4].(*core.PdfObjectInteger)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
xfloat = core.MakeFloat(float64(*xint))
|
||||
}
|
||||
yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
|
||||
if !ok {
|
||||
yint, ok := op.Params[5].(*core.PdfObjectInteger)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
yfloat = core.MakeFloat(float64(*yint))
|
||||
}
|
||||
if yPos == -1 {
|
||||
yPos = float64(*yfloat)
|
||||
} else if yPos > float64(*yfloat) {
|
||||
txt += "\n"
|
||||
xPos = float64(*xfloat)
|
||||
yPos = float64(*yfloat)
|
||||
continue
|
||||
}
|
||||
if xPos == -1 {
|
||||
xPos = float64(*xfloat)
|
||||
} else if xPos < float64(*xfloat) {
|
||||
txt += "\t"
|
||||
xPos = float64(*xfloat)
|
||||
}
|
||||
}
|
||||
if inText && op.Operand == "TJ" {
|
||||
if len(op.Params) < 1 {
|
||||
continue
|
||||
}
|
||||
paramList, ok := op.Params[0].(*core.PdfObjectArray)
|
||||
if !ok {
|
||||
return "", fmt.Errorf("invalid parameter type, no array (%T)", op.Params[0])
|
||||
}
|
||||
for _, obj := range *paramList {
|
||||
switch v := obj.(type) {
|
||||
case *core.PdfObjectString:
|
||||
txt += string(*v)
|
||||
case *core.PdfObjectFloat:
|
||||
if *v < -100 {
|
||||
txt += " "
|
||||
}
|
||||
case *core.PdfObjectInteger:
|
||||
if *v < -100 {
|
||||
txt += " "
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if inText && op.Operand == "Tj" {
|
||||
if len(op.Params) < 1 {
|
||||
continue
|
||||
}
|
||||
param, ok := op.Params[0].(*core.PdfObjectString)
|
||||
if !ok {
|
||||
return "", fmt.Errorf("invalid parameter type, not string (%T)", op.Params[0])
|
||||
}
|
||||
txt += string(*param)
|
||||
}
|
||||
}
|
||||
|
||||
return txt, nil
|
||||
}
|
||||
Reference in New Issue
Block a user