fix wrong git ignore

This commit is contained in:
Adrian Zürcher
2025-12-15 17:44:00 +01:00
parent ed9f31bb96
commit 8f313c00f0
126 changed files with 70589 additions and 1 deletions

View File

@@ -0,0 +1,3 @@
package extractor
var isTesting = false

View File

@@ -0,0 +1,23 @@
package extractor
import "gitea.tecamino.com/paadi/pdfmerge/internal/pdf/model"
// Extractor stores and offers functionality for extracting content from PDF pages.
type Extractor struct {
contents string
resources *model.PdfPageResources
}
// New returns an Extractor instance for extracting content from the input PDF page.
func New(page *model.PdfPage) (*Extractor, error) {
contents, err := page.GetAllContentStreams()
if err != nil {
return nil, err
}
e := &Extractor{}
e.contents = contents
e.resources = page.Resources
return e, nil
}

View File

@@ -0,0 +1,225 @@
package extractor
import (
"bytes"
"errors"
"fmt"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/cmap"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/contentstream"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/model"
)
// ExtractText processes and extracts all text data in content streams and returns as a string. Takes into
// account character encoding via CMaps in the PDF file.
// The text is processed linearly e.g. in the order in which it appears. A best effort is done to add
// spaces and newlines.
func (e *Extractor) ExtractText() (string, error) {
var buf bytes.Buffer
cstreamParser := contentstream.NewContentStreamParser(e.contents)
operations, err := cstreamParser.Parse()
if err != nil {
return buf.String(), err
}
processor := contentstream.NewContentStreamProcessor(*operations)
var codemap *cmap.CMap
inText := false
xPos, yPos := float64(-1), float64(-1)
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error {
operand := op.Operand
switch operand {
case "BT":
inText = true
case "ET":
inText = false
case "Tf":
if !inText {
common.Log.Debug("Tf operand outside text")
return nil
}
if len(op.Params) != 2 {
common.Log.Debug("error Tf should only get 2 input params, got %d", len(op.Params))
return errors.New("Incorrect parameter count")
}
codemap = nil
fontName, ok := op.Params[0].(*core.PdfObjectName)
if !ok {
common.Log.Debug("error Tf font input not a name")
return errors.New("Tf range error")
}
if resources == nil {
return nil
}
fontObj, found := resources.GetFontByName(*fontName)
if !found {
common.Log.Debug("Font not found...")
return errors.New("Font not in resources")
}
fontObj = core.TraceToDirectObject(fontObj)
if fontDict, isDict := fontObj.(*core.PdfObjectDictionary); isDict {
toUnicode := fontDict.Get("ToUnicode")
if toUnicode != nil {
toUnicode = core.TraceToDirectObject(toUnicode)
toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
if !ok {
return errors.New("invalid ToUnicode entry - not a stream")
}
decoded, err := core.DecodeStream(toUnicodeStream)
if err != nil {
return err
}
codemap, err = cmap.LoadCmapFromData(decoded)
if err != nil {
return err
}
}
}
case "T*":
if !inText {
common.Log.Debug("T* operand outside text")
return nil
}
buf.WriteString("\n")
case "Td", "TD":
if !inText {
common.Log.Debug("Td/TD operand outside text")
return nil
}
// Params: [tx ty], corresponeds to Tm=Tlm=[1 0 0;0 1 0;tx ty 1]*Tm
if len(op.Params) != 2 {
common.Log.Debug("Td/TD invalid arguments")
return nil
}
tx, err := getNumberAsFloat(op.Params[0])
if err != nil {
common.Log.Debug("Td Float parse error")
return nil
}
ty, err := getNumberAsFloat(op.Params[1])
if err != nil {
common.Log.Debug("Td Float parse error")
return nil
}
if tx > 0 {
buf.WriteString(" ")
}
if ty < 0 {
// TODO: More flexible space characters?
buf.WriteString("\n")
}
case "Tm":
if !inText {
common.Log.Debug("Tm operand outside text")
return nil
}
// Params: a,b,c,d,e,f as in Tm = [a b 0; c d 0; e f 1].
// The last two (e,f) represent translation.
if len(op.Params) != 6 {
return errors.New("Tm: Invalid number of inputs")
}
xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
if !ok {
xint, ok := op.Params[4].(*core.PdfObjectInteger)
if !ok {
return nil
}
xfloat = core.MakeFloat(float64(*xint))
}
yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
if !ok {
yint, ok := op.Params[5].(*core.PdfObjectInteger)
if !ok {
return nil
}
yfloat = core.MakeFloat(float64(*yint))
}
if yPos == -1 {
yPos = float64(*yfloat)
} else if yPos > float64(*yfloat) {
buf.WriteString("\n")
xPos = float64(*xfloat)
yPos = float64(*yfloat)
return nil
}
if xPos == -1 {
xPos = float64(*xfloat)
} else if xPos < float64(*xfloat) {
buf.WriteString("\t")
xPos = float64(*xfloat)
}
case "TJ":
if !inText {
common.Log.Debug("TJ operand outside text")
return nil
}
if len(op.Params) < 1 {
return nil
}
paramList, ok := op.Params[0].(*core.PdfObjectArray)
if !ok {
return fmt.Errorf("invalid parameter type, no array (%T)", op.Params[0])
}
for _, obj := range *paramList {
switch v := obj.(type) {
case *core.PdfObjectString:
if codemap != nil {
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*v)))
} else {
buf.WriteString(string(*v))
}
case *core.PdfObjectFloat:
if *v < -100 {
buf.WriteString(" ")
}
case *core.PdfObjectInteger:
if *v < -100 {
buf.WriteString(" ")
}
}
}
case "Tj":
if !inText {
common.Log.Debug("Tj operand outside text")
return nil
}
if len(op.Params) < 1 {
return nil
}
param, ok := op.Params[0].(*core.PdfObjectString)
if !ok {
return fmt.Errorf("invalid parameter type, not string (%T)", op.Params[0])
}
if codemap != nil {
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*param)))
} else {
buf.WriteString(string(*param))
}
}
return nil
})
err = processor.Process(e.resources)
if err != nil {
common.Log.Error("error processing: %v", err)
return buf.String(), err
}
return buf.String(), nil
}

View File

@@ -0,0 +1,20 @@
package extractor
import (
"errors"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
)
// getNumberAsFloat can retrieve numeric values from PdfObject (both integer/float).
func getNumberAsFloat(obj core.PdfObject) (float64, error) {
if fObj, ok := obj.(*core.PdfObjectFloat); ok {
return float64(*fObj), nil
}
if iObj, ok := obj.(*core.PdfObjectInteger); ok {
return float64(*iObj), nil
}
return 0, errors.New("Not a number")
}