fix wrong git ignore
This commit is contained in:
3
internal/pdf/extractor/const.go
Normal file
3
internal/pdf/extractor/const.go
Normal file
@@ -0,0 +1,3 @@
|
||||
package extractor
|
||||
|
||||
var isTesting = false
|
||||
23
internal/pdf/extractor/extractor.go
Normal file
23
internal/pdf/extractor/extractor.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package extractor
|
||||
|
||||
import "gitea.tecamino.com/paadi/pdfmerge/internal/pdf/model"
|
||||
|
||||
// Extractor stores and offers functionality for extracting content from PDF pages.
|
||||
type Extractor struct {
|
||||
contents string
|
||||
resources *model.PdfPageResources
|
||||
}
|
||||
|
||||
// New returns an Extractor instance for extracting content from the input PDF page.
|
||||
func New(page *model.PdfPage) (*Extractor, error) {
|
||||
contents, err := page.GetAllContentStreams()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
e := &Extractor{}
|
||||
e.contents = contents
|
||||
e.resources = page.Resources
|
||||
|
||||
return e, nil
|
||||
}
|
||||
225
internal/pdf/extractor/text.go
Normal file
225
internal/pdf/extractor/text.go
Normal file
@@ -0,0 +1,225 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/cmap"
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/contentstream"
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/model"
|
||||
)
|
||||
|
||||
// ExtractText processes and extracts all text data in content streams and returns as a string. Takes into
|
||||
// account character encoding via CMaps in the PDF file.
|
||||
// The text is processed linearly e.g. in the order in which it appears. A best effort is done to add
|
||||
// spaces and newlines.
|
||||
func (e *Extractor) ExtractText() (string, error) {
|
||||
var buf bytes.Buffer
|
||||
|
||||
cstreamParser := contentstream.NewContentStreamParser(e.contents)
|
||||
operations, err := cstreamParser.Parse()
|
||||
if err != nil {
|
||||
return buf.String(), err
|
||||
}
|
||||
|
||||
processor := contentstream.NewContentStreamProcessor(*operations)
|
||||
|
||||
var codemap *cmap.CMap
|
||||
inText := false
|
||||
xPos, yPos := float64(-1), float64(-1)
|
||||
|
||||
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
|
||||
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error {
|
||||
operand := op.Operand
|
||||
switch operand {
|
||||
case "BT":
|
||||
inText = true
|
||||
case "ET":
|
||||
inText = false
|
||||
case "Tf":
|
||||
if !inText {
|
||||
common.Log.Debug("Tf operand outside text")
|
||||
return nil
|
||||
}
|
||||
|
||||
if len(op.Params) != 2 {
|
||||
common.Log.Debug("error Tf should only get 2 input params, got %d", len(op.Params))
|
||||
return errors.New("Incorrect parameter count")
|
||||
}
|
||||
|
||||
codemap = nil
|
||||
|
||||
fontName, ok := op.Params[0].(*core.PdfObjectName)
|
||||
if !ok {
|
||||
common.Log.Debug("error Tf font input not a name")
|
||||
return errors.New("Tf range error")
|
||||
}
|
||||
|
||||
if resources == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
fontObj, found := resources.GetFontByName(*fontName)
|
||||
if !found {
|
||||
common.Log.Debug("Font not found...")
|
||||
return errors.New("Font not in resources")
|
||||
}
|
||||
|
||||
fontObj = core.TraceToDirectObject(fontObj)
|
||||
if fontDict, isDict := fontObj.(*core.PdfObjectDictionary); isDict {
|
||||
toUnicode := fontDict.Get("ToUnicode")
|
||||
if toUnicode != nil {
|
||||
toUnicode = core.TraceToDirectObject(toUnicode)
|
||||
toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
|
||||
if !ok {
|
||||
return errors.New("invalid ToUnicode entry - not a stream")
|
||||
}
|
||||
decoded, err := core.DecodeStream(toUnicodeStream)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
codemap, err = cmap.LoadCmapFromData(decoded)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
case "T*":
|
||||
if !inText {
|
||||
common.Log.Debug("T* operand outside text")
|
||||
return nil
|
||||
}
|
||||
buf.WriteString("\n")
|
||||
case "Td", "TD":
|
||||
if !inText {
|
||||
common.Log.Debug("Td/TD operand outside text")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Params: [tx ty], corresponeds to Tm=Tlm=[1 0 0;0 1 0;tx ty 1]*Tm
|
||||
if len(op.Params) != 2 {
|
||||
common.Log.Debug("Td/TD invalid arguments")
|
||||
return nil
|
||||
}
|
||||
tx, err := getNumberAsFloat(op.Params[0])
|
||||
if err != nil {
|
||||
common.Log.Debug("Td Float parse error")
|
||||
return nil
|
||||
}
|
||||
ty, err := getNumberAsFloat(op.Params[1])
|
||||
if err != nil {
|
||||
common.Log.Debug("Td Float parse error")
|
||||
return nil
|
||||
}
|
||||
|
||||
if tx > 0 {
|
||||
buf.WriteString(" ")
|
||||
}
|
||||
if ty < 0 {
|
||||
// TODO: More flexible space characters?
|
||||
buf.WriteString("\n")
|
||||
}
|
||||
case "Tm":
|
||||
if !inText {
|
||||
common.Log.Debug("Tm operand outside text")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Params: a,b,c,d,e,f as in Tm = [a b 0; c d 0; e f 1].
|
||||
// The last two (e,f) represent translation.
|
||||
if len(op.Params) != 6 {
|
||||
return errors.New("Tm: Invalid number of inputs")
|
||||
}
|
||||
xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
|
||||
if !ok {
|
||||
xint, ok := op.Params[4].(*core.PdfObjectInteger)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
xfloat = core.MakeFloat(float64(*xint))
|
||||
}
|
||||
yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
|
||||
if !ok {
|
||||
yint, ok := op.Params[5].(*core.PdfObjectInteger)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
yfloat = core.MakeFloat(float64(*yint))
|
||||
}
|
||||
if yPos == -1 {
|
||||
yPos = float64(*yfloat)
|
||||
} else if yPos > float64(*yfloat) {
|
||||
buf.WriteString("\n")
|
||||
xPos = float64(*xfloat)
|
||||
yPos = float64(*yfloat)
|
||||
return nil
|
||||
}
|
||||
if xPos == -1 {
|
||||
xPos = float64(*xfloat)
|
||||
} else if xPos < float64(*xfloat) {
|
||||
buf.WriteString("\t")
|
||||
xPos = float64(*xfloat)
|
||||
}
|
||||
case "TJ":
|
||||
if !inText {
|
||||
common.Log.Debug("TJ operand outside text")
|
||||
return nil
|
||||
}
|
||||
if len(op.Params) < 1 {
|
||||
return nil
|
||||
}
|
||||
paramList, ok := op.Params[0].(*core.PdfObjectArray)
|
||||
if !ok {
|
||||
return fmt.Errorf("invalid parameter type, no array (%T)", op.Params[0])
|
||||
}
|
||||
for _, obj := range *paramList {
|
||||
switch v := obj.(type) {
|
||||
case *core.PdfObjectString:
|
||||
if codemap != nil {
|
||||
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*v)))
|
||||
} else {
|
||||
buf.WriteString(string(*v))
|
||||
}
|
||||
case *core.PdfObjectFloat:
|
||||
if *v < -100 {
|
||||
buf.WriteString(" ")
|
||||
}
|
||||
case *core.PdfObjectInteger:
|
||||
if *v < -100 {
|
||||
buf.WriteString(" ")
|
||||
}
|
||||
}
|
||||
}
|
||||
case "Tj":
|
||||
if !inText {
|
||||
common.Log.Debug("Tj operand outside text")
|
||||
return nil
|
||||
}
|
||||
if len(op.Params) < 1 {
|
||||
return nil
|
||||
}
|
||||
param, ok := op.Params[0].(*core.PdfObjectString)
|
||||
if !ok {
|
||||
return fmt.Errorf("invalid parameter type, not string (%T)", op.Params[0])
|
||||
}
|
||||
if codemap != nil {
|
||||
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*param)))
|
||||
} else {
|
||||
buf.WriteString(string(*param))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
err = processor.Process(e.resources)
|
||||
if err != nil {
|
||||
common.Log.Error("error processing: %v", err)
|
||||
return buf.String(), err
|
||||
}
|
||||
return buf.String(), nil
|
||||
}
|
||||
20
internal/pdf/extractor/utils.go
Normal file
20
internal/pdf/extractor/utils.go
Normal file
@@ -0,0 +1,20 @@
|
||||
package extractor
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
|
||||
)
|
||||
|
||||
// getNumberAsFloat can retrieve numeric values from PdfObject (both integer/float).
|
||||
func getNumberAsFloat(obj core.PdfObject) (float64, error) {
|
||||
if fObj, ok := obj.(*core.PdfObjectFloat); ok {
|
||||
return float64(*fObj), nil
|
||||
}
|
||||
|
||||
if iObj, ok := obj.(*core.PdfObjectInteger); ok {
|
||||
return float64(*iObj), nil
|
||||
}
|
||||
|
||||
return 0, errors.New("Not a number")
|
||||
}
|
||||
Reference in New Issue
Block a user