fix wrong git ignore
This commit is contained in:
586
internal/pdf/contentstream/parser.go
Normal file
586
internal/pdf/contentstream/parser.go
Normal file
@@ -0,0 +1,586 @@
|
||||
package contentstream
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
|
||||
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
|
||||
)
|
||||
|
||||
// Content stream parser.
|
||||
type ContentStreamParser struct {
|
||||
reader *bufio.Reader
|
||||
}
|
||||
|
||||
// Create a new instance of the content stream parser from an input content
|
||||
// stream string.
|
||||
func NewContentStreamParser(contentStr string) *ContentStreamParser {
|
||||
// Each command has parameters and an operand (command).
|
||||
parser := ContentStreamParser{}
|
||||
|
||||
buffer := bytes.NewBufferString(contentStr + "\n") // Add newline at end to get last operand without EOF error.
|
||||
parser.reader = bufio.NewReader(buffer)
|
||||
|
||||
return &parser
|
||||
}
|
||||
|
||||
// Parses all commands in content stream, returning a list of operation data.
|
||||
func (sp *ContentStreamParser) Parse() (*ContentStreamOperations, error) {
|
||||
operations := ContentStreamOperations{}
|
||||
|
||||
for {
|
||||
operation := ContentStreamOperation{}
|
||||
|
||||
for {
|
||||
obj, err, isOperand := sp.parseObject()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
// End of data. Successful exit point.
|
||||
return &operations, nil
|
||||
}
|
||||
return &operations, err
|
||||
}
|
||||
if isOperand {
|
||||
operation.Operand = string(*obj.(*core.PdfObjectString))
|
||||
operations = append(operations, &operation)
|
||||
break
|
||||
} else {
|
||||
operation.Params = append(operation.Params, obj)
|
||||
}
|
||||
}
|
||||
|
||||
if operation.Operand == "BI" {
|
||||
// Parse an inline image, reads everything between the "BI" and "EI".
|
||||
// The image is stored as the parameter.
|
||||
im, err := sp.ParseInlineImage()
|
||||
if err != nil {
|
||||
return &operations, err
|
||||
}
|
||||
operation.Params = append(operation.Params, im)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip over any spaces. Returns the number of spaces skipped and
|
||||
// an error if any.
|
||||
func (sp *ContentStreamParser) skipSpaces() (int, error) {
|
||||
cnt := 0
|
||||
for {
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if core.IsWhiteSpace(bb[0]) {
|
||||
sp.reader.ReadByte()
|
||||
cnt++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return cnt, nil
|
||||
}
|
||||
|
||||
// Skip over comments and spaces. Can handle multi-line comments.
|
||||
func (sp *ContentStreamParser) skipComments() error {
|
||||
if _, err := sp.skipSpaces(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
isFirst := true
|
||||
for {
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err != nil {
|
||||
common.Log.Debug("error %s", err.Error())
|
||||
return err
|
||||
}
|
||||
if isFirst && bb[0] != '%' {
|
||||
// Not a comment clearly.
|
||||
return nil
|
||||
} else {
|
||||
isFirst = false
|
||||
}
|
||||
if (bb[0] != '\r') && (bb[0] != '\n') {
|
||||
sp.reader.ReadByte()
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Call recursively to handle multiline comments.
|
||||
return sp.skipComments()
|
||||
}
|
||||
|
||||
// Parse a name starting with '/'.
|
||||
func (sp *ContentStreamParser) parseName() (core.PdfObjectName, error) {
|
||||
name := ""
|
||||
nameStarted := false
|
||||
for {
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err == io.EOF {
|
||||
break // Can happen when loading from object stream.
|
||||
}
|
||||
if err != nil {
|
||||
return core.PdfObjectName(name), err
|
||||
}
|
||||
|
||||
if !nameStarted {
|
||||
// Should always start with '/', otherwise not valid.
|
||||
if bb[0] == '/' {
|
||||
nameStarted = true
|
||||
sp.reader.ReadByte()
|
||||
} else {
|
||||
common.Log.Error("Name starting with %s (% x)", bb, bb)
|
||||
return core.PdfObjectName(name), fmt.Errorf("invalid name: (%c)", bb[0])
|
||||
}
|
||||
} else {
|
||||
if core.IsWhiteSpace(bb[0]) {
|
||||
break
|
||||
} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
|
||||
break // Looks like start of next statement.
|
||||
} else if bb[0] == '#' {
|
||||
hexcode, err := sp.reader.Peek(3)
|
||||
if err != nil {
|
||||
return core.PdfObjectName(name), err
|
||||
}
|
||||
sp.reader.Discard(3)
|
||||
|
||||
code, err := hex.DecodeString(string(hexcode[1:3]))
|
||||
if err != nil {
|
||||
return core.PdfObjectName(name), err
|
||||
}
|
||||
name += string(code)
|
||||
} else {
|
||||
b, _ := sp.reader.ReadByte()
|
||||
name += string(b)
|
||||
}
|
||||
}
|
||||
}
|
||||
return core.PdfObjectName(name), nil
|
||||
}
|
||||
|
||||
// Numeric objects.
|
||||
// Section 7.3.3.
|
||||
// Integer or Float.
|
||||
//
|
||||
// An integer shall be written as one or more decimal digits optionally
|
||||
// preceded by a sign. The value shall be interpreted as a signed
|
||||
// decimal integer and shall be converted to an integer object.
|
||||
//
|
||||
// A real value shall be written as one or more decimal digits with an
|
||||
// optional sign and a leading, trailing, or embedded PERIOD (2Eh)
|
||||
// (decimal point). The value shall be interpreted as a real number
|
||||
// and shall be converted to a real object.
|
||||
//
|
||||
// Regarding exponential numbers: 7.3.3 Numeric Objects:
|
||||
// A conforming writer shall not use the PostScript syntax for numbers
|
||||
// with non-decimal radices (such as 16#FFFE) or in exponential format
|
||||
// (such as 6.02E23).
|
||||
// Nonetheless, we sometimes get numbers with exponential format, so
|
||||
// we will support it in the reader (no confusion with other types, so
|
||||
// no compromise).
|
||||
func (sp *ContentStreamParser) parseNumber() (core.PdfObject, error) {
|
||||
isFloat := false
|
||||
allowSigns := true
|
||||
numStr := ""
|
||||
for {
|
||||
common.Log.Trace("Parsing number \"%s\"", numStr)
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err == io.EOF {
|
||||
// GH: EOF handling. Handle EOF like end of line. Can happen with
|
||||
// encoded object streams that the object is at the end.
|
||||
// In other cases, we will get the EOF error elsewhere at any rate.
|
||||
break // Handle like EOF
|
||||
}
|
||||
if err != nil {
|
||||
common.Log.Error("error %s", err)
|
||||
return nil, err
|
||||
}
|
||||
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
|
||||
// Only appear in the beginning, otherwise serves as a delimiter.
|
||||
b, _ := sp.reader.ReadByte()
|
||||
numStr += string(b)
|
||||
allowSigns = false // Only allowed in beginning, and after e (exponential).
|
||||
} else if core.IsDecimalDigit(bb[0]) {
|
||||
b, _ := sp.reader.ReadByte()
|
||||
numStr += string(b)
|
||||
} else if bb[0] == '.' {
|
||||
b, _ := sp.reader.ReadByte()
|
||||
numStr += string(b)
|
||||
isFloat = true
|
||||
} else if bb[0] == 'e' {
|
||||
// Exponential number format.
|
||||
b, _ := sp.reader.ReadByte()
|
||||
numStr += string(b)
|
||||
isFloat = true
|
||||
allowSigns = true
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if isFloat {
|
||||
fVal, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
common.Log.Debug("error parsing number %q err=%v. Using 0.0. Output may be incorrect", numStr, err)
|
||||
fVal = 0.0
|
||||
err = nil
|
||||
}
|
||||
o := core.PdfObjectFloat(fVal)
|
||||
return &o, err
|
||||
} else {
|
||||
intVal, err := strconv.ParseInt(numStr, 10, 64)
|
||||
if err != nil {
|
||||
common.Log.Debug("error parsing integer %q err=%v. Using 0. Output may be incorrect", numStr, err)
|
||||
intVal = 0
|
||||
err = nil
|
||||
}
|
||||
o := core.PdfObjectInteger(intVal)
|
||||
return &o, err
|
||||
}
|
||||
}
|
||||
|
||||
// A string starts with '(' and ends with ')'.
|
||||
func (sp *ContentStreamParser) parseString() (core.PdfObjectString, error) {
|
||||
sp.reader.ReadByte()
|
||||
|
||||
bytes := []byte{}
|
||||
count := 1
|
||||
for {
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err != nil {
|
||||
return core.PdfObjectString(bytes), err
|
||||
}
|
||||
|
||||
if bb[0] == '\\' { // Escape sequence.
|
||||
sp.reader.ReadByte() // Skip the escape \ byte.
|
||||
b, err := sp.reader.ReadByte()
|
||||
if err != nil {
|
||||
return core.PdfObjectString(bytes), err
|
||||
}
|
||||
|
||||
// Octal '\ddd' number (base 8).
|
||||
if core.IsOctalDigit(b) {
|
||||
bb, err := sp.reader.Peek(2)
|
||||
if err != nil {
|
||||
return core.PdfObjectString(bytes), err
|
||||
}
|
||||
|
||||
numeric := []byte{}
|
||||
numeric = append(numeric, b)
|
||||
for _, val := range bb {
|
||||
if core.IsOctalDigit(val) {
|
||||
numeric = append(numeric, val)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
sp.reader.Discard(len(numeric) - 1)
|
||||
|
||||
common.Log.Trace("Numeric string \"%s\"", numeric)
|
||||
code, err := strconv.ParseUint(string(numeric), 8, 32)
|
||||
if err != nil {
|
||||
return core.PdfObjectString(bytes), err
|
||||
}
|
||||
bytes = append(bytes, byte(code))
|
||||
continue
|
||||
}
|
||||
|
||||
switch b {
|
||||
case 'n':
|
||||
bytes = append(bytes, '\n')
|
||||
case 'r':
|
||||
bytes = append(bytes, '\r')
|
||||
case 't':
|
||||
bytes = append(bytes, '\t')
|
||||
case 'b':
|
||||
bytes = append(bytes, '\b')
|
||||
case 'f':
|
||||
bytes = append(bytes, '\f')
|
||||
case '(':
|
||||
bytes = append(bytes, '(')
|
||||
case ')':
|
||||
bytes = append(bytes, ')')
|
||||
case '\\':
|
||||
bytes = append(bytes, '\\')
|
||||
}
|
||||
|
||||
continue
|
||||
} else if bb[0] == '(' {
|
||||
count++
|
||||
} else if bb[0] == ')' {
|
||||
count--
|
||||
if count == 0 {
|
||||
sp.reader.ReadByte()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
b, _ := sp.reader.ReadByte()
|
||||
bytes = append(bytes, b)
|
||||
}
|
||||
|
||||
return core.PdfObjectString(bytes), nil
|
||||
}
|
||||
|
||||
// Starts with '<' ends with '>'.
|
||||
func (sp *ContentStreamParser) parseHexString() (core.PdfObjectString, error) {
|
||||
sp.reader.ReadByte()
|
||||
|
||||
hextable := []byte("0123456789abcdefABCDEF")
|
||||
|
||||
tmp := []byte{}
|
||||
for {
|
||||
sp.skipSpaces()
|
||||
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err != nil {
|
||||
return core.PdfObjectString(""), err
|
||||
}
|
||||
|
||||
if bb[0] == '>' {
|
||||
sp.reader.ReadByte()
|
||||
break
|
||||
}
|
||||
|
||||
b, _ := sp.reader.ReadByte()
|
||||
if bytes.IndexByte(hextable, b) >= 0 {
|
||||
tmp = append(tmp, b)
|
||||
}
|
||||
}
|
||||
|
||||
if len(tmp)%2 == 1 {
|
||||
tmp = append(tmp, '0')
|
||||
}
|
||||
|
||||
buf, _ := hex.DecodeString(string(tmp))
|
||||
return core.PdfObjectString(buf), nil
|
||||
}
|
||||
|
||||
// Starts with '[' ends with ']'. Can contain any kinds of direct objects.
|
||||
func (sp *ContentStreamParser) parseArray() (core.PdfObjectArray, error) {
|
||||
arr := make(core.PdfObjectArray, 0)
|
||||
|
||||
sp.reader.ReadByte()
|
||||
|
||||
for {
|
||||
sp.skipSpaces()
|
||||
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err != nil {
|
||||
return arr, err
|
||||
}
|
||||
|
||||
if bb[0] == ']' {
|
||||
sp.reader.ReadByte()
|
||||
break
|
||||
}
|
||||
|
||||
obj, err, _ := sp.parseObject()
|
||||
if err != nil {
|
||||
return arr, err
|
||||
}
|
||||
arr = append(arr, obj)
|
||||
}
|
||||
|
||||
return arr, nil
|
||||
}
|
||||
|
||||
// Parse bool object.
|
||||
func (sp *ContentStreamParser) parseBool() (core.PdfObjectBool, error) {
|
||||
bb, err := sp.reader.Peek(4)
|
||||
if err != nil {
|
||||
return core.PdfObjectBool(false), err
|
||||
}
|
||||
if (len(bb) >= 4) && (string(bb[:4]) == "true") {
|
||||
sp.reader.Discard(4)
|
||||
return core.PdfObjectBool(true), nil
|
||||
}
|
||||
|
||||
bb, err = sp.reader.Peek(5)
|
||||
if err != nil {
|
||||
return core.PdfObjectBool(false), err
|
||||
}
|
||||
if (len(bb) >= 5) && (string(bb[:5]) == "false") {
|
||||
sp.reader.Discard(5)
|
||||
return core.PdfObjectBool(false), nil
|
||||
}
|
||||
|
||||
return core.PdfObjectBool(false), errors.New("unexpected boolean string")
|
||||
}
|
||||
|
||||
// Parse null object.
|
||||
func (sp *ContentStreamParser) parseNull() (core.PdfObjectNull, error) {
|
||||
_, err := sp.reader.Discard(4)
|
||||
return core.PdfObjectNull{}, err
|
||||
}
|
||||
|
||||
func (sp *ContentStreamParser) parseDict() (*core.PdfObjectDictionary, error) {
|
||||
common.Log.Trace("Reading content stream dict!")
|
||||
|
||||
dict := core.MakeDict()
|
||||
|
||||
// Pass the '<<'
|
||||
c, _ := sp.reader.ReadByte()
|
||||
if c != '<' {
|
||||
return nil, errors.New("invalid dict")
|
||||
}
|
||||
c, _ = sp.reader.ReadByte()
|
||||
if c != '<' {
|
||||
return nil, errors.New("invalid dict")
|
||||
}
|
||||
|
||||
for {
|
||||
sp.skipSpaces()
|
||||
|
||||
bb, err := sp.reader.Peek(2)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
common.Log.Trace("Dict peek: %s (% x)!", string(bb), string(bb))
|
||||
if (bb[0] == '>') && (bb[1] == '>') {
|
||||
common.Log.Trace("EOF dictionary")
|
||||
sp.reader.ReadByte()
|
||||
sp.reader.ReadByte()
|
||||
break
|
||||
}
|
||||
common.Log.Trace("Parse the name!")
|
||||
|
||||
keyName, err := sp.parseName()
|
||||
common.Log.Trace("Key: %s", keyName)
|
||||
if err != nil {
|
||||
common.Log.Debug("error Returning name err %s", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(keyName) > 4 && keyName[len(keyName)-4:] == "null" {
|
||||
// Some writers have a bug where the null is appended without
|
||||
// space. For example "\Boundsnull"
|
||||
newKey := keyName[0 : len(keyName)-4]
|
||||
common.Log.Trace("Taking care of null bug (%s)", keyName)
|
||||
common.Log.Trace("New key \"%s\" = null", newKey)
|
||||
sp.skipSpaces()
|
||||
bb, _ := sp.reader.Peek(1)
|
||||
if bb[0] == '/' {
|
||||
dict.Set(newKey, core.MakeNull())
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
sp.skipSpaces()
|
||||
|
||||
val, err, _ := sp.parseObject()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
dict.Set(keyName, val)
|
||||
|
||||
common.Log.Trace("dict[%s] = %s", keyName, val.String())
|
||||
}
|
||||
|
||||
return dict, nil
|
||||
}
|
||||
|
||||
// An operand is a text command represented by a word.
|
||||
func (sp *ContentStreamParser) parseOperand() (core.PdfObjectString, error) {
|
||||
bytes := []byte{}
|
||||
for {
|
||||
bb, err := sp.reader.Peek(1)
|
||||
if err != nil {
|
||||
return core.PdfObjectString(bytes), err
|
||||
}
|
||||
if core.IsDelimiter(bb[0]) {
|
||||
break
|
||||
}
|
||||
if core.IsWhiteSpace(bb[0]) {
|
||||
break
|
||||
}
|
||||
|
||||
b, _ := sp.reader.ReadByte()
|
||||
bytes = append(bytes, b)
|
||||
}
|
||||
|
||||
return core.PdfObjectString(bytes), nil
|
||||
}
|
||||
|
||||
// Parse a generic object. Returns the object, an error code, and a bool
|
||||
// value indicating whether the object is an operand. An operand
|
||||
// is contained in a pdf string object.
|
||||
func (sp *ContentStreamParser) parseObject() (core.PdfObject, error, bool) {
|
||||
// Determine the kind of object.
|
||||
// parse it!
|
||||
// make a list of operands, then once operand arrives put into a package.
|
||||
|
||||
sp.skipSpaces()
|
||||
for {
|
||||
bb, err := sp.reader.Peek(2)
|
||||
if err != nil {
|
||||
return nil, err, false
|
||||
}
|
||||
|
||||
common.Log.Trace("Peek string: %s", string(bb))
|
||||
// Determine type.
|
||||
if bb[0] == '%' {
|
||||
sp.skipComments()
|
||||
continue
|
||||
} else if bb[0] == '/' {
|
||||
name, err := sp.parseName()
|
||||
common.Log.Trace("->Name: '%s'", name)
|
||||
return &name, err, false
|
||||
} else if bb[0] == '(' {
|
||||
common.Log.Trace("->String!")
|
||||
str, err := sp.parseString()
|
||||
common.Log.Trace("(%s)\n", str.String())
|
||||
return &str, err, false
|
||||
} else if bb[0] == '<' && bb[1] != '<' {
|
||||
common.Log.Trace("->Hex String!")
|
||||
str, err := sp.parseHexString()
|
||||
return &str, err, false
|
||||
} else if bb[0] == '[' {
|
||||
common.Log.Trace("->Array!")
|
||||
arr, err := sp.parseArray()
|
||||
return &arr, err, false
|
||||
} else if core.IsFloatDigit(bb[0]) || (bb[0] == '-' && core.IsFloatDigit(bb[1])) {
|
||||
common.Log.Trace("->Number!")
|
||||
number, err := sp.parseNumber()
|
||||
return number, err, false
|
||||
} else if bb[0] == '<' && bb[1] == '<' {
|
||||
dict, err := sp.parseDict()
|
||||
return dict, err, false
|
||||
} else {
|
||||
// Otherwise, can be: keyword such as "null", "false", "true" or an operand...
|
||||
common.Log.Trace("->Operand or bool?")
|
||||
// Let's peek farther to find out.
|
||||
bb, _ = sp.reader.Peek(5)
|
||||
peekStr := string(bb)
|
||||
common.Log.Trace("cont Peek str: %s", peekStr)
|
||||
|
||||
if (len(peekStr) > 3) && (peekStr[:4] == "null") {
|
||||
null, err := sp.parseNull()
|
||||
return &null, err, false
|
||||
} else if (len(peekStr) > 4) && (peekStr[:5] == "false") {
|
||||
b, err := sp.parseBool()
|
||||
return &b, err, false
|
||||
} else if (len(peekStr) > 3) && (peekStr[:4] == "true") {
|
||||
b, err := sp.parseBool()
|
||||
return &b, err, false
|
||||
}
|
||||
|
||||
operand, err := sp.parseOperand()
|
||||
if err != nil {
|
||||
return &operand, err, false
|
||||
}
|
||||
if len(operand.String()) < 1 {
|
||||
return &operand, ErrInvalidOperand, false
|
||||
}
|
||||
return &operand, nil, true
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user