fix wrong git ignore

This commit is contained in:
Adrian Zürcher
2025-12-15 17:44:00 +01:00
parent ed9f31bb96
commit 8f313c00f0
126 changed files with 70589 additions and 1 deletions

401
internal/pdf/cmap/cmap.go Normal file
View File

@@ -0,0 +1,401 @@
package cmap
import (
"bytes"
"errors"
"io"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/model/textencoding"
)
// CMap represents a character code to unicode mapping used in PDF files.
type CMap struct {
*cMapParser
// Text encoder to look up runes from input glyph names.
encoder textencoding.TextEncoder
// map of character code to string (sequence of runes) for 1-4 byte codes separately.
codeMap [4]map[uint64]string
name string
ctype int
codespaces []codespace
}
// codespace represents a single codespace range used in the CMap.
type codespace struct {
numBytes int
low uint64
high uint64
}
// Name returns the name of the CMap.
func (cmap *CMap) Name() string {
return cmap.name
}
// Type returns the type of the CMap.
func (cmap *CMap) Type() int {
return cmap.ctype
}
// CharcodeBytesToUnicode converts a byte array of charcodes to a unicode string representation.
func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string {
var buf bytes.Buffer
// Maximum number of possible bytes per code.
maxLen := 4
i := 0
for i < len(src) {
var code uint64
var j int
for j = 0; j < maxLen && i+j < len(src); j++ {
b := src[i+j]
code <<= 8
code |= uint64(b)
tgt, has := cmap.codeMap[j][code]
if has {
buf.WriteString(tgt)
break
} else if j == maxLen-1 || i+j == len(src)-1 {
break
}
}
i += j + 1
}
return buf.String()
}
// CharcodeToUnicode converts a single character code to unicode string.
// Note that CharcodeBytesToUnicode is typically more efficient.
func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string {
// Search through different code lengths.
for numBytes := 1; numBytes <= 4; numBytes++ {
if c, has := cmap.codeMap[numBytes-1][srcCode]; has {
return c
}
}
// Not found.
return "?"
}
// newCMap returns an initialized CMap.
func newCMap() *CMap {
cmap := &CMap{}
cmap.codespaces = []codespace{}
cmap.codeMap = [4]map[uint64]string{}
// Maps for 1-4 bytes are initialized. Minimal overhead if not used (most commonly used are 1-2 bytes).
cmap.codeMap[0] = map[uint64]string{}
cmap.codeMap[1] = map[uint64]string{}
cmap.codeMap[2] = map[uint64]string{}
cmap.codeMap[3] = map[uint64]string{}
return cmap
}
// LoadCmapFromData parses CMap data in memory through a byte vector and returns a CMap which
// can be used for character code to unicode conversion.
func LoadCmapFromData(data []byte) (*CMap, error) {
cmap := newCMap()
cmap.cMapParser = newCMapParser(data)
err := cmap.parse()
if err != nil {
return cmap, err
}
return cmap, nil
}
// parse parses the CMap file and loads into the CMap structure.
func (cmap *CMap) parse() error {
for {
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
common.Log.Debug("error parsing CMap: %v", err)
return err
}
if op, isOp := o.(cmapOperand); isOp {
common.Log.Trace("Operand: %s", op.Operand)
switch op.Operand {
case begincodespacerange:
err := cmap.parseCodespaceRange()
if err != nil {
return err
}
case beginbfchar:
err := cmap.parseBfchar()
if err != nil {
return err
}
case beginbfrange:
err := cmap.parseBfrange()
if err != nil {
return err
}
}
} else if n, isName := o.(cmapName); isName {
if n.Name == cmapname {
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
name, ok := o.(cmapName)
if !ok {
return errors.New("CMap name not a name")
}
cmap.name = name.Name
} else if n.Name == cmaptype {
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
typeInt, ok := o.(cmapInt)
if !ok {
return errors.New("CMap type not an integer")
}
cmap.ctype = int(typeInt.val)
}
} else {
common.Log.Trace("Unhandled object: %T %#v", o, o)
}
}
return nil
}
// parseCodespaceRange parses the codespace range section of a CMap.
func (cmap *CMap) parseCodespaceRange() error {
for {
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
hexLow, isHex := o.(cmapHexString)
if !isHex {
if op, isOperand := o.(cmapOperand); isOperand {
if op.Operand == endcodespacerange {
return nil
}
return errors.New("unexpected operand")
}
}
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
hexHigh, ok := o.(cmapHexString)
if !ok {
return errors.New("non-hex high")
}
if hexLow.numBytes != hexHigh.numBytes {
return errors.New("unequal number of bytes in range")
}
low := hexToUint64(hexLow)
high := hexToUint64(hexHigh)
numBytes := hexLow.numBytes
cspace := codespace{numBytes: numBytes, low: low, high: high}
cmap.codespaces = append(cmap.codespaces, cspace)
common.Log.Trace("Codespace low: 0x%X, high: 0x%X", low, high)
}
return nil
}
// parseBfchar parses a bfchar section of a CMap file.
func (cmap *CMap) parseBfchar() error {
for {
// Src code.
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
var srcCode uint64
var numBytes int
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfchar {
return nil
}
return errors.New("unexpected operand")
case cmapHexString:
srcCode = hexToUint64(v)
numBytes = v.numBytes
default:
return errors.New("unexpected type")
}
// Target code.
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
var toCode string
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfchar {
return nil
}
return errors.New("unexpected operand")
case cmapHexString:
toCode = hexToString(v)
case cmapName:
toCode = "?"
if cmap.encoder != nil {
if r, found := cmap.encoder.GlyphToRune(v.Name); found {
toCode = string(r)
}
}
default:
return errors.New("unexpected type")
}
if numBytes <= 0 || numBytes > 4 {
return errors.New("invalid code length")
}
cmap.codeMap[numBytes-1][srcCode] = toCode
}
return nil
}
// parseBfrange parses a bfrange section of a CMap file.
func (cmap *CMap) parseBfrange() error {
for {
// The specifications are in pairs of 3.
// <srcCodeFrom> <srcCodeTo> <target>
// where target can be either <destFrom> as a hex code, or a list.
// Src code from.
var srcCodeFrom uint64
var numBytes int
{
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfrange {
return nil
}
return errors.New("unexpected operand")
case cmapHexString:
srcCodeFrom = hexToUint64(v)
numBytes = v.numBytes
default:
return errors.New("unexpected type")
}
}
// Src code to.
var srcCodeTo uint64
{
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfrange {
return nil
}
return errors.New("unexpected operand")
case cmapHexString:
srcCodeTo = hexToUint64(v)
default:
return errors.New("unexpected type")
}
}
// target(s).
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
if numBytes <= 0 || numBytes > 4 {
return errors.New("invalid code length")
}
switch v := o.(type) {
case cmapArray:
sc := srcCodeFrom
for _, o := range v.Array {
hexs, ok := o.(cmapHexString)
if !ok {
return errors.New("non-hex string in array")
}
cmap.codeMap[numBytes-1][sc] = hexToString(hexs)
sc++
}
if sc != srcCodeTo+1 {
return errors.New("invalid number of items in array")
}
case cmapHexString:
// <srcCodeFrom> <srcCodeTo> <dstCode>, maps [from,to] to [dstCode,dstCode+to-from].
// in hex format.
target := hexToUint64(v)
i := uint64(0)
for sc := srcCodeFrom; sc <= srcCodeTo; sc++ {
r := target + i
cmap.codeMap[numBytes-1][sc] = string(r)
i++
}
default:
return errors.New("unexpected type")
}
}
return nil
}

View File

@@ -0,0 +1,13 @@
package cmap
const (
begincodespacerange = "begincodespacerange"
endcodespacerange = "endcodespacerange"
beginbfchar = "beginbfchar"
endbfchar = "endbfchar"
beginbfrange = "beginbfrange"
endbfrange = "endbfrange"
cmapname = "CMapName"
cmaptype = "CMapType"
)

467
internal/pdf/cmap/parser.go Normal file
View File

@@ -0,0 +1,467 @@
package cmap
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"strconv"
"encoding/hex"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/common"
"gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core"
)
// cMapParser parses CMap character to unicode mapping files.
type cMapParser struct {
reader *bufio.Reader
}
// cMapParser creates a new instance of the PDF CMap parser from input data.
func newCMapParser(content []byte) *cMapParser {
parser := cMapParser{}
buffer := bytes.NewBuffer(content)
parser.reader = bufio.NewReader(buffer)
return &parser
}
// Detect the signature at the current file position and parse
// the corresponding object.
func (p *cMapParser) parseObject() (cmapObject, error) {
p.skipSpaces()
for {
bb, err := p.reader.Peek(2)
if err != nil {
return nil, err
}
if bb[0] == '%' {
p.parseComment()
p.skipSpaces()
continue
} else if bb[0] == '/' {
name, err := p.parseName()
return name, err
} else if bb[0] == '(' {
str, err := p.parseString()
return str, err
} else if bb[0] == '[' {
arr, err := p.parseArray()
return arr, err
} else if (bb[0] == '<') && (bb[1] == '<') {
dict, err := p.parseDict()
return dict, err
} else if bb[0] == '<' {
shex, err := p.parseHexString()
return shex, err
} else if core.IsDecimalDigit(bb[0]) || (bb[0] == '-' && core.IsDecimalDigit(bb[1])) {
number, err := p.parseNumber()
if err != nil {
return nil, err
}
return number, nil
} else {
// Operand?
operand, err := p.parseOperand()
if err != nil {
return nil, err
}
return operand, nil
}
}
}
// Skip over any spaces. Returns the number of spaces skipped and
// an error if any.
func (p *cMapParser) skipSpaces() (int, error) {
cnt := 0
for {
bb, err := p.reader.Peek(1)
if err != nil {
return 0, err
}
if core.IsWhiteSpace(bb[0]) {
p.reader.ReadByte()
cnt++
} else {
break
}
}
return cnt, nil
}
// parseComment reads a comment line starting with '%'.
func (p *cMapParser) parseComment() (string, error) {
var r bytes.Buffer
_, err := p.skipSpaces()
if err != nil {
return r.String(), err
}
isFirst := true
for {
bb, err := p.reader.Peek(1)
if err != nil {
common.Log.Debug("error %s", err.Error())
return r.String(), err
}
if isFirst && bb[0] != '%' {
return r.String(), errors.New("comment should start with %")
}
isFirst = false
if (bb[0] != '\r') && (bb[0] != '\n') {
b, _ := p.reader.ReadByte()
r.WriteByte(b)
} else {
break
}
}
return r.String(), nil
}
// Parse a name starting with '/'.
func (p *cMapParser) parseName() (cmapName, error) {
name := ""
nameStarted := false
for {
bb, err := p.reader.Peek(1)
if err == io.EOF {
break // Can happen when loading from object stream.
}
if err != nil {
return cmapName{name}, err
}
if !nameStarted {
// Should always start with '/', otherwise not valid.
if bb[0] == '/' {
nameStarted = true
p.reader.ReadByte()
} else {
common.Log.Debug("error Name starting with %s (% x)", bb, bb)
return cmapName{name}, fmt.Errorf("invalid name: (%c)", bb[0])
}
} else {
if core.IsWhiteSpace(bb[0]) {
break
} else if (bb[0] == '/') || (bb[0] == '[') || (bb[0] == '(') || (bb[0] == ']') || (bb[0] == '<') || (bb[0] == '>') {
break // Looks like start of next statement.
} else if bb[0] == '#' {
hexcode, err := p.reader.Peek(3)
if err != nil {
return cmapName{name}, err
}
p.reader.Discard(3)
code, err := hex.DecodeString(string(hexcode[1:3]))
if err != nil {
return cmapName{name}, err
}
name += string(code)
} else {
b, _ := p.reader.ReadByte()
name += string(b)
}
}
}
return cmapName{name}, nil
}
// A string starts with '(' and ends with ')'.
func (p *cMapParser) parseString() (cmapString, error) {
p.reader.ReadByte()
buf := bytes.Buffer{}
count := 1
for {
bb, err := p.reader.Peek(1)
if err != nil {
return cmapString{buf.String()}, err
}
if bb[0] == '\\' { // Escape sequence.
p.reader.ReadByte() // Skip the escape \ byte.
b, err := p.reader.ReadByte()
if err != nil {
return cmapString{buf.String()}, err
}
// Octal '\ddd' number (base 8).
if core.IsOctalDigit(b) {
bb, err := p.reader.Peek(2)
if err != nil {
return cmapString{buf.String()}, err
}
numeric := []byte{}
numeric = append(numeric, b)
for _, val := range bb {
if core.IsOctalDigit(val) {
numeric = append(numeric, val)
} else {
break
}
}
p.reader.Discard(len(numeric) - 1)
common.Log.Trace("Numeric string \"%s\"", numeric)
code, err := strconv.ParseUint(string(numeric), 8, 32)
if err != nil {
return cmapString{buf.String()}, err
}
buf.WriteByte(byte(code))
continue
}
switch b {
case 'n':
buf.WriteByte('\n')
case 'r':
buf.WriteByte('\r')
case 't':
buf.WriteByte('\t')
case 'b':
buf.WriteByte('\b')
case 'f':
buf.WriteByte('\f')
case '(':
buf.WriteByte('(')
case ')':
buf.WriteByte(')')
case '\\':
buf.WriteByte('\\')
}
continue
} else if bb[0] == '(' {
count++
} else if bb[0] == ')' {
count--
if count == 0 {
p.reader.ReadByte()
break
}
}
b, _ := p.reader.ReadByte()
buf.WriteByte(b)
}
return cmapString{buf.String()}, nil
}
// Starts with '<' ends with '>'.
// Currently not converting the hex codes to characters.
func (p *cMapParser) parseHexString() (cmapHexString, error) {
p.reader.ReadByte()
hextable := []byte("0123456789abcdefABCDEF")
buf := bytes.Buffer{}
//tmp := []byte{}
for {
p.skipSpaces()
bb, err := p.reader.Peek(1)
if err != nil {
return cmapHexString{numBytes: 0, b: []byte("")}, err
}
if bb[0] == '>' {
p.reader.ReadByte()
break
}
b, _ := p.reader.ReadByte()
if bytes.IndexByte(hextable, b) >= 0 {
buf.WriteByte(b)
}
}
if buf.Len()%2 == 1 {
buf.WriteByte('0')
}
numBytes := buf.Len() / 2
hexb, _ := hex.DecodeString(buf.String())
return cmapHexString{numBytes: numBytes, b: hexb}, nil
}
// Starts with '[' ends with ']'. Can contain any kinds of direct objects.
func (p *cMapParser) parseArray() (cmapArray, error) {
arr := cmapArray{}
arr.Array = []cmapObject{}
p.reader.ReadByte()
for {
p.skipSpaces()
bb, err := p.reader.Peek(1)
if err != nil {
return arr, err
}
if bb[0] == ']' {
p.reader.ReadByte()
break
}
obj, err := p.parseObject()
if err != nil {
return arr, err
}
arr.Array = append(arr.Array, obj)
}
return arr, nil
}
// Reads and parses a PDF dictionary object enclosed with '<<' and '>>'
func (p *cMapParser) parseDict() (cmapDict, error) {
common.Log.Trace("Reading PDF Dict!")
dict := makeDict()
// Pass the '<<'
c, _ := p.reader.ReadByte()
if c != '<' {
return dict, errors.New("invalid dict")
}
c, _ = p.reader.ReadByte()
if c != '<' {
return dict, errors.New("invalid dict")
}
for {
p.skipSpaces()
bb, err := p.reader.Peek(2)
if err != nil {
return dict, err
}
if (bb[0] == '>') && (bb[1] == '>') {
p.reader.ReadByte()
p.reader.ReadByte()
break
}
key, err := p.parseName()
common.Log.Trace("Key: %s", key.Name)
if err != nil {
common.Log.Debug("error Returning name err %s", err)
return dict, err
}
p.skipSpaces()
val, err := p.parseObject()
if err != nil {
return dict, err
}
dict.Dict[key.Name] = val
// Skip "def" which optionally follows key value dict definitions in CMaps.
p.skipSpaces()
bb, err = p.reader.Peek(3)
if err != nil {
return dict, err
}
if string(bb) == "def" {
p.reader.Discard(3)
}
}
return dict, nil
}
func (p *cMapParser) parseNumber() (cmapObject, error) {
isFloat := false
allowSigns := true
numStr := bytes.Buffer{}
for {
bb, err := p.reader.Peek(1)
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
if allowSigns && (bb[0] == '-' || bb[0] == '+') {
// Only appear in the beginning, otherwise serves as a delimiter.
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
allowSigns = false // Only allowed in beginning, and after e (exponential).
} else if core.IsDecimalDigit(bb[0]) {
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
} else if bb[0] == '.' {
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
isFloat = true
} else if bb[0] == 'e' {
// Exponential number format.
b, _ := p.reader.ReadByte()
numStr.WriteByte(b)
isFloat = true
allowSigns = true
} else {
break
}
}
if isFloat {
fVal, err := strconv.ParseFloat(numStr.String(), 64)
o := cmapFloat{fVal}
return o, err
}
intVal, err := strconv.ParseInt(numStr.String(), 10, 64)
o := cmapInt{intVal}
return o, err
}
// An operand is a text command represented by a word.
func (p *cMapParser) parseOperand() (cmapOperand, error) {
op := cmapOperand{}
buf := bytes.Buffer{}
for {
bb, err := p.reader.Peek(1)
if err != nil {
if err == io.EOF {
break
}
return op, err
}
if core.IsDelimiter(bb[0]) {
break
}
if core.IsWhiteSpace(bb[0]) {
break
}
b, _ := p.reader.ReadByte()
buf.WriteByte(b)
}
if buf.Len() == 0 {
return op, fmt.Errorf("invalid operand (empty)")
}
op.Operand = buf.String()
return op, nil
}

View File

@@ -0,0 +1,43 @@
package cmap
type cmapObject interface {
}
type cmapName struct {
Name string
}
type cmapOperand struct {
Operand string
}
type cmapHexString struct {
numBytes int // original number of bytes in the raw representation
b []byte
}
type cmapString struct {
String string
}
type cmapArray struct {
Array []cmapObject
}
type cmapDict struct {
Dict map[string]cmapObject
}
type cmapFloat struct {
val float64
}
type cmapInt struct {
val int64
}
func makeDict() cmapDict {
d := cmapDict{}
d.Dict = map[string]cmapObject{}
return d
}

View File

@@ -0,0 +1,29 @@
package cmap
import "bytes"
func hexToUint64(shex cmapHexString) uint64 {
val := uint64(0)
for _, v := range shex.b {
val <<= 8
val |= uint64(v)
}
return val
}
func hexToString(shex cmapHexString) string {
var buf bytes.Buffer
// Assumes unicode in format <HHLL> with 2 bytes HH and LL representing a rune.
for i := 0; i < len(shex.b)-1; i += 2 {
b1 := uint64(shex.b[i])
b2 := uint64(shex.b[i+1])
r := rune((b1 << 8) | b2)
buf.WriteRune(r)
}
return buf.String()
}