package contentstream import ( "bytes" "fmt" "gitea.tecamino.com/paadi/pdfmerge/internal/pdf/core" ) type ContentStreamOperation struct { Params []core.PdfObject Operand string } type ContentStreamOperations []*ContentStreamOperation // Check if the content stream operations are fully wrapped (within q ... Q) func (s *ContentStreamOperations) isWrapped() bool { if len(*s) < 2 { return false } depth := 0 for _, op := range *s { switch op.Operand { case "q": depth++ case "Q": depth-- default: if depth < 1 { return false } } } // Should end at depth == 0 return depth == 0 } // Wrap entire contents within q ... Q. If unbalanced, then adds extra Qs at the end. // Only does if needed. Ensures that when adding new content, one start with all states // in the default condition. func (s *ContentStreamOperations) WrapIfNeeded() *ContentStreamOperations { if len(*s) == 0 { // No need to wrap if empty. return s } if s.isWrapped() { return s } *s = append([]*ContentStreamOperation{{Operand: "q"}}, *s...) depth := 0 for _, op := range *s { switch op.Operand { case "q": depth++ case "Q": depth-- } } for depth > 0 { *s = append(*s, &ContentStreamOperation{Operand: "Q"}) depth-- } return s } // Convert a set of content stream operations to a content stream byte presentation, i.e. the kind that can be // stored as a PDF stream or string format. func (s *ContentStreamOperations) Bytes() []byte { var buf bytes.Buffer for _, op := range *s { if op == nil { continue } if op.Operand == "BI" { // Inline image requires special handling. buf.WriteString(op.Operand + "\n") buf.WriteString(op.Params[0].DefaultWriteString()) } else { // Default handler. for _, param := range op.Params { buf.WriteString(param.DefaultWriteString()) buf.WriteString(" ") } buf.WriteString(op.Operand + "\n") } } return buf.Bytes() } // ExtractText parses and extracts all text data in content streams and returns as a string. // Does not take into account Encoding table, the output is simply the character codes. // // Deprecated: More advanced text extraction is offered in package extractor with character encoding support. func (s *ContentStreamParser) ExtractText() (string, error) { operations, err := s.Parse() if err != nil { return "", err } inText := false xPos, yPos := float64(-1), float64(-1) txt := "" for _, op := range *operations { switch op.Operand { case "BT": inText = true case "ET": inText = false } if op.Operand == "Td" || op.Operand == "TD" || op.Operand == "T*" { // Move to next line... txt += "\n" } if op.Operand == "Tm" { if len(op.Params) != 6 { continue } xfloat, ok := op.Params[4].(*core.PdfObjectFloat) if !ok { xint, ok := op.Params[4].(*core.PdfObjectInteger) if !ok { continue } xfloat = core.MakeFloat(float64(*xint)) } yfloat, ok := op.Params[5].(*core.PdfObjectFloat) if !ok { yint, ok := op.Params[5].(*core.PdfObjectInteger) if !ok { continue } yfloat = core.MakeFloat(float64(*yint)) } if yPos == -1 { yPos = float64(*yfloat) } else if yPos > float64(*yfloat) { txt += "\n" xPos = float64(*xfloat) yPos = float64(*yfloat) continue } if xPos == -1 { xPos = float64(*xfloat) } else if xPos < float64(*xfloat) { txt += "\t" xPos = float64(*xfloat) } } if inText && op.Operand == "TJ" { if len(op.Params) < 1 { continue } paramList, ok := op.Params[0].(*core.PdfObjectArray) if !ok { return "", fmt.Errorf("invalid parameter type, no array (%T)", op.Params[0]) } for _, obj := range *paramList { switch v := obj.(type) { case *core.PdfObjectString: txt += string(*v) case *core.PdfObjectFloat: if *v < -100 { txt += " " } case *core.PdfObjectInteger: if *v < -100 { txt += " " } } } } else if inText && op.Operand == "Tj" { if len(op.Params) < 1 { continue } param, ok := op.Params[0].(*core.PdfObjectString) if !ok { return "", fmt.Errorf("invalid parameter type, not string (%T)", op.Params[0]) } txt += string(*param) } } return txt, nil }