@@ -61,7 +61,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
6161 pageText := & PageText {}
6262 state := newTextState ()
6363 fontStack := fontStacker {}
64- var to * textObject
64+ to := newTextObject (e , resources , contentstream.GraphicsState {}, & state , & fontStack )
65+ var inTextObj bool
6566
6667 cstreamParser := contentstream .NewContentStreamParser (contents )
6768 operations , err := cstreamParser .Parse ()
@@ -102,16 +103,31 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
102103 state .tfont = fontStack .pop ()
103104 }
104105 case "BT" : // Begin text
105- // Begin a text object, initializing the text matrix, Tm, and the text line matrix,
106- // Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
107- // not appear before an ET.
108- if to != nil {
106+ // Begin a text object, initializing the text matrix, Tm, and
107+ // the text line matrix, Tlm, to the identity matrix. Text
108+ // objects shall not be nested. A second BT shall not appear
109+ // before an ET. However, if that happens, all existing marks
110+ // are added to the page marks, in order to avoid losing content.
111+ if inTextObj {
109112 common .Log .Debug ("BT called while in a text object" )
113+ pageText .marks = append (pageText .marks , to .marks ... )
110114 }
115+ inTextObj = true
111116 to = newTextObject (e , resources , gs , & state , & fontStack )
112117 case "ET" : // End Text
118+ // End text object, discarding text matrix. If the current
119+ // text object contains text marks, they are added to the
120+ // page text marks collection.
121+ // The ET operator should always have a matching BT operator.
122+ // However, if ET appears outside of a text object, the behavior
123+ // does not change: the text matrices are discarded and all
124+ // existing marks in the text object are added to the page marks.
125+ if ! inTextObj {
126+ common .Log .Debug ("ET called outside of a text object" )
127+ }
128+ inTextObj = false
113129 pageText .marks = append (pageText .marks , to .marks ... )
114- to = nil
130+ to . reset ()
115131 case "T*" : // Move to start of next text line
116132 to .nextLine ()
117133 case "Td" : // Move text location
@@ -202,10 +218,6 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
202218 }
203219 to .setCharSpacing (y )
204220 case "Tf" : // Set font.
205- if to == nil {
206- // This is needed for 26-Hazard-Thermal-environment.pdf
207- to = newTextObject (e , resources , gs , & state , & fontStack )
208- }
209221 if ok , err := to .checkOp (op , 2 , true ); ! ok {
210222 common .Log .Debug ("ERROR: Tf err=%v" , err )
211223 return err
@@ -659,6 +671,14 @@ func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentst
659671 }
660672}
661673
674+ // reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text
675+ // object to the identity matrix. In addition, the marks collection is cleared.
676+ func (to * textObject ) reset () {
677+ to .tm = transform .IdentityMatrix ()
678+ to .tlm = transform .IdentityMatrix ()
679+ to .marks = nil
680+ }
681+
662682// renderText processes and renders byte array `data` for extraction purposes.
663683func (to * textObject ) renderText (data []byte ) error {
664684 font := to .getCurrentFont ()
@@ -1205,7 +1225,7 @@ func (pt *PageText) sortPosition(tol float64) {
12051225 if pt .marks [i - 1 ].orient != pt .marks [i ].orient {
12061226 cluster ++
12071227 } else {
1208- if pt .marks [i - 1 ].orientedStart .Y - pt .marks [i ].orientedStart .Y > tol {
1228+ if pt .marks [i - 1 ].orientedStart .Y - pt .marks [i ].orientedStart .Y > tol {
12091229 cluster ++
12101230 }
12111231 }
0 commit comments