From fd593eb5e9bbd8d8e129a4ca7d95b8753ddd2cf0 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sun, 24 Dec 2023 01:24:44 -0800 Subject: [PATCH] Reversing character order for RTL helps output --- src/ocrmypdf/hocrtransform/_hocr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ocrmypdf/hocrtransform/_hocr.py b/src/ocrmypdf/hocrtransform/_hocr.py index efa39598..75c699e5 100644 --- a/src/ocrmypdf/hocrtransform/_hocr.py +++ b/src/ocrmypdf/hocrtransform/_hocr.py @@ -367,10 +367,15 @@ class HocrTransform: self._debug_draw_word_bbox(canvas, box) # If this word is 0 units wide, our best bet seems to be to suppress this text + if text_direction == TextDirection.RTL: + log.info("RTL: %s", elemtxt) if font_width > 0: text.text_transform(Matrix(1, 0, 0, -1, box.llx, 0)) text.horiz_scale(100 * box.width / font_width) - text.show(self._font.text_encode(elemtxt)) + if text_direction == TextDirection.LTR: + text.show(self._font.text_encode(elemtxt)) + elif text_direction == TextDirection.RTL: + text.show(self._font.text_encode(elemtxt[::-1])) # Get coordinates of the next word (if there is one) hocr_next_box = (