From c87221a4e66140225b4dfba0d2e2b985b703f838 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sun, 23 May 2021 01:14:15 -0700 Subject: [PATCH] validation: add proper list of languages supported by hocr Based on Latin-1 support in default PDF fonts. --- src/ocrmypdf/_validation.py | 55 ++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py index ab48195e..cd55ef73 100644 --- a/src/ocrmypdf/_validation.py +++ b/src/ocrmypdf/_validation.py @@ -37,7 +37,60 @@ from ocrmypdf.subprocess import check_external_program # ------------- # External dependencies -HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por']) +# According to Wikipedia these languages are supported in the ISO-8859-1 character +# set, meaning reportlab can generate them and they are compatible with hocr, +# assuming Tesseract has the necessary languages installed. Note that there may +# not be language packs for them. +HOCR_OK_LANGS = frozenset( + [ + # Languages fully covered by Latin-1: + 'afr', # Afrikaans + 'alb', # Albanian + 'ast', # Leonese + 'baq', # Basque + 'bre', # Breton + 'cos', # Corsican + 'eng', # English + 'eus', # Basque + 'fao', # Faoese + 'gla', # Scottish Gaelic + 'glg', # Galician + 'glv', # Manx + 'ice', # Icelandic + 'ind', # Indonesian + 'isl', # Icelandic + 'ita', # Italian + 'ltz', # Luxembourgish + 'mal', # Malay Rumi + 'mga', # Irish + 'nor', # Norwegian + 'oci', # Occitan + 'por', # Portugeuse + 'roh', # Romansh + 'sco', # Scots + 'sma', # Sami + 'spa', # Spanish + 'sqi', # Albanian + 'swa', # Swahili + 'swe', # Swedish + 'tgl', # Tagalog + 'wln', # Walloon + # Languages supported by Latin-1 except for a few rare characters that OCR + # is probably not trained to recognize anyway: + 'cat', # Catalan + 'cym', # Welsh + 'dan', # Danish + 'deu', # German + 'dut', # Dutch + 'est', # Estonian + 'fin', # Finnish + 'fra', # French + 'hun', # Hungarian + 'kur', # Kurdish + 'nld', # Dutch + 'wel', # Welsh + ] +) DEFAULT_LANGUAGE = 'eng' # Enforce English hegemony log = logging.getLogger(__name__)