mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
validation: add proper list of languages supported by hocr
Based on Latin-1 support in default PDF fonts.
This commit is contained in:
@@ -37,7 +37,60 @@ from ocrmypdf.subprocess import check_external_program
|
||||
# -------------
|
||||
# External dependencies
|
||||
|
||||
HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
|
||||
# According to Wikipedia these languages are supported in the ISO-8859-1 character
|
||||
# set, meaning reportlab can generate them and they are compatible with hocr,
|
||||
# assuming Tesseract has the necessary languages installed. Note that there may
|
||||
# not be language packs for them.
|
||||
HOCR_OK_LANGS = frozenset(
|
||||
[
|
||||
# Languages fully covered by Latin-1:
|
||||
'afr', # Afrikaans
|
||||
'alb', # Albanian
|
||||
'ast', # Leonese
|
||||
'baq', # Basque
|
||||
'bre', # Breton
|
||||
'cos', # Corsican
|
||||
'eng', # English
|
||||
'eus', # Basque
|
||||
'fao', # Faoese
|
||||
'gla', # Scottish Gaelic
|
||||
'glg', # Galician
|
||||
'glv', # Manx
|
||||
'ice', # Icelandic
|
||||
'ind', # Indonesian
|
||||
'isl', # Icelandic
|
||||
'ita', # Italian
|
||||
'ltz', # Luxembourgish
|
||||
'mal', # Malay Rumi
|
||||
'mga', # Irish
|
||||
'nor', # Norwegian
|
||||
'oci', # Occitan
|
||||
'por', # Portugeuse
|
||||
'roh', # Romansh
|
||||
'sco', # Scots
|
||||
'sma', # Sami
|
||||
'spa', # Spanish
|
||||
'sqi', # Albanian
|
||||
'swa', # Swahili
|
||||
'swe', # Swedish
|
||||
'tgl', # Tagalog
|
||||
'wln', # Walloon
|
||||
# Languages supported by Latin-1 except for a few rare characters that OCR
|
||||
# is probably not trained to recognize anyway:
|
||||
'cat', # Catalan
|
||||
'cym', # Welsh
|
||||
'dan', # Danish
|
||||
'deu', # German
|
||||
'dut', # Dutch
|
||||
'est', # Estonian
|
||||
'fin', # Finnish
|
||||
'fra', # French
|
||||
'hun', # Hungarian
|
||||
'kur', # Kurdish
|
||||
'nld', # Dutch
|
||||
'wel', # Welsh
|
||||
]
|
||||
)
|
||||
DEFAULT_LANGUAGE = 'eng' # Enforce English hegemony
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
Reference in New Issue
Block a user