From c87221a4e66140225b4dfba0d2e2b985b703f838 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <james@purplerock.ca>
Date: Sun, 23 May 2021 01:14:15 -0700
Subject: [PATCH] validation: add proper list of languages supported by hocr

Based on Latin-1 support in default PDF fonts.
---
 src/ocrmypdf/_validation.py | 55 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py
index ab48195e..cd55ef73 100644
--- a/src/ocrmypdf/_validation.py
+++ b/src/ocrmypdf/_validation.py
@@ -37,7 +37,60 @@ from ocrmypdf.subprocess import check_external_program
 # -------------
 # External dependencies
 
-HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
+# According to Wikipedia these languages are supported in the ISO-8859-1 character
+# set, meaning reportlab can generate them and they are compatible with hocr,
+# assuming Tesseract has the necessary languages installed. Note that there may
+# not be language packs for them.
+HOCR_OK_LANGS = frozenset(
+    [
+        # Languages fully covered by Latin-1:
+        'afr',  # Afrikaans
+        'alb',  # Albanian
+        'ast',  # Leonese
+        'baq',  # Basque
+        'bre',  # Breton
+        'cos',  # Corsican
+        'eng',  # English
+        'eus',  # Basque
+        'fao',  # Faoese
+        'gla',  # Scottish Gaelic
+        'glg',  # Galician
+        'glv',  # Manx
+        'ice',  # Icelandic
+        'ind',  # Indonesian
+        'isl',  # Icelandic
+        'ita',  # Italian
+        'ltz',  # Luxembourgish
+        'mal',  # Malay Rumi
+        'mga',  # Irish
+        'nor',  # Norwegian
+        'oci',  # Occitan
+        'por',  # Portugeuse
+        'roh',  # Romansh
+        'sco',  # Scots
+        'sma',  # Sami
+        'spa',  # Spanish
+        'sqi',  # Albanian
+        'swa',  # Swahili
+        'swe',  # Swedish
+        'tgl',  # Tagalog
+        'wln',  # Walloon
+        # Languages supported by Latin-1 except for a few rare characters that OCR
+        # is probably not trained to recognize anyway:
+        'cat',  # Catalan
+        'cym',  # Welsh
+        'dan',  # Danish
+        'deu',  # German
+        'dut',  # Dutch
+        'est',  # Estonian
+        'fin',  # Finnish
+        'fra',  # French
+        'hun',  # Hungarian
+        'kur',  # Kurdish
+        'nld',  # Dutch
+        'wel',  # Welsh
+    ]
+)
 DEFAULT_LANGUAGE = 'eng'  # Enforce English hegemony
 
 log = logging.getLogger(__name__)