diff --git a/scripts/doc_parsing_utils.py b/scripts/doc_parsing_utils.py
index ff9033660..ee6f00738 100644
--- a/scripts/doc_parsing_utils.py
+++ b/scripts/doc_parsing_utils.py
@@ -20,10 +20,15 @@ MARKDOWN_LINK_RE = re.compile(
)
HTML_LINK_RE = re.compile(r"]*>.*?")
-HTML_LINK_TEXT = re.compile(r"]*)>(.*?)")
+HTML_LINK_TEXT_RE = re.compile(r"]*)>(.*?)")
HTML_LINK_OPEN_TAG_RE = re.compile(r"]*)>")
HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2')
+CODE_BLOCK_LANG_RE = re.compile(r"^```([\w-]*)", re.MULTILINE)
+
+SLASHES_COMMENT_RE = re.compile(r"^(?P.*?)(?P\s*// .*)?$")
+HASH_COMMENT_RE = re.compile(r"^(?P.*?)(?P\s*# .*)?$")
+
class CodeIncludeInfo(TypedDict):
line_no: int
@@ -57,6 +62,12 @@ class HtmlLinkInfo(TypedDict):
text: str
+class MultilineCodeBlockInfo(TypedDict):
+ lang: str
+ start_line_no: int
+ content: list[str]
+
+
# Code includes
# -----------------------------------------------------------------------------------------
@@ -82,10 +93,11 @@ def replace_code_includes_with_placeholders(text: list[str]) -> list[str]:
Replace code includes with placeholders.
"""
+ modified_text = text.copy()
includes = extract_code_includes(text)
for include in includes:
- text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
- return text
+ modified_text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
+ return modified_text
def replace_placeholders_with_code_includes(
@@ -274,7 +286,7 @@ def _construct_markdown_link(
link = f"[{text}]({url})"
if attributes:
- link += f" {{{attributes}}}"
+ link += f"{{{attributes}}}"
return link
@@ -345,7 +357,7 @@ def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]:
for html_link in HTML_LINK_RE.finditer(line):
link_str = html_link.group(0)
- link_text_match = HTML_LINK_TEXT.match(link_str)
+ link_text_match = HTML_LINK_TEXT_RE.match(link_str)
assert link_text_match is not None
link_text = link_text_match.group(2)
assert isinstance(link_text, str)
@@ -442,3 +454,188 @@ def replace_html_links(
)
return modified_text
+
+
+# Multiline code blocks
+# -----------------------------------------------------------------------------------------
+
+
+def get_code_block_lang(line: str) -> str:
+ match = CODE_BLOCK_LANG_RE.match(line)
+ if match:
+ return match.group(1)
+ return ""
+
+
+def extract_multiline_code_blocks(text: list[str]) -> list[MultilineCodeBlockInfo]:
+ blocks: list[MultilineCodeBlockInfo] = []
+
+ in_code_block3 = False
+ in_code_block4 = False
+ current_block_lang = ""
+ current_block_start_line = -1
+ current_block_lines = []
+
+ for line_no, line in enumerate(text, start=1):
+ stripped = line.lstrip()
+
+ # --- Detect opening fence ---
+ if not (in_code_block3 or in_code_block4):
+ if stripped.startswith("```"):
+ current_block_start_line = line_no
+ count = len(stripped) - len(stripped.lstrip("`"))
+ if count == 3:
+ in_code_block3 = True
+ current_block_lang = get_code_block_lang(stripped)
+ current_block_lines = [line]
+ continue
+ elif count >= 4:
+ in_code_block4 = True
+ current_block_lang = get_code_block_lang(stripped)
+ current_block_lines = [line]
+ continue
+
+ # --- Detect closing fence ---
+ elif in_code_block3:
+ if stripped.startswith("```"):
+ count = len(stripped) - len(stripped.lstrip("`"))
+ if count == 3:
+ current_block_lines.append(line)
+ blocks.append(
+ MultilineCodeBlockInfo(
+ lang=current_block_lang,
+ start_line_no=current_block_start_line,
+ content=current_block_lines,
+ )
+ )
+ in_code_block3 = False
+ current_block_lang = ""
+ current_block_start_line = -1
+ current_block_lines = []
+ continue
+ current_block_lines.append(line)
+
+ elif in_code_block4:
+ if stripped.startswith("````"):
+ count = len(stripped) - len(stripped.lstrip("`"))
+ if count >= 4:
+ current_block_lines.append(line)
+ blocks.append(
+ MultilineCodeBlockInfo(
+ lang=current_block_lang,
+ start_line_no=current_block_start_line,
+ content=current_block_lines,
+ )
+ )
+ in_code_block4 = False
+ current_block_lang = ""
+ current_block_start_line = -1
+ current_block_lines = []
+ continue
+ current_block_lines.append(line)
+
+ return blocks
+
+
+def _split_hash_comment(line: str) -> tuple[str, str | None]:
+ match = HASH_COMMENT_RE.match(line)
+ if match:
+ code = match.group("code").rstrip()
+ comment = match.group("comment")
+ return code, comment
+ return line.rstrip(), None
+
+
+def _split_slashes_comment(line: str) -> tuple[str, str | None]:
+ match = SLASHES_COMMENT_RE.match(line)
+ if match:
+ code = match.group("code").rstrip()
+ comment = match.group("comment")
+ return code, comment
+ return line, None
+
+
+def replace_multiline_code_block(
+ block_a: MultilineCodeBlockInfo, block_b: MultilineCodeBlockInfo
+) -> list[str]:
+ """
+ Replace multiline code block a with block b leaving comments intact.
+
+ Syntax of comments depends on the language of the code block.
+ Raises ValueError if the blocks are not compatible (different languages or different number of lines).
+ """
+
+ if block_a["lang"] != block_b["lang"]:
+ raise ValueError("Code blocks have different languages")
+ if len(block_a["content"]) != len(block_b["content"]):
+ raise ValueError("Code blocks have different number of lines")
+
+ block_language = block_a["lang"].lower()
+ if block_language in {"mermaid"}:
+ return block_a["content"].copy() # We don't handle mermaid code blocks for now
+
+ code_block: list[str] = []
+ for line_a, line_b in zip(block_a["content"], block_b["content"]):
+ line_a_comment: str | None = None
+ line_b_comment: str | None = None
+
+ # Handle comments based on language
+ if block_language in {
+ "python",
+ "py",
+ "sh",
+ "bash",
+ "dockerfile",
+ "requirements",
+ "gitignore",
+ "toml",
+ "yaml",
+ "yml",
+ }:
+ _line_a_code, line_a_comment = _split_hash_comment(line_a)
+ line_b_code, line_b_comment = _split_hash_comment(line_b)
+ res_line = line_b
+ if line_b_comment:
+ res_line = res_line.replace(line_b_comment, line_a_comment, 1)
+ code_block.append(res_line)
+ elif block_language in {"console", "json"}:
+ _line_a_code, line_a_comment = _split_slashes_comment(line_a)
+ line_b_code, line_b_comment = _split_slashes_comment(line_b)
+ res_line = line_b
+ if line_b_comment:
+ print(f"Replacing comment: {line_b_comment} with {line_a_comment}")
+ res_line = res_line.replace(line_b_comment, line_a_comment, 1)
+ print(f"Resulting line: {res_line}")
+ code_block.append(res_line)
+ else:
+ code_block.append(line_b)
+
+ return code_block
+
+
+def replace_multiline_code_blocks_in_text(
+ text: list[str],
+ code_blocks: list[MultilineCodeBlockInfo],
+ original_code_blocks: list[MultilineCodeBlockInfo],
+) -> list[MultilineCodeBlockInfo]:
+ """
+ Update each code block in `text` with the corresponding code block from
+ `original_code_blocks` with comments taken from `code_blocks`.
+
+ Raises ValueError if the number, language, or shape of code blocks do not match.
+ """
+
+ if len(code_blocks) != len(original_code_blocks):
+ raise ValueError(
+ "Number of code blocks does not match the number of original code blocks"
+ )
+
+ modified_text = text.copy()
+ for block, original_block in zip(code_blocks, original_code_blocks):
+ updated_content = replace_multiline_code_block(block, original_block)
+
+ start_line_index = block["start_line_no"] - 1
+ for i, updated_line in enumerate(updated_content):
+ modified_text[start_line_index + i] = updated_line
+
+ return modified_text
diff --git a/scripts/translation_fixer.py b/scripts/translation_fixer.py
index c1f036dcc..ed9c2969a 100644
--- a/scripts/translation_fixer.py
+++ b/scripts/translation_fixer.py
@@ -1,3 +1,6 @@
+import difflib
+import os
+from collections.abc import Iterable
from pathlib import Path
from typing import Annotated
@@ -8,13 +11,27 @@ from scripts.doc_parsing_utils import (
extract_header_permalinks,
extract_html_links,
extract_markdown_links,
+ extract_multiline_code_blocks,
replace_code_includes_with_placeholders,
replace_header_permalinks,
replace_html_links,
replace_markdown_links,
+ replace_multiline_code_blocks_in_text,
replace_placeholders_with_code_includes,
)
+non_translated_sections = (
+ f"reference{os.sep}",
+ "release-notes.md",
+ "fastapi-people.md",
+ "external-links.md",
+ "newsletter.md",
+ "management-tasks.md",
+ "management.md",
+ "contributing.md",
+)
+
+
cli = typer.Typer()
@@ -23,6 +40,53 @@ def callback():
pass
+def iter_all_lang_paths(lang_path_root: Path) -> Iterable[Path]:
+ """
+ Iterate on the markdown files to translate in order of priority.
+ """
+
+ first_dirs = [
+ lang_path_root / "learn",
+ lang_path_root / "tutorial",
+ lang_path_root / "advanced",
+ lang_path_root / "about",
+ lang_path_root / "how-to",
+ ]
+ first_parent = lang_path_root
+ yield from first_parent.glob("*.md")
+ for dir_path in first_dirs:
+ yield from dir_path.rglob("*.md")
+ first_dirs_str = tuple(str(d) for d in first_dirs)
+ for path in lang_path_root.rglob("*.md"):
+ if str(path).startswith(first_dirs_str):
+ continue
+ if path.parent == first_parent:
+ continue
+ yield path
+
+
+def get_all_paths(lang: str):
+ res: list[str] = []
+ lang_docs_root = Path("docs") / lang / "docs"
+ for path in iter_all_lang_paths(lang_docs_root):
+ relpath = path.relative_to(lang_docs_root)
+ if not str(relpath).startswith(non_translated_sections):
+ res.append(str(relpath))
+ return res
+
+
+@cli.command()
+def fix_all(ctx: typer.Context, language: str):
+ docs = get_all_paths(language)
+
+ for page in docs:
+ doc_path = Path("docs") / language / "docs" / page
+ try:
+ fix_pages(doc_paths=[doc_path])
+ except ValueError as e:
+ print(f"Error processing {doc_path}: {e}")
+
+
@cli.command()
def fix_pages(
doc_paths: Annotated[
@@ -49,6 +113,11 @@ def fix_pages(
)
if fixed_doc_lines != doc_lines:
print(f"Fixing code includes in: {path}")
+ diff = difflib.unified_diff(
+ doc_lines, fixed_doc_lines, fromfile="translation", tofile="fixed"
+ )
+ print("\n".join(diff))
+
doc_lines = fixed_doc_lines
# Fix permalinks
@@ -75,7 +144,14 @@ def fix_pages(
doc_lines = fixed_doc_lines
# Fix multiline code blocks
- # TODO: Implement
+ en_code_blocks = extract_multiline_code_blocks(en_doc_lines)
+ doc_code_blocks = extract_multiline_code_blocks(doc_lines)
+ fixed_doc_lines = replace_multiline_code_blocks_in_text(
+ doc_lines, doc_code_blocks, en_code_blocks
+ )
+ if fixed_doc_lines != doc_lines:
+ print(f"Fixing multiline code blocks in: {path}")
+ doc_lines = fixed_doc_lines
# Write back the fixed document
doc_lines.append("") # Ensure file ends with a newline