diff --git a/scripts/doc_parsing_utils.py b/scripts/doc_parsing_utils.py index ff9033660..ee6f00738 100644 --- a/scripts/doc_parsing_utils.py +++ b/scripts/doc_parsing_utils.py @@ -20,10 +20,15 @@ MARKDOWN_LINK_RE = re.compile( ) HTML_LINK_RE = re.compile(r"]*>.*?") -HTML_LINK_TEXT = re.compile(r"]*)>(.*?)") +HTML_LINK_TEXT_RE = re.compile(r"]*)>(.*?)") HTML_LINK_OPEN_TAG_RE = re.compile(r"]*)>") HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2') +CODE_BLOCK_LANG_RE = re.compile(r"^```([\w-]*)", re.MULTILINE) + +SLASHES_COMMENT_RE = re.compile(r"^(?P.*?)(?P\s*// .*)?$") +HASH_COMMENT_RE = re.compile(r"^(?P.*?)(?P\s*# .*)?$") + class CodeIncludeInfo(TypedDict): line_no: int @@ -57,6 +62,12 @@ class HtmlLinkInfo(TypedDict): text: str +class MultilineCodeBlockInfo(TypedDict): + lang: str + start_line_no: int + content: list[str] + + # Code includes # ----------------------------------------------------------------------------------------- @@ -82,10 +93,11 @@ def replace_code_includes_with_placeholders(text: list[str]) -> list[str]: Replace code includes with placeholders. """ + modified_text = text.copy() includes = extract_code_includes(text) for include in includes: - text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER - return text + modified_text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER + return modified_text def replace_placeholders_with_code_includes( @@ -274,7 +286,7 @@ def _construct_markdown_link( link = f"[{text}]({url})" if attributes: - link += f" {{{attributes}}}" + link += f"{{{attributes}}}" return link @@ -345,7 +357,7 @@ def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]: for html_link in HTML_LINK_RE.finditer(line): link_str = html_link.group(0) - link_text_match = HTML_LINK_TEXT.match(link_str) + link_text_match = HTML_LINK_TEXT_RE.match(link_str) assert link_text_match is not None link_text = link_text_match.group(2) assert isinstance(link_text, str) @@ -442,3 +454,188 @@ def replace_html_links( ) return modified_text + + +# Multiline code blocks +# ----------------------------------------------------------------------------------------- + + +def get_code_block_lang(line: str) -> str: + match = CODE_BLOCK_LANG_RE.match(line) + if match: + return match.group(1) + return "" + + +def extract_multiline_code_blocks(text: list[str]) -> list[MultilineCodeBlockInfo]: + blocks: list[MultilineCodeBlockInfo] = [] + + in_code_block3 = False + in_code_block4 = False + current_block_lang = "" + current_block_start_line = -1 + current_block_lines = [] + + for line_no, line in enumerate(text, start=1): + stripped = line.lstrip() + + # --- Detect opening fence --- + if not (in_code_block3 or in_code_block4): + if stripped.startswith("```"): + current_block_start_line = line_no + count = len(stripped) - len(stripped.lstrip("`")) + if count == 3: + in_code_block3 = True + current_block_lang = get_code_block_lang(stripped) + current_block_lines = [line] + continue + elif count >= 4: + in_code_block4 = True + current_block_lang = get_code_block_lang(stripped) + current_block_lines = [line] + continue + + # --- Detect closing fence --- + elif in_code_block3: + if stripped.startswith("```"): + count = len(stripped) - len(stripped.lstrip("`")) + if count == 3: + current_block_lines.append(line) + blocks.append( + MultilineCodeBlockInfo( + lang=current_block_lang, + start_line_no=current_block_start_line, + content=current_block_lines, + ) + ) + in_code_block3 = False + current_block_lang = "" + current_block_start_line = -1 + current_block_lines = [] + continue + current_block_lines.append(line) + + elif in_code_block4: + if stripped.startswith("````"): + count = len(stripped) - len(stripped.lstrip("`")) + if count >= 4: + current_block_lines.append(line) + blocks.append( + MultilineCodeBlockInfo( + lang=current_block_lang, + start_line_no=current_block_start_line, + content=current_block_lines, + ) + ) + in_code_block4 = False + current_block_lang = "" + current_block_start_line = -1 + current_block_lines = [] + continue + current_block_lines.append(line) + + return blocks + + +def _split_hash_comment(line: str) -> tuple[str, str | None]: + match = HASH_COMMENT_RE.match(line) + if match: + code = match.group("code").rstrip() + comment = match.group("comment") + return code, comment + return line.rstrip(), None + + +def _split_slashes_comment(line: str) -> tuple[str, str | None]: + match = SLASHES_COMMENT_RE.match(line) + if match: + code = match.group("code").rstrip() + comment = match.group("comment") + return code, comment + return line, None + + +def replace_multiline_code_block( + block_a: MultilineCodeBlockInfo, block_b: MultilineCodeBlockInfo +) -> list[str]: + """ + Replace multiline code block a with block b leaving comments intact. + + Syntax of comments depends on the language of the code block. + Raises ValueError if the blocks are not compatible (different languages or different number of lines). + """ + + if block_a["lang"] != block_b["lang"]: + raise ValueError("Code blocks have different languages") + if len(block_a["content"]) != len(block_b["content"]): + raise ValueError("Code blocks have different number of lines") + + block_language = block_a["lang"].lower() + if block_language in {"mermaid"}: + return block_a["content"].copy() # We don't handle mermaid code blocks for now + + code_block: list[str] = [] + for line_a, line_b in zip(block_a["content"], block_b["content"]): + line_a_comment: str | None = None + line_b_comment: str | None = None + + # Handle comments based on language + if block_language in { + "python", + "py", + "sh", + "bash", + "dockerfile", + "requirements", + "gitignore", + "toml", + "yaml", + "yml", + }: + _line_a_code, line_a_comment = _split_hash_comment(line_a) + line_b_code, line_b_comment = _split_hash_comment(line_b) + res_line = line_b + if line_b_comment: + res_line = res_line.replace(line_b_comment, line_a_comment, 1) + code_block.append(res_line) + elif block_language in {"console", "json"}: + _line_a_code, line_a_comment = _split_slashes_comment(line_a) + line_b_code, line_b_comment = _split_slashes_comment(line_b) + res_line = line_b + if line_b_comment: + print(f"Replacing comment: {line_b_comment} with {line_a_comment}") + res_line = res_line.replace(line_b_comment, line_a_comment, 1) + print(f"Resulting line: {res_line}") + code_block.append(res_line) + else: + code_block.append(line_b) + + return code_block + + +def replace_multiline_code_blocks_in_text( + text: list[str], + code_blocks: list[MultilineCodeBlockInfo], + original_code_blocks: list[MultilineCodeBlockInfo], +) -> list[MultilineCodeBlockInfo]: + """ + Update each code block in `text` with the corresponding code block from + `original_code_blocks` with comments taken from `code_blocks`. + + Raises ValueError if the number, language, or shape of code blocks do not match. + """ + + if len(code_blocks) != len(original_code_blocks): + raise ValueError( + "Number of code blocks does not match the number of original code blocks" + ) + + modified_text = text.copy() + for block, original_block in zip(code_blocks, original_code_blocks): + updated_content = replace_multiline_code_block(block, original_block) + + start_line_index = block["start_line_no"] - 1 + for i, updated_line in enumerate(updated_content): + modified_text[start_line_index + i] = updated_line + + return modified_text diff --git a/scripts/translation_fixer.py b/scripts/translation_fixer.py index c1f036dcc..ed9c2969a 100644 --- a/scripts/translation_fixer.py +++ b/scripts/translation_fixer.py @@ -1,3 +1,6 @@ +import difflib +import os +from collections.abc import Iterable from pathlib import Path from typing import Annotated @@ -8,13 +11,27 @@ from scripts.doc_parsing_utils import ( extract_header_permalinks, extract_html_links, extract_markdown_links, + extract_multiline_code_blocks, replace_code_includes_with_placeholders, replace_header_permalinks, replace_html_links, replace_markdown_links, + replace_multiline_code_blocks_in_text, replace_placeholders_with_code_includes, ) +non_translated_sections = ( + f"reference{os.sep}", + "release-notes.md", + "fastapi-people.md", + "external-links.md", + "newsletter.md", + "management-tasks.md", + "management.md", + "contributing.md", +) + + cli = typer.Typer() @@ -23,6 +40,53 @@ def callback(): pass +def iter_all_lang_paths(lang_path_root: Path) -> Iterable[Path]: + """ + Iterate on the markdown files to translate in order of priority. + """ + + first_dirs = [ + lang_path_root / "learn", + lang_path_root / "tutorial", + lang_path_root / "advanced", + lang_path_root / "about", + lang_path_root / "how-to", + ] + first_parent = lang_path_root + yield from first_parent.glob("*.md") + for dir_path in first_dirs: + yield from dir_path.rglob("*.md") + first_dirs_str = tuple(str(d) for d in first_dirs) + for path in lang_path_root.rglob("*.md"): + if str(path).startswith(first_dirs_str): + continue + if path.parent == first_parent: + continue + yield path + + +def get_all_paths(lang: str): + res: list[str] = [] + lang_docs_root = Path("docs") / lang / "docs" + for path in iter_all_lang_paths(lang_docs_root): + relpath = path.relative_to(lang_docs_root) + if not str(relpath).startswith(non_translated_sections): + res.append(str(relpath)) + return res + + +@cli.command() +def fix_all(ctx: typer.Context, language: str): + docs = get_all_paths(language) + + for page in docs: + doc_path = Path("docs") / language / "docs" / page + try: + fix_pages(doc_paths=[doc_path]) + except ValueError as e: + print(f"Error processing {doc_path}: {e}") + + @cli.command() def fix_pages( doc_paths: Annotated[ @@ -49,6 +113,11 @@ def fix_pages( ) if fixed_doc_lines != doc_lines: print(f"Fixing code includes in: {path}") + diff = difflib.unified_diff( + doc_lines, fixed_doc_lines, fromfile="translation", tofile="fixed" + ) + print("\n".join(diff)) + doc_lines = fixed_doc_lines # Fix permalinks @@ -75,7 +144,14 @@ def fix_pages( doc_lines = fixed_doc_lines # Fix multiline code blocks - # TODO: Implement + en_code_blocks = extract_multiline_code_blocks(en_doc_lines) + doc_code_blocks = extract_multiline_code_blocks(doc_lines) + fixed_doc_lines = replace_multiline_code_blocks_in_text( + doc_lines, doc_code_blocks, en_code_blocks + ) + if fixed_doc_lines != doc_lines: + print(f"Fixing multiline code blocks in: {path}") + doc_lines = fixed_doc_lines # Write back the fixed document doc_lines.append("") # Ensure file ends with a newline