Handle code blocks, fix some bugs, add fix-all command

This commit is contained in:
Yurii Motov
2025-12-30 17:44:57 +01:00
parent 0339277673
commit beff498743
2 changed files with 279 additions and 6 deletions

View File

@@ -20,10 +20,15 @@ MARKDOWN_LINK_RE = re.compile(
)
HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>")
HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>")
HTML_LINK_TEXT_RE = re.compile(r"<a\b([^>]*)>(.*?)</a>")
HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>")
HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2')
CODE_BLOCK_LANG_RE = re.compile(r"^```([\w-]*)", re.MULTILINE)
SLASHES_COMMENT_RE = re.compile(r"^(?P<code>.*?)(?P<comment>\s*// .*)?$")
HASH_COMMENT_RE = re.compile(r"^(?P<code>.*?)(?P<comment>\s*# .*)?$")
class CodeIncludeInfo(TypedDict):
line_no: int
@@ -57,6 +62,12 @@ class HtmlLinkInfo(TypedDict):
text: str
class MultilineCodeBlockInfo(TypedDict):
lang: str
start_line_no: int
content: list[str]
# Code includes
# -----------------------------------------------------------------------------------------
@@ -82,10 +93,11 @@ def replace_code_includes_with_placeholders(text: list[str]) -> list[str]:
Replace code includes with placeholders.
"""
modified_text = text.copy()
includes = extract_code_includes(text)
for include in includes:
text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
return text
modified_text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
return modified_text
def replace_placeholders_with_code_includes(
@@ -274,7 +286,7 @@ def _construct_markdown_link(
link = f"[{text}]({url})"
if attributes:
link += f" {{{attributes}}}"
link += f"{{{attributes}}}"
return link
@@ -345,7 +357,7 @@ def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]:
for html_link in HTML_LINK_RE.finditer(line):
link_str = html_link.group(0)
link_text_match = HTML_LINK_TEXT.match(link_str)
link_text_match = HTML_LINK_TEXT_RE.match(link_str)
assert link_text_match is not None
link_text = link_text_match.group(2)
assert isinstance(link_text, str)
@@ -442,3 +454,188 @@ def replace_html_links(
)
return modified_text
# Multiline code blocks
# -----------------------------------------------------------------------------------------
def get_code_block_lang(line: str) -> str:
match = CODE_BLOCK_LANG_RE.match(line)
if match:
return match.group(1)
return ""
def extract_multiline_code_blocks(text: list[str]) -> list[MultilineCodeBlockInfo]:
blocks: list[MultilineCodeBlockInfo] = []
in_code_block3 = False
in_code_block4 = False
current_block_lang = ""
current_block_start_line = -1
current_block_lines = []
for line_no, line in enumerate(text, start=1):
stripped = line.lstrip()
# --- Detect opening fence ---
if not (in_code_block3 or in_code_block4):
if stripped.startswith("```"):
current_block_start_line = line_no
count = len(stripped) - len(stripped.lstrip("`"))
if count == 3:
in_code_block3 = True
current_block_lang = get_code_block_lang(stripped)
current_block_lines = [line]
continue
elif count >= 4:
in_code_block4 = True
current_block_lang = get_code_block_lang(stripped)
current_block_lines = [line]
continue
# --- Detect closing fence ---
elif in_code_block3:
if stripped.startswith("```"):
count = len(stripped) - len(stripped.lstrip("`"))
if count == 3:
current_block_lines.append(line)
blocks.append(
MultilineCodeBlockInfo(
lang=current_block_lang,
start_line_no=current_block_start_line,
content=current_block_lines,
)
)
in_code_block3 = False
current_block_lang = ""
current_block_start_line = -1
current_block_lines = []
continue
current_block_lines.append(line)
elif in_code_block4:
if stripped.startswith("````"):
count = len(stripped) - len(stripped.lstrip("`"))
if count >= 4:
current_block_lines.append(line)
blocks.append(
MultilineCodeBlockInfo(
lang=current_block_lang,
start_line_no=current_block_start_line,
content=current_block_lines,
)
)
in_code_block4 = False
current_block_lang = ""
current_block_start_line = -1
current_block_lines = []
continue
current_block_lines.append(line)
return blocks
def _split_hash_comment(line: str) -> tuple[str, str | None]:
match = HASH_COMMENT_RE.match(line)
if match:
code = match.group("code").rstrip()
comment = match.group("comment")
return code, comment
return line.rstrip(), None
def _split_slashes_comment(line: str) -> tuple[str, str | None]:
match = SLASHES_COMMENT_RE.match(line)
if match:
code = match.group("code").rstrip()
comment = match.group("comment")
return code, comment
return line, None
def replace_multiline_code_block(
block_a: MultilineCodeBlockInfo, block_b: MultilineCodeBlockInfo
) -> list[str]:
"""
Replace multiline code block a with block b leaving comments intact.
Syntax of comments depends on the language of the code block.
Raises ValueError if the blocks are not compatible (different languages or different number of lines).
"""
if block_a["lang"] != block_b["lang"]:
raise ValueError("Code blocks have different languages")
if len(block_a["content"]) != len(block_b["content"]):
raise ValueError("Code blocks have different number of lines")
block_language = block_a["lang"].lower()
if block_language in {"mermaid"}:
return block_a["content"].copy() # We don't handle mermaid code blocks for now
code_block: list[str] = []
for line_a, line_b in zip(block_a["content"], block_b["content"]):
line_a_comment: str | None = None
line_b_comment: str | None = None
# Handle comments based on language
if block_language in {
"python",
"py",
"sh",
"bash",
"dockerfile",
"requirements",
"gitignore",
"toml",
"yaml",
"yml",
}:
_line_a_code, line_a_comment = _split_hash_comment(line_a)
line_b_code, line_b_comment = _split_hash_comment(line_b)
res_line = line_b
if line_b_comment:
res_line = res_line.replace(line_b_comment, line_a_comment, 1)
code_block.append(res_line)
elif block_language in {"console", "json"}:
_line_a_code, line_a_comment = _split_slashes_comment(line_a)
line_b_code, line_b_comment = _split_slashes_comment(line_b)
res_line = line_b
if line_b_comment:
print(f"Replacing comment: {line_b_comment} with {line_a_comment}")
res_line = res_line.replace(line_b_comment, line_a_comment, 1)
print(f"Resulting line: {res_line}")
code_block.append(res_line)
else:
code_block.append(line_b)
return code_block
def replace_multiline_code_blocks_in_text(
text: list[str],
code_blocks: list[MultilineCodeBlockInfo],
original_code_blocks: list[MultilineCodeBlockInfo],
) -> list[MultilineCodeBlockInfo]:
"""
Update each code block in `text` with the corresponding code block from
`original_code_blocks` with comments taken from `code_blocks`.
Raises ValueError if the number, language, or shape of code blocks do not match.
"""
if len(code_blocks) != len(original_code_blocks):
raise ValueError(
"Number of code blocks does not match the number of original code blocks"
)
modified_text = text.copy()
for block, original_block in zip(code_blocks, original_code_blocks):
updated_content = replace_multiline_code_block(block, original_block)
start_line_index = block["start_line_no"] - 1
for i, updated_line in enumerate(updated_content):
modified_text[start_line_index + i] = updated_line
return modified_text

View File

@@ -1,3 +1,6 @@
import difflib
import os
from collections.abc import Iterable
from pathlib import Path
from typing import Annotated
@@ -8,13 +11,27 @@ from scripts.doc_parsing_utils import (
extract_header_permalinks,
extract_html_links,
extract_markdown_links,
extract_multiline_code_blocks,
replace_code_includes_with_placeholders,
replace_header_permalinks,
replace_html_links,
replace_markdown_links,
replace_multiline_code_blocks_in_text,
replace_placeholders_with_code_includes,
)
non_translated_sections = (
f"reference{os.sep}",
"release-notes.md",
"fastapi-people.md",
"external-links.md",
"newsletter.md",
"management-tasks.md",
"management.md",
"contributing.md",
)
cli = typer.Typer()
@@ -23,6 +40,53 @@ def callback():
pass
def iter_all_lang_paths(lang_path_root: Path) -> Iterable[Path]:
"""
Iterate on the markdown files to translate in order of priority.
"""
first_dirs = [
lang_path_root / "learn",
lang_path_root / "tutorial",
lang_path_root / "advanced",
lang_path_root / "about",
lang_path_root / "how-to",
]
first_parent = lang_path_root
yield from first_parent.glob("*.md")
for dir_path in first_dirs:
yield from dir_path.rglob("*.md")
first_dirs_str = tuple(str(d) for d in first_dirs)
for path in lang_path_root.rglob("*.md"):
if str(path).startswith(first_dirs_str):
continue
if path.parent == first_parent:
continue
yield path
def get_all_paths(lang: str):
res: list[str] = []
lang_docs_root = Path("docs") / lang / "docs"
for path in iter_all_lang_paths(lang_docs_root):
relpath = path.relative_to(lang_docs_root)
if not str(relpath).startswith(non_translated_sections):
res.append(str(relpath))
return res
@cli.command()
def fix_all(ctx: typer.Context, language: str):
docs = get_all_paths(language)
for page in docs:
doc_path = Path("docs") / language / "docs" / page
try:
fix_pages(doc_paths=[doc_path])
except ValueError as e:
print(f"Error processing {doc_path}: {e}")
@cli.command()
def fix_pages(
doc_paths: Annotated[
@@ -49,6 +113,11 @@ def fix_pages(
)
if fixed_doc_lines != doc_lines:
print(f"Fixing code includes in: {path}")
diff = difflib.unified_diff(
doc_lines, fixed_doc_lines, fromfile="translation", tofile="fixed"
)
print("\n".join(diff))
doc_lines = fixed_doc_lines
# Fix permalinks
@@ -75,7 +144,14 @@ def fix_pages(
doc_lines = fixed_doc_lines
# Fix multiline code blocks
# TODO: Implement
en_code_blocks = extract_multiline_code_blocks(en_doc_lines)
doc_code_blocks = extract_multiline_code_blocks(doc_lines)
fixed_doc_lines = replace_multiline_code_blocks_in_text(
doc_lines, doc_code_blocks, en_code_blocks
)
if fixed_doc_lines != doc_lines:
print(f"Fixing multiline code blocks in: {path}")
doc_lines = fixed_doc_lines
# Write back the fixed document
doc_lines.append("") # Ensure file ends with a newline