mirror of
https://github.com/EbookFoundation/free-programming-books.git
synced 2025-12-10 12:13:30 +08:00
* Fix: scripts- Incorrect <div> Tag Handling logic * added comments for <div> tags handling logic Co-authored-by: Gabriele Ciccotelli <gabrieleciccotelli98@gmail.com> --------- Co-authored-by: Gabriele Ciccotelli <gabrieleciccotelli98@gmail.com>
606 lines
26 KiB
Python
606 lines
26 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
RTL/LTR Markdown Linter.
|
||
|
||
This script analyzes Markdown files to identify potential issues
|
||
in the display of mixed Right-To-Left (RTL) and Left-To-Right (LTR) text.
|
||
It reads configuration from a `rtl_linter_config.yml` file located in the same
|
||
directory as the script.
|
||
|
||
Key Features:
|
||
- Line-by-line parsing of Markdown list items.
|
||
- Detection of HTML 'dir' attributes to switch text direction context.
|
||
- Handling of nested 'dir' contexts within '<span>' tags.
|
||
- Detection of LTR keywords and symbols that might require Unicode markers.
|
||
- BIDI (Bidirectional Algorithm) visual analysis using the 'python-bidi' library.
|
||
- Parsing of metadata for book items (title, author, meta).
|
||
- Configurable severity levels for detected issues (error, warning, notice).
|
||
- Filters to ignore code blocks, inline code, and text within parentheses.
|
||
- Specific check for RTL authors followed by LTR metadata.
|
||
"""
|
||
import sys
|
||
import os
|
||
import argparse
|
||
import re
|
||
import yaml
|
||
from bidi.algorithm import get_display
|
||
|
||
|
||
def load_config(path):
|
||
"""
|
||
Loads configuration from the specified YAML file.
|
||
|
||
If the file does not exist or an error occurs during loading,
|
||
default values will be used.
|
||
|
||
Args:
|
||
path (str): The path to the YAML configuration file.
|
||
|
||
Returns:
|
||
dict: A dictionary containing the configuration parameters.
|
||
Default values are merged with those loaded from the file,
|
||
with the latter taking precedence.
|
||
"""
|
||
# Default configuration values
|
||
default = {
|
||
'ltr_keywords': [],
|
||
'ltr_symbols': [],
|
||
'pure_ltr_pattern': r"^[\u0000-\u007F]+$", # Matches ASCII characters (Basic Latin character)
|
||
'rtl_chars_pattern': r"[\u0590-\u08FF]", # Matches Right-to-Left (RTL) characters (Arabic, Hebrew, etc.)
|
||
'severity': {
|
||
'bidi_mismatch': 'error', # A difference between the displayed and logical order of text
|
||
'keyword': 'warning', # An LTR keyword (e.g., "HTML") in an RTL context might need an ‏
|
||
'symbol': 'warning', # An LTR symbol (e.g., "C#") in an RTL context might need an ‎
|
||
'pure_ltr': 'notice', # A purely LTR segment in an RTL context might need a trailing ‎
|
||
'author_meta': 'notice' # Specific rules for LTR authors/metadata in RTL contexts.
|
||
},
|
||
'ignore_meta': ['PDF', 'EPUB', 'HTML', 'podcast', 'videocast'],
|
||
'min_ltr_length': 3,
|
||
'rlm_entities': ['‏', '‏', '‏'],
|
||
'lrm_entities': ['‎', '‎', '‎']
|
||
}
|
||
|
||
# If a path is specified and the file exists, attempt to load it
|
||
if path and os.path.exists(path):
|
||
try:
|
||
with open(path, encoding='utf-8') as f:
|
||
data = yaml.safe_load(f) or {}
|
||
conf = data.get('rtl_config', {})
|
||
default.update(conf)
|
||
except Exception as e:
|
||
print(f"::warning file={path}::Could not load config: {e}. Using defaults.") # Output to stdout for GitHub Actions
|
||
|
||
# Return the configuration (updated defaults or just defaults)
|
||
return default
|
||
|
||
|
||
def is_rtl_filename(path):
|
||
'''
|
||
Checks if the given filename indicates an RTL filename.
|
||
|
||
Args:
|
||
path (str): The path to the file.
|
||
|
||
Returns:
|
||
bool: True if the filename suggests an RTL language, False otherwise.
|
||
'''
|
||
name = os.path.basename(path).lower()
|
||
return any(name.endswith(suf) for suf in ['-ar.md','_ar.md','-he.md','_he.md','-fa.md','_fa.md','-ur.md','_ur.md'])
|
||
|
||
# Regex to identify a Markdown list item (e.g., "* text", "- text")
|
||
LIST_ITEM_RE = re.compile(r'^\s*[\*\-\+]\s+(.*)')
|
||
|
||
# Regex to extract title, URL, author, and metadata from a formatted book item
|
||
# Example: Book Title - Author (Metadata)
|
||
BOOK_ITEM_RE = re.compile(
|
||
r"^\s*\[(?P<title>.+?)\]\((?P<url>.+?)\)" # Title and URL (required)
|
||
r"(?:\s*[-–—]\s*(?P<author>[^\(\n\[]+?))?" # Author (optional), separated by -, –, —
|
||
r"(?:\s*[\(\[](?P<meta>.*?)[\)\]])?\s*$" # Metadata (optional), enclosed in parentheses () or []
|
||
)
|
||
|
||
# Regex to find the dir="rtl" or dir="ltr" attribute in an HTML tag
|
||
HTML_DIR_ATTR_RE = re.compile(r"dir\s*=\s*(['\"])(rtl|ltr)\1", re.IGNORECASE)
|
||
|
||
# Regex to find <span> tags with a dir attribute
|
||
SPAN_DIR_RE = re.compile(r'<span[^>]*dir=["\'](rtl|ltr)["\'][^>]*>', re.IGNORECASE)
|
||
|
||
# Regex to identify inline code (text enclosed in single backticks)
|
||
INLINE_CODE_RE = re.compile(r'^`.*`$')
|
||
|
||
# Regex to identify the start of a code block (```)
|
||
# Can be preceded by spaces or a '>' character (for blockquotes)
|
||
CODE_FENCE_START = re.compile(r'^\s*>?\s*```')
|
||
|
||
# Regex to identify text entirely enclosed in parentheses or square brackets.
|
||
# Useful for skipping segments like "(PDF)" or "[Free]" during analysis.
|
||
BRACKET_CONTENT_RE = re.compile(r'''
|
||
(?:^|\W) # Start of line or non-word character
|
||
(\[|\() # Open square or round bracket
|
||
([^\n\)\]]*?) # Content
|
||
(\]|\)) # Close square or round bracket
|
||
(?:\W|$) # End of line or non-word character
|
||
''', re.VERBOSE | re.UNICODE) # VERBOSE for comments, UNICODE for correct matching
|
||
|
||
|
||
def split_by_span(text, base_ctx):
|
||
"""
|
||
Splits text into segments based on nested <span> tags with dir attributes.
|
||
|
||
Args:
|
||
text (str): The input string to split.
|
||
base_ctx (str): The base directionality context ('rtl' or 'ltr').
|
||
|
||
Returns:
|
||
list: A list of tuples, where each tuple contains a text segment (str)
|
||
and its corresponding directionality context ('rtl' or 'ltr').
|
||
|
||
Example of stack behavior:
|
||
Input: "Text <span dir='rtl'>RTL <span dir='ltr'>LTR</span> RTL</span> Text"
|
||
base_ctx: 'ltr'
|
||
|
||
Initial stack: ['ltr']
|
||
Tokens: ["Text ", "<span dir='rtl'>", "RTL ", "<span dir='ltr'>", "LTR", "</span>", " RTL", "</span>", " Text"]
|
||
|
||
Processing:
|
||
1. "Text ": segments.append(("Text ", 'ltr')), stack: ['ltr']
|
||
2. "<span dir='rtl'>": stack.append('rtl'), stack: ['ltr', 'rtl']
|
||
3. "RTL ": segments.append(("RTL ", 'rtl')), stack: ['ltr', 'rtl']
|
||
4. "<span dir='ltr'>": stack.append('ltr'), stack: ['ltr', 'rtl', 'ltr']
|
||
5. "LTR": segments.append(("LTR", 'ltr')), stack: ['ltr', 'rtl', 'ltr']
|
||
6. "</span>": stack.pop(), stack: ['ltr', 'rtl']
|
||
7. " RTL": segments.append((" RTL", 'rtl')), stack: ['ltr', 'rtl']
|
||
8. "</span>": stack.pop(), stack: ['ltr']
|
||
9. " Text": segments.append((" Text", 'ltr')), stack: ['ltr']
|
||
|
||
Resulting segments: [("Text ", 'ltr'), ("RTL ", 'rtl'), ("LTR", 'ltr'), (" RTL", 'rtl'), (" Text", 'ltr')]
|
||
"""
|
||
# Split the text based on <span> tags
|
||
tokens = re.split(r'(<span[^>]*dir=["\'](?:rtl|ltr)["\'][^>]*>|</span>)', text)
|
||
|
||
# Initialize the stack with the base context
|
||
stack = [base_ctx]
|
||
|
||
# Initialize the segments
|
||
segments = []
|
||
|
||
# for each token
|
||
for tok in tokens:
|
||
|
||
# Skip empty tokens
|
||
if not tok:
|
||
continue
|
||
|
||
# Check if the token is an opening <span> tag with a dir attribute
|
||
m = SPAN_DIR_RE.match(tok)
|
||
|
||
# If so, push the new context onto the stack
|
||
if m:
|
||
stack.append(m.group(1).lower()); continue
|
||
|
||
# If the token is a closing </span> tag
|
||
if tok.lower() == '</span>':
|
||
|
||
# Pop the last context from the stack
|
||
if len(stack) > 1: stack.pop()
|
||
continue
|
||
|
||
# Otherwise, if the token is not a span tag, it's a text segment.
|
||
# So, we need to append the tuple (segment, current context) to segments[]
|
||
# Where the current context is the top element of the stack.
|
||
segments.append((tok, stack[-1]))
|
||
|
||
# return the list of tuples
|
||
return segments
|
||
|
||
|
||
def lint_file(path, cfg):
|
||
"""
|
||
Analyzes a single Markdown file for RTL/LTR issues.
|
||
|
||
Args:
|
||
path (str): The path to the Markdown file to analyze.
|
||
cfg (dict): The configuration dictionary.
|
||
|
||
Returns:
|
||
list: A list of strings, where each string represents a detected issue,
|
||
formatted for GitHub Actions output.
|
||
"""
|
||
# Initialize the list of issues
|
||
issues = []
|
||
|
||
# Try to read the file content and handle potential errors
|
||
try:
|
||
lines = open(path, encoding='utf-8').read().splitlines()
|
||
except Exception as e:
|
||
return [f"::error file={path},line=1::Cannot read file: {e}"] # Return as a list of issues
|
||
|
||
# Extract configuration parameters for easier access and readability
|
||
keywords_orig = cfg['ltr_keywords']
|
||
symbols = cfg['ltr_symbols']
|
||
pure_ltr_re = re.compile(cfg['pure_ltr_pattern'])
|
||
rtl_char_re = re.compile(cfg['rtl_chars_pattern'])
|
||
sev = cfg['severity']
|
||
ignore_meta = set(cfg['ignore_meta'])
|
||
min_len = cfg['min_ltr_length']
|
||
|
||
# chr(0x200F) = RLM Unicode character
|
||
# chr(0x200E) = LRM Unicode character
|
||
# These control character must be added here in the code and not in the YAML configuration file,
|
||
# due to the fact that if we included them in the YAML file they would be invisible and, therefore,
|
||
# the YAML file would be less readable
|
||
RLM = [chr(0x200F)] + cfg['rlm_entities']
|
||
LRM = [chr(0x200E)] + cfg['lrm_entities']
|
||
|
||
# Determine the directionality context of the file (RTL or LTR) based on the filename
|
||
file_direction_ctx = 'rtl' if is_rtl_filename(path) else 'ltr'
|
||
|
||
# Stack to manage block-level direction contexts for nested divs.
|
||
# Initialized with the file's base direction context.
|
||
block_context_stack = [file_direction_ctx]
|
||
|
||
# Iterate over each line of the file with its line number
|
||
for idx, line in enumerate(lines, 1):
|
||
|
||
# The active block direction context for the current line is the top of the stack.
|
||
active_block_direction_ctx = block_context_stack[-1]
|
||
|
||
# Skip lines that start a code block (```)
|
||
if CODE_FENCE_START.match(line): continue
|
||
|
||
# Find all opening and closing <div> tags on the line to handle cases
|
||
# where there can be multiple <div> opening and closing on the same line
|
||
div_tags = re.findall(r"(<div[^>]*dir=['\"](rtl|ltr)['\"][^>]*>|</div>)", line, re.IGNORECASE)
|
||
|
||
# Process each found tag in order to correctly update the context stack
|
||
for tag_tuple in div_tags:
|
||
# re.findall with multiple capture groups returns a list of tuples:
|
||
# tag: The full matched tag (e.g., '<div...>' or '</div>')
|
||
# direction: The captured direction ('rtl' or 'ltr'), or empty for a closing tag
|
||
tag, direction = tag_tuple
|
||
|
||
# If it's an opening tag with 'markdown="1"', push the new context
|
||
if tag.startswith('<div') and 'markdown="1"' in tag:
|
||
block_context_stack.append(direction.lower())
|
||
# If it's a closing tag and we are inside a div, pop the context
|
||
elif tag == '</div>' and len(block_context_stack) > 1:
|
||
block_context_stack.pop()
|
||
# Check if the line is a Markdown list item
|
||
list_item = LIST_ITEM_RE.match(line)
|
||
|
||
# If the line is not a list item, skip to the next line
|
||
if not list_item: continue
|
||
|
||
# Extract the text content of the list item and remove leading/trailing whitespace
|
||
text = list_item.group(1).strip()
|
||
|
||
# Extract item parts (title, author, metadata) if it matches the book format
|
||
book_item = BOOK_ITEM_RE.match(text)
|
||
|
||
# If the current line is a book item
|
||
if book_item:
|
||
|
||
# Extract title, author, and metadata from the book item
|
||
title = book_item.group('title')
|
||
author = (book_item.group('author') or '').strip()
|
||
meta = (book_item.group('meta') or '').strip()
|
||
|
||
# If the list item is just a link like the link in the section "### Index" of the .md files (i.e., [Title](url))
|
||
is_link_only_item = not author and not meta
|
||
|
||
# Otherwise, if it's not a book item
|
||
else:
|
||
|
||
# Initialize title, author, and meta with empty strings
|
||
title, author, meta = text, '', ''
|
||
|
||
# Set is_link_only_item to False
|
||
is_link_only_item = False
|
||
|
||
# Specific check: RTL author followed by LTR metadata (e.g., اسم المؤلف (PDF))
|
||
if active_block_direction_ctx == 'rtl' and \
|
||
author and meta and \
|
||
rtl_char_re.search(author) and pure_ltr_re.match(meta) and \
|
||
len(meta) >= min_len and \
|
||
not any(author.strip().endswith(rlm_marker) for rlm_marker in RLM):
|
||
issues.append(
|
||
f"::{sev['author_meta'].lower()} file={path},line={idx}::RTL author '{author.strip()}' followed by LTR meta '{meta}' may need '‏' after author."
|
||
)
|
||
|
||
# Analyze individual parts of the item (title, author, metadata)
|
||
for part, raw_text in [('title', title), ('author', author), ('meta', meta)]:
|
||
|
||
# Skip if the part is empty or if it's metadata to be ignored (e.g., "PDF")
|
||
if not raw_text or (part=='meta' and raw_text in ignore_meta): continue
|
||
|
||
# Split the part into segments based on <span> tags with dir attributes
|
||
segments = split_by_span(raw_text, active_block_direction_ctx)
|
||
|
||
# Filter keywords to avoid duplicates with symbols (a symbol can contain a keyword)
|
||
filtered_keywords = [kw for kw in keywords_orig]
|
||
for sym in symbols:
|
||
filtered_keywords = [kw for kw in filtered_keywords if kw not in sym]
|
||
|
||
# Iterate over each text segment and its directionality context
|
||
for segment_text, segment_direction_ctx in segments:
|
||
|
||
# Remove leading/trailing whitespace from the segment text
|
||
s = segment_text.strip()
|
||
|
||
# In the following block of code, it's checked if the segment is entirely enclosed in parentheses or brackets.
|
||
# In fact, if the content inside is purely LTR or RTL, its display is usually
|
||
# well-isolated by the parentheses or brackets and less prone to BIDI issues.
|
||
# Mixed LTR/RTL content inside brackets should still be checked.
|
||
|
||
# Check if the segment is entirely enclosed in parentheses or brackets.
|
||
m_bracket = BRACKET_CONTENT_RE.fullmatch(s)
|
||
if m_bracket:
|
||
|
||
# If it is, extract the content inside the parentheses/brackets.
|
||
inner_content = m_bracket.group(2)
|
||
|
||
# Determine if the inner content is purely LTR or purely RTL.
|
||
is_pure_ltr_inner = pure_ltr_re.match(inner_content) is not None
|
||
|
||
# Check for pure RTL: contains RTL chars AND no LTR chars (using [A-Za-z0-9] as a proxy for common LTR chars)
|
||
is_pure_rtl_inner = rtl_char_re.search(inner_content) is not None and re.search(r"[A-Za-z0-9]", inner_content) is None
|
||
|
||
# Skip the segment ONLY if the content inside is purely LTR or purely RTL.
|
||
if is_pure_ltr_inner or is_pure_rtl_inner: continue
|
||
|
||
# Skip if it's inline code (i.e., `...`) or already contains directionality markers (e.g., ‏ or ‎)
|
||
if any([
|
||
INLINE_CODE_RE.match(s),
|
||
any(mk in s for mk in RLM+LRM)
|
||
]):
|
||
continue
|
||
|
||
# Check for BIDI mismatch: if the text contains both RTL and LTR
|
||
# characters and the calculated visual order differs from the logical order.
|
||
if rtl_char_re.search(s) and re.search(r"[A-Za-z0-9]", s):
|
||
disp = get_display(s)
|
||
if disp != s:
|
||
issues.append(
|
||
f"::{sev['bidi_mismatch'].lower()} file={path},line={idx}::BIDI mismatch in {part}: the text '{s}' is displayed as '{disp}'"
|
||
)
|
||
|
||
# If the segment context is LTR, there is no need to check LTR keywords and LTR symbols
|
||
# that might need directionality markers, so we can skip the next checks and move on to the next line of the file
|
||
if segment_direction_ctx != 'rtl': continue
|
||
|
||
# Skip keyword and symbol checks for titles of link-only items (e.g., in the Index section of markdown files)
|
||
if not (part == 'title' and is_link_only_item):
|
||
|
||
# Check for LTR symbols: if an LTR symbol is present and lacks an '‎' marker
|
||
for sym in symbols:
|
||
if sym in s and not any(m in s for m in LRM):
|
||
issues.append(
|
||
f"::{sev['symbol'].lower()} file={path},line={idx}::Symbol '{sym}' in {part} '{s}' may need trailing '‎' marker."
|
||
)
|
||
|
||
# Check for LTR keywords: if an LTR keyword is present and lacks an RLM marker
|
||
for kw in filtered_keywords:
|
||
if kw in s and not any(m in s for m in RLM):
|
||
issues.append(
|
||
f"::{sev['keyword'].lower()} file={path},line={idx}::Keyword '{kw}' in {part} '{s}' may need trailing '‏' marker."
|
||
)
|
||
|
||
# Check for "Pure LTR" text: if the segment is entirely LTR,
|
||
# it's not a title, and has a minimum length, it might need a trailing RLM.
|
||
if (part != 'title') and pure_ltr_re.match(s) and not rtl_char_re.search(s) and len(s)>=min_len:
|
||
issues.append(
|
||
f"::{sev['pure_ltr'].lower()} file={path},line={idx}::Pure LTR text '{s}' in {part} of RTL context may need trailing '‏' marker."
|
||
)
|
||
|
||
# Check for unclosed div tags at the end of the file
|
||
if len(block_context_stack) > 1:
|
||
issues.append(
|
||
f"::error file={path},line={len(lines)}::Found unclosed <div dir='...'> tag. "
|
||
f"The final block context is '{block_context_stack[-1]}', not the file's base '{file_direction_ctx}'."
|
||
)
|
||
|
||
# Return the list of found issues
|
||
return issues
|
||
|
||
def get_changed_lines_for_file(filepath):
|
||
"""
|
||
Returns a set of line numbers (1-based) that were changed in the given file in the current PR.
|
||
|
||
This function uses 'git diff' to compare the current branch with 'origin/main' and extracts
|
||
the line numbers of added or modified lines. It is used to restrict PR annotations to only
|
||
those lines that have been changed in the pull request.
|
||
|
||
Args:
|
||
filepath (str): The path to the file to check for changes.
|
||
|
||
Returns:
|
||
set: A set of 1-based line numbers that were added or modified in the file.
|
||
|
||
Note:
|
||
- Requires that the script is run inside a Git repository.
|
||
- If the merge base cannot be found, returns an empty set and does not print errors.
|
||
"""
|
||
import subprocess
|
||
changed_lines = set()
|
||
try:
|
||
# Get the diff for the file (unified=0 for no context lines)
|
||
diff = subprocess.check_output(
|
||
['git', 'diff', '--unified=0', 'origin/main...', '--', filepath],
|
||
encoding='utf-8', errors='ignore'
|
||
)
|
||
for line in diff.splitlines():
|
||
if line.startswith('@@'):
|
||
# Example: @@ -10,0 +11,3 @@
|
||
m = re.search(r'\+(\d+)(?:,(\d+))?', line)
|
||
if m:
|
||
start = int(m.group(1))
|
||
count = int(m.group(2) or '1')
|
||
for i in range(start, start + count):
|
||
changed_lines.add(i)
|
||
except Exception:
|
||
# Silently ignore errors (e.g., unable to find merge base)
|
||
pass
|
||
return changed_lines
|
||
|
||
|
||
def main():
|
||
"""
|
||
Main entry point for the RTL/LTR Markdown linter.
|
||
|
||
Parses command-line arguments, loads configuration, and scans the specified files or directories
|
||
for Markdown files. For each file, it detects RTL/LTR issues and writes all findings to a log file.
|
||
For files changed in the current PR, only issues on changed lines are printed to stdout as GitHub
|
||
Actions annotations.
|
||
|
||
Exit code is 1 if any error or warning is found on changed lines, otherwise 0.
|
||
|
||
Command-line arguments:
|
||
paths_to_scan: List of files or directories to scan for issues.
|
||
--changed-files: List of files changed in the PR (for annotation filtering).
|
||
--log-file: Path to the output log file (default: rtl-linter-output.log).
|
||
"""
|
||
# Create an ArgumentParser object to handle command-line arguments
|
||
parser = argparse.ArgumentParser(
|
||
description="Lints Markdown files for RTL/LTR issues, with PR annotation support."
|
||
)
|
||
|
||
# Argument for files/directories to scan
|
||
parser.add_argument(
|
||
'paths_to_scan',
|
||
nargs='+',
|
||
help="List of files or directories to scan for all issues."
|
||
)
|
||
|
||
# Optional argument for changed files (for PR annotation filtering)
|
||
parser.add_argument(
|
||
'--changed-files',
|
||
nargs='*',
|
||
default=None,
|
||
help="List of changed files to generate PR annotations for."
|
||
)
|
||
|
||
# Optional argument for the log file path
|
||
parser.add_argument(
|
||
'--log-file',
|
||
default='rtl-linter-output.log',
|
||
help="File to write all linter output to."
|
||
)
|
||
|
||
# Parse the command-line arguments
|
||
args = parser.parse_args()
|
||
|
||
# Determine the directory where the script is located to find the config file
|
||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||
|
||
# Load the configuration from 'rtl_linter_config.yml'
|
||
cfg = load_config(os.path.join(script_dir, 'rtl_linter_config.yml'))
|
||
|
||
# Initialize counters for total files processed and errors/warnings found
|
||
total = errs = 0
|
||
|
||
# Count errors/warnings ONLY on changed/added lines for PR annotation exit code
|
||
annotated_errs = 0
|
||
|
||
# Normalize changed file paths for consistent comparison
|
||
changed_files_set = set(os.path.normpath(f) for f in args.changed_files) if args.changed_files else set()
|
||
|
||
# Build a map: {filepath: set(line_numbers)} for changed files
|
||
changed_lines_map = {}
|
||
for f in changed_files_set:
|
||
changed_lines_map[f] = get_changed_lines_for_file(f)
|
||
|
||
# Flag to check if any issues were found
|
||
any_issues = False
|
||
|
||
# Open the specified log file in write mode with UTF-8 encoding
|
||
with open(args.log_file, 'w', encoding='utf-8') as log_f:
|
||
|
||
# Iterate over each path provided in 'paths_to_scan'
|
||
for p_scan_arg in args.paths_to_scan:
|
||
|
||
# Normalize the scan path to ensure consistent handling (e.g., slashes)
|
||
normalized_scan_path = os.path.normpath(p_scan_arg)
|
||
|
||
# If the path is a directory, recursively scan for .md files
|
||
if os.path.isdir(normalized_scan_path):
|
||
|
||
# Walk through the directory and its subdirectories to find all Markdown files
|
||
for root, _, files in os.walk(normalized_scan_path):
|
||
|
||
# For each file in the directory
|
||
for fn in files:
|
||
|
||
# If the file is a Markdown file, lint it
|
||
if fn.lower().endswith('.md'):
|
||
file_path = os.path.normpath(os.path.join(root, fn))
|
||
total += 1
|
||
issues_found = lint_file(file_path, cfg)
|
||
|
||
# Process each issue found
|
||
for issue_str in issues_found:
|
||
log_f.write(issue_str + '\n')
|
||
any_issues = True # Flag to check if any issues were found
|
||
|
||
# For GitHub Actions PR annotations: print only if the file is changed
|
||
# and the issue is on a line that was actually modified or added in the PR
|
||
if file_path in changed_files_set:
|
||
m = re.search(r'line=(\d+)', issue_str)
|
||
if m and int(m.group(1)) in changed_lines_map.get(file_path, set()):
|
||
print(issue_str)
|
||
|
||
# Count errors on changed lines for the exit code logic
|
||
if issue_str.startswith("::error"):
|
||
annotated_errs += 1
|
||
|
||
# Count all errors/warnings for reporting/debugging purposes
|
||
if issue_str.startswith("::error") or issue_str.startswith("::warning"):
|
||
errs += 1
|
||
|
||
# If the path is a Markdown file, lint it directly
|
||
elif normalized_scan_path.lower().endswith('.md'):
|
||
total += 1
|
||
issues_found = lint_file(normalized_scan_path, cfg)
|
||
|
||
# Process each issue found
|
||
for issue_str in issues_found:
|
||
|
||
# Always write the issue to the log file for full reporting
|
||
log_f.write(issue_str + '\n')
|
||
any_issues = True # Flag to check if any issues were found
|
||
|
||
# For GitHub Actions PR annotations: print only if the file is changed
|
||
# and the issue is on a line that was actually modified or added in the PR
|
||
if normalized_scan_path in changed_files_set:
|
||
|
||
# Extract the line number from the issue string (e.g., ...line=123::)
|
||
m = re.search(r'line=(\d+)', issue_str)
|
||
|
||
if m and int(m.group(1)) in changed_lines_map.get(normalized_scan_path, set()):
|
||
|
||
# For GitHub Actions PR annotations: print the annotation
|
||
# so that GitHub Actions can display it in the PR summary
|
||
print(issue_str)
|
||
|
||
# Count errors on changed lines for the exit code logic
|
||
if issue_str.startswith("::error"):
|
||
annotated_errs += 1
|
||
|
||
# Count all errors/warnings for reporting/debugging purposes
|
||
if issue_str.startswith("::error") or issue_str.startswith("::warning"):
|
||
errs += 1
|
||
|
||
# If no issues were found, remove the log file
|
||
if not any_issues:
|
||
try:
|
||
os.remove(args.log_file)
|
||
except Exception:
|
||
pass
|
||
|
||
# Print a debug message to stderr summarizing the linting process
|
||
print(f"::notice ::Processed {total} files, found {errs} issues.")
|
||
|
||
# Exit code: 1 only if there are annotated errors/warnings on changed lines
|
||
sys.exit(1 if annotated_errs else 0)
|
||
|
||
if __name__ == '__main__':
|
||
main()
|