caddy-waf/get_owasp_rules.py

import requests  # For making HTTP requests to GitHub API and downloading files
import re        # For regular expression pattern matching
import json      # For JSON serialization
import time      # For adding delays between API requests

def download_owasp_rules(repo_url, rules_dir, output_path):
    """
    Downloads and processes OWASP ModSecurity Core Rule Set (CRS) files from GitHub.

    Args:
        repo_url (str): GitHub repository path (e.g., 'coreruleset/coreruleset')
        rules_dir (str): Directory containing rule files in the repository
        output_path (str): Path where processed rules will be saved as JSON
    """
    all_rules = []
    headers = {}  # Can be used to add GitHub API token if needed

    try:
        # Construct GitHub API URL to list contents of rules directory
        api_url = f"https://api.github.com/repos/{repo_url}/contents/{rules_dir}"
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()

        # Process each .conf file in the directory
        for file in response.json():
            if not file['name'].endswith('.conf'):
                continue

            # Add delay to avoid hitting GitHub API rate limits
            time.sleep(1)
            response = requests.get(file["download_url"], headers=headers)
            print(f"Processing rule file: {file['name']}")

            # Extract rules from file content and add to collection
            rules = extract_rules(response.text)
            all_rules.extend(rules)

    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return

    # Save processed rules to JSON file
    with open(output_path, 'w') as f:
        json.dump(all_rules, f, indent=2)
    print(f"Saved {len(all_rules)} rules to {output_path}")

def extract_rules(rule_text):
    """
    Extracts individual ModSecurity rules from rule file content using regex.

    Args:
        rule_text (str): Content of ModSecurity rule file

    Returns:
        list: List of dictionaries containing parsed rule information
    """
    rules = []
    # Regex pattern to match ModSecurity SecRule directives
    rule_pattern = r'SecRule\s+([^"]*)"([^"]+)"\s*(\\\s*\n\s*.*?|.*?)(?=\s*SecRule|\s*$)'

    for match in re.finditer(rule_pattern, rule_text, re.MULTILINE | re.DOTALL):
        try:
            variables, pattern, actions = match.groups()

            # Extract key rule properties using regex
            rule_id = re.search(r'id:(\d+)', actions)
            severity = re.search(r'severity:\'?([^,\'\s]+)', actions)
            action = re.search(r'action:\'?([^,\'\s]+)', actions)
            phase = re.search(r'phase:(\d+)', actions)
            description = re.search(r'msg:\'?([^\']+)\'', actions)

            if not rule_id:
                continue

            # Handle special characters in pattern
            pattern = pattern.replace('[CDATA[', '\\[CDATA\\[')

            # Validate regex pattern
            try:
                re.compile(pattern)
            except re.error:
                print(f"Invalid regex pattern in rule {rule_id.group(1)}: {pattern}")
                continue

            # Extract targeted variables from rule
            targets = []
            if variables:
                # List of possible ModSecurity variables to check for
                for target in ["ARGS", "BODY", "URL", "HEADERS", "REQUEST_HEADERS",
                             "RESPONSE_HEADERS", "REQUEST_COOKIES", "USER_AGENT",
                             "CONTENT_TYPE", "X-FORWARDED-FOR", "X-REAL-IP"]:
                    if target in variables.upper():
                        targets.append(target)

            # Set default values if properties are missing
            severity_val = severity.group(1) if severity else "LOW"
            action_val = action.group(1) if action else "log"
            description_val = description.group(1) if description else "No description provided."

            # Calculate rule score based on severity and action
            score = 0 if action_val == "pass" else \
                    5 if action_val == "block" else \
                    4 if severity_val.upper() == "HIGH" else \
                    3 if severity_val.upper() == "MEDIUM" else 1

            # Create rule dictionary with extracted information
            rule = {
                "id": rule_id.group(1),
                "phase": int(phase.group(1)) if phase else 2,
                "pattern": pattern,
                "targets": targets,
                "severity": severity_val,
                "action": action_val,
                "score": score,
                "description": description_val
            }
            rules.append(rule)

        except (AttributeError, ValueError) as e:
            continue

    return rules

if __name__ == "__main__":
    # Configuration for downloading OWASP Core Rule Set
    repo_url = "coreruleset/coreruleset"
    rules_dir = "rules"
    output_path = "rules.json"

    download_owasp_rules(repo_url, rules_dir, output_path)