"""
Test Scanner — Standalone script to scan a repository for test files and export to CSV.

Works independently of TestIntel. Only requires Python 3.11+ standard library.
Scans for tests written in BDD/Gherkin, Robot Framework, pytest, JUnit/TestNG,
and Playwright/Cypress, then outputs a CSV compatible with TestIntel inventory import.

Usage:
    # Scan entire repo with defaults
    python scan_tests.py --path /path/to/repo

    # Custom output file
    python scan_tests.py --path /path/to/repo --output my-tests.csv

    # Only scan specific frameworks
    python scan_tests.py --path /path/to/repo --frameworks bdd,pytest

    # Custom exclude patterns
    python scan_tests.py --path /path/to/repo --exclude node_modules,.venv,dist

Supported frameworks:
    bdd        — .feature files (Gherkin scenarios, tags, steps)
    robot      — .robot files (test cases, tags, documentation)
    pytest     — test_*.py / *_test.py (functions, classes, markers)
    junit      — *Test.java / *Tests.java / *IT.java (methods, annotations)
    playwright — *.spec.ts / *.spec.js / *.test.ts / *.test.js (describe/it/test blocks)

Output CSV columns:
    name, type, priority, tags, steps, automation_status, source_file
"""
# Copyright (c) 2026 OctoBlue Technologies LLC. All rights reserved.
# Proprietary and confidential. See LICENSE for terms.

from __future__ import annotations

import argparse
import ast
import csv
import os
import re
import sys
from pathlib import Path


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

DEFAULT_EXCLUDES = {
    "node_modules", ".venv", "venv", "__pycache__", ".git",
    "dist", "build", ".tox", ".pytest_cache",
}

FRAMEWORK_CHOICES = {"bdd", "robot", "pytest", "junit", "playwright"}

CSV_COLUMNS = [
    "name", "type", "priority", "tags", "steps",
    "automation_status", "source_file",
]


# ---------------------------------------------------------------------------
# Data container
# ---------------------------------------------------------------------------

class TestEntry:
    """Represents a single discovered test."""

    __slots__ = ("name", "test_type", "priority", "tags", "steps",
                 "automation_status", "source_file")

    def __init__(
        self,
        name: str,
        test_type: str = "E2E",
        priority: str = "P2",
        tags: str = "",
        steps: str = "",
        automation_status: str = "Automated",
        source_file: str = "",
    ):
        self.name = name
        self.test_type = test_type
        self.priority = priority
        self.tags = tags
        self.steps = steps
        self.automation_status = automation_status
        self.source_file = source_file

    def as_row(self) -> list[str]:
        return [
            self.name, self.test_type, self.priority, self.tags,
            self.steps, self.automation_status, self.source_file,
        ]


# ---------------------------------------------------------------------------
# File discovery
# ---------------------------------------------------------------------------

def discover_files(
    root: Path,
    excludes: set[str],
) -> dict[str, list[Path]]:
    """Walk the directory tree and bucket files by framework."""
    buckets: dict[str, list[Path]] = {
        "bdd": [],
        "robot": [],
        "pytest": [],
        "junit": [],
        "playwright": [],
    }

    for dirpath, dirnames, filenames in os.walk(root):
        # Prune excluded directories in-place
        dirnames[:] = [d for d in dirnames if d not in excludes]

        for fname in filenames:
            fpath = Path(dirpath) / fname
            lower = fname.lower()

            if lower.endswith(".feature"):
                buckets["bdd"].append(fpath)
            elif lower.endswith(".robot"):
                buckets["robot"].append(fpath)
            elif lower.endswith(".py") and (
                lower.startswith("test_") or lower.endswith("_test.py")
            ):
                buckets["pytest"].append(fpath)
            elif lower.endswith(".java") and (
                lower.endswith("test.java")
                or lower.endswith("tests.java")
                or lower.endswith("it.java")
            ):
                buckets["junit"].append(fpath)
            elif re.match(r".*\.(spec|test)\.(ts|js)$", lower):
                buckets["playwright"].append(fpath)

    return buckets


# ---------------------------------------------------------------------------
# Framework parsers
# ---------------------------------------------------------------------------

def _read_file_safe(path: Path) -> str | None:
    """Read a file, returning None on failure."""
    for encoding in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            return path.read_text(encoding=encoding)
        except (UnicodeDecodeError, ValueError):
            continue
        except OSError as exc:
            _warn(f"Cannot read {path}: {exc}")
            return None
    _warn(f"Cannot decode {path}, skipping")
    return None


def _warn(msg: str) -> None:
    print(f"  WARNING: {msg}", file=sys.stderr)


# --- BDD / Gherkin ---

def parse_bdd(files: list[Path], root: Path) -> list[TestEntry]:
    """Parse .feature files for Scenario/Scenario Outline names, tags, steps."""
    entries: list[TestEntry] = []
    tag_re = re.compile(r"@(\S+)")
    scenario_re = re.compile(
        r"^\s*(Scenario|Scenario Outline|Scenario Template)\s*:\s*(.+)",
        re.IGNORECASE,
    )
    step_re = re.compile(
        r"^\s*(Given|When|Then|And|But)\s+(.+)", re.IGNORECASE
    )

    for fpath in files:
        content = _read_file_safe(fpath)
        if content is None:
            continue

        rel = _relative(fpath, root)
        feature_tags: list[str] = []
        pending_tags: list[str] = []
        in_scenario = False
        current_name = ""
        current_tags: list[str] = []
        current_steps: list[str] = []

        for line in content.splitlines():
            stripped = line.strip()

            # Tag lines
            if stripped.startswith("@"):
                pending_tags.extend(tag_re.findall(stripped))
                continue

            # Feature line — tags before it are feature-level
            if re.match(r"^\s*Feature\s*:", stripped, re.IGNORECASE):
                feature_tags = list(pending_tags)
                pending_tags.clear()
                continue

            # Scenario start
            m = scenario_re.match(stripped)
            if m:
                # Flush previous scenario
                if in_scenario:
                    entries.append(TestEntry(
                        name=current_name,
                        test_type="E2E",
                        tags=", ".join(current_tags),
                        steps="; ".join(current_steps),
                        source_file=rel,
                    ))
                current_name = m.group(2).strip()
                current_tags = feature_tags + pending_tags
                current_steps = []
                pending_tags = []
                in_scenario = True
                continue

            # Step lines (only inside a scenario)
            if in_scenario:
                sm = step_re.match(stripped)
                if sm:
                    current_steps.append(f"{sm.group(1)} {sm.group(2)}")
                    continue

            # Other non-empty, non-comment lines reset pending tags
            if stripped and not stripped.startswith("#"):
                pending_tags.clear()

        # Flush last scenario
        if in_scenario:
            entries.append(TestEntry(
                name=current_name,
                test_type="E2E",
                tags=", ".join(current_tags),
                steps="; ".join(current_steps),
                source_file=rel,
            ))

    return entries


# --- Robot Framework ---

def parse_robot(files: list[Path], root: Path) -> list[TestEntry]:
    """Parse .robot files for test case names, tags, documentation."""
    entries: list[TestEntry] = []
    section_re = re.compile(r"^\*{3}\s*(.+?)\s*\*{3}")

    for fpath in files:
        content = _read_file_safe(fpath)
        if content is None:
            continue

        rel = _relative(fpath, root)
        in_test_cases = False
        current_name: str | None = None
        current_tags: list[str] = []
        current_doc = ""

        def _flush():
            nonlocal current_name, current_tags, current_doc
            if current_name:
                entries.append(TestEntry(
                    name=current_name,
                    test_type="E2E",
                    tags=", ".join(current_tags),
                    steps=current_doc,
                    source_file=rel,
                ))
            current_name = None
            current_tags = []
            current_doc = ""

        for line in content.splitlines():
            # Section header
            sm = section_re.match(line)
            if sm:
                _flush()
                section_name = sm.group(1).strip().lower()
                in_test_cases = "test case" in section_name
                continue

            if not in_test_cases:
                continue

            # Test case name: non-indented, non-empty line
            if line and not line[0].isspace():
                _flush()
                current_name = line.strip()
                continue

            # Indented lines: settings or keywords
            stripped = line.strip()
            if stripped.lower().startswith("[tags]"):
                tag_val = stripped[6:].strip()
                current_tags = [
                    t.strip() for t in re.split(r"\s{2,}|\t", tag_val) if t.strip()
                ]
            elif stripped.lower().startswith("[documentation]"):
                current_doc = stripped[15:].strip()

        _flush()

    return entries


# --- pytest ---

def parse_pytest(files: list[Path], root: Path) -> list[TestEntry]:
    """Parse test_*.py / *_test.py using AST to find test functions and classes."""
    entries: list[TestEntry] = []

    for fpath in files:
        content = _read_file_safe(fpath)
        if content is None:
            continue

        rel = _relative(fpath, root)
        test_type = _infer_type_from_path(rel)

        try:
            tree = ast.parse(content, filename=str(fpath))
        except SyntaxError as exc:
            _warn(f"Syntax error in {fpath}: {exc}")
            continue

        for node in ast.iter_child_nodes(tree):
            if isinstance(node, ast.FunctionDef) and node.name.startswith("test_"):
                markers = _extract_pytest_markers(node)
                entries.append(TestEntry(
                    name=node.name,
                    test_type=test_type,
                    tags=", ".join(markers),
                    source_file=rel,
                ))
            elif isinstance(node, ast.ClassDef) and node.name.startswith("Test"):
                class_markers = _extract_pytest_markers(node)
                for item in node.body:
                    if isinstance(item, ast.FunctionDef) and item.name.startswith("test_"):
                        method_markers = _extract_pytest_markers(item)
                        all_markers = class_markers + method_markers
                        entries.append(TestEntry(
                            name=f"{node.name}::{item.name}",
                            test_type=test_type,
                            tags=", ".join(all_markers),
                            source_file=rel,
                        ))

    return entries


def _extract_pytest_markers(node: ast.FunctionDef | ast.ClassDef) -> list[str]:
    """Extract pytest marker names from decorators."""
    markers: list[str] = []
    for dec in node.decorator_list:
        marker_name = _get_marker_name(dec)
        if marker_name:
            markers.append(marker_name)
    return markers


def _get_marker_name(node: ast.expr) -> str | None:
    """Extract marker name from a decorator AST node."""
    # @pytest.mark.slow
    if isinstance(node, ast.Attribute):
        parts = _unpack_attribute(node)
        if len(parts) >= 3 and parts[0] == "pytest" and parts[1] == "mark":
            return parts[2]
    # @pytest.mark.parametrize(...)
    elif isinstance(node, ast.Call):
        return _get_marker_name(node.func)
    return None


def _unpack_attribute(node: ast.expr) -> list[str]:
    """Unpack a.b.c into ['a', 'b', 'c']."""
    parts: list[str] = []
    while isinstance(node, ast.Attribute):
        parts.append(node.attr)
        node = node.value
    if isinstance(node, ast.Name):
        parts.append(node.id)
    parts.reverse()
    return parts


def _infer_type_from_path(rel_path: str) -> str:
    """Infer test type from file path heuristics."""
    lower = rel_path.lower()
    if "e2e" in lower or "end_to_end" in lower or "end-to-end" in lower:
        return "E2E"
    if "integration" in lower or "integ" in lower:
        return "Integration"
    if "api" in lower:
        return "API"
    if "unit" in lower:
        return "Unit"
    return "Unit"


# --- JUnit / TestNG ---

def parse_junit(files: list[Path], root: Path) -> list[TestEntry]:
    """Parse Java test files for @Test methods."""
    entries: list[TestEntry] = []
    test_annotation_re = re.compile(r"@Test\b")
    method_re = re.compile(
        r"(?:public|protected|private)?\s*(?:static\s+)?(?:void|[\w<>\[\]]+)\s+(\w+)\s*\("
    )
    class_re = re.compile(r"(?:public\s+)?class\s+(\w+)")

    for fpath in files:
        content = _read_file_safe(fpath)
        if content is None:
            continue

        rel = _relative(fpath, root)
        test_type = _infer_type_from_path(rel)

        # Find class name
        class_match = class_re.search(content)
        class_name = class_match.group(1) if class_match else fpath.stem

        lines = content.splitlines()
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            if test_annotation_re.search(line):
                # Look ahead for method signature
                for j in range(i, min(i + 5, len(lines))):
                    mm = method_re.search(lines[j])
                    if mm:
                        method_name = mm.group(1)
                        entries.append(TestEntry(
                            name=f"{class_name}.{method_name}",
                            test_type=test_type,
                            source_file=rel,
                        ))
                        i = j
                        break
            i += 1

    return entries


# --- Playwright / Cypress ---

def parse_playwright(files: list[Path], root: Path) -> list[TestEntry]:
    """Parse .spec.ts/.spec.js/.test.ts/.test.js for describe/it/test blocks."""
    entries: list[TestEntry] = []
    # Match describe('...'), it('...'), test('...')
    # Handles single quotes, double quotes, and backticks
    block_re = re.compile(
        r"""\b(describe|it|test)\s*\(\s*(['"`])(.*?)\2""",
    )

    for fpath in files:
        content = _read_file_safe(fpath)
        if content is None:
            continue

        rel = _relative(fpath, root)
        test_type = "E2E"
        describe_stack: list[str] = []

        for line in content.splitlines():
            # Track describe blocks opening
            stripped = line.strip()

            for match in block_re.finditer(line):
                keyword = match.group(1)
                name = match.group(3)

                if keyword == "describe":
                    describe_stack.append(name)
                elif keyword in ("it", "test"):
                    full_name = (
                        " > ".join(describe_stack + [name])
                        if describe_stack else name
                    )
                    entries.append(TestEntry(
                        name=full_name,
                        test_type=test_type,
                        source_file=rel,
                    ))

            # Track closing braces to pop describe stack (best-effort)
            if describe_stack and stripped.startswith("});") or stripped == "})":
                if describe_stack:
                    describe_stack.pop()

    return entries


# ---------------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------------

def _relative(path: Path, root: Path) -> str:
    """Get a relative path string using forward slashes."""
    try:
        return str(path.relative_to(root)).replace("\\", "/")
    except ValueError:
        return str(path).replace("\\", "/")


# ---------------------------------------------------------------------------
# CSV output
# ---------------------------------------------------------------------------

def write_csv(entries: list[TestEntry], output: Path) -> None:
    """Write test entries to CSV."""
    with open(output, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(CSV_COLUMNS)
        for entry in entries:
            writer.writerow(entry.as_row())


# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------

def print_summary(
    results: dict[str, list[TestEntry]],
    file_counts: dict[str, int],
    output: Path,
) -> None:
    """Print a human-readable summary of the scan."""
    labels = {
        "bdd": "BDD/Gherkin",
        "robot": "Robot Framework",
        "pytest": "pytest",
        "junit": "JUnit/TestNG",
        "playwright": "Playwright/Cypress",
    }
    total = 0
    print("\nScan complete:")
    for key in ("bdd", "pytest", "robot", "junit", "playwright"):
        count = len(results.get(key, []))
        fcount = file_counts.get(key, 0)
        label = f"  {labels[key]}:"
        if count > 0:
            print(f"{label:<25}{count} tests from {fcount} files")
        else:
            print(f"{label:<25} 0 tests")
        total += count

    print(f"\nTotal: {total} tests exported to {output}")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Scan a repository for test files and export to CSV for TestIntel inventory import.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python scan_tests.py --path /path/to/repo
  python scan_tests.py --path /path/to/repo --output my-tests.csv
  python scan_tests.py --path /path/to/repo --frameworks bdd,pytest
  python scan_tests.py --path /path/to/repo --exclude node_modules,.venv,dist
        """,
    )
    parser.add_argument(
        "--path", required=True,
        help="Path to the repository root to scan",
    )
    parser.add_argument(
        "--output", "-o", default="test-scan.csv",
        help="Output CSV file path (default: test-scan.csv)",
    )
    parser.add_argument(
        "--frameworks", default=None,
        help="Comma-separated list of frameworks to scan (default: all). "
             "Choices: bdd, robot, pytest, junit, playwright",
    )
    parser.add_argument(
        "--exclude", default=None,
        help="Comma-separated directory names to exclude (added to defaults)",
    )
    args = parser.parse_args()

    root = Path(args.path).resolve()
    if not root.is_dir():
        print(f"Error: {args.path} is not a valid directory.", file=sys.stderr)
        sys.exit(1)

    # Determine frameworks to scan
    if args.frameworks:
        frameworks = {f.strip().lower() for f in args.frameworks.split(",")}
        invalid = frameworks - FRAMEWORK_CHOICES
        if invalid:
            print(
                f"Error: unknown frameworks: {', '.join(sorted(invalid))}. "
                f"Valid choices: {', '.join(sorted(FRAMEWORK_CHOICES))}",
                file=sys.stderr,
            )
            sys.exit(1)
    else:
        frameworks = FRAMEWORK_CHOICES

    # Build exclude set
    excludes = set(DEFAULT_EXCLUDES)
    if args.exclude:
        excludes.update(e.strip() for e in args.exclude.split(","))

    # Discover files
    print(f"Scanning {root} ...")
    buckets = discover_files(root, excludes)

    # Parse each framework
    parsers = {
        "bdd": parse_bdd,
        "robot": parse_robot,
        "pytest": parse_pytest,
        "junit": parse_junit,
        "playwright": parse_playwright,
    }

    results: dict[str, list[TestEntry]] = {}
    file_counts: dict[str, int] = {}

    for fw in sorted(frameworks):
        files = buckets.get(fw, [])
        file_counts[fw] = len(files)
        if files:
            results[fw] = parsers[fw](files, root)
        else:
            results[fw] = []

    # Combine all entries and write CSV
    all_entries: list[TestEntry] = []
    for fw in sorted(frameworks):
        all_entries.extend(results[fw])

    output = Path(args.output)
    write_csv(all_entries, output)

    # Print summary
    print_summary(results, file_counts, output)


if __name__ == "__main__":
    main()
