youtube-summarizer/backend/test_runner/core/test_discovery.py

"""
Test Discovery System

Intelligently discovers and categorizes tests across the YouTube Summarizer project.
Handles different test types, patterns, and dependency analysis for smart test selection.
"""

import ast
import re
from pathlib import Path
from typing import Dict, List, Optional, Set, Pattern
import logging

from .models import TestCategory, TestInfo, TestSuite


# TestInfo and TestSuite are now imported from models module


class TestDiscovery:
    """
    Intelligent test discovery and categorization.

    Analyzes test files to understand their category, dependencies, and requirements
    for optimal execution planning.
    """

    def __init__(self, project_root: Path, config):
        """
        Initialize test discovery.

        Args:
            project_root: Root directory of the project
            config: Test configuration object
        """
        self.project_root = project_root
        self.config = config
        self.logger = logging.getLogger("TestDiscovery")

        # Test file patterns by category
        self._category_patterns = {
            TestCategory.UNIT: [
                re.compile(r"backend/tests/unit/.*\.py$"),
                re.compile(r"tests/unit/.*\.py$")
            ],
            TestCategory.INTEGRATION: [
                re.compile(r"backend/tests/integration/.*\.py$"),
                re.compile(r"tests/integration/.*\.py$")
            ],
            TestCategory.API: [
                re.compile(r"backend/tests/integration/test.*api.*\.py$"),
                re.compile(r"backend/tests/api/.*\.py$")
            ],
            TestCategory.FRONTEND: [
                re.compile(r"frontend/src/.*\.test\.(ts|tsx|js|jsx)$"),
                re.compile(r"frontend/tests/.*\.(ts|tsx|js|jsx)$"),
                re.compile(r"frontend/src/test/.*\.tsx?$")
            ],
            TestCategory.E2E: [
                re.compile(r".*/e2e/.*\.py$"),
                re.compile(r".*/e2e/.*\.spec\.(ts|js)$")
            ],
            TestCategory.PERFORMANCE: [
                re.compile(r".*/performance/.*\.py$"),
                re.compile(r".*test.*perf.*\.py$")
            ],
            TestCategory.DATABASE: [
                re.compile(r".*test.*database.*\.py$"),
                re.compile(r".*test.*db.*\.py$")
            ]
        }

        # Keywords that help identify test categories
        self._category_keywords = {
            TestCategory.AUTH: ["auth", "login", "token", "jwt", "password"],
            TestCategory.PIPELINE: ["pipeline", "summary", "workflow", "orchestrat"],
            TestCategory.DATABASE: ["database", "db", "model", "migration"],
            TestCategory.API: ["api", "endpoint", "route", "client"],
            TestCategory.INTEGRATION: ["integration", "service", "external"]
        }

        # Dependency indicators
        self._dependency_patterns = {
            "database": [r"@pytest\.mark\.asyncio.*db", r"TestClient", r"test_db", r"Session"],
            "network": [r"requests\.", r"httpx\.", r"aiohttp", r"urllib"],
            "auth": [r"auth", r"login", r"jwt", r"token", r"password"]
        }

    async def discover_by_category(self, category: TestCategory) -> Optional[TestSuite]:
        """
        Discover all tests in a specific category.

        Args:
            category: Test category to discover

        Returns:
            TestSuite containing all tests in the category, or None if no tests found
        """
        self.logger.info(f"Discovering {category.value} tests...")

        tests = []

        # Find test files matching category patterns
        test_files = self._find_test_files(category)

        for test_file in test_files:
            file_tests = await self._analyze_test_file(test_file, category)
            tests.extend(file_tests)

        if not tests:
            return None

        suite = TestSuite(category=category, tests=tests)

        self.logger.info(f"Found {len(tests)} {category.value} tests "
                        f"(estimated duration: {suite.total_estimated_duration:.1f}s)")

        return suite

    async def discover_by_patterns(self, patterns: List[str]) -> Dict[TestCategory, TestSuite]:
        """
        Discover tests matching specific patterns.

        Args:
            patterns: List of glob patterns or test names

        Returns:
            Dictionary mapping categories to test suites
        """
        self.logger.info(f"Discovering tests matching patterns: {patterns}")

        results = {}
        all_test_files = []

        # Find files matching patterns
        for pattern in patterns:
            if "/" in pattern or pattern.endswith(".py"):
                # File path pattern
                matching_files = list(self.project_root.glob(pattern))
                all_test_files.extend(matching_files)
            else:
                # Test name pattern - search all test files
                all_test_files.extend(self._find_all_test_files())

        # Analyze each file and categorize
        categorized_tests = {}

        for test_file in set(all_test_files):
            # Determine category from file path
            category = self._categorize_file(test_file)

            if category not in categorized_tests:
                categorized_tests[category] = []

            file_tests = await self._analyze_test_file(test_file, category)

            # Filter by pattern if it's a test name pattern
            if patterns and not any("/" in p or p.endswith(".py") for p in patterns):
                file_tests = [
                    test for test in file_tests
                    if any(pattern.lower() in test.name.lower() for pattern in patterns)
                ]

            categorized_tests[category].extend(file_tests)

        # Create test suites
        for category, tests in categorized_tests.items():
            if tests:
                results[category] = TestSuite(category=category, tests=tests)

        return results

    def _find_test_files(self, category: TestCategory) -> List[Path]:
        """Find test files matching a specific category."""
        test_files = []
        patterns = self._category_patterns.get(category, [])

        for pattern in patterns:
            # Search for files matching the pattern
            for test_file in self.project_root.rglob("*.py"):
                relative_path = str(test_file.relative_to(self.project_root))
                if pattern.search(relative_path):
                    test_files.append(test_file)

        # Also search TypeScript/JavaScript files for frontend tests
        if category == TestCategory.FRONTEND:
            for ext in ["*.ts", "*.tsx", "*.js", "*.jsx"]:
                for test_file in self.project_root.rglob(ext):
                    relative_path = str(test_file.relative_to(self.project_root))
                    if any(p.search(relative_path) for p in patterns):
                        test_files.append(test_file)

        return list(set(test_files))  # Remove duplicates

    def _find_all_test_files(self) -> List[Path]:
        """Find all test files in the project."""
        test_files = []

        # Python test files
        for test_file in self.project_root.rglob("test_*.py"):
            test_files.append(test_file)

        for test_file in self.project_root.rglob("*_test.py"):
            test_files.append(test_file)

        # Frontend test files
        for pattern in ["*.test.ts", "*.test.tsx", "*.test.js", "*.test.jsx"]:
            for test_file in self.project_root.rglob(pattern):
                test_files.append(test_file)

        return list(set(test_files))

    def _categorize_file(self, file_path: Path) -> TestCategory:
        """Determine the category of a test file."""
        relative_path = str(file_path.relative_to(self.project_root))

        # Check each category pattern
        for category, patterns in self._category_patterns.items():
            if any(pattern.search(relative_path) for pattern in patterns):
                return category

        # Fallback based on keywords in path or filename
        path_lower = relative_path.lower()

        if "integration" in path_lower:
            return TestCategory.INTEGRATION
        elif "unit" in path_lower:
            return TestCategory.UNIT
        elif "api" in path_lower:
            return TestCategory.API
        elif "frontend" in path_lower or file_path.suffix in ['.ts', '.tsx', '.js', '.jsx']:
            return TestCategory.FRONTEND
        elif "e2e" in path_lower:
            return TestCategory.E2E
        else:
            return TestCategory.UNIT  # Default fallback

    async def _analyze_test_file(self, file_path: Path, category: TestCategory) -> List[TestInfo]:
        """Analyze a test file to extract test information."""
        tests = []

        try:
            if file_path.suffix == ".py":
                tests = await self._analyze_python_test_file(file_path, category)
            elif file_path.suffix in [".ts", ".tsx", ".js", ".jsx"]:
                tests = await self._analyze_frontend_test_file(file_path, category)

        except Exception as e:
            self.logger.warning(f"Failed to analyze {file_path}: {e}")

        return tests

    async def _analyze_python_test_file(self, file_path: Path, category: TestCategory) -> List[TestInfo]:
        """Analyze a Python test file."""
        tests = []

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Parse the AST
            tree = ast.parse(content, filename=str(file_path))

            # Extract test classes and methods
            for node in ast.walk(tree):
                if isinstance(node, ast.ClassDef) and node.name.startswith('Test'):
                    class_name = node.name

                    # Analyze methods in the class
                    for item in node.body:
                        if (isinstance(item, ast.FunctionDef) and
                            item.name.startswith('test_')):

                            test_info = self._create_test_info(
                                file_path, category, content, class_name, item.name
                            )
                            tests.append(test_info)

                elif (isinstance(node, ast.FunctionDef) and
                      node.name.startswith('test_')):
                    # Standalone test function
                    test_info = self._create_test_info(
                        file_path, category, content, None, node.name
                    )
                    tests.append(test_info)

        except Exception as e:
            self.logger.warning(f"Error parsing {file_path}: {e}")

        return tests

    async def _analyze_frontend_test_file(self, file_path: Path, category: TestCategory) -> List[TestInfo]:
        """Analyze a frontend test file."""
        tests = []

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Simple regex-based analysis for TypeScript/JavaScript tests
            # Look for test declarations: it('...', describe('...', test('...'
            test_patterns = [
                r"it\s*\(\s*['\"]([^'\"]+)['\"]",
                r"test\s*\(\s*['\"]([^'\"]+)['\"]",
                r"describe\s*\(\s*['\"]([^'\"]+)['\"]"
            ]

            for pattern in test_patterns:
                matches = re.finditer(pattern, content)
                for match in matches:
                    test_name = match.group(1)

                    test_info = TestInfo(
                        name=f"{file_path.stem}::{test_name}",
                        file_path=file_path,
                        category=category,
                        function_name=test_name,
                        estimated_duration=2.0,  # Frontend tests typically take longer
                        markers=self._extract_frontend_markers(content)
                    )

                    tests.append(test_info)

        except Exception as e:
            self.logger.warning(f"Error analyzing frontend test {file_path}: {e}")

        return tests

    def _create_test_info(
        self,
        file_path: Path,
        category: TestCategory,
        content: str,
        class_name: Optional[str],
        function_name: str
    ) -> TestInfo:
        """Create TestInfo object from analysis."""

        # Generate test name
        if class_name:
            test_name = f"{class_name}::{function_name}"
        else:
            test_name = f"{file_path.stem}::{function_name}"

        # Analyze content for requirements and markers
        markers = self._extract_markers(content)
        requires_db = self._requires_database(content)
        requires_network = self._requires_network(content)
        requires_auth = self._requires_auth(content)

        # Estimate duration based on category and markers
        duration = self._estimate_test_duration(category, markers, content)

        # Refine category based on content analysis
        refined_category = self._refine_category(category, content, file_path)

        return TestInfo(
            name=test_name,
            file_path=file_path,
            category=refined_category,
            class_name=class_name,
            function_name=function_name,
            markers=markers,
            estimated_duration=duration,
            requires_database=requires_db,
            requires_network=requires_network,
            requires_auth=requires_auth
        )

    def _extract_markers(self, content: str) -> List[str]:
        """Extract pytest markers from test content."""
        markers = []

        marker_patterns = [
            r"@pytest\.mark\.(\w+)",
            r"@pytest\.mark\.parametrize",
            r"@pytest\.mark\.asyncio",
            r"@pytest\.mark\.slow",
            r"@pytest\.mark\.integration"
        ]

        for pattern in marker_patterns:
            matches = re.finditer(pattern, content)
            for match in matches:
                if match.groups():
                    markers.append(match.group(1))
                else:
                    # Handle complex markers like parametrize
                    marker_line = match.group(0)
                    if "parametrize" in marker_line:
                        markers.append("parametrize")
                    elif "asyncio" in marker_line:
                        markers.append("asyncio")

        return markers

    def _extract_frontend_markers(self, content: str) -> List[str]:
        """Extract markers from frontend test content."""
        markers = []

        if "async" in content or "await" in content:
            markers.append("async")
        if "mock" in content.lower():
            markers.append("mock")
        if "timeout" in content.lower():
            markers.append("timeout")

        return markers

    def _requires_database(self, content: str) -> bool:
        """Check if test requires database setup."""
        db_indicators = [
            "Session", "test_db", "database", "db", "engine",
            "create_all", "drop_all", "transaction", "commit"
        ]

        content_lower = content.lower()
        return any(indicator.lower() in content_lower for indicator in db_indicators)

    def _requires_network(self, content: str) -> bool:
        """Check if test requires network access."""
        network_indicators = [
            "requests.", "httpx.", "aiohttp", "urllib", "http://", "https://",
            "TestClient", "client.post", "client.get", "mock_response"
        ]

        return any(indicator in content for indicator in network_indicators)

    def _requires_auth(self, content: str) -> bool:
        """Check if test requires authentication setup."""
        auth_indicators = [
            "auth", "login", "jwt", "token", "password", "authenticate",
            "Authorization", "Bearer", "session"
        ]

        content_lower = content.lower()
        return any(indicator.lower() in content_lower for indicator in auth_indicators)

    def _estimate_test_duration(self, category: TestCategory, markers: List[str], content: str) -> float:
        """Estimate test execution duration."""
        base_duration = {
            TestCategory.UNIT: 0.1,
            TestCategory.INTEGRATION: 2.0,
            TestCategory.API: 1.5,
            TestCategory.FRONTEND: 3.0,
            TestCategory.E2E: 30.0,
            TestCategory.PERFORMANCE: 60.0,
            TestCategory.DATABASE: 5.0,
            TestCategory.AUTH: 2.0,
            TestCategory.PIPELINE: 10.0
        }.get(category, 1.0)

        # Adjust based on markers
        if "slow" in markers:
            base_duration *= 5.0
        if "asyncio" in markers:
            base_duration *= 1.5
        if "parametrize" in markers:
            # Estimate number of parameters
            param_count = content.count("@pytest.mark.parametrize") * 3  # Rough estimate
            base_duration *= max(param_count, 1)

        # Adjust based on content complexity
        if content.count("await") > 10:
            base_duration *= 2.0
        if content.count("mock") > 5:
            base_duration *= 1.5

        return base_duration

    def _refine_category(self, base_category: TestCategory, content: str, file_path: Path) -> TestCategory:
        """Refine test category based on content analysis."""

        content_lower = content.lower()
        file_name_lower = file_path.name.lower()

        # Check for specific patterns that might override base category
        for category, keywords in self._category_keywords.items():
            if any(keyword in content_lower or keyword in file_name_lower for keyword in keywords):
                # Only override if it makes sense
                if (base_category == TestCategory.UNIT and
                    category in [TestCategory.AUTH, TestCategory.PIPELINE, TestCategory.DATABASE]):
                    return category

        return base_category

    async def get_test_dependencies(self, test_info: TestInfo) -> List[str]:
        """Analyze test dependencies for smart execution ordering."""
        dependencies = []

        try:
            with open(test_info.file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Look for common dependency patterns
            import_patterns = [
                r"from\s+(\S+)\s+import",
                r"import\s+(\S+)"
            ]

            for pattern in import_patterns:
                matches = re.finditer(pattern, content)
                for match in matches:
                    module = match.group(1)
                    if any(keyword in module for keyword in ["service", "model", "api"]):
                        dependencies.append(module)

        except Exception as e:
            self.logger.warning(f"Error analyzing dependencies for {test_info.name}: {e}")

        return dependencies[:10]  # Limit to avoid excessive dependencies