<?php
class Crawler {
    private $db;
    private $gpt;

    public function __construct() {
        $database = new Database();
        $this->db = $database->getConnection();
        $this->gpt = new GPTClient();
    }

    public function crawlSources() {
        $totalResults = [
            'sources_processed' => 0,
            'articles_found' => 0,
            'articles_created' => 0,
            'articles_skipped' => 0,
            'errors' => []
        ];

        try {
            $sources = $this->getActiveSources();

            foreach ($sources as $source) {
                $result = $this->crawlSource($source);
                $this->updateLastCrawled($source['id']);

                // Record crawl history for this source
                $this->recordCrawlHistory($source['id'], $result);

                // Accumulate totals
                $totalResults['sources_processed']++;
                $totalResults['articles_found'] += $result['articles_found'];
                $totalResults['articles_created'] += $result['articles_created'];
                $totalResults['articles_skipped'] += $result['articles_skipped'];

                if ($result['status'] === 'failed') {
                    $totalResults['errors'][] = $source['name'] . ': ' . $result['error_message'];
                }
            }

            return $totalResults;
        } catch (Exception $e) {
            error_log("Crawl sources error: " . $e->getMessage());
            return false;
        }
    }

    private function getActiveSources() {
        try {
            $query = "SELECT * FROM crawl_sources WHERE is_active = 1";
            $stmt = $this->db->prepare($query);
            $stmt->execute();
            return $stmt->fetchAll();
        } catch (Exception $e) {
            error_log("Get active sources error: " . $e->getMessage());
            return [];
        }
    }

    private function crawlSource($source) {
        $startTime = microtime(true);
        $result = [
            'status' => 'success',
            'articles_found' => 0,
            'articles_created' => 0,
            'articles_skipped' => 0,
            'error_message' => null,
            'execution_time' => 0
        ];

        try {
            $content = $this->fetchContent($source['url']);

            if ($content) {
                $articles = $this->extractArticles($content, $source);
                $result['articles_found'] = count($articles);

                foreach ($articles as $articleData) {
                    if ($this->isDuplicate($articleData['title'])) {
                        $result['articles_skipped']++;
                        continue;
                    }

                    $articleId = $this->processAndSaveArticle($articleData, $source);
                    if ($articleId) {
                        $result['articles_created']++;
                    }
                }
            }
        } catch (Exception $e) {
            $result['status'] = 'failed';
            $result['error_message'] = $e->getMessage();
            error_log("Crawl source error for {$source['url']}: " . $e->getMessage());
        }

        $result['execution_time'] = round((microtime(true) - $startTime) * 1000); // in milliseconds
        return $result;
    }

    private function fetchContent($url) {
        try {
            $ch = curl_init();
            curl_setopt_array($ch, [
                CURLOPT_URL => $url,
                CURLOPT_RETURNTRANSFER => true,
                CURLOPT_FOLLOWLOCATION => true,
                CURLOPT_MAXREDIRS => 3,
                CURLOPT_TIMEOUT => 30,
                CURLOPT_USERAGENT => 'Excellent Blog Crawler 1.0',
                CURLOPT_SSL_VERIFYPEER => false,
                CURLOPT_SSL_VERIFYHOST => false
            ]);

            $content = curl_exec($ch);
            $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);

            if (curl_errno($ch) || $httpCode !== 200) {
                curl_close($ch);
                return false;
            }

            curl_close($ch);
            return $content;
        } catch (Exception $e) {
            error_log("Fetch content error: " . $e->getMessage());
            return false;
        }
    }

    private function extractArticles($html, $source) {
        $articles = [];

        try {
            $dom = new DOMDocument();
            @$dom->loadHTML($html);
            $xpath = new DOMXPath($dom);

            // Generic article extraction - can be enhanced with specific selectors per source
            $articleNodes = $xpath->query('//article | //div[contains(@class, "post")] | //div[contains(@class, "article")]');

            foreach ($articleNodes as $node) {
                $title = $this->extractTitle($xpath, $node);
                $content = $this->extractContent($xpath, $node);
                $link = $this->extractLink($xpath, $node, $source['url']);

                if ($title && $content && strlen($content) > 200) {
                    $articles[] = [
                        'title' => $title,
                        'content' => $content,
                        'source_url' => $link,
                        'source_name' => $source['name']
                    ];
                }

                // Limit to prevent overload
                if (count($articles) >= 5) {
                    break;
                }
            }
        } catch (Exception $e) {
            error_log("Extract articles error: " . $e->getMessage());
        }

        return $articles;
    }

    private function extractTitle($xpath, $node) {
        $titleNodes = $xpath->query('.//h1 | .//h2 | .//h3 | .//*[contains(@class, "title")]', $node);
        if ($titleNodes->length > 0) {
            return trim($titleNodes->item(0)->textContent);
        }
        return null;
    }

    private function extractContent($xpath, $node) {
        $contentNodes = $xpath->query('.//p | .//div[contains(@class, "content")]', $node);
        $content = '';

        foreach ($contentNodes as $contentNode) {
            $text = trim($contentNode->textContent);
            if (strlen($text) > 50) {
                $content .= $text . "\n\n";
            }
        }

        return trim($content);
    }

    private function extractLink($xpath, $node, $baseUrl) {
        $linkNodes = $xpath->query('.//a[@href]', $node);
        if ($linkNodes->length > 0) {
            $href = $linkNodes->item(0)->getAttribute('href');
            if (strpos($href, 'http') === 0) {
                return $href;
            } else {
                return rtrim($baseUrl, '/') . '/' . ltrim($href, '/');
            }
        }
        return $baseUrl;
    }

    private function isDuplicate($title) {
        try {
            $query = "SELECT COUNT(*) as count FROM articles WHERE title = :title";
            $stmt = $this->db->prepare($query);
            $stmt->bindParam(':title', $title);
            $stmt->execute();

            $result = $stmt->fetch();
            return $result['count'] > 0;
        } catch (Exception $e) {
            error_log("Check duplicate error: " . $e->getMessage());
            return false;
        }
    }

    private function processAndSaveArticle($articleData, $source) {
        try {
            // Use AI to clean and improve the content
            $improvedContent = $this->improveContent($articleData['content'], $articleData['title']);

            $article = new Article();

            $data = [
                'title' => $articleData['title'],
                'content' => $improvedContent ?: $articleData['content'],
                'excerpt' => substr(strip_tags($improvedContent ?: $articleData['content']), 0, 200) . '...',
                'type' => 'external',
                'status' => 'published',
                'author_id' => 1, // System user
                'category_id' => $this->getDefaultCategoryId(),
                'is_ai_generated' => true,
                'source_url' => $articleData['source_url'],
                'meta_title' => $articleData['title'],
                'meta_description' => substr(strip_tags($improvedContent ?: $articleData['content']), 0, 160),
                'featured_image' => null
            ];

            $articleId = $article->create($data);

            if ($articleId) {
                error_log("Crawled article saved: ID {$articleId}, Title: {$data['title']}");
                return $articleId;
            }

            return false;
        } catch (Exception $e) {
            error_log("Process and save article error: " . $e->getMessage());
            return false;
        }
    }

    private function improveContent($content, $title) {
        try {
            $prompt = "Please improve and restructure the following article content. Make it well-formatted with proper HTML tags, improve readability, fix any grammar issues, and ensure it's engaging for readers. Keep the original meaning but enhance the presentation.

Title: {$title}

Content: {$content}

Please return only the improved content with proper HTML formatting.";

            return $this->gpt->generateArticle($prompt, 'external', 'zh-TW')['content'] ?? null;
        } catch (Exception $e) {
            error_log("Improve content error: " . $e->getMessage());
            return null;
        }
    }

    private function getDefaultCategoryId() {
        try {
            $query = "SELECT id FROM categories WHERE slug = 'news' LIMIT 1";
            $stmt = $this->db->prepare($query);
            $stmt->execute();

            $result = $stmt->fetch();
            return $result ? $result['id'] : null;
        } catch (Exception $e) {
            error_log("Get default category error: " . $e->getMessage());
            return null;
        }
    }

    private function updateLastCrawled($sourceId) {
        try {
            $query = "UPDATE crawl_sources SET last_crawled = NOW() WHERE id = :id";
            $stmt = $this->db->prepare($query);
            $stmt->bindParam(':id', $sourceId);
            return $stmt->execute();
        } catch (Exception $e) {
            error_log("Update last crawled error: " . $e->getMessage());
            return false;
        }
    }

    public function addSource($name, $url, $selector = null) {
        try {
            $query = "INSERT INTO crawl_sources (name, url, selector) VALUES (:name, :url, :selector)";
            $stmt = $this->db->prepare($query);
            $stmt->bindParam(':name', $name);
            $stmt->bindParam(':url', $url);
            $stmt->bindParam(':selector', $selector);

            if ($stmt->execute()) {
                return $this->db->lastInsertId();
            }

            return false;
        } catch (Exception $e) {
            error_log("Add source error: " . $e->getMessage());
            return false;
        }
    }

    public function removeSource($id) {
        try {
            $query = "DELETE FROM crawl_sources WHERE id = :id";
            $stmt = $this->db->prepare($query);
            $stmt->bindParam(':id', $id);
            return $stmt->execute();
        } catch (Exception $e) {
            error_log("Remove source error: " . $e->getMessage());
            return false;
        }
    }

    public function getAllSources() {
        try {
            $query = "SELECT * FROM crawl_sources ORDER BY created_at DESC";
            $stmt = $this->db->prepare($query);
            $stmt->execute();
            return $stmt->fetchAll();
        } catch (Exception $e) {
            error_log("Get all sources error: " . $e->getMessage());
            return [];
        }
    }

    public function toggleSource($id, $active) {
        try {
            $query = "UPDATE crawl_sources SET is_active = :active WHERE id = :id";
            $stmt = $this->db->prepare($query);
            $stmt->bindParam(':id', $id);
            $stmt->bindParam(':active', $active, PDO::PARAM_BOOL);
            return $stmt->execute();
        } catch (Exception $e) {
            error_log("Toggle source error: " . $e->getMessage());
            return false;
        }
    }

    private function recordCrawlHistory($sourceId, $result) {
        try {
            $query = "INSERT INTO crawl_history (source_id, articles_found, articles_created, articles_skipped, status, error_message, execution_time, completed_at)
                      VALUES (:source_id, :articles_found, :articles_created, :articles_skipped, :status, :error_message, :execution_time, NOW())";

            $stmt = $this->db->prepare($query);
            $stmt->bindParam(':source_id', $sourceId);
            $stmt->bindParam(':articles_found', $result['articles_found']);
            $stmt->bindParam(':articles_created', $result['articles_created']);
            $stmt->bindParam(':articles_skipped', $result['articles_skipped']);
            $stmt->bindParam(':status', $result['status']);
            $stmt->bindParam(':error_message', $result['error_message']);
            $stmt->bindParam(':execution_time', $result['execution_time']);

            return $stmt->execute();
        } catch (Exception $e) {
            error_log("Record crawl history error: " . $e->getMessage());
            return false;
        }
    }

    public function getCrawlHistory($limit = 20) {
        try {
            $query = "SELECT ch.*, cs.name as source_name
                      FROM crawl_history ch
                      LEFT JOIN crawl_sources cs ON ch.source_id = cs.id
                      ORDER BY ch.started_at DESC
                      LIMIT :limit";

            $stmt = $this->db->prepare($query);
            $stmt->bindParam(':limit', $limit, PDO::PARAM_INT);
            $stmt->execute();

            return $stmt->fetchAll();
        } catch (Exception $e) {
            error_log("Get crawl history error: " . $e->getMessage());
            return [];
        }
    }

    public function getCrawlStats() {
        try {
            $stats = [];

            // Total crawls today
            $query = "SELECT COUNT(*) as count FROM crawl_history WHERE DATE(started_at) = CURDATE()";
            $stmt = $this->db->prepare($query);
            $stmt->execute();
            $stats['crawls_today'] = $stmt->fetch()['count'];

            // Total articles created today
            $query = "SELECT SUM(articles_created) as count FROM crawl_history WHERE DATE(started_at) = CURDATE()";
            $stmt = $this->db->prepare($query);
            $stmt->execute();
            $stats['articles_today'] = $stmt->fetch()['count'] ?: 0;

            // Success rate (last 24 hours)
            $query = "SELECT
                        COUNT(*) as total,
                        SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successful
                      FROM crawl_history
                      WHERE started_at >= DATE_SUB(NOW(), INTERVAL 24 HOUR)";
            $stmt = $this->db->prepare($query);
            $stmt->execute();
            $result = $stmt->fetch();
            $stats['success_rate'] = $result['total'] > 0 ? round(($result['successful'] / $result['total']) * 100, 1) : 0;

            // Average execution time
            $query = "SELECT AVG(execution_time) as avg_time FROM crawl_history WHERE started_at >= DATE_SUB(NOW(), INTERVAL 24 HOUR)";
            $stmt = $this->db->prepare($query);
            $stmt->execute();
            $stats['avg_execution_time'] = round($stmt->fetch()['avg_time'] ?: 0);

            return $stats;
        } catch (Exception $e) {
            error_log("Get crawl stats error: " . $e->getMessage());
            return [];
        }
    }
}
?>