<?php
/**
 * AiPress Chat — Indexer component (CLEAN PRODUCTION VERSION).
 *
 * @package aiPress_Chat
 */

namespace AIPress;

/**
 * Indexer component.
 */
class Indexer {
	/**
	 * Construct component.
	 */
	public function __construct() {
		register_activation_hook( AIPRESS_PLUGIN_FILE, array( $this, 'create_table' ) );
		add_action( 'wp_ajax_aipress_index_now', array( $this, 'index_selected' ) );
		add_shortcode( 'aipress_status', [ $this, 'status_shortcode' ] );
	}
	
	/**
	 * Create table component.
	 */
	public function create_table() {
		global $wpdb;
		$table = $wpdb->prefix . 'aipress_chunks';
		$sql   = "CREATE TABLE IF NOT EXISTS {$table} (id BIGINT AUTO_INCREMENT PRIMARY KEY,page_id BIGINT,chunk_index INT,content LONGTEXT,embedding LONGTEXT,INDEX(page_id)) {$wpdb->get_charset_collate()};";
		require_once ABSPATH . 'wp-admin/includes/upgrade.php';
		dbDelta( $sql );
	}
	
	/**
	 * Get content for indexing - tries multiple methods INCLUDING rendered HTML.
	 *
	 * @param int $page_id The WordPress page ID.
	 * @return string The content to index.
	 */
	private function get_rendered_content( $page_id ) {
		// Method 1: Check for custom aiPress content field first
		$custom_content = get_post_meta( $page_id, '_aipress_content', true );
		$content_mode = get_post_meta( $page_id, '_aipress_content_mode', true ) ?: 'replace';
		
		if ( ! empty( $custom_content ) ) {
			if ( $content_mode === 'add' ) {
				$auto_content = $this->get_auto_detected_content( $page_id );
				if ( ! empty( $auto_content ) ) {
					return $auto_content . "\n\n" . $custom_content;
				}
			}
			return $custom_content;
		}
		
		// Method 2: Try to get the actual rendered page content via HTTP request
		$rendered_content = $this->fetch_page_via_http( $page_id );
		if ( ! empty( $rendered_content ) && strlen( trim( $rendered_content ) ) > 50 ) {
			return $rendered_content; // Already processed by extract_content_from_html
		}
		
		// Method 3: Try WordPress post content
		$post_content = get_post_field( 'post_content', $page_id );
		if ( ! empty( trim( $post_content ) ) ) {
			return do_shortcode( $post_content );
		}
		
		// Method 4: Fallback to title + excerpt
		$title = get_the_title( $page_id );
		$excerpt = get_the_excerpt( $page_id );
		if ( ! empty( $title ) ) {
			return $title . '. ' . $excerpt;
		}
		
		return "No content available for this page.";
	}
	
	/**
	 * Fetch page content via HTTP request to get the actual rendered HTML.
	 *
	 * @param int $page_id The page ID.
	 * @return string The rendered HTML content.
	 */
	private function fetch_page_via_http( $page_id ) {
		$page_url = get_permalink( $page_id );
		if ( ! $page_url ) {
			return '';
		}
		
		// Make HTTP request to get the rendered page
		$response = wp_remote_get( $page_url, array(
			'timeout' => 60, // Increased timeout
			'user-agent' => 'aiPress Content Indexer',
			'sslverify' => false, // For local development
			'redirection' => 5,
		) );
		
		if ( is_wp_error( $response ) ) {
			return '';
		}
		
		$html = wp_remote_retrieve_body( $response );
		if ( empty( $html ) ) {
			return '';
		}
		
		// Extract content from the HTML - focus on main content areas
		return $this->extract_content_from_html( $html );
	}
	
	/**
     * Extract meaningful content from HTML.
     *
     * @param string $html The HTML content.
     * @return string The extracted text content.
     */
    private function extract_content_from_html( $html ) {
        // Remove script and style tags completely
        $html = preg_replace( '/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/mi', '', $html );
        $html = preg_replace( '/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/mi', '', $html );
        
        // Remove head section
        $html = preg_replace( '/<head\b[^<]*(?:(?!<\/head>)<[^<]*)*<\/head>/mi', '', $html );
        
        // Remove some unwanted elements but keep most content
        $remove_patterns = array(
            '/<nav\b[^>]*>.*?<\/nav>/is',
            '/<footer\b[^>]*>.*?<\/footer>/is',
        );
        
        foreach ( $remove_patterns as $pattern ) {
            $html = preg_replace( $pattern, '', $html );
        }
        
        // Convert HTML to text - MUCH simpler approach
        $text = wp_strip_all_tags( $html );
        
        // Clean up whitespace and normalize
        $text = preg_replace( '/\s+/', ' ', $text );
        $text = trim( $text );
        
        return $text;
    }
	
	/**
	 * Get auto-detected content for a page (fallback method).
	 *
	 * @param int $page_id The WordPress page ID.
	 * @return string The auto-detected content.
	 */
	private function get_auto_detected_content( $page_id ) {
		// Try theme-provided content via filter
		$theme_content = apply_filters( 'aipress_page_content', '', $page_id );
		if ( ! empty( $theme_content ) && strlen( trim( wp_strip_all_tags( $theme_content ) ) ) > 20 ) {
			return $theme_content;
		}
		
		// Get post content from database
		$post_content = get_post_field( 'post_content', $page_id );
		if ( ! empty( trim( $post_content ) ) ) {
			return do_shortcode( $post_content );
		}
		
		// Fallback to title and excerpt
		$title = get_the_title( $page_id );
		$excerpt = get_the_excerpt( $page_id );
		if ( ! empty( $title ) ) {
			return $title . '. ' . $excerpt;
		}
		
		return '';
	}
	
	/**
	 * Index specific content IDs directly (new method)
	 */
	public function index_specific_content( $content_ids ) {
		if ( empty( $content_ids ) ) {
			return 'No content selected for indexing.';
		}
		
		$key = get_option( 'aipress_settings' )['openai_key'] ?? '';
		if ( empty( $key ) ) {
			return 'Error: No API key set.';
		}
		
		// Apply Pro limits
		$is_pro = aipress_is_pro();
		if ( ! $is_pro ) {
			$content_ids = array_slice( $content_ids, 0, 5 );
		}
		
		global $wpdb;
		$table = $wpdb->prefix . 'aipress_chunks';
		
		$total_chunks = 0;
		$total_errors = 0;
		
		foreach ( $content_ids as $post_id ) {
			$post_id = intval( $post_id );
			
			// Get post info
			$post = get_post( $post_id );
			if ( ! $post ) {
				$total_errors++;
				continue;
			}
			
			// Get content for this post
			$content = $this->get_rendered_content( $post_id );
			
			if ( ! $content || strlen( trim( $content ) ) < 5 ) {
				$total_errors++;
				continue;
			}
			
			// Clear existing chunks for this post
			$wpdb->delete( $table, array( 'page_id' => $post_id ), array( '%d' ) );
			
			// Create chunks
			$chunks = $this->chunk( $content );
			if ( empty( $chunks ) ) {
				$total_errors++;
				continue;
			}
			
			$chunk_index = 0;
			foreach ( $chunks as $chunk ) {
				$chunk = trim( $chunk );
				if ( strlen( $chunk ) < 5 ) {
					continue;
				}
				
				$emb = $this->embed( $chunk );
				if ( empty( $emb ) ) {
					continue;
				}
				
				$inserted = $wpdb->insert(
					$table,
					array(
						'page_id' => $post_id,
						'chunk_index' => $chunk_index,
						'content' => $chunk,
						'embedding' => wp_json_encode( $emb ),
					),
					array( '%d', '%d', '%s', '%s' )
				);
				
				if ( $inserted ) {
					$total_chunks++;
				}
				
				$chunk_index++;
			}
		}
		
		// Update last indexed time
		update_option( 'aipress_last_indexed', time() );
		
		$processed = count( $content_ids );
		return "Successfully indexed {$total_chunks} chunks from {$processed} page(s)";
	}
	
	/**
	 * Chunk component.
	 *
	 * @param string $content  The content to chunk.
	 * @param int    $words Number of words per chunk.
	 * @return string[]     Array of text chunks.
	 */
	private function chunk( $content, $words = 300 ) {
		// Clean the content first - but less aggressively
		$clean_content = $this->clean_content( $content );
		
		if ( empty( trim( $clean_content ) ) ) {
			return array();
		}
		
		$arr = preg_split( '/\s+/', trim( $clean_content ) );
		$chunks = array_chunk( $arr, $words );
		$result = array_map( fn( $c ) => implode( ' ', $c ), $chunks );
		
		return $result;
	}
	
	/**
	 * Clean content gently.
	 *
	 * @param string $content Content text.
	 * @return string Cleaned content.
	 */
	private function clean_content( $content ) {
		// Remove shortcodes
		$content = strip_shortcodes( $content );
		
		// Remove HTML tags
		$content = wp_strip_all_tags( $content );
		
		// Normalize whitespace but preserve line breaks for readability
		$content = preg_replace( '/[ \t]+/', ' ', $content ); // Multiple spaces/tabs to single space
		$content = preg_replace( '/\n\s*\n/', "\n", $content ); // Multiple newlines to single
		$content = trim( $content );
		
		return $content;
	}
	
	/**
	 * Embed component.
	 *
	 * @param string $text Text to send to the embeddings API.
	 * @return float[]     Embedding vector as an array of floats.
	 */
	private function embed( $text ) {
		$key = get_option( 'aipress_settings' )['openai_key'] ?? ( defined( 'AIPRESS_OPENAI_KEY' ) ? AIPRESS_OPENAI_KEY : '' );
		if ( ! $key ) {
			return array();
		}
		
		$r = wp_remote_post(
			'https://api.openai.com/v1/embeddings',
			array(
				'headers' => array(
					'Authorization' => 'Bearer ' . $key,
					'Content-Type'  => 'application/json',
				),
				'body'    => wp_json_encode(
					array(
						'model' => 'text-embedding-3-small',
						'input' => $text,
					)
				),
				'timeout' => 30,
			)
		);
		
		if ( is_wp_error( $r ) ) {
			return array();
		}
		
		$response_code = wp_remote_retrieve_response_code( $r );
		$body = wp_remote_retrieve_body( $r );
		
		if ( $response_code !== 200 ) {
			return array();
		}
		
		$data = json_decode( $body, true );
		
		if ( ! isset( $data['data'][0]['embedding'] ) ) {
			return array();
		}
		
		return $data['data'][0]['embedding'];
	}
	
	/**
	 * Index selected component.
	 */
	public function index_selected() {
		$key = get_option( 'aipress_settings' )['openai_key'] ?? '';
		if ( empty( $key ) ) {
			wp_send_json_error( 'Error: no API key set. Please return to Settings and save your OpenAI API Key first.' );
			return;
		}
		
		// Get selected content (backward compatible with old selected_pages)
		$settings = get_option( 'aipress_settings' );
		$ids = $settings['selected_content'] ?? $settings['selected_pages'] ?? array();
		
		if ( ! $ids ) {
			wp_send_json_success( 'No content selected for indexing.' );
			return;
		}
		
		// Apply limits based on version
		$is_pro = aipress_is_pro();
		if ( ! $is_pro ) {
			$ids = array_slice( $ids, 0, 5 );
		}
		
		global $wpdb;
		$table = $wpdb->prefix . 'aipress_chunks';
		
		$total_chunks = 0;
		$total_errors = 0;
		$processed_types = array();
		
		foreach ( $ids as $post_id ) {
			// Get post info
			$post = get_post( $post_id );
			if ( ! $post ) {
				$total_errors++;
				continue;
			}
			
			$post_type = $post->post_type;
			$processed_types[ $post_type ] = ( $processed_types[ $post_type ] ?? 0 ) + 1;
			
			// Get content for this post
			$content = $this->get_rendered_content( $post_id );
			
			if ( ! $content || strlen( trim( $content ) ) < 5 ) {
				$total_errors++;
				continue;
			}
			
			// Clear existing chunks for this post
			$wpdb->delete( $table, array( 'page_id' => $post_id ), array( '%d' ) );
			
			// Create chunks
			$chunks = $this->chunk( $content );
			if ( empty( $chunks ) ) {
				$total_errors++;
				continue;
			}
			
			$chunk_index = 0;
			foreach ( $chunks as $chunk ) {
				$chunk = trim( $chunk );
				if ( strlen( $chunk ) < 5 ) {
					continue;
				}
				
				$emb = $this->embed( $chunk );
				if ( ! $emb ) {
					$total_errors++;
					continue;
				}
				
				$inserted = $wpdb->insert(
					$table,
					array(
						'page_id' => $post_id,
						'chunk_index' => $chunk_index,
						'content' => $chunk,
						'embedding' => wp_json_encode( $emb ),
					),
					array( '%d', '%d', '%s', '%s' )
				);
				
				if ( $inserted ) {
					$total_chunks++;
				} else {
					$total_errors++;
				}
				
				$chunk_index++;
			}
		}
		
		update_option( 'aipress_last_indexed', time() );
		
		// Create summary message
		$type_summary = array();
		foreach ( $processed_types as $type => $count ) {
			$type_obj = get_post_type_object( $type );
			$type_name = $type_obj ? $type_obj->label : $type;
			$type_summary[] = "$count $type_name";
		}
		
		$message = "Indexed $total_chunks chunks from " . implode( ', ', $type_summary ) . '.';
		if ( $total_errors > 0 ) {
			$message .= " ($total_errors errors occurred)";
		}
		
		wp_send_json_success( $message );
	}
	
	/**
	 * Shortcode [aipress_status] → prints selected page IDs & chunk counts.
	 *
	 * @return string HTML table of "Page ID → # chunks."
	 */
	public function status_shortcode(): string {
		$opts = get_option( 'aipress_settings' );
		$selected = is_array( $opts['selected_pages'] ?? null ) ? $opts['selected_pages'] : [];
		if ( empty( $selected ) ) {
			return '<p><strong>aiPress Status:</strong> No pages are selected for indexing.</p>';
		}

		global $wpdb;
		$table = $wpdb->prefix . 'aipress_chunks';
		$html  = '<h4>aiPress Index Status</h4><table style="border-collapse: collapse; width: 100%;"><thead><tr style="background: #f1f1f1;"><th style="border: 1px solid #ddd; padding: 8px;">Page ID</th><th style="border: 1px solid #ddd; padding: 8px;">Title</th><th style="border: 1px solid #ddd; padding: 8px;"># Chunks</th><th style="border: 1px solid #ddd; padding: 8px;">Sample Content</th></tr></thead><tbody>';
		
		foreach ( $selected as $pid ) {
			$title = get_the_title( $pid ) ?: "(ID {$pid})";
			$count = (int) $wpdb->get_var( $wpdb->prepare(
                "SELECT COUNT(*) FROM %i WHERE page_id = %d",
                $table,
                $pid
            ));
			
			// Get a sample of the content to verify it's working
			$sample = $wpdb->get_var( $wpdb->prepare(
                "SELECT content FROM %i WHERE page_id = %d LIMIT 1",
                $table,
                $pid
            ));
			$sample_preview = $sample ? substr( $sample, 0, 100 ) . '...' : 'No content';
			
			// Check if page has custom content
			$has_custom = get_post_meta( $pid, '_aipress_content', true );
			$status_color = $count > 0 ? '#4CAF50' : '#f44336';
			
			$html .= sprintf(
				'<tr><td style="border: 1px solid #ddd; padding: 8px;">%d</td><td style="border: 1px solid #ddd; padding: 8px;">%s%s</td><td style="border: 1px solid #ddd; padding: 8px; color: %s;"><strong>%d</strong></td><td style="border: 1px solid #ddd; padding: 8px;"><small>%s</small></td></tr>',
				$pid,
				esc_html( $title ),
				$has_custom ? ' 📝' : '',
				$status_color,
				$count,
				esc_html( $sample_preview )
			);
		}
		$html .= '</tbody></table>';
		$html .= '<p><small>📝 = Has custom content added</small></p>';
		return $html;
	}
}