chat-with-ai/src/utils/documentExtraction.ts

/**
 * Document text extraction utilities for various file types
 * (Browser-friendly version)
 */

import * as pdfjs from 'pdfjs-dist';

// Set worker path for PDF.js
pdfjs.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';

export interface ExtractionResult {
  text: string;
  metadata?: Record<string, any>;
  pages?: number;
  fileType: string;
  error?: string;
  truncated?: boolean;
  charCount: number;
}

/**
 * Extract text from a PDF Buffer
 */
export async function extractPdfText(buffer: ArrayBuffer): Promise<ExtractionResult> {
  try {
    // Load the PDF document
    const pdf = await pdfjs.getDocument({ data: buffer }).promise;
    const numPages = pdf.numPages;
    let text = '';

    // Set a reasonable limit to avoid processing extremely large PDFs
    // This can be adjusted based on your needs
    const MAX_PAGES = 50;
    const processPages = Math.min(numPages, MAX_PAGES);
    const truncated = numPages > MAX_PAGES;

    // Extract text from each page
    for (let i = 1; i <= processPages; i++) {
      const page = await pdf.getPage(i);
      const content = await page.getTextContent();
      const pageText = content.items
        .map((item: any) => item.str)
        .join(' ');

      text += `${pageText}\n\n`;
    }

    // Add a note if the document was truncated
    if (truncated) {
      text += `[Note: Document truncated. Only showing ${MAX_PAGES} of ${numPages} pages.]\n`;
    }

    return {
      text,
      pages: numPages,
      fileType: 'pdf',
      truncated,
      charCount: text.length
    };
  } catch (error) {
    console.error('Error extracting PDF text:', error);
    return {
      text: '',
      fileType: 'pdf',
      error: error instanceof Error ? error.message : 'Unknown error extracting PDF text',
      charCount: 0
    };
  }
}

/**
 * Extract text from a DOCX file
 * Note: This is a simplified version for browser compatibility
 * Full DOCX parsing is difficult in the browser
 */
export async function extractDocxText(buffer: ArrayBuffer): Promise<ExtractionResult> {
  try {
    // For browser compatibility, we'll use a simple approach
    // to extract readable text from DOCX files
    const decoder = new TextDecoder('utf-8');
    const bytes = new Uint8Array(buffer);
    let text = '';

    // DOCX files are ZIP files containing XML
    // We'll look for text content in the raw bytes
    // This is very simple and won't work well for most DOCX files
    // But it's better than nothing for browser compatibility
    for (let i = 0; i < bytes.length; i++) {
      // Look for text between XML tags
      if (bytes[i] === 60 && bytes[i + 1] === 119 && bytes[i + 2] === 58 && bytes[i + 3] === 116) { // <w:t
        // Find the closing >
        let j = i + 4;
        while (j < bytes.length && bytes[j] !== 62) j++;
        j++; // Skip over the >

        // Extract text up to </w:t>
        let textChunk = '';
        while (j < bytes.length &&
               !(bytes[j] === 60 && bytes[j + 1] === 47 && bytes[j + 2] === 119 &&
                 bytes[j + 3] === 58 && bytes[j + 4] === 116)) {
          textChunk += String.fromCharCode(bytes[j]);
          j++;
        }

        if (textChunk) {
          text += textChunk + ' ';
        }
      }
    }

    // If we couldn't extract any text, provide an informative message
    if (!text) {
      text = "DOCX text extraction in the browser is limited. " +
             "The file was uploaded successfully, but extracting its contents requires more advanced processing. " +
             "You can still use the file, but you may need to describe its contents in your message.";
    }

    return {
      text,
      fileType: 'docx',
      charCount: text.length
    };
  } catch (error) {
    console.error('Error extracting DOCX text:', error);
    return {
      text: 'Error extracting text from DOCX file. DOCX parsing in the browser is limited.',
      fileType: 'docx',
      error: error instanceof Error ? error.message : 'Unknown error extracting DOCX text',
      charCount: 0
    };
  }
}

/**
 * Extract text from a plain text file
 */
export function extractTextFileContent(buffer: ArrayBuffer): ExtractionResult {
  try {
    // Convert ArrayBuffer to string
    const decoder = new TextDecoder('utf-8');
    const text = decoder.decode(buffer);

    return {
      text,
      fileType: 'txt',
      charCount: text.length
    };
  } catch (error) {
    console.error('Error extracting text file content:', error);
    return {
      text: '',
      fileType: 'txt',
      error: error instanceof Error ? error.message : 'Unknown error extracting text',
      charCount: 0
    };
  }
}

/**
 * Extract text from CSV file
 */
export function extractCsvContent(buffer: ArrayBuffer): ExtractionResult {
  try {
    // Convert ArrayBuffer to string
    const decoder = new TextDecoder('utf-8');
    const csvText = decoder.decode(buffer);

    // For CSV, we'll return both raw CSV and a formatted version
    // that might be more readable for AI processing
    let formattedText = 'CSV DATA:\n\n';

    // Simple CSV parsing (handles basic cases)
    const rows = csvText.split('\n');
    const headers = rows[0].split(',').map(h => h.trim());

    // Add headers
    formattedText += `Headers: ${headers.join(', ')}\n\n`;

    // Process a sample of rows (to avoid extremely large outputs)
    const MAX_ROWS = 100;
    const processRows = Math.min(rows.length - 1, MAX_ROWS);
    const truncated = rows.length - 1 > MAX_ROWS;

    formattedText += 'Data:\n';
    for (let i = 1; i <= processRows; i++) {
      if (!rows[i].trim()) continue;

      const values = rows[i].split(',').map(v => v.trim());
      let rowText = '';

      for (let j = 0; j < headers.length; j++) {
        if (j < values.length) {
          rowText += `${headers[j]}: ${values[j]}; `;
        }
      }

      formattedText += `Row ${i}: ${rowText}\n`;
    }

    if (truncated) {
      formattedText += `\n[Note: CSV truncated. Only showing ${MAX_ROWS} of ${rows.length - 1} data rows.]\n`;
    }

    return {
      text: formattedText,
      fileType: 'csv',
      truncated,
      metadata: {
        headers,
        totalRows: rows.length - 1
      },
      charCount: formattedText.length
    };
  } catch (error) {
    console.error('Error extracting CSV content:', error);
    return {
      text: '',
      fileType: 'csv',
      error: error instanceof Error ? error.message : 'Unknown error extracting CSV',
      charCount: 0
    };
  }
}

/**
 * Extract JSON content
 */
export function extractJsonContent(buffer: ArrayBuffer): ExtractionResult {
  try {
    // Convert ArrayBuffer to string
    const decoder = new TextDecoder('utf-8');
    const jsonText = decoder.decode(buffer);

    // Parse JSON to validate it and for potential formatting
    const jsonData = JSON.parse(jsonText);

    // For large JSON objects, we'll summarize
    const jsonString = JSON.stringify(jsonData, null, 2);
    let text = jsonString;
    let truncated = false;

    // Truncate if very large
    const MAX_CHARS = 10000;
    if (jsonString.length > MAX_CHARS) {
      text = jsonString.substring(0, MAX_CHARS) +
        `\n\n[Note: JSON content truncated. Showing ${MAX_CHARS} of ${jsonString.length} characters.]\n`;
      truncated = true;
    }

    return {
      text,
      fileType: 'json',
      truncated,
      charCount: text.length
    };
  } catch (error) {
    console.error('Error extracting JSON content:', error);

    // If JSON parsing fails, return the raw text
    try {
      const decoder = new TextDecoder('utf-8');
      const rawText = decoder.decode(buffer);

      return {
        text: rawText,
        fileType: 'json',
        error: 'Invalid JSON format, showing raw content',
        charCount: rawText.length
      };
    } catch (fallbackError) {
      return {
        text: '',
        fileType: 'json',
        error: error instanceof Error ? error.message : 'Unknown error extracting JSON',
        charCount: 0
      };
    }
  }
}

/**
 * Detect file type from file extension
 */
export function detectFileType(file: File): string {
  const fileName = file.name.toLowerCase();
  const fileType = file.type.toLowerCase();

  if (fileType.includes('pdf') || fileName.endsWith('.pdf')) {
    return 'pdf';
  } else if (fileType.includes('wordprocessingml') || fileName.endsWith('.docx') || fileName.endsWith('.doc')) {
    return 'docx';
  } else if (fileType.includes('csv') || fileName.endsWith('.csv')) {
    return 'csv';
  } else if (fileType.includes('json') || fileName.endsWith('.json')) {
    return 'json';
  } else if (fileType.startsWith('text/') ||
            ['txt', 'md', 'log', 'xml', 'html', 'js', 'ts', 'css', 'py', 'java'].some(ext => fileName.endsWith(`.${ext}`))) {
    return 'text';
  }

  return 'unknown';
}

/**
 * Main function to extract text from a file based on its type
 */
export async function extractTextFromFile(file: File): Promise<ExtractionResult> {
  try {
    // Read file as ArrayBuffer
    const buffer = await file.arrayBuffer();

    // Detect file type
    const fileExtType = detectFileType(file);

    // Extract text based on file type
    if (fileExtType === 'pdf') {
      return await extractPdfText(buffer);
    } else if (fileExtType === 'docx') {
      return await extractDocxText(buffer);
    } else if (fileExtType === 'csv') {
      return extractCsvContent(buffer);
    } else if (fileExtType === 'json') {
      return extractJsonContent(buffer);
    } else if (fileExtType === 'text') {
      return extractTextFileContent(buffer);
    } else {
      return {
        text: `File type '${file.type}' is not supported for text extraction.`,
        fileType: fileExtType,
        error: 'Unsupported file type',
        charCount: 0
      };
    }
  } catch (error) {
    console.error('Error extracting text from file:', error);
    return {
      text: '',
      fileType: 'unknown',
      error: error instanceof Error ? error.message : 'Unknown error extracting text',
      charCount: 0
    };
  }
}

/**
 * Get a summary of the extracted text, useful for large documents
 */
export function getTextSummary(result: ExtractionResult, maxLength: number = 200): string {
  if (!result.text) return 'No text content extracted.';

  const summary = result.text.substring(0, maxLength);
  return summary + (result.text.length > maxLength ? '...' : '');
}