/** * Document text extraction utilities for various file types * (Browser-friendly version) */ import * as pdfjs from 'pdfjs-dist'; // Set worker path for PDF.js pdfjs.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js'; export interface ExtractionResult { text: string; metadata?: Record; pages?: number; fileType: string; error?: string; truncated?: boolean; charCount: number; } /** * Extract text from a PDF Buffer */ export async function extractPdfText(buffer: ArrayBuffer): Promise { try { // Load the PDF document const pdf = await pdfjs.getDocument({ data: buffer }).promise; const numPages = pdf.numPages; let text = ''; // Set a reasonable limit to avoid processing extremely large PDFs // This can be adjusted based on your needs const MAX_PAGES = 50; const processPages = Math.min(numPages, MAX_PAGES); const truncated = numPages > MAX_PAGES; // Extract text from each page for (let i = 1; i <= processPages; i++) { const page = await pdf.getPage(i); const content = await page.getTextContent(); const pageText = content.items .map((item: any) => item.str) .join(' '); text += `${pageText}\n\n`; } // Add a note if the document was truncated if (truncated) { text += `[Note: Document truncated. Only showing ${MAX_PAGES} of ${numPages} pages.]\n`; } return { text, pages: numPages, fileType: 'pdf', truncated, charCount: text.length }; } catch (error) { console.error('Error extracting PDF text:', error); return { text: '', fileType: 'pdf', error: error instanceof Error ? error.message : 'Unknown error extracting PDF text', charCount: 0 }; } } /** * Extract text from a DOCX file * Note: This is a simplified version for browser compatibility * Full DOCX parsing is difficult in the browser */ export async function extractDocxText(buffer: ArrayBuffer): Promise { try { // For browser compatibility, we'll use a simple approach // to extract readable text from DOCX files const decoder = new TextDecoder('utf-8'); const bytes = new Uint8Array(buffer); let text = ''; // DOCX files are ZIP files containing XML // We'll look for text content in the raw bytes // This is very simple and won't work well for most DOCX files // But it's better than nothing for browser compatibility for (let i = 0; i < bytes.length; i++) { // Look for text between XML tags if (bytes[i] === 60 && bytes[i + 1] === 119 && bytes[i + 2] === 58 && bytes[i + 3] === 116) { // let j = i + 4; while (j < bytes.length && bytes[j] !== 62) j++; j++; // Skip over the > // Extract text up to let textChunk = ''; while (j < bytes.length && !(bytes[j] === 60 && bytes[j + 1] === 47 && bytes[j + 2] === 119 && bytes[j + 3] === 58 && bytes[j + 4] === 116)) { textChunk += String.fromCharCode(bytes[j]); j++; } if (textChunk) { text += textChunk + ' '; } } } // If we couldn't extract any text, provide an informative message if (!text) { text = "DOCX text extraction in the browser is limited. " + "The file was uploaded successfully, but extracting its contents requires more advanced processing. " + "You can still use the file, but you may need to describe its contents in your message."; } return { text, fileType: 'docx', charCount: text.length }; } catch (error) { console.error('Error extracting DOCX text:', error); return { text: 'Error extracting text from DOCX file. DOCX parsing in the browser is limited.', fileType: 'docx', error: error instanceof Error ? error.message : 'Unknown error extracting DOCX text', charCount: 0 }; } } /** * Extract text from a plain text file */ export function extractTextFileContent(buffer: ArrayBuffer): ExtractionResult { try { // Convert ArrayBuffer to string const decoder = new TextDecoder('utf-8'); const text = decoder.decode(buffer); return { text, fileType: 'txt', charCount: text.length }; } catch (error) { console.error('Error extracting text file content:', error); return { text: '', fileType: 'txt', error: error instanceof Error ? error.message : 'Unknown error extracting text', charCount: 0 }; } } /** * Extract text from CSV file */ export function extractCsvContent(buffer: ArrayBuffer): ExtractionResult { try { // Convert ArrayBuffer to string const decoder = new TextDecoder('utf-8'); const csvText = decoder.decode(buffer); // For CSV, we'll return both raw CSV and a formatted version // that might be more readable for AI processing let formattedText = 'CSV DATA:\n\n'; // Simple CSV parsing (handles basic cases) const rows = csvText.split('\n'); const headers = rows[0].split(',').map(h => h.trim()); // Add headers formattedText += `Headers: ${headers.join(', ')}\n\n`; // Process a sample of rows (to avoid extremely large outputs) const MAX_ROWS = 100; const processRows = Math.min(rows.length - 1, MAX_ROWS); const truncated = rows.length - 1 > MAX_ROWS; formattedText += 'Data:\n'; for (let i = 1; i <= processRows; i++) { if (!rows[i].trim()) continue; const values = rows[i].split(',').map(v => v.trim()); let rowText = ''; for (let j = 0; j < headers.length; j++) { if (j < values.length) { rowText += `${headers[j]}: ${values[j]}; `; } } formattedText += `Row ${i}: ${rowText}\n`; } if (truncated) { formattedText += `\n[Note: CSV truncated. Only showing ${MAX_ROWS} of ${rows.length - 1} data rows.]\n`; } return { text: formattedText, fileType: 'csv', truncated, metadata: { headers, totalRows: rows.length - 1 }, charCount: formattedText.length }; } catch (error) { console.error('Error extracting CSV content:', error); return { text: '', fileType: 'csv', error: error instanceof Error ? error.message : 'Unknown error extracting CSV', charCount: 0 }; } } /** * Extract JSON content */ export function extractJsonContent(buffer: ArrayBuffer): ExtractionResult { try { // Convert ArrayBuffer to string const decoder = new TextDecoder('utf-8'); const jsonText = decoder.decode(buffer); // Parse JSON to validate it and for potential formatting const jsonData = JSON.parse(jsonText); // For large JSON objects, we'll summarize const jsonString = JSON.stringify(jsonData, null, 2); let text = jsonString; let truncated = false; // Truncate if very large const MAX_CHARS = 10000; if (jsonString.length > MAX_CHARS) { text = jsonString.substring(0, MAX_CHARS) + `\n\n[Note: JSON content truncated. Showing ${MAX_CHARS} of ${jsonString.length} characters.]\n`; truncated = true; } return { text, fileType: 'json', truncated, charCount: text.length }; } catch (error) { console.error('Error extracting JSON content:', error); // If JSON parsing fails, return the raw text try { const decoder = new TextDecoder('utf-8'); const rawText = decoder.decode(buffer); return { text: rawText, fileType: 'json', error: 'Invalid JSON format, showing raw content', charCount: rawText.length }; } catch (fallbackError) { return { text: '', fileType: 'json', error: error instanceof Error ? error.message : 'Unknown error extracting JSON', charCount: 0 }; } } } /** * Detect file type from file extension */ export function detectFileType(file: File): string { const fileName = file.name.toLowerCase(); const fileType = file.type.toLowerCase(); if (fileType.includes('pdf') || fileName.endsWith('.pdf')) { return 'pdf'; } else if (fileType.includes('wordprocessingml') || fileName.endsWith('.docx') || fileName.endsWith('.doc')) { return 'docx'; } else if (fileType.includes('csv') || fileName.endsWith('.csv')) { return 'csv'; } else if (fileType.includes('json') || fileName.endsWith('.json')) { return 'json'; } else if (fileType.startsWith('text/') || ['txt', 'md', 'log', 'xml', 'html', 'js', 'ts', 'css', 'py', 'java'].some(ext => fileName.endsWith(`.${ext}`))) { return 'text'; } return 'unknown'; } /** * Main function to extract text from a file based on its type */ export async function extractTextFromFile(file: File): Promise { try { // Read file as ArrayBuffer const buffer = await file.arrayBuffer(); // Detect file type const fileExtType = detectFileType(file); // Extract text based on file type if (fileExtType === 'pdf') { return await extractPdfText(buffer); } else if (fileExtType === 'docx') { return await extractDocxText(buffer); } else if (fileExtType === 'csv') { return extractCsvContent(buffer); } else if (fileExtType === 'json') { return extractJsonContent(buffer); } else if (fileExtType === 'text') { return extractTextFileContent(buffer); } else { return { text: `File type '${file.type}' is not supported for text extraction.`, fileType: fileExtType, error: 'Unsupported file type', charCount: 0 }; } } catch (error) { console.error('Error extracting text from file:', error); return { text: '', fileType: 'unknown', error: error instanceof Error ? error.message : 'Unknown error extracting text', charCount: 0 }; } } /** * Get a summary of the extracted text, useful for large documents */ export function getTextSummary(result: ExtractionResult, maxLength: number = 200): string { if (!result.text) return 'No text content extracted.'; const summary = result.text.substring(0, maxLength); return summary + (result.text.length > maxLength ? '...' : ''); }