chat-with-ai/src/utils/documentExtraction.ts

354 lines
10 KiB
TypeScript

/**
* Document text extraction utilities for various file types
* (Browser-friendly version)
*/
import * as pdfjs from 'pdfjs-dist';
// Set worker path for PDF.js
pdfjs.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
export interface ExtractionResult {
text: string;
metadata?: Record<string, any>;
pages?: number;
fileType: string;
error?: string;
truncated?: boolean;
charCount: number;
}
/**
* Extract text from a PDF Buffer
*/
export async function extractPdfText(buffer: ArrayBuffer): Promise<ExtractionResult> {
try {
// Load the PDF document
const pdf = await pdfjs.getDocument({ data: buffer }).promise;
const numPages = pdf.numPages;
let text = '';
// Set a reasonable limit to avoid processing extremely large PDFs
// This can be adjusted based on your needs
const MAX_PAGES = 50;
const processPages = Math.min(numPages, MAX_PAGES);
const truncated = numPages > MAX_PAGES;
// Extract text from each page
for (let i = 1; i <= processPages; i++) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();
const pageText = content.items
.map((item: any) => item.str)
.join(' ');
text += `${pageText}\n\n`;
}
// Add a note if the document was truncated
if (truncated) {
text += `[Note: Document truncated. Only showing ${MAX_PAGES} of ${numPages} pages.]\n`;
}
return {
text,
pages: numPages,
fileType: 'pdf',
truncated,
charCount: text.length
};
} catch (error) {
console.error('Error extracting PDF text:', error);
return {
text: '',
fileType: 'pdf',
error: error instanceof Error ? error.message : 'Unknown error extracting PDF text',
charCount: 0
};
}
}
/**
* Extract text from a DOCX file
* Note: This is a simplified version for browser compatibility
* Full DOCX parsing is difficult in the browser
*/
export async function extractDocxText(buffer: ArrayBuffer): Promise<ExtractionResult> {
try {
// For browser compatibility, we'll use a simple approach
// to extract readable text from DOCX files
const decoder = new TextDecoder('utf-8');
const bytes = new Uint8Array(buffer);
let text = '';
// DOCX files are ZIP files containing XML
// We'll look for text content in the raw bytes
// This is very simple and won't work well for most DOCX files
// But it's better than nothing for browser compatibility
for (let i = 0; i < bytes.length; i++) {
// Look for text between XML tags
if (bytes[i] === 60 && bytes[i + 1] === 119 && bytes[i + 2] === 58 && bytes[i + 3] === 116) { // <w:t
// Find the closing >
let j = i + 4;
while (j < bytes.length && bytes[j] !== 62) j++;
j++; // Skip over the >
// Extract text up to </w:t>
let textChunk = '';
while (j < bytes.length &&
!(bytes[j] === 60 && bytes[j + 1] === 47 && bytes[j + 2] === 119 &&
bytes[j + 3] === 58 && bytes[j + 4] === 116)) {
textChunk += String.fromCharCode(bytes[j]);
j++;
}
if (textChunk) {
text += textChunk + ' ';
}
}
}
// If we couldn't extract any text, provide an informative message
if (!text) {
text = "DOCX text extraction in the browser is limited. " +
"The file was uploaded successfully, but extracting its contents requires more advanced processing. " +
"You can still use the file, but you may need to describe its contents in your message.";
}
return {
text,
fileType: 'docx',
charCount: text.length
};
} catch (error) {
console.error('Error extracting DOCX text:', error);
return {
text: 'Error extracting text from DOCX file. DOCX parsing in the browser is limited.',
fileType: 'docx',
error: error instanceof Error ? error.message : 'Unknown error extracting DOCX text',
charCount: 0
};
}
}
/**
* Extract text from a plain text file
*/
export function extractTextFileContent(buffer: ArrayBuffer): ExtractionResult {
try {
// Convert ArrayBuffer to string
const decoder = new TextDecoder('utf-8');
const text = decoder.decode(buffer);
return {
text,
fileType: 'txt',
charCount: text.length
};
} catch (error) {
console.error('Error extracting text file content:', error);
return {
text: '',
fileType: 'txt',
error: error instanceof Error ? error.message : 'Unknown error extracting text',
charCount: 0
};
}
}
/**
* Extract text from CSV file
*/
export function extractCsvContent(buffer: ArrayBuffer): ExtractionResult {
try {
// Convert ArrayBuffer to string
const decoder = new TextDecoder('utf-8');
const csvText = decoder.decode(buffer);
// For CSV, we'll return both raw CSV and a formatted version
// that might be more readable for AI processing
let formattedText = 'CSV DATA:\n\n';
// Simple CSV parsing (handles basic cases)
const rows = csvText.split('\n');
const headers = rows[0].split(',').map(h => h.trim());
// Add headers
formattedText += `Headers: ${headers.join(', ')}\n\n`;
// Process a sample of rows (to avoid extremely large outputs)
const MAX_ROWS = 100;
const processRows = Math.min(rows.length - 1, MAX_ROWS);
const truncated = rows.length - 1 > MAX_ROWS;
formattedText += 'Data:\n';
for (let i = 1; i <= processRows; i++) {
if (!rows[i].trim()) continue;
const values = rows[i].split(',').map(v => v.trim());
let rowText = '';
for (let j = 0; j < headers.length; j++) {
if (j < values.length) {
rowText += `${headers[j]}: ${values[j]}; `;
}
}
formattedText += `Row ${i}: ${rowText}\n`;
}
if (truncated) {
formattedText += `\n[Note: CSV truncated. Only showing ${MAX_ROWS} of ${rows.length - 1} data rows.]\n`;
}
return {
text: formattedText,
fileType: 'csv',
truncated,
metadata: {
headers,
totalRows: rows.length - 1
},
charCount: formattedText.length
};
} catch (error) {
console.error('Error extracting CSV content:', error);
return {
text: '',
fileType: 'csv',
error: error instanceof Error ? error.message : 'Unknown error extracting CSV',
charCount: 0
};
}
}
/**
* Extract JSON content
*/
export function extractJsonContent(buffer: ArrayBuffer): ExtractionResult {
try {
// Convert ArrayBuffer to string
const decoder = new TextDecoder('utf-8');
const jsonText = decoder.decode(buffer);
// Parse JSON to validate it and for potential formatting
const jsonData = JSON.parse(jsonText);
// For large JSON objects, we'll summarize
const jsonString = JSON.stringify(jsonData, null, 2);
let text = jsonString;
let truncated = false;
// Truncate if very large
const MAX_CHARS = 10000;
if (jsonString.length > MAX_CHARS) {
text = jsonString.substring(0, MAX_CHARS) +
`\n\n[Note: JSON content truncated. Showing ${MAX_CHARS} of ${jsonString.length} characters.]\n`;
truncated = true;
}
return {
text,
fileType: 'json',
truncated,
charCount: text.length
};
} catch (error) {
console.error('Error extracting JSON content:', error);
// If JSON parsing fails, return the raw text
try {
const decoder = new TextDecoder('utf-8');
const rawText = decoder.decode(buffer);
return {
text: rawText,
fileType: 'json',
error: 'Invalid JSON format, showing raw content',
charCount: rawText.length
};
} catch (fallbackError) {
return {
text: '',
fileType: 'json',
error: error instanceof Error ? error.message : 'Unknown error extracting JSON',
charCount: 0
};
}
}
}
/**
* Detect file type from file extension
*/
export function detectFileType(file: File): string {
const fileName = file.name.toLowerCase();
const fileType = file.type.toLowerCase();
if (fileType.includes('pdf') || fileName.endsWith('.pdf')) {
return 'pdf';
} else if (fileType.includes('wordprocessingml') || fileName.endsWith('.docx') || fileName.endsWith('.doc')) {
return 'docx';
} else if (fileType.includes('csv') || fileName.endsWith('.csv')) {
return 'csv';
} else if (fileType.includes('json') || fileName.endsWith('.json')) {
return 'json';
} else if (fileType.startsWith('text/') ||
['txt', 'md', 'log', 'xml', 'html', 'js', 'ts', 'css', 'py', 'java'].some(ext => fileName.endsWith(`.${ext}`))) {
return 'text';
}
return 'unknown';
}
/**
* Main function to extract text from a file based on its type
*/
export async function extractTextFromFile(file: File): Promise<ExtractionResult> {
try {
// Read file as ArrayBuffer
const buffer = await file.arrayBuffer();
// Detect file type
const fileExtType = detectFileType(file);
// Extract text based on file type
if (fileExtType === 'pdf') {
return await extractPdfText(buffer);
} else if (fileExtType === 'docx') {
return await extractDocxText(buffer);
} else if (fileExtType === 'csv') {
return extractCsvContent(buffer);
} else if (fileExtType === 'json') {
return extractJsonContent(buffer);
} else if (fileExtType === 'text') {
return extractTextFileContent(buffer);
} else {
return {
text: `File type '${file.type}' is not supported for text extraction.`,
fileType: fileExtType,
error: 'Unsupported file type',
charCount: 0
};
}
} catch (error) {
console.error('Error extracting text from file:', error);
return {
text: '',
fileType: 'unknown',
error: error instanceof Error ? error.message : 'Unknown error extracting text',
charCount: 0
};
}
}
/**
* Get a summary of the extracted text, useful for large documents
*/
export function getTextSummary(result: ExtractionResult, maxLength: number = 200): string {
if (!result.text) return 'No text content extracted.';
const summary = result.text.substring(0, maxLength);
return summary + (result.text.length > maxLength ? '...' : '');
}