354 lines
10 KiB
TypeScript
354 lines
10 KiB
TypeScript
/**
|
|
* Document text extraction utilities for various file types
|
|
* (Browser-friendly version)
|
|
*/
|
|
|
|
import * as pdfjs from 'pdfjs-dist';
|
|
|
|
// Set worker path for PDF.js
|
|
pdfjs.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
|
|
|
|
export interface ExtractionResult {
|
|
text: string;
|
|
metadata?: Record<string, any>;
|
|
pages?: number;
|
|
fileType: string;
|
|
error?: string;
|
|
truncated?: boolean;
|
|
charCount: number;
|
|
}
|
|
|
|
/**
|
|
* Extract text from a PDF Buffer
|
|
*/
|
|
export async function extractPdfText(buffer: ArrayBuffer): Promise<ExtractionResult> {
|
|
try {
|
|
// Load the PDF document
|
|
const pdf = await pdfjs.getDocument({ data: buffer }).promise;
|
|
const numPages = pdf.numPages;
|
|
let text = '';
|
|
|
|
// Set a reasonable limit to avoid processing extremely large PDFs
|
|
// This can be adjusted based on your needs
|
|
const MAX_PAGES = 50;
|
|
const processPages = Math.min(numPages, MAX_PAGES);
|
|
const truncated = numPages > MAX_PAGES;
|
|
|
|
// Extract text from each page
|
|
for (let i = 1; i <= processPages; i++) {
|
|
const page = await pdf.getPage(i);
|
|
const content = await page.getTextContent();
|
|
const pageText = content.items
|
|
.map((item: any) => item.str)
|
|
.join(' ');
|
|
|
|
text += `${pageText}\n\n`;
|
|
}
|
|
|
|
// Add a note if the document was truncated
|
|
if (truncated) {
|
|
text += `[Note: Document truncated. Only showing ${MAX_PAGES} of ${numPages} pages.]\n`;
|
|
}
|
|
|
|
return {
|
|
text,
|
|
pages: numPages,
|
|
fileType: 'pdf',
|
|
truncated,
|
|
charCount: text.length
|
|
};
|
|
} catch (error) {
|
|
console.error('Error extracting PDF text:', error);
|
|
return {
|
|
text: '',
|
|
fileType: 'pdf',
|
|
error: error instanceof Error ? error.message : 'Unknown error extracting PDF text',
|
|
charCount: 0
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract text from a DOCX file
|
|
* Note: This is a simplified version for browser compatibility
|
|
* Full DOCX parsing is difficult in the browser
|
|
*/
|
|
export async function extractDocxText(buffer: ArrayBuffer): Promise<ExtractionResult> {
|
|
try {
|
|
// For browser compatibility, we'll use a simple approach
|
|
// to extract readable text from DOCX files
|
|
const decoder = new TextDecoder('utf-8');
|
|
const bytes = new Uint8Array(buffer);
|
|
let text = '';
|
|
|
|
// DOCX files are ZIP files containing XML
|
|
// We'll look for text content in the raw bytes
|
|
// This is very simple and won't work well for most DOCX files
|
|
// But it's better than nothing for browser compatibility
|
|
for (let i = 0; i < bytes.length; i++) {
|
|
// Look for text between XML tags
|
|
if (bytes[i] === 60 && bytes[i + 1] === 119 && bytes[i + 2] === 58 && bytes[i + 3] === 116) { // <w:t
|
|
// Find the closing >
|
|
let j = i + 4;
|
|
while (j < bytes.length && bytes[j] !== 62) j++;
|
|
j++; // Skip over the >
|
|
|
|
// Extract text up to </w:t>
|
|
let textChunk = '';
|
|
while (j < bytes.length &&
|
|
!(bytes[j] === 60 && bytes[j + 1] === 47 && bytes[j + 2] === 119 &&
|
|
bytes[j + 3] === 58 && bytes[j + 4] === 116)) {
|
|
textChunk += String.fromCharCode(bytes[j]);
|
|
j++;
|
|
}
|
|
|
|
if (textChunk) {
|
|
text += textChunk + ' ';
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we couldn't extract any text, provide an informative message
|
|
if (!text) {
|
|
text = "DOCX text extraction in the browser is limited. " +
|
|
"The file was uploaded successfully, but extracting its contents requires more advanced processing. " +
|
|
"You can still use the file, but you may need to describe its contents in your message.";
|
|
}
|
|
|
|
return {
|
|
text,
|
|
fileType: 'docx',
|
|
charCount: text.length
|
|
};
|
|
} catch (error) {
|
|
console.error('Error extracting DOCX text:', error);
|
|
return {
|
|
text: 'Error extracting text from DOCX file. DOCX parsing in the browser is limited.',
|
|
fileType: 'docx',
|
|
error: error instanceof Error ? error.message : 'Unknown error extracting DOCX text',
|
|
charCount: 0
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract text from a plain text file
|
|
*/
|
|
export function extractTextFileContent(buffer: ArrayBuffer): ExtractionResult {
|
|
try {
|
|
// Convert ArrayBuffer to string
|
|
const decoder = new TextDecoder('utf-8');
|
|
const text = decoder.decode(buffer);
|
|
|
|
return {
|
|
text,
|
|
fileType: 'txt',
|
|
charCount: text.length
|
|
};
|
|
} catch (error) {
|
|
console.error('Error extracting text file content:', error);
|
|
return {
|
|
text: '',
|
|
fileType: 'txt',
|
|
error: error instanceof Error ? error.message : 'Unknown error extracting text',
|
|
charCount: 0
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract text from CSV file
|
|
*/
|
|
export function extractCsvContent(buffer: ArrayBuffer): ExtractionResult {
|
|
try {
|
|
// Convert ArrayBuffer to string
|
|
const decoder = new TextDecoder('utf-8');
|
|
const csvText = decoder.decode(buffer);
|
|
|
|
// For CSV, we'll return both raw CSV and a formatted version
|
|
// that might be more readable for AI processing
|
|
let formattedText = 'CSV DATA:\n\n';
|
|
|
|
// Simple CSV parsing (handles basic cases)
|
|
const rows = csvText.split('\n');
|
|
const headers = rows[0].split(',').map(h => h.trim());
|
|
|
|
// Add headers
|
|
formattedText += `Headers: ${headers.join(', ')}\n\n`;
|
|
|
|
// Process a sample of rows (to avoid extremely large outputs)
|
|
const MAX_ROWS = 100;
|
|
const processRows = Math.min(rows.length - 1, MAX_ROWS);
|
|
const truncated = rows.length - 1 > MAX_ROWS;
|
|
|
|
formattedText += 'Data:\n';
|
|
for (let i = 1; i <= processRows; i++) {
|
|
if (!rows[i].trim()) continue;
|
|
|
|
const values = rows[i].split(',').map(v => v.trim());
|
|
let rowText = '';
|
|
|
|
for (let j = 0; j < headers.length; j++) {
|
|
if (j < values.length) {
|
|
rowText += `${headers[j]}: ${values[j]}; `;
|
|
}
|
|
}
|
|
|
|
formattedText += `Row ${i}: ${rowText}\n`;
|
|
}
|
|
|
|
if (truncated) {
|
|
formattedText += `\n[Note: CSV truncated. Only showing ${MAX_ROWS} of ${rows.length - 1} data rows.]\n`;
|
|
}
|
|
|
|
return {
|
|
text: formattedText,
|
|
fileType: 'csv',
|
|
truncated,
|
|
metadata: {
|
|
headers,
|
|
totalRows: rows.length - 1
|
|
},
|
|
charCount: formattedText.length
|
|
};
|
|
} catch (error) {
|
|
console.error('Error extracting CSV content:', error);
|
|
return {
|
|
text: '',
|
|
fileType: 'csv',
|
|
error: error instanceof Error ? error.message : 'Unknown error extracting CSV',
|
|
charCount: 0
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract JSON content
|
|
*/
|
|
export function extractJsonContent(buffer: ArrayBuffer): ExtractionResult {
|
|
try {
|
|
// Convert ArrayBuffer to string
|
|
const decoder = new TextDecoder('utf-8');
|
|
const jsonText = decoder.decode(buffer);
|
|
|
|
// Parse JSON to validate it and for potential formatting
|
|
const jsonData = JSON.parse(jsonText);
|
|
|
|
// For large JSON objects, we'll summarize
|
|
const jsonString = JSON.stringify(jsonData, null, 2);
|
|
let text = jsonString;
|
|
let truncated = false;
|
|
|
|
// Truncate if very large
|
|
const MAX_CHARS = 10000;
|
|
if (jsonString.length > MAX_CHARS) {
|
|
text = jsonString.substring(0, MAX_CHARS) +
|
|
`\n\n[Note: JSON content truncated. Showing ${MAX_CHARS} of ${jsonString.length} characters.]\n`;
|
|
truncated = true;
|
|
}
|
|
|
|
return {
|
|
text,
|
|
fileType: 'json',
|
|
truncated,
|
|
charCount: text.length
|
|
};
|
|
} catch (error) {
|
|
console.error('Error extracting JSON content:', error);
|
|
|
|
// If JSON parsing fails, return the raw text
|
|
try {
|
|
const decoder = new TextDecoder('utf-8');
|
|
const rawText = decoder.decode(buffer);
|
|
|
|
return {
|
|
text: rawText,
|
|
fileType: 'json',
|
|
error: 'Invalid JSON format, showing raw content',
|
|
charCount: rawText.length
|
|
};
|
|
} catch (fallbackError) {
|
|
return {
|
|
text: '',
|
|
fileType: 'json',
|
|
error: error instanceof Error ? error.message : 'Unknown error extracting JSON',
|
|
charCount: 0
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Detect file type from file extension
|
|
*/
|
|
export function detectFileType(file: File): string {
|
|
const fileName = file.name.toLowerCase();
|
|
const fileType = file.type.toLowerCase();
|
|
|
|
if (fileType.includes('pdf') || fileName.endsWith('.pdf')) {
|
|
return 'pdf';
|
|
} else if (fileType.includes('wordprocessingml') || fileName.endsWith('.docx') || fileName.endsWith('.doc')) {
|
|
return 'docx';
|
|
} else if (fileType.includes('csv') || fileName.endsWith('.csv')) {
|
|
return 'csv';
|
|
} else if (fileType.includes('json') || fileName.endsWith('.json')) {
|
|
return 'json';
|
|
} else if (fileType.startsWith('text/') ||
|
|
['txt', 'md', 'log', 'xml', 'html', 'js', 'ts', 'css', 'py', 'java'].some(ext => fileName.endsWith(`.${ext}`))) {
|
|
return 'text';
|
|
}
|
|
|
|
return 'unknown';
|
|
}
|
|
|
|
/**
|
|
* Main function to extract text from a file based on its type
|
|
*/
|
|
export async function extractTextFromFile(file: File): Promise<ExtractionResult> {
|
|
try {
|
|
// Read file as ArrayBuffer
|
|
const buffer = await file.arrayBuffer();
|
|
|
|
// Detect file type
|
|
const fileExtType = detectFileType(file);
|
|
|
|
// Extract text based on file type
|
|
if (fileExtType === 'pdf') {
|
|
return await extractPdfText(buffer);
|
|
} else if (fileExtType === 'docx') {
|
|
return await extractDocxText(buffer);
|
|
} else if (fileExtType === 'csv') {
|
|
return extractCsvContent(buffer);
|
|
} else if (fileExtType === 'json') {
|
|
return extractJsonContent(buffer);
|
|
} else if (fileExtType === 'text') {
|
|
return extractTextFileContent(buffer);
|
|
} else {
|
|
return {
|
|
text: `File type '${file.type}' is not supported for text extraction.`,
|
|
fileType: fileExtType,
|
|
error: 'Unsupported file type',
|
|
charCount: 0
|
|
};
|
|
}
|
|
} catch (error) {
|
|
console.error('Error extracting text from file:', error);
|
|
return {
|
|
text: '',
|
|
fileType: 'unknown',
|
|
error: error instanceof Error ? error.message : 'Unknown error extracting text',
|
|
charCount: 0
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get a summary of the extracted text, useful for large documents
|
|
*/
|
|
export function getTextSummary(result: ExtractionResult, maxLength: number = 200): string {
|
|
if (!result.text) return 'No text content extracted.';
|
|
|
|
const summary = result.text.substring(0, maxLength);
|
|
return summary + (result.text.length > maxLength ? '...' : '');
|
|
}
|