
import JSZip from 'jszip';
import { extractRawText } from 'mammoth';

const MIME_TYPE_MAP = {
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
  'org.openxmlformats.wordprocessingml.document': 'docx',
  'docx': '.docx',
  'application/vnd.oasis.opendocument.text': 'odt',
  'public.opentext': '.odt',
  'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
  'org.openxmlformats.presentationml.presentation': 'pptx',
  '.pptx': 'pptx',
  'application/vnd.oasis.opendocument.presentation': 'odp',
  'org.oasis.opendocument.presentation': '.odp',
  '.odp': 'odp',
}


const TextExtractionService = {
  extract(bufferData) {
    console.log('in TextExtractionService.extract(). bufferData:', bufferData);
    return new Promise((resolve, reject) => {
      try {
        if (!(bufferData && bufferData.arrayBuffer)) {
          reject(new Error('No buffer provided.'));
          return;
        }
  
        const fileType = MIME_TYPE_MAP[bufferData.mimeType];

        if (!fileType) {
          reject(new Error('Cannot support text extraction for mime type: ' + bufferData.mimeType));
          return;
        }

        const buffer = bufferData.arrayBuffer;
  
        switch (fileType) {
          case 'docx':
            this.docx(buffer)
              .then((text) => {
                resolve(text);
              })
              .catch((error) => {
                reject(error);
              });
            break;
  
          case 'odp':
            this.odp(buffer)
              .then((text) => {
                resolve(text);
              })
              .catch((error) => {
                reject(error);
              });
            break;
  
          case 'odt':
            this.odt(buffer)
              .then((text) => {
                resolve(text);
              })
              .catch((error) => {
                reject(error);
              });
            break;
    
          case 'pptx':
            this.pptx(buffer)
              .then((text) => {
                resolve(text);
              })
              .catch((error) => {
                reject(error);
              });
            break;

          default:
            reject(new Error('Unsupported file type for text extraction: ' + fileType));
        }
      } catch (error) {
        reject(new Error('Unexpected error during text extraction'));
      }
    });
  },


  /*
  ==================================================
   EXTRACTORS
  ==================================================
  */

  docx(buffer) {
    return new Promise((resolve, reject) => {
      try {
        if (!buffer) {
          reject(new Error('No buffer provided.'));
          return;
        }
  
        extractRawText({ arrayBuffer: buffer })
          .then((result) => {
            resolve(result?.value || '');
          })
          .catch((err) => {
            reject(new Error('Failed to extract DOCX text. err: ' + err));
          });
      } catch (error) {
        reject(new Error('Unexpected error during DOCX text extraction. error: ' + error));
      }
    });
  },

  odp(buffer) {
    return new Promise((resolve, reject) => {
      try {
        if (!buffer) {
          reject(new Error('No buffer provided.'));
          return;
        }
  
        JSZip.loadAsync(buffer)
          .then((zip) => {
            const file = zip.file('content.xml');
            if (!file) {
              return reject(new Error('No content.xml found in the ODP file.'));
            }
  
            file.async('string')
              .then((contentXml) => {
                const paragraphMatches = contentXml.match(/<text:p[^>]*>(.*?)<\/text:p>/g);
                let extractedText = '';
  
                if (paragraphMatches) {
                  extractedText = paragraphMatches
                    .map((para) => {
                      const spanText = para.replace(/<text:span[^>]*>(.*?)<\/text:span>/g, '$1'); // Extract text inside spans
                      const cleanText = spanText.replace(/<[^>]+>/g, ''); // Remove other XML tags
                      return cleanText.trim(); // Clean up extra spaces
                    })
                    .join('\n');
                }
  
                resolve(extractedText.trim());
              })
              .catch(() => {
                reject(new Error('Failed to read content.xml from the ODP file.'));
              });
          })
          .catch(() => {
            reject(new Error('Failed to load ODP buffer.'));
          });
      } catch (error) {
        reject(new Error('Unexpected error during ODP text extraction'));
      }
    });
  },  

  odt(buffer) {
    return new Promise((resolve, reject) => {
      try {
        if (!buffer) {
          reject(new Error('No buffer provided.'));
          return;
        }
  
        JSZip.loadAsync(buffer)
          .then((zip) => {
            const file = zip.file('content.xml');
            if (!file) {
              return reject(new Error('No content.xml found in the ODT file.'));
            }
  
            file.async('string')
              .then((contentXml) => {
                const paragraphMatches = contentXml.match(/<text:p[^>]*>(.*?)<\/text:p>/g);
                let extractedText = '';
  
                if (paragraphMatches) {
                  extractedText = paragraphMatches
                    .map((para) => {
                      const cleanText = para.replace(/<[^>]+>/g, ''); // Remove XML tags
                      return cleanText.trim(); // Clean up extra spaces
                    })
                    .join('\n');
                }
  
                resolve(extractedText.trim());
              })
              .catch(() => {
                reject(new Error('Failed to read content.xml from the ODT file.'));
              });
          })
          .catch(() => {
            reject(new Error('Failed to load ODT buffer.'));
          });
      } catch (error) {
        reject(new Error('Unexpected error during ODT text extraction'));
      }
    });
  },

  pptx(buffer) {
    return new Promise((resolve, reject) => {
      try {
        if (!buffer) {
          reject(new Error('No buffer provided.'));
          return;
        }
  
        JSZip.loadAsync(buffer).then((zip) => {
          const slidePromises = [];
  
          zip.forEach((relativePath, file) => {
            if (relativePath.startsWith('ppt/slides/slide')) {
              slidePromises.push(file.async('string'));
            }
          });
  
          Promise.all(slidePromises)
            .then((slideContents) => {
              let extractedText = '';
  
              slideContents.forEach((slideContent) => {
                const textMatches = slideContent.match(/<a:t>(.*?)<\/a:t>/g);
                if (textMatches) {
                  textMatches.forEach((match) => {
                    const cleanText = match.replace(/<\/?a:t>/g, '');
                    extractedText += cleanText + '\n';
                  });
                }
              });
  
              resolve(extractedText);
            })
            .catch(() => {
              reject(new Error('Failed to process slide contents'));
            });
        }).catch(() => {
          reject(new Error('Failed to load PPTX buffer'));
        });
      } catch (error) {
        reject(new Error('Unexpected error during PPTX text extraction'));
      }
    });
  },
}
  
export default TextExtractionService;
