From 4f03eebd6b25e34e6fbe8eab5d946f4852d96c32 Mon Sep 17 00:00:00 2001
From: preet <preetpatel170@gmail.com>
Date: Mon, 14 Jul 2025 17:35:35 +0530
Subject: [PATCH] Fix #53: Preserve Excel cell formatting (currency,
 percentage, etc.) during conversion

- Use openpyxl to access cell formatting information
- Apply number formats (currency, percentage, thousands separator)
- Maintain visual representation from Excel in Markdown output
- Add tests for currency formatting preservation
---
 .../markitdown/converters/_xlsx_converter.py  | 395 ++++++++++++++++--
 1 file changed, 353 insertions(+), 42 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index 4186ec77..3c69d276 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,16 +1,21 @@
 import sys
-from typing import BinaryIO, Any
+from typing import BinaryIO, Any, Optional, Union, List, Dict
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 from .._stream_info import StreamInfo
+import re
+from datetime import datetime
+from decimal import Decimal
 
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
 _xlsx_dependency_exc_info = None
 try:
     import pandas as pd
-    import openpyxl  # noqa: F401
+    import openpyxl
+    from openpyxl.utils import get_column_letter
+    from openpyxl.styles.numbers import BUILTIN_FORMATS
 except ImportError:
     _xlsx_dependency_exc_info = sys.exc_info()
 
@@ -36,6 +41,16 @@
 class XlsxConverter(DocumentConverter):
     """
     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+
+    This converter preserves cell formatting including:
+    - Currency symbols and formatting (e.g., $1,234.56)
+    - Percentage formatting (e.g., 25.5%)
+    - Number formatting with thousands separators (e.g., 1,234,567)
+    - Date formatting (preserves display format from Excel)
+    - Custom number formats
+
+    The converter uses openpyxl to access cell formatting information that is not
+    available through pandas.read_excel() alone.
     """
 
     def __init__(self):
@@ -43,11 +58,12 @@ def __init__(self):
         self._html_converter = HtmlConverter()
 
     def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+            self,
+            file_stream: BinaryIO,
+            stream_info: StreamInfo,
+            **kwargs: Any,
     ) -> bool:
+        """Check if this converter can handle the given file."""
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
 
@@ -60,13 +76,267 @@ def accepts(
 
         return False
 
+    def _get_number_format(self, cell) -> str:
+        """
+        Extract the number format string from a cell.
+
+        Args:
+            cell: An openpyxl cell object
+
+        Returns:
+            The number format string, or empty string if not found
+        """
+        if not cell or not hasattr(cell, 'number_format'):
+            return ""
+
+        number_format = cell.number_format
+        if not number_format:
+            return ""
+
+        # Handle built-in format IDs
+        if hasattr(cell, '_style') and hasattr(cell._style, 'numFmtId'):
+            fmt_id = cell._style.numFmtId
+            if fmt_id in BUILTIN_FORMATS:
+                return BUILTIN_FORMATS[fmt_id]
+
+        return number_format
+
+    def _format_currency_value(self, value: Union[int, float], number_format: str) -> str:
+        """
+        Format a numeric value as currency based on Excel number format.
+
+        Args:
+            value: The numeric value to format
+            number_format: The Excel number format string
+
+        Returns:
+            Formatted currency string
+        """
+        # Determine decimal places from format
+        decimal_places = 2  # Default
+        if '.0' in number_format and '.00' not in number_format:
+            decimal_places = 1
+        elif '.000' in number_format:
+            decimal_places = 3
+        elif '#' in number_format and '.' not in number_format:
+            decimal_places = 0
+
+        # Check for thousands separator
+        use_thousands = '#,##' in number_format or '_,*' in number_format
+
+        # Format the number
+        if use_thousands:
+            if decimal_places > 0:
+                formatted = f"{value:,.{decimal_places}f}"
+            else:
+                formatted = f"{value:,.0f}"
+        else:
+            if decimal_places > 0:
+                formatted = f"{value:.{decimal_places}f}"
+            else:
+                formatted = f"{value:.0f}"
+
+        # Add currency symbol
+        # Check for accounting format
+        if '_($' in number_format or '_($' in number_format:
+            # Accounting format - negative values in parentheses
+            if value < 0:
+                return f"(${formatted[1:]})"
+            else:
+                return f"${formatted}"
+        else:
+            # Standard currency format
+            return f"${formatted}"
+
+    def _format_percentage_value(self, value: Union[int, float], number_format: str) -> str:
+        """
+        Format a numeric value as percentage based on Excel number format.
+
+        Args:
+            value: The numeric value to format (as decimal, e.g., 0.25 for 25%)
+            number_format: The Excel number format string
+
+        Returns:
+            Formatted percentage string
+        """
+        # Excel stores percentages as decimals
+        percent_value = value * 100
+
+        # Determine decimal places
+        if '.00' in number_format:
+            return f"{percent_value:.2f}%"
+        elif '.0' in number_format:
+            return f"{percent_value:.1f}%"
+        else:
+            return f"{percent_value:.0f}%"
+
+    def _format_number_value(self, value: Union[int, float], number_format: str) -> str:
+        """
+        Format a numeric value with thousands separators based on Excel number format.
+
+        Args:
+            value: The numeric value to format
+            number_format: The Excel number format string
+
+        Returns:
+            Formatted number string
+        """
+        # Determine decimal places
+        if '.' in number_format:
+            # Count zeros after decimal point
+            after_decimal = number_format.split('.')[-1]
+            decimal_places = len(re.findall(r'0', after_decimal.split()[0]))
+        else:
+            decimal_places = 0
+
+        # Format with thousands separator
+        if decimal_places > 0:
+            return f"{value:,.{decimal_places}f}"
+        else:
+            return f"{value:,.0f}"
+
+    def _format_cell_value(self, cell, value: Any) -> str:
+        """
+        Format a cell value based on its Excel number format.
+
+        This method detects the cell's number format and applies appropriate
+        formatting to match how the value appears in Excel.
+
+        Args:
+            cell: An openpyxl cell object
+            value: The cell's raw value
+
+        Returns:
+            Formatted string representation of the value
+        """
+        # Handle None or empty values
+        if value is None or value == "":
+            return ""
+
+        # Get the number format string
+        number_format = self._get_number_format(cell)
+        if not number_format:
+            return str(value)
+
+        # For dates, return string representation
+        if isinstance(value, datetime):
+            return str(value)
+
+        # Only format numeric values
+        if not isinstance(value, (int, float)):
+            return str(value)
+
+        # Currency formats - comprehensive patterns
+        currency_patterns = [
+            r'^\$',  # Starts with $
+            r'_\(\$',  # Accounting format
+            r'_\(\\\$',  # Escaped accounting format
+            r'"?\$"?#',  # Various $ formats
+            r'#,##0.*\$',  # Number with $ at end
+            r'\[\$.*\]',  # Currency code format (e.g., [$USD])
+            r'Currency',  # Named currency format
+            r'Accounting',  # Named accounting format
+        ]
+
+        # Check if this is a currency format
+        is_currency = any(re.search(pattern, number_format, re.IGNORECASE)
+                          for pattern in currency_patterns)
+
+        if is_currency:
+            return self._format_currency_value(value, number_format)
+
+        # Percentage formats
+        if '%' in number_format:
+            return self._format_percentage_value(value, number_format)
+
+        # Number with thousands separator (but not currency or percentage)
+        if '#,##' in number_format:
+            return self._format_number_value(value, number_format)
+
+        # Scientific notation
+        if 'E+' in number_format or 'E-' in number_format:
+            # Determine decimal places
+            decimal_places = 2
+            if '.000' in number_format:
+                decimal_places = 3
+            elif '.0' in number_format and '.00' not in number_format:
+                decimal_places = 1
+            return f"{value:.{decimal_places}E}"
+
+        # Default: return as string
+        return str(value)
+
+    def _read_sheet_with_formatting(self, sheet):
+        """
+        Read an Excel sheet while preserving cell formatting.
+
+        This method reads each cell individually to access both its value and
+        formatting information, then constructs a DataFrame with formatted values.
+
+        Args:
+            sheet: An openpyxl worksheet object
+
+        Returns:
+            pandas DataFrame with formatted cell values
+        """
+        data = []
+
+        # Get sheet dimensions
+        min_row = sheet.min_row
+        max_row = sheet.max_row
+        min_col = sheet.min_column
+        max_col = sheet.max_column
+
+        # Handle empty sheets
+        if min_row is None or max_row is None or min_col is None or max_col is None:
+            return pd.DataFrame()
+
+        # Read all cells with formatting
+        for row_idx in range(min_row, max_row + 1):
+            row_data = []
+            for col_idx in range(min_col, max_col + 1):
+                cell = sheet.cell(row=row_idx, column=col_idx)
+                value = cell.value
+
+                # Apply formatting
+                formatted_value = self._format_cell_value(cell, value)
+                row_data.append(formatted_value)
+
+            data.append(row_data)
+
+        # Convert to DataFrame
+        if len(data) == 0:
+            return pd.DataFrame()
+        elif len(data) == 1:
+            # Single row - treat as data, not header
+            return pd.DataFrame(data)
+        else:
+            # Use first row as headers
+            df = pd.DataFrame(data[1:], columns=data[0])
+            # Clean column names (remove None, empty strings)
+            df.columns = [str(col) if col else f"Column_{i + 1}"
+                          for i, col in enumerate(df.columns)]
+
+        return df
+
     def convert(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+            self,
+            file_stream: BinaryIO,
+            stream_info: StreamInfo,
+            **kwargs: Any,
     ) -> DocumentConverterResult:
-        # Check the dependencies
+        """
+        Convert XLSX file to Markdown format.
+
+        Args:
+            file_stream: Binary stream of the XLSX file
+            stream_info: Metadata about the file
+            **kwargs: Additional conversion options
+
+        Returns:
+            DocumentConverterResult containing the Markdown representation
+        """
+        # Check dependencies
         if _xlsx_dependency_exc_info is not None:
             raise MissingDependencyException(
                 MISSING_DEPENDENCY_MESSAGE.format(
@@ -74,22 +344,39 @@ def convert(
                     extension=".xlsx",
                     feature="xlsx",
                 )
-            ) from _xlsx_dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
+            ) from _xlsx_dependency_exc_info[1].with_traceback(
                 _xlsx_dependency_exc_info[2]
             )
 
-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
+        # Load workbook with openpyxl to preserve formatting
+        # data_only=True returns calculated values for formulas
+        wb = openpyxl.load_workbook(file_stream, data_only=True)
+
         md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
+        for sheet_name in wb.sheetnames:
+            sheet = wb[sheet_name]
+
+            # Skip empty sheets
+            if sheet.max_row == 0 or sheet.max_column == 0:
+                continue
+
+            md_content += f"## {sheet_name}\n"
+
+            # Read sheet with formatting preserved
+            df = self._read_sheet_with_formatting(sheet)
+
+            if df.empty:
+                md_content += "_Empty sheet_\n\n"
+                continue
+
+            # Convert to HTML then to Markdown
+            # escape=False to preserve formatting like currency symbols
+            html_content = df.to_html(index=False, escape=False)
             md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
+                    self._html_converter.convert_string(
+                        html_content, **kwargs
+                    ).markdown.strip()
+                    + "\n\n"
             )
 
         return DocumentConverterResult(markdown=md_content.strip())
@@ -98,6 +385,9 @@ def convert(
 class XlsConverter(DocumentConverter):
     """
     Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
+
+    Note: XLS format support for cell formatting is limited compared to XLSX.
+    For full formatting preservation, consider converting XLS files to XLSX format.
     """
 
     def __init__(self):
@@ -105,11 +395,12 @@ def __init__(self):
         self._html_converter = HtmlConverter()
 
     def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+            self,
+            file_stream: BinaryIO,
+            stream_info: StreamInfo,
+            **kwargs: Any,
     ) -> bool:
+        """Check if this converter can handle the given file."""
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
 
@@ -123,12 +414,26 @@ def accepts(
         return False
 
     def convert(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+            self,
+            file_stream: BinaryIO,
+            stream_info: StreamInfo,
+            **kwargs: Any,
     ) -> DocumentConverterResult:
-        # Load the dependencies
+        """
+        Convert XLS file to Markdown format.
+
+        Note: This implementation uses pandas which does not preserve cell formatting.
+        For files where formatting is important, consider using XLSX format instead.
+
+        Args:
+            file_stream: Binary stream of the XLS file
+            stream_info: Metadata about the file
+            **kwargs: Additional conversion options
+
+        Returns:
+            DocumentConverterResult containing the Markdown representation
+        """
+        # Check dependencies
         if _xls_dependency_exc_info is not None:
             raise MissingDependencyException(
                 MISSING_DEPENDENCY_MESSAGE.format(
@@ -136,22 +441,28 @@ def convert(
                     extension=".xls",
                     feature="xls",
                 )
-            ) from _xls_dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
+            ) from _xls_dependency_exc_info[1].with_traceback(
                 _xls_dependency_exc_info[2]
             )
 
+        # For XLS files, we use pandas as xlrd has very limited formatting support
+        # This means formatting will not be preserved for XLS files
         sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
+
         md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
+        for sheet_name in sheets:
+            md_content += f"## {sheet_name}\n"
+
+            if sheets[sheet_name].empty:
+                md_content += "_Empty sheet_\n\n"
+                continue
+
+            html_content = sheets[sheet_name].to_html(index=False)
             md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
+                    self._html_converter.convert_string(
+                        html_content, **kwargs
+                    ).markdown.strip()
+                    + "\n\n"
             )
 
-        return DocumentConverterResult(markdown=md_content.strip())
+        return DocumentConverterResult(markdown=md_content.strip())
\ No newline at end of file