Source code for ooodev_xml.odxml

# coding: utf-8
# See Also: https://pypi.org/project/ooo-dev-tools/

# region Imports
from __future__ import annotations
import os
from typing import Sequence, Tuple, List, overload
from xml.dom.minidom import Node, parse, Document, parseString
import urllib.request
from xml.dom.minicompat import NodeList
from ooodev.utils.table_helper import TableHelper
from ooodev.utils import lo as mLo
from ooodev.utils import file_io as mFileIO
from ooodev.utils.type_var import PathOrStr
from ooodev.utils.inst.lo.doc_type import DocTypeStr
from lxml import etree as XML_ETREE

# endregion Imports


[docs]class XML:
    """XML method used for with LibreOffice Documents"""

    # region --------------- Load / Save ------------------------------

[docs]    @classmethod
    def load_doc(cls, fnm: PathOrStr) -> Document:
        """
        Gets a document from a file

        Args:
            fnm (PathOrStr): XML file to load.

        Raises:
            Exception: if unable to open document.

        Returns:
            Document: XML Document.
        """
        # sourcery skip: raise-specific-error
        try:
            pth = mFileIO.FileIO.get_absolute_path(fnm)
            with open(pth) as file:
                doc = parse(file)
            cls._remove_whitespace(doc)
            doc.normalize()
            return doc
        except Exception as e:
            print(e)
            raise Exception(f"Opening of document failed: '{fnm}'") from e

[docs]    @classmethod
    def url_2_doc(cls, url: str) -> Document:
        """
        Gets a XML Document from remote source.

        Args:
            url (str): URL for a remote XML Document

        Raises:
            Exception: if unable to open document.

        Returns:
            Document: XML Document
        """
        # sourcery skip: raise-specific-error
        try:
            with urllib.request.urlopen(url) as url_data:
                doc = parseString(url_data.read().decode())
            cls._remove_whitespace(doc)
            doc.normalize()
            return doc
        except Exception as e:
            print(e)
            raise Exception(f"Opening of document failed: '{url}'") from e

[docs]    @classmethod
    def str_to_doc(cls, xml_str: str) -> Document:
        """
        Gets a XML document from xml string.

        Args:
            xml_str (str): XML string.

        Raises:
            Exception: if unable to create document from xml.

        Returns:
            Document: XML Document on successful load; Otherwise, None.
        """
        # sourcery skip: raise-specific-error
        try:
            doc = parseString(xml_str)
            cls._remove_whitespace(doc)
            doc.normalize()
            return doc
        except Exception as e:
            print(e)
            raise Exception("Error get xml document from xml string") from e

[docs]    @staticmethod
    def save_doc(doc: Document, xml_fnm: PathOrStr) -> None:
        """
        Save doc to xml file.

        Args:
            doc (Document): doc to save.
            xml_fnm (PathOrStr): Output file path.

        Raises:
            Exception: If unable to save document
        """
        # sourcery skip: raise-specific-error
        try:
            pth = mFileIO.FileIO.get_absolute_path(xml_fnm)
            with open(pth, "w") as file:
                sx: str = doc.toprettyxml(indent="  ")
                # remove any empty lines, there if often a lot with toprettyxml()
                lines = [line for line in sx.splitlines() if line.strip() != ""]
                lines.append("")  # end with empty line
                clean_sx = "\n".join(lines)
                file.write(clean_sx)
                # doc.writexml(writer=file, indent="  ")
        except Exception as e:
            raise Exception(f"Unable to save document to {xml_fnm}") from e

    # endregion ------------ Load / Save ------------------------------

    # region --------------- DOM data extraction -----------------------

[docs]    @staticmethod
    def get_node(tag_name: str, nodes: NodeList) -> Node | None:
        """
        Gets the fist tag_name found in nodes.

        Args:
            tag_name (str): tag name to find in nodes.
            nodes (NodeList): Nodes to search

        Returns:
            Node | None: First found node; Otherwise, None
        """
        name = tag_name.casefold()
        return next(
            (node for node in nodes if node.nodeType == Node.ELEMENT_NODE and node.tagName.casefold() == name),
            None,
        )

    # region    get_node_value()
    @overload
    @classmethod
    def get_node_value(cls, node: Node) -> str:
        """
        Get the text stored in the node

        Args:
            node (Node): Node to get value of.

        Returns:
            str: Node value.
        """
        ...

    @overload
    @classmethod
    def get_node_value(cls, tag_name: str, nodes: NodeList) -> str:
        """
        Gets first tag_name node in the list and returns it text.

        Args:
            tag_name (str): tag_name to search for.
            nodes (NodeList): List of nodes to search.

        Returns:
            str: Node value if found; Otherwise empty str.
        """
        ...

[docs]    @classmethod
    def get_node_value(cls, *args, **kwargs) -> str:
        """
        Gets first ``tag_name`` node in the list and returns it text.

        Args:
            node (Node): Node to get value of.
            tag_name (str): ``tag_name`` to search for.
            nodes (NodeList): List of nodes to search.

        Returns:
            str: Node value if found; Otherwise empty str.
        """
        ordered_keys = (1, 2)
        kargs_len = len(kwargs)
        count = len(args) + kargs_len

        def get_kwargs() -> dict:
            ka = {}
            if kargs_len == 0:
                return ka
            valid_keys = ("tag_name", "nodes", "node")
            check = all(key in valid_keys for key in kwargs)
            if not check:
                raise TypeError("get_node_value() got an unexpected keyword argument")
            keys = ("tag_name", "node")
            for key in keys:
                if key in kwargs:
                    ka[1] = kwargs[key]
                    break
            if count == 1:
                return ka
            ka[2] = kwargs.get("nodes", None)
            return ka

        if count not in (1, 2):
            raise TypeError("get_node_value() got an invalid number of arguments")

        kargs = get_kwargs()

        for i, arg in enumerate(args):
            kargs[ordered_keys[i]] = arg

        if count == 1:
            return cls._get_node_val(kargs[1])
        return cls._get_node_val2(kargs[1], kargs[2])

    @staticmethod
    def _get_node_val(node: Node) -> str:
        if node is None:
            return ""
        if not node.hasChildNodes():
            return ""
        child_nodes: NodeList = node.childNodes
        return next(
            (str(node.data).strip() for node in child_nodes if node.nodeType == Node.TEXT_NODE),
            "",
        )

    @classmethod
    def _get_node_val2(cls, tag_name: str, nodes: NodeList) -> str:
        if nodes is None:
            return ""
        name = tag_name.casefold()
        return next(
            (cls._get_node_val(node) for node in nodes if node.nodeName.casefold() == name),
            "",
        )

    # endregion get_node_value()

[docs]    @classmethod
    def get_node_values(cls, nodes: NodeList) -> Tuple[str, ...]:
        """
        Gets all the node values

        Args:
            nodes (NodeList): Nodes to get values of.

        Returns:
            Tuple[str, ...]: Node Values
        """
        vals = []
        for node in nodes:
            val = cls._get_node_val(node)
            if val != "":
                vals.append(val)
        return tuple(vals) if vals else ()

[docs]    @staticmethod
    def get_node_attr(attr_name: str, node: Node) -> str:
        """
        Get the named attribute value from node

        Args:
            attr_name (str): Attribute Name
            node (Node): Node to get attribute of.

        Returns:
            str: Attribute value if found; Otherwise empty str.
        """
        if node.attributes is None:  # type: ignore
            return ""
        # attrs is {} if there are no attributes
        attrs = dict(node.attributes.items())  # type: ignore
        name = attr_name.casefold()
        return next((str(v) for k, v in attrs.items() if str(k).casefold() == name), "")

[docs]    @classmethod
    def get_all_node_values(cls, row_nodes: NodeList, col_ids: Sequence[str]) -> List[list] | None:
        """
        Gets all node values.

        .. collapse:: Example XML

            XML is assumed to have structure that is similar

                .. include:: ../../resources/xml/pay.xml.rst

        The data from a sequence of <col> becomes one row in the
        generated 2D array.

        The first row of the 2D array contains the col ID strings.

        Args:
            row_nodes (NodeList): rows
            col_ids (Sequence[str]): Column ids

        Returns:
            List[list] | None: 2D-list of values on success; Otherwise, None

        Note:
            col_ids must match the column names:

            ``col_ids = ("purpose", "amount", "tax", "maturity")``

            Results for example xml:

            .. include:: ../../resources/xml/pay_all_notes_result.rst
        """
        num_rows = len(row_nodes)
        num_cols = len(col_ids)
        if num_cols == 0 or num_rows == 0:
            return None
        data = TableHelper.make_2d_array(num_rows=num_rows, num_cols=num_cols)
        # data = [[1] * num_cols for _ in range(num_rows + 1)]
        # put column strings in first row of list
        for col, _ in enumerate(col_ids):
            data[0][col] = mLo.Lo.capitalize(col_ids[col])

        for i, node in enumerate(row_nodes):
            # extract all the column strings for ith row
            col_nodes = node.childNodes
            for col in range(num_cols):
                data[i][col] = cls.get_node_value(col_ids[col], col_nodes)
        return data

    # endregion ------------ DOM data extraction -----------------------

    # region ---------------- XLS transforming -------------------------

[docs]    @staticmethod
    def apply_xslt(xml_fnm: PathOrStr, xls_fnm: PathOrStr) -> str:
        """
        Transforms xml file using XLST.

        Not available in macros at this time.

        Args:
            xml_fnm (PathOrStr): XML source file path.
            xls_fnm (PathOrStr): XSL source file path.

        Raises:
            Exception: If unable to apply xls

        Returns:
            str: String of XML that has been transformed.
        """
        # sourcery skip: raise-specific-error
        _xml_parser = XML_ETREE.XMLParser(remove_blank_text=True)

        try:
            pth_xml = mFileIO.FileIO.get_absolute_path(xml_fnm)
            pth_xls = mFileIO.FileIO.get_absolute_path(xls_fnm)
            print(f"Applying filter '{xls_fnm}' to '{xml_fnm}'")
            dom = XML_ETREE.parse(pth_xml, parser=_xml_parser)
            xslt = XML_ETREE.parse(pth_xls)  # type: ignore
            transform = XML_ETREE.XSLT(xslt)
            new_dom = transform(dom)
            return XML_ETREE.tostring(new_dom, encoding="unicode")  # type: ignore # unicode produces string
        except Exception as e:
            raise Exception(f"Unable to transform '{xml_fnm}' with '{xls_fnm}'") from e

[docs]    @staticmethod
    def apply_xslt_to_str(xml_str: str, xls_fnm: PathOrStr) -> str:
        """
        Transforms xml using XLST.

        Not available in macros at this time.

        Args:
            xml_str (str): Raw XML data.
            xls_fnm (PathOrStr): XSL source file path.

        Raises:
            Exception: If unable to apply xls

        Returns:
            str: String of XML that has been transformed.
        """
        # sourcery skip: raise-specific-error
        _xml_parser = XML_ETREE.XMLParser(remove_blank_text=True)

        try:
            pth = mFileIO.FileIO.get_absolute_path(xls_fnm)
            print(f"Applying the filter in '{xls_fnm}'")
            dom = XML_ETREE.fromstring(xml_str)  # type: ignore
            xslt = XML_ETREE.parse(pth, parser=_xml_parser)

            transform = XML_ETREE.XSLT(xslt)
            new_dom = transform(dom)
            return XML_ETREE.tostring(new_dom, encoding="unicode")  # type: ignore # unicode produces string
        except Exception as e:
            raise Exception("Unable to transform the string") from e

    # endregion ------------- XLS transforming -------------------------

    # region --------------- Filter ------------------------------------

[docs]    @classmethod
    def get_flat_filter_name(cls, doc_type: DocTypeStr) -> str:
        """
        Gets the Flat XML filter name for the doc type.

        Args:
            doc_type (Lo.DocTypeStr): Document type.

        Returns:
            str: Flat XML filter name.
        """
        if doc_type == DocTypeStr.WRITER:
            return "OpenDocument Text Flat XML"
        elif doc_type == DocTypeStr.CALC:
            return "OpenDocument Spreadsheet Flat XML"
        elif doc_type == DocTypeStr.DRAW:
            return "OpenDocument Drawing Flat XML"
        elif doc_type == DocTypeStr.IMPRESS:
            return "OpenDocument Presentation Flat XML"
        else:
            print("No Flat XML filter for this document type; using Flat text")
            return "OpenDocument Text Flat XML"

    # spelling fix

    # endregion ------------ Filter ------------------------------------

    # region --------------- Formatting --------------------------------

    # region    indent()
    @overload
    @classmethod
    def indent(cls, src: str) -> str:
        """
        Indents xml

        Args:
            src (str): raw xml data.

        Raises:
            TypeError is src is not expected type
            Exception: If unable to indent

        Returns:
            str: Indented xml as string.
        """
        ...

    @overload
    @classmethod
    def indent(cls, src: os.PathLike) -> str:
        """
        Indents xml

        Args:
            src (PathLike): xml file path.

        Raises:
            TypeError is src is not expected type
            Exception: If unable to indent

        Returns:
            str: Indented xml as string.
        """
        ...

    @overload
    @classmethod
    def indent(cls, src: Document) -> str:
        """
        Indents xml

        Args:
            src (Document): xml document.

        Raises:
            TypeError is src is not expected type
            Exception: If unable to indent

        Returns:
            str: Indented xml as string.
        """
        ...

[docs]    @classmethod
    def indent(cls, src: os.PathLike | str | Document) -> str:
        """
        Indents xml

        Args:
            src (str | PathLike | Document): raw xml data or xml file path or xml document.

        Raises:
            TypeError is src is not expected type
            Exception: If unable to indent

        Returns:
            str: Indented xml as string.
        """
        # sourcery skip: raise-specific-error
        try:
            if isinstance(src, os.PathLike):
                with open(mFileIO.FileIO.get_absolute_path(src), "r") as file:
                    doc = parse(file)
            elif isinstance(src, str):
                doc = parseString(src)
            elif isinstance(src, Document):
                # don't modify origin document
                doc = parseString(src.toxml())
            else:
                raise TypeError(
                    f"src is not recognized. Expected, str, PathLike or Document. Got {type(src).__name__}"
                )
            cls._remove_whitespace(doc)
            doc.normalize()
            # To parse string instead use: dom = md.parseString(xml_string)
            # remove the weird newline issue:
            # should not be needed with cls._remove_whitespace(doc)
            # pretty_xml = os.linesep.join([s for s in pretty_xml.splitlines() if s.strip()])
            return doc.toprettyxml()
        except TypeError:
            raise
        except Exception as e:
            if isinstance(src, (str, os.PathLike)):
                msg = f"Unable to indent '{src}'"
            else:
                msg = "Unable to indent document"
            raise Exception(msg) from e

    # endregion indent()

    @classmethod
    def _remove_whitespace(cls, node):
        """
        Removes whites from xml node

        Args:
            node (node): xml node, or xml document

        Note:
            it is necessary .normalize() the document to combine adjacent text nodes.
            Otherwise, you could end up with a bunch of redundant XML elements with just whitespace.
            Again, recursion is the only way to visit tree elements since you can’t iterate over the
            document and its elements with a loop. Finally, this should give you the expected result:
        """
        # https://realpython.com/python-xml-parser/
        # e.g.
        # document = parse("smiley.svg")
        # cls._remove_whitespace(document)
        # document.normalize()
        if node.nodeType == Node.TEXT_NODE and node.nodeValue.strip() == "":
            node.nodeValue = ""
        for child in node.childNodes:
            cls._remove_whitespace(child)

    # @classmethod
    # def _indent(cls, elem, level=0) -> None:
    #     # pretty print
    #     # https://stackoverflow.com/questions/749796/pretty-printing-xml-in-python
    #     i = "\n" + level*"  "
    #     if len(elem):
    #         if not elem.text or not elem.text.strip():
    #             elem.text = i + "  "
    #         if not elem.tail or not elem.tail.strip():
    #             elem.tail = i
    #         for elem in elem:
    #             cls._indent(elem, level+1)
    #         if not elem.tail or not elem.tail.strip():
    #             elem.tail = i
    #     else:
    #         if level and (not elem.tail or not elem.tail.strip()):
    #             elem.tail = i

    # endregion ------------- Formatting --------------------------------