# coding: utf-8
# See Also: https://pypi.org/project/ooo-dev-tools/
# region Imports
from __future__ import annotations
import os
from typing import Sequence, Tuple, List, overload
from xml.dom.minidom import Node, parse, Document, parseString
import urllib.request
from xml.dom.minicompat import NodeList
from ooodev.utils.table_helper import TableHelper
from ooodev.utils import lo as mLo
from ooodev.utils import file_io as mFileIO
from ooodev.utils.type_var import PathOrStr
from ooodev.utils.inst.lo.doc_type import DocTypeStr
from lxml import etree as XML_ETREE
# endregion Imports
[docs]class XML:
"""XML method used for with LibreOffice Documents"""
# region --------------- Load / Save ------------------------------
[docs] @classmethod
def load_doc(cls, fnm: PathOrStr) -> Document:
"""
Gets a document from a file
Args:
fnm (PathOrStr): XML file to load.
Raises:
Exception: if unable to open document.
Returns:
Document: XML Document.
"""
# sourcery skip: raise-specific-error
try:
pth = mFileIO.FileIO.get_absolute_path(fnm)
with open(pth) as file:
doc = parse(file)
cls._remove_whitespace(doc)
doc.normalize()
return doc
except Exception as e:
print(e)
raise Exception(f"Opening of document failed: '{fnm}'") from e
[docs] @classmethod
def url_2_doc(cls, url: str) -> Document:
"""
Gets a XML Document from remote source.
Args:
url (str): URL for a remote XML Document
Raises:
Exception: if unable to open document.
Returns:
Document: XML Document
"""
# sourcery skip: raise-specific-error
try:
with urllib.request.urlopen(url) as url_data:
doc = parseString(url_data.read().decode())
cls._remove_whitespace(doc)
doc.normalize()
return doc
except Exception as e:
print(e)
raise Exception(f"Opening of document failed: '{url}'") from e
[docs] @classmethod
def str_to_doc(cls, xml_str: str) -> Document:
"""
Gets a XML document from xml string.
Args:
xml_str (str): XML string.
Raises:
Exception: if unable to create document from xml.
Returns:
Document: XML Document on successful load; Otherwise, None.
"""
# sourcery skip: raise-specific-error
try:
doc = parseString(xml_str)
cls._remove_whitespace(doc)
doc.normalize()
return doc
except Exception as e:
print(e)
raise Exception("Error get xml document from xml string") from e
[docs] @staticmethod
def save_doc(doc: Document, xml_fnm: PathOrStr) -> None:
"""
Save doc to xml file.
Args:
doc (Document): doc to save.
xml_fnm (PathOrStr): Output file path.
Raises:
Exception: If unable to save document
"""
# sourcery skip: raise-specific-error
try:
pth = mFileIO.FileIO.get_absolute_path(xml_fnm)
with open(pth, "w") as file:
sx: str = doc.toprettyxml(indent=" ")
# remove any empty lines, there if often a lot with toprettyxml()
lines = [line for line in sx.splitlines() if line.strip() != ""]
lines.append("") # end with empty line
clean_sx = "\n".join(lines)
file.write(clean_sx)
# doc.writexml(writer=file, indent=" ")
except Exception as e:
raise Exception(f"Unable to save document to {xml_fnm}") from e
# endregion ------------ Load / Save ------------------------------
# region --------------- DOM data extraction -----------------------
[docs] @staticmethod
def get_node(tag_name: str, nodes: NodeList) -> Node | None:
"""
Gets the fist tag_name found in nodes.
Args:
tag_name (str): tag name to find in nodes.
nodes (NodeList): Nodes to search
Returns:
Node | None: First found node; Otherwise, None
"""
name = tag_name.casefold()
return next(
(node for node in nodes if node.nodeType == Node.ELEMENT_NODE and node.tagName.casefold() == name),
None,
)
# region get_node_value()
@overload
@classmethod
def get_node_value(cls, node: Node) -> str:
"""
Get the text stored in the node
Args:
node (Node): Node to get value of.
Returns:
str: Node value.
"""
...
@overload
@classmethod
def get_node_value(cls, tag_name: str, nodes: NodeList) -> str:
"""
Gets first tag_name node in the list and returns it text.
Args:
tag_name (str): tag_name to search for.
nodes (NodeList): List of nodes to search.
Returns:
str: Node value if found; Otherwise empty str.
"""
...
[docs] @classmethod
def get_node_value(cls, *args, **kwargs) -> str:
"""
Gets first ``tag_name`` node in the list and returns it text.
Args:
node (Node): Node to get value of.
tag_name (str): ``tag_name`` to search for.
nodes (NodeList): List of nodes to search.
Returns:
str: Node value if found; Otherwise empty str.
"""
ordered_keys = (1, 2)
kargs_len = len(kwargs)
count = len(args) + kargs_len
def get_kwargs() -> dict:
ka = {}
if kargs_len == 0:
return ka
valid_keys = ("tag_name", "nodes", "node")
check = all(key in valid_keys for key in kwargs)
if not check:
raise TypeError("get_node_value() got an unexpected keyword argument")
keys = ("tag_name", "node")
for key in keys:
if key in kwargs:
ka[1] = kwargs[key]
break
if count == 1:
return ka
ka[2] = kwargs.get("nodes", None)
return ka
if count not in (1, 2):
raise TypeError("get_node_value() got an invalid number of arguments")
kargs = get_kwargs()
for i, arg in enumerate(args):
kargs[ordered_keys[i]] = arg
if count == 1:
return cls._get_node_val(kargs[1])
return cls._get_node_val2(kargs[1], kargs[2])
@staticmethod
def _get_node_val(node: Node) -> str:
if node is None:
return ""
if not node.hasChildNodes():
return ""
child_nodes: NodeList = node.childNodes
return next(
(str(node.data).strip() for node in child_nodes if node.nodeType == Node.TEXT_NODE),
"",
)
@classmethod
def _get_node_val2(cls, tag_name: str, nodes: NodeList) -> str:
if nodes is None:
return ""
name = tag_name.casefold()
return next(
(cls._get_node_val(node) for node in nodes if node.nodeName.casefold() == name),
"",
)
# endregion get_node_value()
[docs] @classmethod
def get_node_values(cls, nodes: NodeList) -> Tuple[str, ...]:
"""
Gets all the node values
Args:
nodes (NodeList): Nodes to get values of.
Returns:
Tuple[str, ...]: Node Values
"""
vals = []
for node in nodes:
val = cls._get_node_val(node)
if val != "":
vals.append(val)
return tuple(vals) if vals else ()
[docs] @staticmethod
def get_node_attr(attr_name: str, node: Node) -> str:
"""
Get the named attribute value from node
Args:
attr_name (str): Attribute Name
node (Node): Node to get attribute of.
Returns:
str: Attribute value if found; Otherwise empty str.
"""
if node.attributes is None: # type: ignore
return ""
# attrs is {} if there are no attributes
attrs = dict(node.attributes.items()) # type: ignore
name = attr_name.casefold()
return next((str(v) for k, v in attrs.items() if str(k).casefold() == name), "")
[docs] @classmethod
def get_all_node_values(cls, row_nodes: NodeList, col_ids: Sequence[str]) -> List[list] | None:
"""
Gets all node values.
.. collapse:: Example XML
XML is assumed to have structure that is similar
.. include:: ../../resources/xml/pay.xml.rst
The data from a sequence of <col> becomes one row in the
generated 2D array.
The first row of the 2D array contains the col ID strings.
Args:
row_nodes (NodeList): rows
col_ids (Sequence[str]): Column ids
Returns:
List[list] | None: 2D-list of values on success; Otherwise, None
Note:
col_ids must match the column names:
``col_ids = ("purpose", "amount", "tax", "maturity")``
Results for example xml:
.. include:: ../../resources/xml/pay_all_notes_result.rst
"""
num_rows = len(row_nodes)
num_cols = len(col_ids)
if num_cols == 0 or num_rows == 0:
return None
data = TableHelper.make_2d_array(num_rows=num_rows, num_cols=num_cols)
# data = [[1] * num_cols for _ in range(num_rows + 1)]
# put column strings in first row of list
for col, _ in enumerate(col_ids):
data[0][col] = mLo.Lo.capitalize(col_ids[col])
for i, node in enumerate(row_nodes):
# extract all the column strings for ith row
col_nodes = node.childNodes
for col in range(num_cols):
data[i][col] = cls.get_node_value(col_ids[col], col_nodes)
return data
# endregion ------------ DOM data extraction -----------------------
# region ---------------- XLS transforming -------------------------
[docs] @staticmethod
def apply_xslt(xml_fnm: PathOrStr, xls_fnm: PathOrStr) -> str:
"""
Transforms xml file using XLST.
Not available in macros at this time.
Args:
xml_fnm (PathOrStr): XML source file path.
xls_fnm (PathOrStr): XSL source file path.
Raises:
Exception: If unable to apply xls
Returns:
str: String of XML that has been transformed.
"""
# sourcery skip: raise-specific-error
_xml_parser = XML_ETREE.XMLParser(remove_blank_text=True)
try:
pth_xml = mFileIO.FileIO.get_absolute_path(xml_fnm)
pth_xls = mFileIO.FileIO.get_absolute_path(xls_fnm)
print(f"Applying filter '{xls_fnm}' to '{xml_fnm}'")
dom = XML_ETREE.parse(pth_xml, parser=_xml_parser)
xslt = XML_ETREE.parse(pth_xls) # type: ignore
transform = XML_ETREE.XSLT(xslt)
new_dom = transform(dom)
return XML_ETREE.tostring(new_dom, encoding="unicode") # type: ignore # unicode produces string
except Exception as e:
raise Exception(f"Unable to transform '{xml_fnm}' with '{xls_fnm}'") from e
[docs] @staticmethod
def apply_xslt_to_str(xml_str: str, xls_fnm: PathOrStr) -> str:
"""
Transforms xml using XLST.
Not available in macros at this time.
Args:
xml_str (str): Raw XML data.
xls_fnm (PathOrStr): XSL source file path.
Raises:
Exception: If unable to apply xls
Returns:
str: String of XML that has been transformed.
"""
# sourcery skip: raise-specific-error
_xml_parser = XML_ETREE.XMLParser(remove_blank_text=True)
try:
pth = mFileIO.FileIO.get_absolute_path(xls_fnm)
print(f"Applying the filter in '{xls_fnm}'")
dom = XML_ETREE.fromstring(xml_str) # type: ignore
xslt = XML_ETREE.parse(pth, parser=_xml_parser)
transform = XML_ETREE.XSLT(xslt)
new_dom = transform(dom)
return XML_ETREE.tostring(new_dom, encoding="unicode") # type: ignore # unicode produces string
except Exception as e:
raise Exception("Unable to transform the string") from e
# endregion ------------- XLS transforming -------------------------
# region --------------- Filter ------------------------------------
[docs] @classmethod
def get_flat_filter_name(cls, doc_type: DocTypeStr) -> str:
"""
Gets the Flat XML filter name for the doc type.
Args:
doc_type (Lo.DocTypeStr): Document type.
Returns:
str: Flat XML filter name.
"""
if doc_type == DocTypeStr.WRITER:
return "OpenDocument Text Flat XML"
elif doc_type == DocTypeStr.CALC:
return "OpenDocument Spreadsheet Flat XML"
elif doc_type == DocTypeStr.DRAW:
return "OpenDocument Drawing Flat XML"
elif doc_type == DocTypeStr.IMPRESS:
return "OpenDocument Presentation Flat XML"
else:
print("No Flat XML filter for this document type; using Flat text")
return "OpenDocument Text Flat XML"
# spelling fix
# endregion ------------ Filter ------------------------------------
# region --------------- Formatting --------------------------------
# region indent()
@overload
@classmethod
def indent(cls, src: str) -> str:
"""
Indents xml
Args:
src (str): raw xml data.
Raises:
TypeError is src is not expected type
Exception: If unable to indent
Returns:
str: Indented xml as string.
"""
...
@overload
@classmethod
def indent(cls, src: os.PathLike) -> str:
"""
Indents xml
Args:
src (PathLike): xml file path.
Raises:
TypeError is src is not expected type
Exception: If unable to indent
Returns:
str: Indented xml as string.
"""
...
@overload
@classmethod
def indent(cls, src: Document) -> str:
"""
Indents xml
Args:
src (Document): xml document.
Raises:
TypeError is src is not expected type
Exception: If unable to indent
Returns:
str: Indented xml as string.
"""
...
[docs] @classmethod
def indent(cls, src: os.PathLike | str | Document) -> str:
"""
Indents xml
Args:
src (str | PathLike | Document): raw xml data or xml file path or xml document.
Raises:
TypeError is src is not expected type
Exception: If unable to indent
Returns:
str: Indented xml as string.
"""
# sourcery skip: raise-specific-error
try:
if isinstance(src, os.PathLike):
with open(mFileIO.FileIO.get_absolute_path(src), "r") as file:
doc = parse(file)
elif isinstance(src, str):
doc = parseString(src)
elif isinstance(src, Document):
# don't modify origin document
doc = parseString(src.toxml())
else:
raise TypeError(
f"src is not recognized. Expected, str, PathLike or Document. Got {type(src).__name__}"
)
cls._remove_whitespace(doc)
doc.normalize()
# To parse string instead use: dom = md.parseString(xml_string)
# remove the weird newline issue:
# should not be needed with cls._remove_whitespace(doc)
# pretty_xml = os.linesep.join([s for s in pretty_xml.splitlines() if s.strip()])
return doc.toprettyxml()
except TypeError:
raise
except Exception as e:
if isinstance(src, (str, os.PathLike)):
msg = f"Unable to indent '{src}'"
else:
msg = "Unable to indent document"
raise Exception(msg) from e
# endregion indent()
@classmethod
def _remove_whitespace(cls, node):
"""
Removes whites from xml node
Args:
node (node): xml node, or xml document
Note:
it is necessary .normalize() the document to combine adjacent text nodes.
Otherwise, you could end up with a bunch of redundant XML elements with just whitespace.
Again, recursion is the only way to visit tree elements since you can’t iterate over the
document and its elements with a loop. Finally, this should give you the expected result:
"""
# https://realpython.com/python-xml-parser/
# e.g.
# document = parse("smiley.svg")
# cls._remove_whitespace(document)
# document.normalize()
if node.nodeType == Node.TEXT_NODE and node.nodeValue.strip() == "":
node.nodeValue = ""
for child in node.childNodes:
cls._remove_whitespace(child)
# @classmethod
# def _indent(cls, elem, level=0) -> None:
# # pretty print
# # https://stackoverflow.com/questions/749796/pretty-printing-xml-in-python
# i = "\n" + level*" "
# if len(elem):
# if not elem.text or not elem.text.strip():
# elem.text = i + " "
# if not elem.tail or not elem.tail.strip():
# elem.tail = i
# for elem in elem:
# cls._indent(elem, level+1)
# if not elem.tail or not elem.tail.strip():
# elem.tail = i
# else:
# if level and (not elem.tail or not elem.tail.strip()):
# elem.tail = i
# endregion ------------- Formatting --------------------------------