Module gamslib.objectcsv.dublincore

Dublin Core metadata access for GAMS objects.

Provides the DublinCore class for parsing DC.xml files and accessing metadata elements with language preference and fallback logic. Includes utility methods for retrieving element values as lists or strings, and for removing linebreaks from text.

Features

  • Parse DC.xml and extract Dublin Core elements with language support.
  • Retrieve metadata values in preferred language, with fallback to alternatives.
  • Utility for joining multiple values and formatting rights statements.
  • Configurable lookup order for language fallback.

Classes

class DublinCore (path: pathlib._local.Path,
lookup_order: tuple = ('en', 'de', 'fr', 'es', 'it'))
Expand source code
class DublinCore:
    """Represents data from DC.xml and provides methods to access it."""

    UNSPECIFIED_LANG = "unspecified"

    def __init__(
        self, path: Path, lookup_order: tuple = ("en", "de", "fr", "es", "it")
    ):
        """
        Initialize and parse the DC.xml file.

        Args:
            path (Path): Path to the DC.xml file.
            lookup_order (tuple): Preferred language order for fallback.
        """
        self.path: Path = path
        self.lookup_order: list[str] = list(lookup_order)
        self._data: dict[str, Any] = {}  # [element][lang] = text
        self._parse(path)

    def _parse(self, path: Path):
        tree = ET.parse(path)
        root = tree.getroot()

        for elem in DC_ELEMENTS:
            for child in root.findall(f"dc:{elem}", namespaces=NAMESPACES):
                lang = child.attrib.get(
                    f"{{{NAMESPACES['xml']}}}lang", self.UNSPECIFIED_LANG
                )
                element = self._data.get(elem, {})
                values = element.get(lang, [])
                if child.text is not None:
                    values.append(child.text)
                element[lang] = values
                self._data[elem] = element
        # TODO: Add DC_TERMS and DCMI_TYPES?

    @classmethod
    def remove_linebreaks(cls, text: str) -> str:
        """
        Remove linebreaks from a string.

        Args:
            text (str): The string to remove linebreaks from.

        Returns:
            str: The string without linebreaks.
        """
        return re.sub(r"[\r\n]+", " ", text).strip()

    def get_en_element(self, name: str, default="") -> list[str]:
        """
        Return the value(s) of a Dublin Core element in English.

        Args:
            name (str): The name of the element without namespace (e.g. "title").
            default (str): Default value if element is missing.

        Returns:
            list[str]: The value(s) of the element as a list of strings.

        Raises:
            ValueError: If the element name is not a valid Dublin Core element.
        """
        if name not in DC_ELEMENTS:
            raise ValueError(f"Element {name} is not a Dublin Core element.")

        values = self._data[name].get("en", [])
        if not values and default != "":
            values = [default]
        return [self.remove_linebreaks(value) for value in values]

    def get_en_element_as_str(self, name: str, default="") -> str:
        """
        Return the joined value(s) of a Dublin Core element in English.

        Args:
            name (str): The name of the element without namespace (e.g. "title").
            default (str): Default value if element is missing.

        Returns:
            str: The joined value(s) of the element as a string. Multiple values are separated by ';'.

        Raises:
            ValueError: If the element name is not a valid Dublin Core element.
        """
        return "; ".join(self.get_en_element(name, default=default))

    def get_element(
        self, name: str, preferred_lang: str = "en", default: str = ""
    ) -> list[str]:
        """
        Return the value(s) of a Dublin Core element as a list of strings.

        Args:
            name (str): The name of the element without namespace (e.g. "title").
            preferred_lang (str): The preferred language of the element (e.g. "de").
            default (str): The default value to return if no value is found.

        Returns:
            list[str]: The value(s) of the element as a list of strings.

        Raises:
            ValueError: If the element name is not a valid Dublin Core element.

        Notes:
            - If no entry in the preferred language is available, the function will search
              for entries in another language, depending on the lookup_order set during
              object creation. If no entry is found with a specified language, the function
              checks for an entry with no 'xml:lang' attribute. If still no value is found,
              the default value will be returned (as a list).
        """
        if name not in DC_ELEMENTS:
            raise ValueError(f"Element {name} is not a Dublin Core element.")
        # element not in DC.xml
        if name not in self._data:
            logger.debug(
                "Element '%s{name}' not found in %s{self.path}. Returning default value: [%s]",
                name,
                self.path,
                default,
            )
            return [default]

        # search for an entry in desired language
        if preferred_lang not in self._data[name]:
            # search for another language in defined lookup order
            alternative_lang = self.UNSPECIFIED_LANG
            for lang in self.lookup_order:
                if lang in self._data[name]:
                    alternative_lang = lang
                    break
            if alternative_lang == self.UNSPECIFIED_LANG:
                # no entry for any lang in lookup_order, so we use the first entry without lang
                logger.debug(
                    "Preferred language '%s{preferred_lang}' not found in %s{self.path}. "
                    "Using first entry without xml:lang attribute instead.",
                    preferred_lang,
                    self.path,
                )
            # we found an alternative lang
            else:
                logger.debug(
                    "Preferred language '%s{preferred_lang}' not found in %s{self.path}. "
                    "Using value for language '%s{alternative_lang}' instead.",
                    preferred_lang,
                    self.path,
                    alternative_lang,
                )
            preferred_lang = alternative_lang

        return [
            self.remove_linebreaks(value) for value in self._data[name][preferred_lang]
        ]

    def get_element_as_str(
        self, name: str, preferred_lang: str = "en", default: str = ""
    ) -> str:
        """
        Return the value(s) of a Dublin Core element as a string.

        Args:
            name (str): The name of the element without namespace.
            preferred_lang (str): The preferred language of the element.
            default (str): The default value to return if no value is found.

        Returns:
            str: The value(s) as a single string. For 'rights', formats as "name (url)" if two values are present; otherwise, values are joined with ';'.
        """
        values = self.get_element(name, preferred_lang, default)
        if name == "rights":
            # we expect the licence name first, followed by the url in brackets
            str_value = values[0] if len(values) == 1 else f"{values[0]} ({values[1]})"
        else:
            str_value = "; ".join(values)
        return str_value

Represents data from DC.xml and provides methods to access it.

Initialize and parse the DC.xml file.

Args

path : Path
Path to the DC.xml file.
lookup_order : tuple
Preferred language order for fallback.

Class variables

var UNSPECIFIED_LANG

The type of the None singleton.

Static methods

def remove_linebreaks(text: str) ‑> str

Remove linebreaks from a string.

Args

text : str
The string to remove linebreaks from.

Returns

str
The string without linebreaks.

Methods

def get_element(self, name: str, preferred_lang: str = 'en', default: str = '') ‑> list[str]
Expand source code
def get_element(
    self, name: str, preferred_lang: str = "en", default: str = ""
) -> list[str]:
    """
    Return the value(s) of a Dublin Core element as a list of strings.

    Args:
        name (str): The name of the element without namespace (e.g. "title").
        preferred_lang (str): The preferred language of the element (e.g. "de").
        default (str): The default value to return if no value is found.

    Returns:
        list[str]: The value(s) of the element as a list of strings.

    Raises:
        ValueError: If the element name is not a valid Dublin Core element.

    Notes:
        - If no entry in the preferred language is available, the function will search
          for entries in another language, depending on the lookup_order set during
          object creation. If no entry is found with a specified language, the function
          checks for an entry with no 'xml:lang' attribute. If still no value is found,
          the default value will be returned (as a list).
    """
    if name not in DC_ELEMENTS:
        raise ValueError(f"Element {name} is not a Dublin Core element.")
    # element not in DC.xml
    if name not in self._data:
        logger.debug(
            "Element '%s{name}' not found in %s{self.path}. Returning default value: [%s]",
            name,
            self.path,
            default,
        )
        return [default]

    # search for an entry in desired language
    if preferred_lang not in self._data[name]:
        # search for another language in defined lookup order
        alternative_lang = self.UNSPECIFIED_LANG
        for lang in self.lookup_order:
            if lang in self._data[name]:
                alternative_lang = lang
                break
        if alternative_lang == self.UNSPECIFIED_LANG:
            # no entry for any lang in lookup_order, so we use the first entry without lang
            logger.debug(
                "Preferred language '%s{preferred_lang}' not found in %s{self.path}. "
                "Using first entry without xml:lang attribute instead.",
                preferred_lang,
                self.path,
            )
        # we found an alternative lang
        else:
            logger.debug(
                "Preferred language '%s{preferred_lang}' not found in %s{self.path}. "
                "Using value for language '%s{alternative_lang}' instead.",
                preferred_lang,
                self.path,
                alternative_lang,
            )
        preferred_lang = alternative_lang

    return [
        self.remove_linebreaks(value) for value in self._data[name][preferred_lang]
    ]

Return the value(s) of a Dublin Core element as a list of strings.

Args

name : str
The name of the element without namespace (e.g. "title").
preferred_lang : str
The preferred language of the element (e.g. "de").
default : str
The default value to return if no value is found.

Returns

list[str]
The value(s) of the element as a list of strings.

Raises

ValueError
If the element name is not a valid Dublin Core element.

Notes

  • If no entry in the preferred language is available, the function will search for entries in another language, depending on the lookup_order set during object creation. If no entry is found with a specified language, the function checks for an entry with no 'xml:lang' attribute. If still no value is found, the default value will be returned (as a list).
def get_element_as_str(self, name: str, preferred_lang: str = 'en', default: str = '') ‑> str
Expand source code
def get_element_as_str(
    self, name: str, preferred_lang: str = "en", default: str = ""
) -> str:
    """
    Return the value(s) of a Dublin Core element as a string.

    Args:
        name (str): The name of the element without namespace.
        preferred_lang (str): The preferred language of the element.
        default (str): The default value to return if no value is found.

    Returns:
        str: The value(s) as a single string. For 'rights', formats as "name (url)" if two values are present; otherwise, values are joined with ';'.
    """
    values = self.get_element(name, preferred_lang, default)
    if name == "rights":
        # we expect the licence name first, followed by the url in brackets
        str_value = values[0] if len(values) == 1 else f"{values[0]} ({values[1]})"
    else:
        str_value = "; ".join(values)
    return str_value

Return the value(s) of a Dublin Core element as a string.

Args

name : str
The name of the element without namespace.
preferred_lang : str
The preferred language of the element.
default : str
The default value to return if no value is found.

Returns

str
The value(s) as a single string. For 'rights', formats as "name (url)" if two values are present; otherwise, values are joined with ';'.
def get_en_element(self, name: str, default='') ‑> list[str]
Expand source code
def get_en_element(self, name: str, default="") -> list[str]:
    """
    Return the value(s) of a Dublin Core element in English.

    Args:
        name (str): The name of the element without namespace (e.g. "title").
        default (str): Default value if element is missing.

    Returns:
        list[str]: The value(s) of the element as a list of strings.

    Raises:
        ValueError: If the element name is not a valid Dublin Core element.
    """
    if name not in DC_ELEMENTS:
        raise ValueError(f"Element {name} is not a Dublin Core element.")

    values = self._data[name].get("en", [])
    if not values and default != "":
        values = [default]
    return [self.remove_linebreaks(value) for value in values]

Return the value(s) of a Dublin Core element in English.

Args

name : str
The name of the element without namespace (e.g. "title").
default : str
Default value if element is missing.

Returns

list[str]
The value(s) of the element as a list of strings.

Raises

ValueError
If the element name is not a valid Dublin Core element.
def get_en_element_as_str(self, name: str, default='') ‑> str
Expand source code
def get_en_element_as_str(self, name: str, default="") -> str:
    """
    Return the joined value(s) of a Dublin Core element in English.

    Args:
        name (str): The name of the element without namespace (e.g. "title").
        default (str): Default value if element is missing.

    Returns:
        str: The joined value(s) of the element as a string. Multiple values are separated by ';'.

    Raises:
        ValueError: If the element name is not a valid Dublin Core element.
    """
    return "; ".join(self.get_en_element(name, default=default))

Return the joined value(s) of a Dublin Core element in English.

Args

name : str
The name of the element without namespace (e.g. "title").
default : str
Default value if element is missing.

Returns

str
The joined value(s) of the element as a string. Multiple values are separated by ';'.

Raises

ValueError
If the element name is not a valid Dublin Core element.