Merge pull request #61 from telefonicasc/fix/normalizer

fgalan · web-flow · commit 9b15d449dee8 · 2023-05-08T08:40:23.000+02:00
issue #54 - add normalizer class
diff --git a/python-lib/tc_etl_lib/README.md b/python-lib/tc_etl_lib/README.md
@@ -176,9 +176,14 @@ except Exception as err:
 # send entities
 cbm: tc.cb.cbManager = tc.cb.cbManager(endpoint = 'http://<cb_endpoint>:<port>')
 
+# (opcional) solo es necesario usar normalizer si los datos que se usan para
+# construir el entity id pueden contener caracteres prohibidos por NGSI
+# (acentos, paréntesis, etc)
+normalize = tc.normalizer()
+
 entities = [
             {
-                "id": "myEntity1",
+                "id": normalize("myEntity1"),
                 "type": "myType",
                 "description": {
                     "value": "My first happy entity",
@@ -194,7 +199,7 @@ entities = [
                 }
             },
             {
-                "id": "myEntity2",
+                "id": normalize("myEntity2"),
                 "type": "myType",
                 "description": {
                     "value": "My second happy entity",
@@ -325,6 +330,34 @@ La librería está creada con diferentes clases dependiendo de la funcionalidad
         - :raises [ValueError](https://docs.python.org/3/library/exceptions.html#ValueError): Se lanza cuando le falta algún argumento o inicializar alguna varibale del objeto cbManager, para poder realizar la autenticación o envío de datos.
         - :raises FetchError: Se lanza cuando el servicio de Context Broker, responde con un error concreto.
 
+- Clase `normalizer`: Esta clase en encarga de normalizar cadenas unicode, reemplazando o eliminado cualquier caracter que no sea válido como parte de un ID de entidad NGSI.
+   - `__init__`: constructor de objetos de la clase.
+      - :param opcional `replacement`: define el carácter de reemplazo que sustituirá a todos los caracteres prohibidos (`&`, `?`, `/`, `#`, `<`, `>`, `"`, `'`, `=`, `;`, `(`, `)`). Esta lista de caracteres se ha extraido de https://github.com/telefonicaid/fiware-orion/blob/master/doc/manuals/orion-api.md#general-syntax-restrictions
+      - :param opcional `override`: diccionario de pares "caracter prohibido": "carácter reemplazo", que permite especificar un reemplazo personalizado para caracteres especiales particulares. Si se usa como carácter reemplazo `""` o `None`, el caracter prohibido se borra en lugar de reemplazarse.
+    - `__call__`: Función que ejecuta el reemplazo de los caracteres especiales. 
+      - :param: obligatorio `text`: Cadena de texto a normalizar. El normalizador devuelve una nueva cadena de texto con estos cambios:
+        - Convierte los caracteres acentuados (á, é, í, ó, u) en sus variantes sin acento.
+        - Elimina otros caracteres unicode no disponibles en ascii.
+        - Elimina códigos de control ascii.
+        - Reemplaza los caracteres prohibidos por el caracter de reemplazo (por defecto `-`, puede cambiarse con los overrides que se indican en el constructor)
+        - Reemplaza todos los espacios en blanco consecutivos por el carácter de reemplazo.
+        - NOTA: Esta función no recorta la longitud de la cadena devuelta a 256 caracteres, porque el llamante puede querer conservar la cadena entera para por ejemplo guardarla en algún otro atributo, antes de truncarla.
+
+Algunos ejemplos de uso de `normalizer`:
+
+```
+# Reemplazar los espacios por "+", al estilo "url encoding".
+# El resto de caracteres especiales, sustituirlos por el carácter
+# de reemplazo por defecto.
+norm = tc.normalizer(override={" ": "+"})
+norm("text (with spaces)") # devuelve "text+-with+spaces-"
+
+# Eliminar directamente todos los caracteres especiales,
+# dejando solo los espacios (reemplazados por "-")
+norm = tc.normalizer(replacement="", override={" ": "-"})
+norm("text (with spaces)") # devuelve "text-with-spaces"
+```
+
 La librería además proporciona [context managers](https://docs.python.org/3/reference/datamodel.html#context-managers) para abstraer la escritura de entidades en formato NGSIv2 a distintos backends (`store`s). Estos son:
 
 - `orionStore`: Genera un store asociado a una instancia particular de `cbManager` y `authManager`. Todas las entidades que se envíen a este store, se almacenarán en el cbManager correspondiente.
@@ -442,6 +475,8 @@ TOTAL                        403    221    45%
 
 ## Changelog
 
+- Add: new class `normalizer` to clean up text strings to be used as NGSI entity IDs, by replacing or removing forbidden characters ([#54](https://github.com/telefonicasc/etl-framework/pull/54))
+
 0.8.0 (March 22nd, 2023)
 
 - Add: new optional parameter called `replace_id` in sqlFileStore context manager ([#58](https://github.com/telefonicasc/etl-framework/pull/58))
diff --git a/python-lib/tc_etl_lib/tc_etl_lib/__init__.py b/python-lib/tc_etl_lib/tc_etl_lib/__init__.py
@@ -22,3 +22,4 @@
 from .auth import authManager
 from .cb import FetchError, cbManager
 from .store import Store, orionStore, sqlFileStore
+from .normalizer import normalizer
diff --git a/python-lib/tc_etl_lib/tc_etl_lib/normalizer.py b/python-lib/tc_etl_lib/tc_etl_lib/normalizer.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 Telefónica Soluciones de Informática y Comunicaciones de España, S.A.U.
+#
+# This file is part of tc_etl_lib
+#
+# tc_etl_lib is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# tc_etl_lib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
+# General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with IoT orchestrator. If not, see http://www.gnu.org/licenses/.
+#
+
+import unicodedata
+import re
+
+from typing import Mapping, Optional
+
+_whitespace_re = re.compile(r"\s+")
+
+class normalizer:
+    """
+    Normalizer is a class that will normalize unicode strings to
+    valid NGSI entity IDs. Normalization rules are at:
+
+    https://github.com/telefonicaid/fiware-orion/blob/master/doc/manuals/orion-api.md#general-syntax-restrictions
+
+    Normalizers have a __call__ function that takes an input string and:
+
+    - Turn accented characters (á, é, í, ó, u) into unaccented variants.
+    - Remove any other unicode character not available in ascii
+    - Remove ascii control codes
+    - Replace forbidden characters '&', '?', '/', '#' '<', '>', '"', ''', '=', ';', '(', ')'
+      with the replacement character (default "-", can be changed in the constructor)
+    - Merges consecutive whitespace and replaces it with the replacement character
+
+    You can also set a different replacement character for a specific forbidden
+    character, by adding the translation to the `override` optional argument of the
+    constructor.
+
+    E.g. if you want to replace " " with "+", you can call:
+
+    ```
+    norm = normalizer(override={" ": "+"})
+    norm("text (with spaces)")
+    ```
+
+    And you will get `"text+-with+spaces-"`.
+
+    You can also remove a forbidden character altogether, by setting its value to
+    `None` in the `override` argument. E.g if you want to remove parenthesis,
+    you can call:
+
+    ```
+    norm = normalizer(override={"(": None, ")": None})
+    norm("text (with parenthesis)")
+    ```
+
+    If you want to remove ALL special characters (except whitespace):
+
+    ```
+    norm = normalizer(replacement="", override={ " ": "-" })
+    norm("text (with & special > characters)")
+    ```
+
+    And you will get `"text-with-special-characters"`
+
+    The function does not trim the string size to 256 characters, because
+    you might want the full normalized original string to store it somewhere
+    else before truncating.
+    """
+
+    def __init__(self, replacement: str = "-", override: Optional[Mapping[str, str]] = None):
+        """Set the default replacement string and custom override mapping"""
+        if override is None:
+            override = {}
+        forbidden_chars = {
+            "&": replacement,
+            "?": replacement,
+            "/": replacement,
+            "#": replacement,
+            "<": replacement,
+            ">": replacement,
+            '"': replacement,
+            "'": replacement,
+            "=": replacement,
+            ";": replacement,
+            "(": replacement,
+            ")": replacement
+        }
+        source = []
+        target = []
+        remove = []
+        for key, val in forbidden_chars.items():
+            custom = override.get(key, val)
+            if custom is None or custom == "":
+                remove.append(key)
+            else:
+                if len(custom) > 1:
+                    raise ValueError(f"wrong override '{custom}' for char '{key}': must be a single character")
+                source.append(key)
+                target.append(custom)
+        self.space_replacement = override.get(" ", replacement) or ""
+        self.table = str.maketrans(
+            "".join(source), "".join(target), "".join(remove))
+
+    def __call__(self, text: str) -> str:
+        """Normalize text to NGSI entity ID"""
+        global _whitespace_re
+        ascii = unicodedata.normalize('NFD', text).encode('utf-8').decode('ascii', errors='ignore')
+        without_control_chars = "".join(ch for ch in ascii if unicodedata.category(ch)[0] != "C")
+        without_specials = without_control_chars.translate(self.table).strip()
+        return _whitespace_re.sub(self.space_replacement, without_specials)
diff --git a/python-lib/tc_etl_lib/tc_etl_lib/test_normalizer.py b/python-lib/tc_etl_lib/tc_etl_lib/test_normalizer.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2023 Telefónica Soluciones de Informática y Comunicaciones de España, S.A.U.
+#
+# This file is part of tc_etl_lib
+#
+# tc_etl_lib is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# tc_etl_lib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
+# General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with IoT orchestrator. If not, see http://www.gnu.org/licenses/.
+
+'''
+Normalizer tests
+'''
+
+import unittest
+from tc_etl_lib import normalizer
+
+
+class TestNormalizer(unittest.TestCase):
+    '''Tests for sqlFileStore'''
+
+    def do_test(self, replacement="-", override=None, input="", expected=""):
+        '''test Normalizer with the given options dict'''
+        norm = normalizer(replacement=replacement, override=override)
+        result = norm(input)
+        self.assertEqual(result, expected)
+
+    def test_default_behaviour(self):
+        self.do_test(
+            replacement="-",
+            override=None,
+            input="text (with & specials) > áéíóúñ",
+            expected="text--with---specials----aeioun"
+        )
+
+    def test_different_replacement(self):
+        self.do_test(
+            replacement="_",
+            override=None,
+            input="text (with & specials) > áéíóúñ",
+            expected="text__with___specials____aeioun"
+        )
+
+    def test_space_override(self):
+        self.do_test(
+            replacement="-",
+            override={" ": "+"},
+            input="text (with & specials) > áéíóúñ",
+            expected="text+-with+-+specials-+-+aeioun"
+        )
+
+    def test_forbidden_override(self):
+        self.do_test(
+            replacement="-",
+            override={">": "+"},
+            input="text (with & specials) > áéíóúñ",
+            expected="text--with---specials--+-aeioun"
+        )
+
+    def test_forbidden_remove_some(self):
+        self.do_test(
+            replacement="-",
+            override= {
+                "(": None,
+                ")": None,
+            },
+            input="text (with & specials) > áéíóúñ",
+            expected="text-with---specials---aeioun"
+        )
+
+    def test_space_remove(self):
+        self.do_test(
+            replacement="-",
+            override= {" ": None },
+            input="text (with & specials) > áéíóúñ",
+            expected="text-with-specials--aeioun"
+        )
+
+    def test_forbidden_remove_all(self):
+        self.do_test(
+            replacement="",
+            override={ " ": "-" },
+            input="text (with & specials) > áéíóúñ",
+            expected="text-with-specials-aeioun"
+        )
+
+    def test_remove_all(self):
+        self.do_test(
+            replacement="",
+            override=None,
+            input="text (with & specials) > áéíóúñ",
+            expected="textwithspecialsaeioun"
+        )